Source code for reuse.report

# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
# SPDX-FileCopyrightText: 2024 Kerry McAdams <github@klmcadams>
# SPDX-FileCopyrightText: 2024 Sebastien Morais <github@SMoraisAnsys>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains reports about files and projects for linting."""

import bdb
import contextlib
import datetime
import logging
import multiprocessing as mp
import random
from hashlib import md5
from io import StringIO
from os import cpu_count
from pathlib import Path, PurePath
from typing import (
    Any,
    Collection,
    Iterable,
    NamedTuple,
    Optional,
    Protocol,
    cast,
)
from uuid import uuid4

from . import _LICENSING, __REUSE_version__, __version__
from ._util import (
    _add_plus_to_identifier,
    _checksum,
    _strip_plus_from_identifier,
)
from .extract import _LICENSEREF_PATTERN
from .global_licensing import ReuseDep5
from .i18n import _
from .project import Project, ReuseInfo
from .types import StrPath

_LOGGER = logging.getLogger(__name__)

LINT_VERSION = "1.0"


class _MultiprocessingContainer:
    """Container that remembers some data in order to generate a FileReport."""

    def __init__(
        self, project: Project, do_checksum: bool, add_license_concluded: bool
    ):
        if isinstance(project.global_licensing, ReuseDep5):
            # Remember that a dep5_copyright was (or was not) set prior.
            self.has_dep5 = bool(project.global_licensing)
            # TODO: We create a copy of the project in the following
            # song-and-dance because the debian Copyright object cannot be
            # pickled.
            new_project = Project(
                project.root,
                vcs_strategy=project.vcs_strategy,
                license_map=project.license_map,
                licenses=project.licenses.copy(),
                # TODO: adjust this method/class to account for REUSE.toml as
                # well. Unset dep5_copyright
                global_licensing=None,
                include_submodules=project.include_submodules,
                include_meson_subprojects=project.include_meson_subprojects,
            )
            new_project.licenses_without_extension = (
                project.licenses_without_extension
            )
            self.project = new_project
        else:
            self.has_dep5 = False
            self.project = project

        self.reuse_dep5: Optional[ReuseDep5] = None
        self.do_checksum = do_checksum
        self.add_license_concluded = add_license_concluded

    def __call__(self, file_: StrPath) -> "_MultiprocessingResult":
        # By remembering that we've parsed the .reuse/dep5, we only parse it
        # once (the first time) inside of each process.
        if self.has_dep5 and not self.reuse_dep5:
            with contextlib.suppress(Exception):
                self.reuse_dep5 = ReuseDep5.from_file(
                    self.project.root / ".reuse/dep5"
                )
                self.project.global_licensing = self.reuse_dep5
        # pylint: disable=broad-except
        try:
            return _MultiprocessingResult(
                file_,
                FileReport.generate(
                    self.project,
                    file_,
                    do_checksum=self.do_checksum,
                    add_license_concluded=self.add_license_concluded,
                ),
                None,
            )
        except Exception as exc:
            return _MultiprocessingResult(file_, None, exc)


class _MultiprocessingResult(NamedTuple):
    """Result of :class:`MultiprocessingContainer`."""

    path: StrPath
    report: Optional["FileReport"]
    error: Optional[Exception]


def _generate_file_reports(
    project: Project,
    do_checksum: bool = True,
    subset_files: Optional[Collection[StrPath]] = None,
    multiprocessing: bool = cpu_count() > 1,  # type: ignore
    add_license_concluded: bool = False,
) -> Iterable[_MultiprocessingResult]:
    """Create a :class:`FileReport` for every file in the project, filtered
    by *subset_files*.
    """
    container = _MultiprocessingContainer(
        project, do_checksum, add_license_concluded
    )

    files = (
        project.subset_files(subset_files)
        if subset_files is not None
        else project.all_files()
    )
    if multiprocessing:
        with mp.Pool() as pool:
            results: Iterable[_MultiprocessingResult] = pool.map(
                container, files
            )
        pool.join()
    else:
        results = map(container, files)
    return results


def _process_error(error: Exception, path: StrPath) -> None:
    # Facilitate better debugging by being able to quit the program.
    if isinstance(error, (bdb.BdbQuit, KeyboardInterrupt)):
        raise error
    if isinstance(error, (OSError, UnicodeError)):
        _LOGGER.error(
            _("Could not read '{path}'").format(path=path),
            exc_info=error,
        )
    else:
        _LOGGER.error(
            _("Unexpected error occurred while parsing '{path}'").format(
                path=path
            ),
            exc_info=error,
        )


[docs] class ProjectReportSubsetProtocol(Protocol): """A :class:`Protocol` that defines a subset of functionality of :class:`ProjectReport`, implemented by :class:`ProjectSubsetReport`. """ path: StrPath missing_licenses: dict[str, set[Path]] read_errors: set[Path] file_reports: set["FileReport"] @property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" @property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" @property def is_compliant(self) -> bool: """Whether the report subset is compliant with the REUSE Spec."""
[docs] class ProjectReport: # pylint: disable=too-many-instance-attributes """Object that holds linting report about the project.""" def __init__(self, do_checksum: bool = True): self.path: StrPath = "" self.licenses: dict[str, Path] = {} self.missing_licenses: dict[str, set[Path]] = {} self.bad_licenses: dict[str, set[Path]] = {} self.deprecated_licenses: set[str] = set() self.read_errors: set[Path] = set() self.file_reports: set[FileReport] = set() self.licenses_without_extension: dict[str, Path] = {} self.do_checksum = do_checksum self._unused_licenses: Optional[set[str]] = None self._used_licenses: Optional[set[str]] = None self._files_without_licenses: Optional[set[Path]] = None self._files_without_copyright: Optional[set[Path]] = None self._is_compliant: Optional[bool] = None
[docs] def to_dict_lint(self) -> dict[str, Any]: """Collects and formats data relevant to linting from report and returns it as a dictionary. Returns: Dictionary containing data from the ProjectReport object. """ # Setup report data container data: dict[str, Any] = { "non_compliant": { "missing_licenses": self.missing_licenses, "unused_licenses": [str(file) for file in self.unused_licenses], "deprecated_licenses": [ str(file) for file in self.deprecated_licenses ], "bad_licenses": self.bad_licenses, "licenses_without_extension": self.licenses_without_extension, "missing_copyright_info": [ str(file) for file in self.files_without_copyright ], "missing_licensing_info": [ str(file) for file in self.files_without_licenses ], "read_errors": [str(file) for file in self.read_errors], }, "files": [], "summary": { "used_licenses": [], }, "recommendations": self.recommendations, } # Populate 'files' for file_report in self.file_reports: data["files"].append(file_report.to_dict_lint()) # Populate 'summary' number_of_files = len(self.file_reports) data["summary"] = { "used_licenses": list(self.used_licenses), "files_total": number_of_files, "files_with_copyright_info": number_of_files - len(self.files_without_copyright), "files_with_licensing_info": number_of_files - len(self.files_without_licenses), "compliant": self.is_compliant, } # Add the top three keys unsorted_data = { "lint_version": LINT_VERSION, "reuse_spec_version": __REUSE_version__, "reuse_tool_version": __version__, **data, } # Sort dictionary keys while keeping the top three keys at the beginning # and the recommendations on the bottom sorted_keys = sorted(list(unsorted_data.keys())) sorted_keys.remove("lint_version") sorted_keys.remove("reuse_spec_version") sorted_keys.remove("reuse_tool_version") sorted_keys.remove("recommendations") sorted_keys = ( [ "lint_version", "reuse_spec_version", "reuse_tool_version", ] + sorted_keys + ["recommendations"] ) sorted_data = {key: unsorted_data[key] for key in sorted_keys} return sorted_data
[docs] def bill_of_materials( self, creator_person: Optional[str] = None, creator_organization: Optional[str] = None, ) -> str: """Generate a bill of materials from the project. See https://spdx.org/specifications. """ out = StringIO() # Write mandatory tags out.write("SPDXVersion: SPDX-2.1\n") out.write("DataLicense: CC0-1.0\n") out.write("SPDXID: SPDXRef-DOCUMENT\n") out.write(f"DocumentName: {Path(self.path).resolve().name}\n") # TODO: Generate UUID from git revision maybe # TODO: Fix the URL out.write( f"DocumentNamespace: http://spdx.org/spdxdocs/spdx-v2.1-{uuid4()}\n" ) # Author out.write(f"Creator: Person: {format_creator(creator_person)}\n") out.write( f"Creator: Organization: {format_creator(creator_organization)}\n" ) out.write(f"Creator: Tool: reuse-{__version__}\n") now = datetime.datetime.now(tz=datetime.timezone.utc) out.write(f"Created: {now.strftime('%Y-%m-%dT%H:%M:%SZ')}\n") out.write( "CreatorComment: <text>This document was created automatically" " using available reuse information consistent with" " REUSE.</text>\n" ) reports = sorted(self.file_reports, key=lambda x: x.name) for report in reports: out.write( "Relationship: SPDXRef-DOCUMENT DESCRIBES" f" {report.spdx_id}\n" ) for report in reports: out.write("\n") out.write(f"FileName: {report.name}\n") out.write(f"SPDXID: {report.spdx_id}\n") out.write(f"FileChecksum: SHA1: {report.chk_sum}\n") out.write(f"LicenseConcluded: {report.license_concluded}\n") for lic in sorted(report.licenses_in_file): out.write(f"LicenseInfoInFile: {lic}\n") if report.copyright: out.write( "FileCopyrightText:" f" <text>{report.copyright}</text>\n" ) else: out.write("FileCopyrightText: NONE\n") # Licenses for lic, path in sorted(self.licenses.items()): if _LICENSEREF_PATTERN.match(lic): out.write("\n") out.write(f"LicenseID: {lic}\n") out.write("LicenseName: NOASSERTION\n") with (Path(self.path) / path).open(encoding="utf-8") as fp: out.write(f"ExtractedText: <text>{fp.read()}</text>\n") return out.getvalue()
[docs] @classmethod def generate( cls, project: Project, do_checksum: bool = True, multiprocessing: bool = cpu_count() > 1, # type: ignore add_license_concluded: bool = False, ) -> "ProjectReport": """Generate a :class:`ProjectReport` from a :class:`Project`. Args: project: The :class:`Project` to lint. do_checksum: Generate a checksum of every file. If this is :const:`False`, generate a random checksum for every file. multiprocessing: Whether to use multiprocessing. add_license_concluded: Whether to aggregate all found SPDX expressions into a concluded license. """ project_report = cls(do_checksum=do_checksum) project_report.path = project.root project_report.licenses = project.licenses project_report.licenses_without_extension = ( project.licenses_without_extension ) results = _generate_file_reports( project, do_checksum=do_checksum, multiprocessing=multiprocessing, add_license_concluded=add_license_concluded, ) for result in results: if result.error: _process_error(result.error, result.path) project_report.read_errors.add(Path(result.path)) continue file_report = cast(FileReport, result.report) # File report. project_report.file_reports.add(file_report) # Missing licenses. for missing_license in file_report.missing_licenses: project_report.missing_licenses.setdefault( missing_license, set() ).add(file_report.path) # Bad licenses for bad_license in file_report.bad_licenses: project_report.bad_licenses.setdefault(bad_license, set()).add( file_report.path ) # More bad licenses, and also deprecated licenses for name, path in project.licenses.items(): if name not in project.license_map: project_report.bad_licenses.setdefault(name, set()).add(path) elif project.license_map[name]["isDeprecatedLicenseId"]: project_report.deprecated_licenses.add(name) return project_report
@property def used_licenses(self) -> set[str]: """Set of license identifiers that are found in file reports.""" if self._used_licenses is not None: return self._used_licenses self._used_licenses = { lic for file_report in self.file_reports for lic in file_report.licenses_in_file } return self._used_licenses @property def unused_licenses(self) -> set[str]: """Set of license identifiers that are not found in any file report.""" if self._unused_licenses is not None: return self._unused_licenses self._unused_licenses = { lic for lic in self.licenses if not any( identifier in self.used_licenses for identifier in set((lic, _add_plus_to_identifier(lic))) ) } return self._unused_licenses @property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" if self._files_without_licenses is not None: return self._files_without_licenses self._files_without_licenses = { file_report.path for file_report in self.file_reports if not file_report.licenses_in_file } return self._files_without_licenses @property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" if self._files_without_copyright is not None: return self._files_without_copyright self._files_without_copyright = { file_report.path for file_report in self.file_reports if not file_report.copyright } return self._files_without_copyright @property def is_compliant(self) -> bool: """Whether the report is compliant with the REUSE Spec.""" if self._is_compliant is not None: return self._is_compliant self._is_compliant = not any( ( self.missing_licenses, self.unused_licenses, self.bad_licenses, self.deprecated_licenses, self.licenses_without_extension, self.files_without_copyright, self.files_without_licenses, self.read_errors, ) ) return self._is_compliant @property def recommendations(self) -> list[str]: """Generate help for next steps based on found REUSE issues""" recommendations = [] # These items should be ordered in the same way as in the summary. if self.bad_licenses: recommendations.append( _( "Fix bad licenses: At least one license in the LICENSES" " directory and/or provided by 'SPDX-License-Identifier'" " tags is invalid. They are either not valid SPDX License" " Identifiers or do not start with 'LicenseRef-'. FAQ about" " custom licenses:" " https://reuse.software/faq/#custom-license" ) ) if self.deprecated_licenses: recommendations.append( _( "Fix deprecated licenses: At least one of the licenses in" " the LICENSES directory and/or provided by an" " 'SPDX-License-Identifier' tag or in '.reuse/dep5' has" " been deprecated by SPDX. The current list and their" " respective recommended new identifiers can be found" " here: <https://spdx.org/licenses/#deprecated>" ) ) if self.licenses_without_extension: recommendations.append( _( "Fix licenses without file extension: At least one license" " text file in the 'LICENSES' directory does not have a" " '.txt' file extension. Please rename the file(s)" " accordingly." ) ) if self.missing_licenses: recommendations.append( _( "Fix missing licenses: For at least one of the license" " identifiers provided by the 'SPDX-License-Identifier'" " tags, there is no corresponding license text file in the" " 'LICENSES' directory. For SPDX license identifiers, you" " can simply run 'reuse download --all' to get any missing" " ones. For custom licenses (starting with 'LicenseRef-')," " you need to add these files yourself." ) ) if self.unused_licenses: recommendations.append( _( "Fix unused licenses: At least one of the license text" " files in 'LICENSES' is not referenced by any file, e.g." " by an 'SPDX-License-Identifier' tag. Please make sure" " that you either tag the accordingly licensed files" " properly, or delete the unused license text if you are" " sure that no file or code snippet is licensed as such." ) ) if self.read_errors: recommendations.append( _( "Fix read errors: At least one of the files in your" " directory cannot be read by the tool. Please check the" " file permissions. You will find the affected files at the" " top of the output as part of the logged error messages." ) ) if self.files_without_copyright or self.files_without_licenses: recommendations.append( _( "Fix missing copyright/licensing information: For one or" " more files, the tool cannot find copyright and/or" " licensing information. You typically do this by adding" " 'SPDX-FileCopyrightText' and 'SPDX-License-Identifier'" " tags to each file. The tutorial explains additional ways" " to do this: <https://reuse.software/tutorial/>" ) ) return recommendations
[docs] class ProjectSubsetReport: """Like a :class:`ProjectReport`, but for a subset of the files using a subset of features. """ def __init__(self) -> None: self.path: StrPath = "" self.missing_licenses: dict[str, set[Path]] = {} self.read_errors: set[Path] = set() self.file_reports: set[FileReport] = set()
[docs] @classmethod def generate( cls, project: Project, subset_files: Collection[StrPath], multiprocessing: bool = cpu_count() > 1, # type: ignore ) -> "ProjectSubsetReport": """Generate a :class:`ProjectSubsetReport` from a :class:`Project`. Args: project: The :class:`Project` to lint. subset_files: Only lint the files in this list. multiprocessing: Whether to use multiprocessing. """ subset_report = cls() subset_report.path = project.root results = _generate_file_reports( project, do_checksum=False, subset_files=subset_files, multiprocessing=multiprocessing, add_license_concluded=False, ) for result in results: if result.error: _process_error(result.error, result.path) subset_report.read_errors.add(Path(result.path)) continue file_report = cast(FileReport, result.report) subset_report.file_reports.add(file_report) for missing_license in file_report.missing_licenses: subset_report.missing_licenses.setdefault( missing_license, set() ).add(file_report.path) return subset_report
@property def files_without_licenses(self) -> set[Path]: """Set of paths that have no licensing information.""" return { file_report.path for file_report in self.file_reports if not file_report.licenses_in_file } @property def files_without_copyright(self) -> set[Path]: """Set of paths that have no copyright information.""" return { file_report.path for file_report in self.file_reports if not file_report.copyright } @property def is_compliant(self) -> bool: """Whether the report subset is compliant with the REUSE Spec.""" return not any( ( self.missing_licenses, self.files_without_copyright, self.files_without_licenses, self.read_errors, ) )
[docs] class FileReport: # pylint: disable=too-many-instance-attributes """Object that holds a linting report about a single file.""" def __init__(self, name: str, path: StrPath, do_checksum: bool = True): self.name = name self.path = Path(path) self.do_checksum = do_checksum self.reuse_infos: list[ReuseInfo] = [] self.spdx_id: Optional[str] = None self.chk_sum: Optional[str] = None self.licenses_in_file: list[str] = [] self.license_concluded: str = "" self.copyright: str = "" self.bad_licenses: set[str] = set() self.missing_licenses: set[str] = set()
[docs] def to_dict_lint(self) -> dict[str, Any]: """Turn the report into a json-like dictionary with exclusively information relevant for linting. """ return { "path": PurePath(self.name).as_posix(), "copyrights": [ { "value": line, "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for line in reuse_info.copyright_lines ], "spdx_expressions": [ { "value": str(expression), "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for expression in reuse_info.spdx_expressions ], }
[docs] @classmethod def generate( cls, project: Project, path: StrPath, do_checksum: bool = True, add_license_concluded: bool = False, ) -> "FileReport": """Generate a FileReport from a path in a Project.""" path = Path(path) if not path.is_file(): raise OSError(f"{path} is not a file") relative = project.relative_from_root(path) report = cls(f"./{relative}", path, do_checksum=do_checksum) # Checksum and ID if report.do_checksum: report.chk_sum = _checksum(path) else: # This path avoids a lot of heavy computation, which is handy for # scenarios where you only need a unique hash, not a consistent # hash. report.chk_sum = f"{random.getrandbits(160):040x}" spdx_id = md5() spdx_id.update(report.name.encode("utf-8")) spdx_id.update(report.chk_sum.encode("utf-8")) report.spdx_id = f"SPDXRef-{spdx_id.hexdigest()}" reuse_infos = project.reuse_info_of(path) for reuse_info in reuse_infos: for expression in reuse_info.spdx_expressions: for identifier in _LICENSING.license_keys(expression): # A license expression akin to Apache-1.0+ should register # correctly if LICENSES/Apache-1.0.txt exists. identifiers = {identifier} if ( plus_identifier := _strip_plus_from_identifier( identifier ) ) != identifier: identifiers.add(plus_identifier) # Bad license if not identifiers.intersection(project.license_map): report.bad_licenses.add(identifier) # Missing license if not identifiers.intersection(project.licenses): report.missing_licenses.add(identifier) # Add license to report. report.licenses_in_file.append(identifier) if not add_license_concluded: report.license_concluded = "NOASSERTION" elif not any(reuse_info.spdx_expressions for reuse_info in reuse_infos): report.license_concluded = "NONE" else: # Merge all the license expressions together, wrapping them in # parentheses to make sure an expression doesn't spill into another # one. The extra parentheses will be removed by the roundtrip # through parse() -> simplify() -> render(). report.license_concluded = ( _LICENSING.parse( " AND ".join( f"({expression})" for reuse_info in reuse_infos for expression in reuse_info.spdx_expressions ), ) .simplify() .render() ) # Copyright text report.copyright = "\n".join( sorted( line for reuse_info in reuse_infos for line in reuse_info.copyright_lines ) ) # Source of licensing and copyright info report.reuse_infos = reuse_infos return report
def __hash__(self) -> int: if self.chk_sum is not None: return hash(self.name + self.chk_sum) return super().__hash__()
[docs] def format_creator(creator: Optional[str]) -> str: """Render the creator field based on the provided flag""" if creator is None: return "Anonymous ()" if "(" in creator and creator.endswith(")"): # The creator field already contains an email address return creator return creator + " ()"