Source code for reuse.report

# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains reports about files and projects for linting."""

import contextlib
import datetime
import logging
import multiprocessing as mp
import random
from gettext import gettext as _
from hashlib import md5
from io import StringIO
from os import cpu_count
from pathlib import Path, PurePath
from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Set, cast
from uuid import uuid4

from debian.copyright import Copyright

from . import __REUSE_version__, __version__
from ._util import (
    _LICENSEREF_PATTERN,
    _LICENSING,
    StrPath,
    _checksum,
    _parse_dep5,
)
from .project import Project, ReuseInfo

_LOGGER = logging.getLogger(__name__)

LINT_VERSION = "1.0"


class _MultiprocessingContainer:
    """Container that remembers some data in order to generate a FileReport."""

    def __init__(
        self, project: Project, do_checksum: bool, add_license_concluded: bool
    ):
        # TODO: We create a copy of the project in the following song-and-dance
        # because the debian Copyright object cannot be pickled.
        new_project = Project(
            project.root,
            vcs_strategy=project.vcs_strategy.__class__,
            license_map=project.license_map,
            licenses=project.licenses.copy(),
            # Unset dep5_copyright
            dep5_copyright=None,
            include_submodules=project.include_submodules,
            include_meson_subprojects=project.include_meson_subprojects,
        )
        new_project.licenses_without_extension = (
            project.licenses_without_extension
        )

        self.project = new_project
        # Remember that a dep5_copyright was (or was not) set prior.
        self.has_dep5 = bool(project.dep5_copyright)
        self.dep5_copyright: Optional[Copyright] = None
        self.do_checksum = do_checksum
        self.add_license_concluded = add_license_concluded

    def __call__(self, file_: StrPath) -> "_MultiprocessingResult":
        # By remembering that we've parsed the .reuse/dep5, we only parse it
        # once (the first time) inside of each process.
        if self.has_dep5 and not self.dep5_copyright:
            with contextlib.suppress(Exception):
                self.dep5_copyright = _parse_dep5(
                    self.project.root / ".reuse/dep5"
                )
                self.project.dep5_copyright = self.dep5_copyright
        # pylint: disable=broad-except
        try:
            return _MultiprocessingResult(
                file_,
                FileReport.generate(
                    self.project,
                    file_,
                    do_checksum=self.do_checksum,
                    add_license_concluded=self.add_license_concluded,
                ),
                None,
            )
        except Exception as exc:
            return _MultiprocessingResult(file_, None, exc)


class _MultiprocessingResult(NamedTuple):
    """Result of :class:`MultiprocessingContainer`."""

    path: StrPath
    report: Optional["FileReport"]
    error: Optional[Exception]


[docs]class ProjectReport: # pylint: disable=too-many-instance-attributes """Object that holds linting report about the project.""" def __init__(self, do_checksum: bool = True): self.path: StrPath = "" self.licenses: Dict[str, Path] = {} self.missing_licenses: Dict[str, Set[Path]] = {} self.bad_licenses: Dict[str, Set[Path]] = {} self.deprecated_licenses: Set[str] = set() self.read_errors: Set[Path] = set() self.file_reports: Set[FileReport] = set() self.licenses_without_extension: Dict[str, Path] = {} self.do_checksum = do_checksum self._unused_licenses: Optional[Set[str]] = None self._used_licenses: Optional[Set[str]] = None self._files_without_licenses: Optional[Set[Path]] = None self._files_without_copyright: Optional[Set[Path]] = None self._is_compliant: Optional[bool] = None
[docs] def to_dict_lint(self) -> Dict[str, Any]: """Collects and formats data relevant to linting from report and returns it as a dictionary. Returns: Dictionary containing data from the ProjectReport object. """ # Setup report data container data: Dict[str, Any] = { "non_compliant": { "missing_licenses": self.missing_licenses, "unused_licenses": [str(file) for file in self.unused_licenses], "deprecated_licenses": [ str(file) for file in self.deprecated_licenses ], "bad_licenses": self.bad_licenses, "licenses_without_extension": self.licenses_without_extension, "missing_copyright_info": [ str(file) for file in self.files_without_copyright ], "missing_licensing_info": [ str(file) for file in self.files_without_licenses ], "read_errors": [str(file) for file in self.read_errors], }, "files": [], "summary": { "used_licenses": [], }, "recommendations": self.recommendations, } # Populate 'files' for file_report in self.file_reports: data["files"].append(file_report.to_dict_lint()) # Populate 'summary' number_of_files = len(self.file_reports) data["summary"] = { "used_licenses": list(self.used_licenses), "files_total": number_of_files, "files_with_copyright_info": number_of_files - len(self.files_without_copyright), "files_with_licensing_info": number_of_files - len(self.files_without_licenses), "compliant": self.is_compliant, } # Add the top three keys unsorted_data = { "lint_version": LINT_VERSION, "reuse_spec_version": __REUSE_version__, "reuse_tool_version": __version__, **data, } # Sort dictionary keys while keeping the top three keys at the beginning # and the recommendations on the bottom sorted_keys = sorted(list(unsorted_data.keys())) sorted_keys.remove("lint_version") sorted_keys.remove("reuse_spec_version") sorted_keys.remove("reuse_tool_version") sorted_keys.remove("recommendations") sorted_keys = ( [ "lint_version", "reuse_spec_version", "reuse_tool_version", ] + sorted_keys + ["recommendations"] ) sorted_data = {key: unsorted_data[key] for key in sorted_keys} return sorted_data
[docs] def bill_of_materials( self, creator_person: Optional[str] = None, creator_organization: Optional[str] = None, ) -> str: """Generate a bill of materials from the project. See https://spdx.org/specifications. """ out = StringIO() # Write mandatory tags out.write("SPDXVersion: SPDX-2.1\n") out.write("DataLicense: CC0-1.0\n") out.write("SPDXID: SPDXRef-DOCUMENT\n") out.write(f"DocumentName: {Path(self.path).resolve().name}\n") # TODO: Generate UUID from git revision maybe # TODO: Fix the URL out.write( f"DocumentNamespace: http://spdx.org/spdxdocs/spdx-v2.1-{uuid4()}\n" ) # Author out.write(f"Creator: Person: {format_creator(creator_person)}\n") out.write( f"Creator: Organization: {format_creator(creator_organization)}\n" ) out.write(f"Creator: Tool: reuse-{__version__}\n") now = datetime.datetime.now(tz=datetime.timezone.utc) out.write(f"Created: {now.strftime('%Y-%m-%dT%H:%M:%SZ')}\n") out.write( "CreatorComment: <text>This document was created automatically" " using available reuse information consistent with" " REUSE.</text>\n" ) reports = sorted(self.file_reports, key=lambda x: x.name) for report in reports: out.write( "Relationship: SPDXRef-DOCUMENT describes" f" {report.spdx_id}\n" ) for report in reports: out.write("\n") out.write(f"FileName: {report.name}\n") out.write(f"SPDXID: {report.spdx_id}\n") out.write(f"FileChecksum: SHA1: {report.chk_sum}\n") out.write(f"LicenseConcluded: {report.license_concluded}\n") for lic in sorted(report.licenses_in_file): out.write(f"LicenseInfoInFile: {lic}\n") if report.copyright: out.write( "FileCopyrightText:" f" <text>{report.copyright}</text>\n" ) else: out.write("FileCopyrightText: NONE\n") # Licenses for lic, path in sorted(self.licenses.items()): if _LICENSEREF_PATTERN.match(lic): out.write("\n") out.write(f"LicenseID: {lic}\n") out.write("LicenseName: NOASSERTION\n") with (Path(self.path) / path).open(encoding="utf-8") as fp: out.write(f"ExtractedText: <text>{fp.read()}</text>\n") return out.getvalue()
[docs] @classmethod def generate( cls, project: Project, do_checksum: bool = True, multiprocessing: bool = cpu_count() > 1, # type: ignore add_license_concluded: bool = False, ) -> "ProjectReport": """Generate a ProjectReport from a Project.""" project_report = cls(do_checksum=do_checksum) project_report.path = project.root project_report.licenses = project.licenses project_report.licenses_without_extension = ( project.licenses_without_extension ) container = _MultiprocessingContainer( project, do_checksum, add_license_concluded ) if multiprocessing: with mp.Pool() as pool: results: Iterable[_MultiprocessingResult] = pool.map( container, project.all_files() ) pool.join() else: results = map(container, project.all_files()) for result in results: if result.error: if isinstance(result.error, (OSError, UnicodeError)): _LOGGER.error( _("Could not read '{path}'").format(path=result.path), exc_info=result.error, ) project_report.read_errors.add(Path(result.path)) continue _LOGGER.error( _( "Unexpected error occurred while parsing '{path}'" ).format(path=result.path), exc_info=result.error, ) project_report.read_errors.add(Path(result.path)) continue file_report = cast(FileReport, result.report) # File report. project_report.file_reports.add(file_report) # Missing licenses. for missing_license in file_report.missing_licenses: project_report.missing_licenses.setdefault( missing_license, set() ).add(file_report.path) # Bad licenses for bad_license in file_report.bad_licenses: project_report.bad_licenses.setdefault(bad_license, set()).add( file_report.path ) # More bad licenses, and also deprecated licenses for name, path in project.licenses.items(): if name not in project.license_map: project_report.bad_licenses.setdefault(name, set()).add(path) elif project.license_map[name]["isDeprecatedLicenseId"]: project_report.deprecated_licenses.add(name) return project_report
@property def used_licenses(self) -> Set[str]: """Set of license identifiers that are found in file reports.""" if self._used_licenses is not None: return self._used_licenses self._used_licenses = { lic for file_report in self.file_reports for lic in file_report.licenses_in_file } return self._used_licenses @property def unused_licenses(self) -> Set[str]: """Set of license identifiers that are not found in any file report.""" if self._unused_licenses is not None: return self._unused_licenses # First collect licenses that are suspected to be unused. suspected_unused_licenses = { lic for lic in self.licenses if lic not in self.used_licenses } # Remove false positives. self._unused_licenses = { lic for lic in suspected_unused_licenses if f"{lic}+" not in self.used_licenses } return self._unused_licenses @property def files_without_licenses(self) -> Set[Path]: """Set of paths that have no licensing information.""" if self._files_without_licenses is not None: return self._files_without_licenses self._files_without_licenses = { file_report.path for file_report in self.file_reports if not file_report.licenses_in_file } return self._files_without_licenses @property def files_without_copyright(self) -> Set[Path]: """Set of paths that have no copyright information.""" if self._files_without_copyright is not None: return self._files_without_copyright self._files_without_copyright = { file_report.path for file_report in self.file_reports if not file_report.copyright } return self._files_without_copyright @property def is_compliant(self) -> bool: """Whether the report is compliant with the REUSE Spec.""" if self._is_compliant is not None: return self._is_compliant self._is_compliant = not any( ( self.missing_licenses, self.unused_licenses, self.bad_licenses, self.deprecated_licenses, self.licenses_without_extension, self.files_without_copyright, self.files_without_licenses, self.read_errors, ) ) return self._is_compliant @property def recommendations(self) -> List[str]: """Generate help for next steps based on found REUSE issues""" recommendations = [] # These items should be ordered in the same way as in the summary. if self.bad_licenses: recommendations.append( _( "Fix bad licenses: At least one license in the LICENSES" " directory and/or provided by 'SPDX-License-Identifier'" " tags is invalid. They are either not valid SPDX License" " Identifiers or do not start with 'LicenseRef-'. FAQ about" " custom licenses:" " https://reuse.software/faq/#custom-license" ) ) if self.deprecated_licenses: recommendations.append( _( "Fix deprecated licenses: At least one of the licenses in" " the LICENSES directory and/or provided by an" " 'SPDX-License-Identifier' tag or in '.reuse/dep5' has" " been deprecated by SPDX. The current list and their" " respective recommended new identifiers can be found" " here: <https://spdx.org/licenses/#deprecated>" ) ) if self.licenses_without_extension: recommendations.append( _( "Fix licenses without file extension: At least one license" " text file in the 'LICENSES' directory does not have a" " '.txt' file extension. Please rename the file(s)" " accordingly." ) ) if self.missing_licenses: recommendations.append( _( "Fix missing licenses: For at least one of the license" " identifiers provided by the 'SPDX-License-Identifier'" " tags, there is no corresponding license text file in the" " 'LICENSES' directory. For SPDX license identifiers, you" " can simply run 'reuse download --all' to get any missing" " ones. For custom licenses (starting with 'LicenseRef-')," " you need to add these files yourself." ) ) if self.unused_licenses: recommendations.append( _( "Fix unused licenses: At least one of the license text" " files in 'LICENSES' is not referenced by any file, e.g." " by an 'SPDX-License-Identifier' tag. Please make sure" " that you either tag the accordingly licensed files" " properly, or delete the unused license text if you are" " sure that no file or code snippet is licensed as such." ) ) if self.read_errors: recommendations.append( _( "Fix read errors: At least one of the files in your" " directory cannot be read by the tool. Please check the" " file permissions. You will find the affected files at the" " top of the output as part of the logged error messages." ) ) if self.files_without_copyright or self.files_without_licenses: recommendations.append( _( "Fix missing copyright/licensing information: For one or" " more files, the tool cannot find copyright and/or" " licensing information. You typically do this by adding" " 'SPDX-FileCopyrightText' and 'SPDX-License-Identifier'" " tags to each file. The tutorial explains additional ways" " to do this: <https://reuse.software/tutorial/>" ) ) return recommendations
[docs]class FileReport: # pylint: disable=too-many-instance-attributes """Object that holds a linting report about a single file.""" def __init__(self, name: str, path: StrPath, do_checksum: bool = True): self.name = name self.path = Path(path) self.do_checksum = do_checksum self.reuse_infos: List[ReuseInfo] = [] self.spdx_id: Optional[str] = None self.chk_sum: Optional[str] = None self.licenses_in_file: List[str] = [] self.license_concluded: str = "" self.copyright: str = "" self.bad_licenses: Set[str] = set() self.missing_licenses: Set[str] = set()
[docs] def to_dict_lint(self) -> Dict[str, Any]: """Turn the report into a json-like dictionary with exclusively information relevant for linting. """ return { # This gets rid of the './' prefix. In Python 3.9, use # str.removeprefix. "path": PurePath(self.name).as_posix(), "copyrights": [ { "value": line, "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for line in reuse_info.copyright_lines ], "spdx_expressions": [ { "value": str(expression), "source": reuse_info.source_path, "source_type": ( reuse_info.source_type.value if reuse_info.source_type else None ), } for reuse_info in self.reuse_infos for expression in reuse_info.spdx_expressions ], }
[docs] @classmethod def generate( cls, project: Project, path: StrPath, do_checksum: bool = True, add_license_concluded: bool = False, ) -> "FileReport": """Generate a FileReport from a path in a Project.""" path = Path(path) if not path.is_file(): raise OSError(f"{path} is not a file") relative = project.relative_from_root(path) report = cls(f"./{relative}", path, do_checksum=do_checksum) # Checksum and ID if report.do_checksum: report.chk_sum = _checksum(path) else: # This path avoids a lot of heavy computation, which is handy for # scenarios where you only need a unique hash, not a consistent # hash. report.chk_sum = f"{random.getrandbits(160):040x}" spdx_id = md5() spdx_id.update(report.name.encode("utf-8")) spdx_id.update(report.chk_sum.encode("utf-8")) report.spdx_id = f"SPDXRef-{spdx_id.hexdigest()}" reuse_infos = project.reuse_info_of(path) for reuse_info in reuse_infos: for expression in reuse_info.spdx_expressions: for identifier in _LICENSING.license_keys(expression): # A license expression akin to Apache-1.0+ should register # correctly if LICENSES/Apache-1.0.txt exists. identifiers = {identifier} if identifier.endswith("+"): identifiers.add(identifier[:-1]) # Bad license if not identifiers.intersection(project.license_map): report.bad_licenses.add(identifier) # Missing license if not identifiers.intersection(project.licenses): report.missing_licenses.add(identifier) # Add license to report. report.licenses_in_file.append(identifier) if not add_license_concluded: report.license_concluded = "NOASSERTION" elif not any(reuse_info.spdx_expressions for reuse_info in reuse_infos): report.license_concluded = "NONE" else: # Merge all the license expressions together, wrapping them in # parentheses to make sure an expression doesn't spill into another # one. The extra parentheses will be removed by the roundtrip # through parse() -> simplify() -> render(). report.license_concluded = ( _LICENSING.parse( " AND ".join( f"({expression})" for reuse_info in reuse_infos for expression in reuse_info.spdx_expressions ), ) .simplify() .render() ) # Copyright text report.copyright = "\n".join( sorted( line for reuse_info in reuse_infos for line in reuse_info.copyright_lines ) ) # Source of licensing and copyright info report.reuse_infos = reuse_infos return report
def __hash__(self) -> int: if self.chk_sum is not None: return hash(self.name + self.chk_sum) return super().__hash__()
[docs]def format_creator(creator: Optional[str]) -> str: """Render the creator field based on the provided flag""" if creator is None: return "Anonymous ()" if "(" in creator and creator.endswith(")"): # The creator field already contains an email address return creator return creator + " ()"