Source code for reuse.report

# SPDX-FileCopyrightText: 2017-2019 Free Software Foundation Europe e.V.
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains reports about files and projects for linting."""

import datetime
import logging
import random
from gettext import gettext as _
from hashlib import md5
from io import StringIO
from os import PathLike
from pathlib import Path
from typing import Iterable, List, NamedTuple, Set
from uuid import uuid4

from . import __version__
from ._util import _LICENSING, _checksum
from .project import Project

_LOGGER = logging.getLogger(__name__)

FileReportInfo = NamedTuple(
    "FileReportInfo",
    [
        ("file_report", "FileReport"),
        ("bad_licenses", Set[str]),
        ("missing_licenses", Set[str]),
    ],
)


[docs]class ProjectReport: # pylint: disable=too-many-instance-attributes """Object that holds linting report about the project.""" def __init__(self, do_checksum: bool = True): self.path = None self.licenses = dict() self.missing_licenses = dict() self.bad_licenses = dict() self.read_errors = set() self.file_reports = set() self.do_checksum = do_checksum self._unused_licenses = None self._files_without_licenses = None self._files_without_copyright = None
[docs] def to_dict(self): """Turn the report into a json-like dictionary.""" return { "path": str(Path(self.path).resolve()), "licenses": { identifier: str(path) for identifier, path in self.licenses.items() }, "missing_licenses": { lic: [str(file_) for file_ in files] for lic, files in self.missing_licenses.items() }, "bad_licenses": { lic: [str(file_) for file_ in files] for lic, files in self.bad_licenses.items() }, "read_errors": list(map(str, self.read_errors)), "file_reports": [report.to_dict() for report in self.file_reports], }
[docs] def bill_of_materials(self) -> str: """Generate a bill of materials from the project. See https://spdx.org/specifications. """ out = StringIO() # Write mandatory tags out.write("SPDXVersion: SPDX-2.1\n") out.write("DataLicense: CC0-1.0\n") out.write("SPDXID: SPDXRef-DOCUMENT\n") out.write("DocumentName: {}\n".format(Path(self.path).resolve().name)) # TODO: Generate UUID from git revision maybe # TODO: Fix the URL out.write( "DocumentNamespace: " "http://spdx.org/spdxdocs/spdx-v2.1-{}\n".format(uuid4()) ) # Author # TODO: Fix Person and Organization out.write("Creator: Person: Anonymous ()\n") out.write("Creator: Organization: Anonymous ()\n") out.write("Creator: Tool: reuse-{}\n".format(__version__)) now = datetime.datetime.utcnow() now = now.replace(microsecond=0) out.write("Created: {}Z\n".format(now.isoformat())) out.write( "CreatorComment: <text>This document was created automatically " "using available reuse information consistent with " "REUSE.</text>\n" ) reports = sorted(self.file_reports, key=lambda x: x.spdxfile.name) for report in reports: out.write( "Relationship: SPDXRef-DOCUMENT describes {}\n".format( report.spdxfile.spdx_id ) ) for report in reports: out.write("\n") out.write("FileName: {}\n".format(report.spdxfile.name)) out.write("SPDXID: {}\n".format(report.spdxfile.spdx_id)) out.write( "FileChecksum: SHA1: {}\n".format(report.spdxfile.chk_sum) ) # IMPORTANT: Make no assertion about concluded license. This tool # cannot, with full certainty, determine the license of a file. out.write("LicenseConcluded: NOASSERTION\n") for lic in sorted(report.spdxfile.licenses_in_file): out.write("LicenseInfoInFile: {}\n".format(lic)) if report.spdxfile.copyright: out.write( "FileCopyrightText: <text>{}</text>\n".format( report.spdxfile.copyright ) ) else: out.write("FileCopyrightText: NONE\n") # Licenses for lic, path in sorted(self.licenses.items()): if lic.startswith("LicenseRef-"): out.write("\n") out.write("LicenseID: {}\n".format(lic)) out.write("LicenseName: NOASSERTION\n") with (Path(self.path) / path).open() as fp: out.write( "ExtractedText: <text>{}</text>\n".format(fp.read()) ) return out.getvalue()
[docs] @classmethod def generate( cls, project: Project, paths: Iterable[PathLike] = None, do_checksum: bool = True, ) -> "ProjectReport": """Generate a ProjectReport from a Project.""" if paths is None: paths = [project.root] project_report = cls(do_checksum=do_checksum) project_report.path = project.root project_report.licenses = project.licenses for path in paths: for file_ in project.all_files(path): try: lint_file_info = FileReport.generate( project, file_, do_checksum=project_report.do_checksum ) except (OSError, UnicodeError): # Translators: %s is a path. _LOGGER.info(_("Could not read %s"), file_) project_report.read_errors.add(file_) continue # File report. project_report.file_reports.add(lint_file_info.file_report) # Bad and missing licenses. for license in lint_file_info.missing_licenses: project_report.missing_licenses.setdefault( license, set() ).add(lint_file_info.file_report.path) for license in lint_file_info.bad_licenses: project_report.bad_licenses.setdefault(license, set()).add( lint_file_info.file_report.path ) # More bad licenses for name, path in project.licenses.items(): if name not in project.license_map: project_report.bad_licenses.setdefault(name, set()).add(path) return project_report
@property def used_licenses(self) -> Set[str]: """Set of license identifiers that are found in file reports.""" return set(self.licenses) - self.unused_licenses @property def unused_licenses(self) -> Set[str]: """Set of license identifiers that are not found in any file report.""" if self._unused_licenses is not None: return self._unused_licenses used_licenses = set() unused_licenses = set() for file_report in self.file_reports: for lic in file_report.spdxfile.licenses_in_file: used_licenses.add(lic) for lic in self.licenses: if lic not in used_licenses: unused_licenses.add(lic) self._unused_licenses = unused_licenses return unused_licenses @property def files_without_licenses(self) -> Iterable[PathLike]: """Iterable of paths that have no license information.""" if self._files_without_licenses is not None: return self._files_without_licenses files_without_licenses = [] for file_report in self.file_reports: if not file_report.spdxfile.licenses_in_file: files_without_licenses.append(file_report.path) self._files_without_licenses = files_without_licenses return files_without_licenses @property def files_without_copyright(self) -> Iterable[PathLike]: """Iterable of paths that have no copyright information.""" if self._files_without_copyright is not None: return self._files_without_copyright files_without_copyright = [] for file_report in self.file_reports: if not file_report.spdxfile.copyright: files_without_copyright.append(file_report.path) self._files_without_copyright = files_without_copyright return files_without_copyright
class _File: # pylint: disable=too-few-public-methods """Represent an SPDX file. Sufficiently enough for our purposes, in any case. """ def __init__(self, name, spdx_id=None, chk_sum=None): self.name: str = name self.spdx_id: str = spdx_id self.chk_sum: str = chk_sum self.licenses_in_file: List[str] = [] self.copyright: str = None
[docs]class FileReport: """Object that holds a linting report about a single file. Importantly, it also contains SPDX File information in :attr:`spdxfile`. """ def __init__( self, name: PathLike, path: PathLike, do_checksum: bool = True ): self.spdxfile = _File(name) self.path = Path(path) self.do_checksum = do_checksum
[docs] def to_dict(self): """Turn the report into a json-like dictionary.""" return { "path": str(Path(self.path).resolve()), "name": self.spdxfile.name, "spdx_id": self.spdxfile.spdx_id, "chk_sum": self.spdxfile.chk_sum, "licenses_in_file": [ lic for lic in self.spdxfile.licenses_in_file ], "copyright": self.spdxfile.copyright, }
[docs] @classmethod def generate( cls, project: Project, path: PathLike, do_checksum: bool = True ) -> FileReportInfo: """Generate a FileReport from a path in a Project.""" path = Path(path) if not path.is_file(): raise OSError("{} is not a file".format(path)) # pylint: disable=protected-access relative = project._relative_from_root(path) report = cls("./" + str(relative), path, do_checksum=do_checksum) bad_licenses = set() missing_licenses = set() # Checksum and ID if report.do_checksum: report.spdxfile.chk_sum = _checksum(path) else: # This path avoids a lot of heavy computation, which is handy for # scenarios where you only need a unique hash, not a consistent # hash. report.spdxfile.chk_sum = "%040x" % random.getrandbits(40) spdx_id = md5() spdx_id.update(str(relative).encode("utf-8")) spdx_id.update(report.spdxfile.chk_sum.encode("utf-8")) report.spdxfile.spdx_id = "SPDXRef-{}".format(spdx_id.hexdigest()) spdx_info = project.spdx_info_of(path) for expression in spdx_info.spdx_expressions: for identifier in _LICENSING.license_keys(expression): # Bad license if identifier not in project.license_map: bad_licenses.add(identifier) # Missing license elif identifier not in project.licenses: missing_licenses.add(identifier) # Add license to report. report.spdxfile.licenses_in_file.append(identifier) # Copyright text report.spdxfile.copyright = "\n".join( sorted(spdx_info.copyright_lines) ) return FileReportInfo(report, bad_licenses, missing_licenses)
def __hash__(self): if self.spdxfile.chk_sum is not None: return hash(self.spdxfile.name + self.spdxfile.chk_sum) return super().__hash__(self)