# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""Module that contains reports about files and projects for linting."""
import contextlib
import datetime
import logging
import multiprocessing as mp
import random
from gettext import gettext as _
from hashlib import md5
from io import StringIO
from os import cpu_count
from pathlib import Path, PurePath
from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Set, cast
from uuid import uuid4
from debian.copyright import Copyright
from . import __REUSE_version__, __version__
from ._util import (
_LICENSEREF_PATTERN,
_LICENSING,
StrPath,
_checksum,
_parse_dep5,
)
from .project import Project, ReuseInfo
_LOGGER = logging.getLogger(__name__)
LINT_VERSION = "1.0"
class _MultiprocessingContainer:
"""Container that remembers some data in order to generate a FileReport."""
def __init__(
self, project: Project, do_checksum: bool, add_license_concluded: bool
):
# TODO: We create a copy of the project in the following song-and-dance
# because the debian Copyright object cannot be pickled.
new_project = Project(
project.root,
vcs_strategy=project.vcs_strategy.__class__,
license_map=project.license_map,
licenses=project.licenses.copy(),
# Unset dep5_copyright
dep5_copyright=None,
include_submodules=project.include_submodules,
include_meson_subprojects=project.include_meson_subprojects,
)
new_project.licenses_without_extension = (
project.licenses_without_extension
)
self.project = new_project
# Remember that a dep5_copyright was (or was not) set prior.
self.has_dep5 = bool(project.dep5_copyright)
self.dep5_copyright: Optional[Copyright] = None
self.do_checksum = do_checksum
self.add_license_concluded = add_license_concluded
def __call__(self, file_: StrPath) -> "_MultiprocessingResult":
# By remembering that we've parsed the .reuse/dep5, we only parse it
# once (the first time) inside of each process.
if self.has_dep5 and not self.dep5_copyright:
with contextlib.suppress(Exception):
self.dep5_copyright = _parse_dep5(
self.project.root / ".reuse/dep5"
)
self.project.dep5_copyright = self.dep5_copyright
# pylint: disable=broad-except
try:
return _MultiprocessingResult(
file_,
FileReport.generate(
self.project,
file_,
do_checksum=self.do_checksum,
add_license_concluded=self.add_license_concluded,
),
None,
)
except Exception as exc:
return _MultiprocessingResult(file_, None, exc)
class _MultiprocessingResult(NamedTuple):
"""Result of :class:`MultiprocessingContainer`."""
path: StrPath
report: Optional["FileReport"]
error: Optional[Exception]
[docs]class ProjectReport: # pylint: disable=too-many-instance-attributes
"""Object that holds linting report about the project."""
def __init__(self, do_checksum: bool = True):
self.path: StrPath = ""
self.licenses: Dict[str, Path] = {}
self.missing_licenses: Dict[str, Set[Path]] = {}
self.bad_licenses: Dict[str, Set[Path]] = {}
self.deprecated_licenses: Set[str] = set()
self.read_errors: Set[Path] = set()
self.file_reports: Set[FileReport] = set()
self.licenses_without_extension: Dict[str, Path] = {}
self.do_checksum = do_checksum
self._unused_licenses: Optional[Set[str]] = None
self._used_licenses: Optional[Set[str]] = None
self._files_without_licenses: Optional[Set[Path]] = None
self._files_without_copyright: Optional[Set[Path]] = None
self._is_compliant: Optional[bool] = None
[docs] def to_dict_lint(self) -> Dict[str, Any]:
"""Collects and formats data relevant to linting from report and returns
it as a dictionary.
Returns:
Dictionary containing data from the ProjectReport object.
"""
# Setup report data container
data: Dict[str, Any] = {
"non_compliant": {
"missing_licenses": self.missing_licenses,
"unused_licenses": [str(file) for file in self.unused_licenses],
"deprecated_licenses": [
str(file) for file in self.deprecated_licenses
],
"bad_licenses": self.bad_licenses,
"licenses_without_extension": self.licenses_without_extension,
"missing_copyright_info": [
str(file) for file in self.files_without_copyright
],
"missing_licensing_info": [
str(file) for file in self.files_without_licenses
],
"read_errors": [str(file) for file in self.read_errors],
},
"files": [],
"summary": {
"used_licenses": [],
},
"recommendations": self.recommendations,
}
# Populate 'files'
for file_report in self.file_reports:
data["files"].append(file_report.to_dict_lint())
# Populate 'summary'
number_of_files = len(self.file_reports)
data["summary"] = {
"used_licenses": list(self.used_licenses),
"files_total": number_of_files,
"files_with_copyright_info": number_of_files
- len(self.files_without_copyright),
"files_with_licensing_info": number_of_files
- len(self.files_without_licenses),
"compliant": self.is_compliant,
}
# Add the top three keys
unsorted_data = {
"lint_version": LINT_VERSION,
"reuse_spec_version": __REUSE_version__,
"reuse_tool_version": __version__,
**data,
}
# Sort dictionary keys while keeping the top three keys at the beginning
# and the recommendations on the bottom
sorted_keys = sorted(list(unsorted_data.keys()))
sorted_keys.remove("lint_version")
sorted_keys.remove("reuse_spec_version")
sorted_keys.remove("reuse_tool_version")
sorted_keys.remove("recommendations")
sorted_keys = (
[
"lint_version",
"reuse_spec_version",
"reuse_tool_version",
]
+ sorted_keys
+ ["recommendations"]
)
sorted_data = {key: unsorted_data[key] for key in sorted_keys}
return sorted_data
[docs] def bill_of_materials(
self,
creator_person: Optional[str] = None,
creator_organization: Optional[str] = None,
) -> str:
"""Generate a bill of materials from the project.
See https://spdx.org/specifications.
"""
out = StringIO()
# Write mandatory tags
out.write("SPDXVersion: SPDX-2.1\n")
out.write("DataLicense: CC0-1.0\n")
out.write("SPDXID: SPDXRef-DOCUMENT\n")
out.write(f"DocumentName: {Path(self.path).resolve().name}\n")
# TODO: Generate UUID from git revision maybe
# TODO: Fix the URL
out.write(
f"DocumentNamespace: http://spdx.org/spdxdocs/spdx-v2.1-{uuid4()}\n"
)
# Author
out.write(f"Creator: Person: {format_creator(creator_person)}\n")
out.write(
f"Creator: Organization: {format_creator(creator_organization)}\n"
)
out.write(f"Creator: Tool: reuse-{__version__}\n")
now = datetime.datetime.now(tz=datetime.timezone.utc)
out.write(f"Created: {now.strftime('%Y-%m-%dT%H:%M:%SZ')}\n")
out.write(
"CreatorComment: <text>This document was created automatically"
" using available reuse information consistent with"
" REUSE.</text>\n"
)
reports = sorted(self.file_reports, key=lambda x: x.name)
for report in reports:
out.write(
"Relationship: SPDXRef-DOCUMENT describes"
f" {report.spdx_id}\n"
)
for report in reports:
out.write("\n")
out.write(f"FileName: {report.name}\n")
out.write(f"SPDXID: {report.spdx_id}\n")
out.write(f"FileChecksum: SHA1: {report.chk_sum}\n")
out.write(f"LicenseConcluded: {report.license_concluded}\n")
for lic in sorted(report.licenses_in_file):
out.write(f"LicenseInfoInFile: {lic}\n")
if report.copyright:
out.write(
"FileCopyrightText:" f" <text>{report.copyright}</text>\n"
)
else:
out.write("FileCopyrightText: NONE\n")
# Licenses
for lic, path in sorted(self.licenses.items()):
if _LICENSEREF_PATTERN.match(lic):
out.write("\n")
out.write(f"LicenseID: {lic}\n")
out.write("LicenseName: NOASSERTION\n")
with (Path(self.path) / path).open(encoding="utf-8") as fp:
out.write(f"ExtractedText: <text>{fp.read()}</text>\n")
return out.getvalue()
[docs] @classmethod
def generate(
cls,
project: Project,
do_checksum: bool = True,
multiprocessing: bool = cpu_count() > 1, # type: ignore
add_license_concluded: bool = False,
) -> "ProjectReport":
"""Generate a ProjectReport from a Project."""
project_report = cls(do_checksum=do_checksum)
project_report.path = project.root
project_report.licenses = project.licenses
project_report.licenses_without_extension = (
project.licenses_without_extension
)
container = _MultiprocessingContainer(
project, do_checksum, add_license_concluded
)
if multiprocessing:
with mp.Pool() as pool:
results: Iterable[_MultiprocessingResult] = pool.map(
container, project.all_files()
)
pool.join()
else:
results = map(container, project.all_files())
for result in results:
if result.error:
if isinstance(result.error, (OSError, UnicodeError)):
_LOGGER.error(
_("Could not read '{path}'").format(path=result.path),
exc_info=result.error,
)
project_report.read_errors.add(Path(result.path))
continue
_LOGGER.error(
_(
"Unexpected error occurred while parsing '{path}'"
).format(path=result.path),
exc_info=result.error,
)
project_report.read_errors.add(Path(result.path))
continue
file_report = cast(FileReport, result.report)
# File report.
project_report.file_reports.add(file_report)
# Missing licenses.
for missing_license in file_report.missing_licenses:
project_report.missing_licenses.setdefault(
missing_license, set()
).add(file_report.path)
# Bad licenses
for bad_license in file_report.bad_licenses:
project_report.bad_licenses.setdefault(bad_license, set()).add(
file_report.path
)
# More bad licenses, and also deprecated licenses
for name, path in project.licenses.items():
if name not in project.license_map:
project_report.bad_licenses.setdefault(name, set()).add(path)
elif project.license_map[name]["isDeprecatedLicenseId"]:
project_report.deprecated_licenses.add(name)
return project_report
@property
def used_licenses(self) -> Set[str]:
"""Set of license identifiers that are found in file reports."""
if self._used_licenses is not None:
return self._used_licenses
self._used_licenses = {
lic
for file_report in self.file_reports
for lic in file_report.licenses_in_file
}
return self._used_licenses
@property
def unused_licenses(self) -> Set[str]:
"""Set of license identifiers that are not found in any file report."""
if self._unused_licenses is not None:
return self._unused_licenses
# First collect licenses that are suspected to be unused.
suspected_unused_licenses = {
lic for lic in self.licenses if lic not in self.used_licenses
}
# Remove false positives.
self._unused_licenses = {
lic
for lic in suspected_unused_licenses
if f"{lic}+" not in self.used_licenses
}
return self._unused_licenses
@property
def files_without_licenses(self) -> Set[Path]:
"""Set of paths that have no licensing information."""
if self._files_without_licenses is not None:
return self._files_without_licenses
self._files_without_licenses = {
file_report.path
for file_report in self.file_reports
if not file_report.licenses_in_file
}
return self._files_without_licenses
@property
def files_without_copyright(self) -> Set[Path]:
"""Set of paths that have no copyright information."""
if self._files_without_copyright is not None:
return self._files_without_copyright
self._files_without_copyright = {
file_report.path
for file_report in self.file_reports
if not file_report.copyright
}
return self._files_without_copyright
@property
def is_compliant(self) -> bool:
"""Whether the report is compliant with the REUSE Spec."""
if self._is_compliant is not None:
return self._is_compliant
self._is_compliant = not any(
(
self.missing_licenses,
self.unused_licenses,
self.bad_licenses,
self.deprecated_licenses,
self.licenses_without_extension,
self.files_without_copyright,
self.files_without_licenses,
self.read_errors,
)
)
return self._is_compliant
@property
def recommendations(self) -> List[str]:
"""Generate help for next steps based on found REUSE issues"""
recommendations = []
# These items should be ordered in the same way as in the summary.
if self.bad_licenses:
recommendations.append(
_(
"Fix bad licenses: At least one license in the LICENSES"
" directory and/or provided by 'SPDX-License-Identifier'"
" tags is invalid. They are either not valid SPDX License"
" Identifiers or do not start with 'LicenseRef-'. FAQ about"
" custom licenses:"
" https://reuse.software/faq/#custom-license"
)
)
if self.deprecated_licenses:
recommendations.append(
_(
"Fix deprecated licenses: At least one of the licenses in"
" the LICENSES directory and/or provided by an"
" 'SPDX-License-Identifier' tag or in '.reuse/dep5' has"
" been deprecated by SPDX. The current list and their"
" respective recommended new identifiers can be found"
" here: <https://spdx.org/licenses/#deprecated>"
)
)
if self.licenses_without_extension:
recommendations.append(
_(
"Fix licenses without file extension: At least one license"
" text file in the 'LICENSES' directory does not have a"
" '.txt' file extension. Please rename the file(s)"
" accordingly."
)
)
if self.missing_licenses:
recommendations.append(
_(
"Fix missing licenses: For at least one of the license"
" identifiers provided by the 'SPDX-License-Identifier'"
" tags, there is no corresponding license text file in the"
" 'LICENSES' directory. For SPDX license identifiers, you"
" can simply run 'reuse download --all' to get any missing"
" ones. For custom licenses (starting with 'LicenseRef-'),"
" you need to add these files yourself."
)
)
if self.unused_licenses:
recommendations.append(
_(
"Fix unused licenses: At least one of the license text"
" files in 'LICENSES' is not referenced by any file, e.g."
" by an 'SPDX-License-Identifier' tag. Please make sure"
" that you either tag the accordingly licensed files"
" properly, or delete the unused license text if you are"
" sure that no file or code snippet is licensed as such."
)
)
if self.read_errors:
recommendations.append(
_(
"Fix read errors: At least one of the files in your"
" directory cannot be read by the tool. Please check the"
" file permissions. You will find the affected files at the"
" top of the output as part of the logged error messages."
)
)
if self.files_without_copyright or self.files_without_licenses:
recommendations.append(
_(
"Fix missing copyright/licensing information: For one or"
" more files, the tool cannot find copyright and/or"
" licensing information. You typically do this by adding"
" 'SPDX-FileCopyrightText' and 'SPDX-License-Identifier'"
" tags to each file. The tutorial explains additional ways"
" to do this: <https://reuse.software/tutorial/>"
)
)
return recommendations
[docs]class FileReport: # pylint: disable=too-many-instance-attributes
"""Object that holds a linting report about a single file."""
def __init__(self, name: str, path: StrPath, do_checksum: bool = True):
self.name = name
self.path = Path(path)
self.do_checksum = do_checksum
self.reuse_infos: List[ReuseInfo] = []
self.spdx_id: Optional[str] = None
self.chk_sum: Optional[str] = None
self.licenses_in_file: List[str] = []
self.license_concluded: str = ""
self.copyright: str = ""
self.bad_licenses: Set[str] = set()
self.missing_licenses: Set[str] = set()
[docs] def to_dict_lint(self) -> Dict[str, Any]:
"""Turn the report into a json-like dictionary with exclusively
information relevant for linting.
"""
return {
# This gets rid of the './' prefix. In Python 3.9, use
# str.removeprefix.
"path": PurePath(self.name).as_posix(),
"copyrights": [
{
"value": line,
"source": reuse_info.source_path,
"source_type": (
reuse_info.source_type.value
if reuse_info.source_type
else None
),
}
for reuse_info in self.reuse_infos
for line in reuse_info.copyright_lines
],
"spdx_expressions": [
{
"value": str(expression),
"source": reuse_info.source_path,
"source_type": (
reuse_info.source_type.value
if reuse_info.source_type
else None
),
}
for reuse_info in self.reuse_infos
for expression in reuse_info.spdx_expressions
],
}
[docs] @classmethod
def generate(
cls,
project: Project,
path: StrPath,
do_checksum: bool = True,
add_license_concluded: bool = False,
) -> "FileReport":
"""Generate a FileReport from a path in a Project."""
path = Path(path)
if not path.is_file():
raise OSError(f"{path} is not a file")
relative = project.relative_from_root(path)
report = cls(f"./{relative}", path, do_checksum=do_checksum)
# Checksum and ID
if report.do_checksum:
report.chk_sum = _checksum(path)
else:
# This path avoids a lot of heavy computation, which is handy for
# scenarios where you only need a unique hash, not a consistent
# hash.
report.chk_sum = f"{random.getrandbits(160):040x}"
spdx_id = md5()
spdx_id.update(report.name.encode("utf-8"))
spdx_id.update(report.chk_sum.encode("utf-8"))
report.spdx_id = f"SPDXRef-{spdx_id.hexdigest()}"
reuse_infos = project.reuse_info_of(path)
for reuse_info in reuse_infos:
for expression in reuse_info.spdx_expressions:
for identifier in _LICENSING.license_keys(expression):
# A license expression akin to Apache-1.0+ should register
# correctly if LICENSES/Apache-1.0.txt exists.
identifiers = {identifier}
if identifier.endswith("+"):
identifiers.add(identifier[:-1])
# Bad license
if not identifiers.intersection(project.license_map):
report.bad_licenses.add(identifier)
# Missing license
if not identifiers.intersection(project.licenses):
report.missing_licenses.add(identifier)
# Add license to report.
report.licenses_in_file.append(identifier)
if not add_license_concluded:
report.license_concluded = "NOASSERTION"
elif not any(reuse_info.spdx_expressions for reuse_info in reuse_infos):
report.license_concluded = "NONE"
else:
# Merge all the license expressions together, wrapping them in
# parentheses to make sure an expression doesn't spill into another
# one. The extra parentheses will be removed by the roundtrip
# through parse() -> simplify() -> render().
report.license_concluded = (
_LICENSING.parse(
" AND ".join(
f"({expression})"
for reuse_info in reuse_infos
for expression in reuse_info.spdx_expressions
),
)
.simplify()
.render()
)
# Copyright text
report.copyright = "\n".join(
sorted(
line
for reuse_info in reuse_infos
for line in reuse_info.copyright_lines
)
)
# Source of licensing and copyright info
report.reuse_infos = reuse_infos
return report
def __hash__(self) -> int:
if self.chk_sum is not None:
return hash(self.name + self.chk_sum)
return super().__hash__()