Source code for reuse.global_licensing

# SPDX-FileCopyrightText: 2023 Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Code for parsing and validating REUSE.toml."""

# mypy: disable-error-code=attr-defined

import logging
import re
from abc import ABC, abstractmethod
from collections import defaultdict
from enum import Enum
from pathlib import Path, PurePath
from typing import (
    Any,
    Callable,
    Collection,
    Generator,
    Optional,
    Type,
    TypeVar,
    Union,
    cast,
    overload,
)

import attrs
import tomlkit
from attr.validators import _InstanceOfValidator as _AttrInstanceOfValidator
from boolean.boolean import Expression, ParseError
from debian.copyright import Copyright
from debian.copyright import Error as DebianError
from license_expression import ExpressionError

from . import _LICENSING, ReuseInfo, SourceType
from .covered_files import iter_files
from .exceptions import (
    GlobalLicensingParseError,
    GlobalLicensingParseTypeError,
    GlobalLicensingParseValueError,
)
from .i18n import _
from .types import StrPath
from .vcs import VCSStrategy

_LOGGER = logging.getLogger(__name__)

_T = TypeVar("_T")

#: Current version of REUSE.toml.
REUSE_TOML_VERSION = 1

#: Relation between Python attribute names and TOML keys.
_TOML_KEYS = {
    "paths": "path",
    "precedence": "precedence",
    "copyright_lines": "SPDX-FileCopyrightText",
    "spdx_expressions": "SPDX-License-Identifier",
}


[docs] class PrecedenceType(Enum): """An enum of behaviours surrounding order of precedence for entries in a :class:`GlobalLicensing`. """ #: Aggregate the results from the file with the results from the global #: licensing file. AGGREGATE = "aggregate" #: Use the results that are closest to the covered file. This is typically #: the file itself, or the global licensing file if no REUSE information #: exists inside of the file. CLOSEST = "closest" #: Only use the results from the global licensing file. OVERRIDE = "override"
@attrs.define class _CollectionOfValidator: collection_type: Type[Collection] = attrs.field() value_type: Type = attrs.field() optional: bool = attrs.field(default=True) def __call__( self, instance: object, attribute: attrs.Attribute, value: Collection[_T], ) -> None: # This is a hack to display the TOML's key names instead of the Python # attributes. if isinstance(instance, AnnotationsItem): attr_name = _TOML_KEYS[attribute.name] else: attr_name = attribute.name source = getattr(instance, "source", None) if not isinstance(value, self.collection_type): raise GlobalLicensingParseTypeError( _( "{attr_name} must be a {type_name} (got {value} that is a" " {value_class})." ).format( attr_name=repr(attr_name), type_name=self.collection_type.__name__, value=repr(value), value_class=repr(value.__class__), ), source=source, ) for item in value: if not isinstance(item, self.value_type): raise GlobalLicensingParseTypeError( _( "Item in {attr_name} collection must be a {type_name}" " (got {item_value} that is a {item_class})." ).format( attr_name=repr(attr_name), type_name=self.value_type.__name__, item_value=repr(item), item_class=repr(item.__class__), ), source=source, ) if not self.optional and not value: raise GlobalLicensingParseValueError( _("{attr_name} must not be empty.").format( attr_name=repr(attr_name), ), source=source, ) def _validate_collection_of( collection_type: Type[Collection], value_type: Type[_T], optional: bool = False, ) -> Callable[[Any, attrs.Attribute, Collection[_T]], Any]: return _CollectionOfValidator( collection_type, value_type, optional=optional ) class _InstanceOfValidator(_AttrInstanceOfValidator): def __call__(self, inst: Any, attr: attrs.Attribute, value: _T) -> None: try: super().__call__(inst, attr, value) except TypeError as error: raise GlobalLicensingParseTypeError( _( "{name} must be a {type} (got {value} that is a" " {value_type})." ).format( name=repr(error.args[1].name), type=repr(error.args[2].__name__), value=repr(error.args[3]), value_type=repr(error.args[3].__class__), ), source=getattr(inst, "source", None), ) from error def _instance_of( type_: Type[_T], ) -> Callable[[Any, attrs.Attribute, _T], Any]: return _InstanceOfValidator(type_) def _str_to_global_precedence(value: Any) -> PrecedenceType: try: return PrecedenceType(value) except ValueError as error: raise GlobalLicensingParseValueError( _( "The value of 'precedence' must be one of {precedence_vals}" " (got {received})" ).format( precedence_vals=tuple( member.value for member in PrecedenceType ), received=repr(value), ) ) from error @overload def _str_to_set(value: str) -> set[str]: ... @overload def _str_to_set(value: Union[None, _T, Collection[_T]]) -> set[_T]: ... def _str_to_set( value: Union[str, None, _T, Collection[_T]] ) -> Union[set[str], set[_T]]: if value is None: return cast(set[str], set()) if isinstance(value, str): return {value} if hasattr(value, "__iter__"): return set(value) return {value} def _str_to_set_of_expr(value: Any) -> set[Expression]: value = _str_to_set(value) result = set() for expression in value: try: result.add(_LICENSING.parse(expression)) except (ExpressionError, ParseError) as error: raise GlobalLicensingParseValueError( _("Could not parse '{expression}'").format( expression=expression ) ) from error return result
[docs] @attrs.define class GlobalLicensing(ABC): """An abstract class that represents a configuration file that contains licensing information that is pertinent to other files in the project. """ source: str = attrs.field(validator=_instance_of(str))
[docs] @classmethod @abstractmethod def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing": """Parse the file and create a :class:`GlobalLicensing` object from its contents. Raises: FileNotFoundError: file doesn't exist. OSError: some other error surrounding I/O. GlobalLicensingParseError: file could not be parsed. """
[docs] @abstractmethod def reuse_info_of( self, path: StrPath ) -> dict[PrecedenceType, list[ReuseInfo]]: """Find the REUSE information of *path* defined in the configuration. The path must be relative to the root of a :class:`reuse.project.Project`. The key indicates the precedence type for the subsequent information. """
[docs] @attrs.define class ReuseDep5(GlobalLicensing): """A soft wrapper around :class:`Copyright`.""" dep5_copyright: Copyright
[docs] @classmethod def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseDep5": path = Path(path) try: with path.open(encoding="utf-8") as fp: return cls(str(path), Copyright(fp)) except UnicodeDecodeError as error: raise GlobalLicensingParseError( str(error), source=str(path) ) from error # TODO: Remove ValueError once # <https://salsa.debian.org/python-debian-team/python-debian/-/merge_requests/123> # is closed except (DebianError, ValueError) as error: raise GlobalLicensingParseError( str(error), source=str(path) ) from error
[docs] def reuse_info_of( self, path: StrPath ) -> dict[PrecedenceType, list[ReuseInfo]]: path = PurePath(path).as_posix() result = self.dep5_copyright.find_files_paragraph(path) if result is None: return {} return { PrecedenceType.AGGREGATE: [ ReuseInfo( spdx_expressions=set( map(_LICENSING.parse, [result.license.synopsis]) ), copyright_lines=set( map(str.strip, result.copyright.splitlines()) ), path=path, source_type=SourceType.DEP5, # This is hardcoded. It must be a relative path from the # project root. self.source is not (guaranteed) a relative # path. source_path=".reuse/dep5", ) ] }
[docs] @attrs.define class AnnotationsItem: """A class that maps to a single [[annotations]] table element in REUSE.toml. """ paths: set[str] = attrs.field( converter=_str_to_set, validator=_validate_collection_of(set, str, optional=False), ) precedence: PrecedenceType = attrs.field( converter=_str_to_global_precedence, default=PrecedenceType.CLOSEST ) copyright_lines: set[str] = attrs.field( converter=_str_to_set, validator=_validate_collection_of(set, str, optional=True), default=None, ) spdx_expressions: set[Expression] = attrs.field( converter=_str_to_set_of_expr, validator=_validate_collection_of(set, Expression, optional=True), default=None, ) _paths_regex: re.Pattern = attrs.field(init=False) def __attrs_post_init__(self) -> None: def translate(path: str) -> str: # pylint: disable=too-many-branches blocks = [] escaping = False globstar = False prev_char = "" for char in path: if char == "\\": if prev_char == "\\" and escaping: escaping = False blocks.append("\\\\") else: escaping = True elif char == "*": if escaping: blocks.append(re.escape("*")) escaping = False elif prev_char == "*" and not globstar: globstar = True blocks.append(r".*") elif char == "/": if not globstar: if prev_char == "*": blocks.append("[^/]*") blocks.append("/") escaping = False else: if prev_char == "*" and not globstar: blocks.append(r"[^/]*") blocks.append(re.escape(char)) globstar = False escaping = False prev_char = char if prev_char == "*" and not globstar: blocks.append(r"[^/]*") result = "".join(blocks) return f"^({result})$" self._paths_regex = re.compile( "|".join(translate(path) for path in self.paths) )
[docs] @classmethod def from_dict(cls, values: dict[str, Any]) -> "AnnotationsItem": """Create an :class:`AnnotationsItem` from a dictionary that uses the key-value pairs for an [[annotations]] table in REUSE.toml. """ new_dict = {} new_dict["paths"] = values.get(_TOML_KEYS["paths"]) precedence = values.get(_TOML_KEYS["precedence"]) if precedence is not None: new_dict["precedence"] = precedence new_dict["copyright_lines"] = values.get(_TOML_KEYS["copyright_lines"]) new_dict["spdx_expressions"] = values.get( _TOML_KEYS["spdx_expressions"] ) return cls(**new_dict) # type: ignore
[docs] def matches(self, path: str) -> bool: """Determine whether *path* matches any of the paths (or path globs) in :class:`AnnotationsItem`. """ return bool(self._paths_regex.match(path))
[docs] @attrs.define class ReuseTOML(GlobalLicensing): """A class that contains the data parsed from a REUSE.toml file.""" version: int = attrs.field(validator=_instance_of(int)) annotations: list[AnnotationsItem] = attrs.field( validator=_validate_collection_of(list, AnnotationsItem, optional=True) )
[docs] @classmethod def from_dict(cls, values: dict[str, Any], source: str) -> "ReuseTOML": """Create a :class:`ReuseTOML` from the dict version of REUSE.toml.""" new_dict = {} new_dict["version"] = values.get("version") new_dict["source"] = source annotation_dicts = values.get("annotations", []) try: annotations = [ AnnotationsItem.from_dict(annotation) for annotation in annotation_dicts ] except GlobalLicensingParseError as error: error.source = source raise error from error new_dict["annotations"] = annotations return cls(**new_dict) # type: ignore
[docs] @classmethod def from_toml(cls, toml: str, source: str) -> "ReuseTOML": """Create a :class:`ReuseTOML` from TOML text.""" try: tomldict = tomlkit.loads(toml) except tomlkit.exceptions.TOMLKitError as error: raise GlobalLicensingParseError( str(error), source=source ) from error return cls.from_dict(tomldict, source)
[docs] @classmethod def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseTOML": try: with Path(path).open(encoding="utf-8") as fp: return cls.from_toml(fp.read(), str(path)) except UnicodeDecodeError as error: raise GlobalLicensingParseError( str(error), source=str(path) ) from error
[docs] def find_annotations_item(self, path: StrPath) -> Optional[AnnotationsItem]: """Find a :class:`AnnotationsItem` that matches *path*. The latest match in :attr:`annotations` is returned. """ path = PurePath(path).as_posix() for item in reversed(self.annotations): if item.matches(path): return item return None
[docs] def reuse_info_of( self, path: StrPath ) -> dict[PrecedenceType, list[ReuseInfo]]: path = PurePath(path).as_posix() item = self.find_annotations_item(path) if item: return { item.precedence: [ ReuseInfo( spdx_expressions=item.spdx_expressions, copyright_lines=item.copyright_lines, path=path, source_path="REUSE.toml", source_type=SourceType.REUSE_TOML, ) ] } return {}
@property def directory(self) -> PurePath: """The directory in which the REUSE.toml file is located.""" return PurePath(self.source).parent
[docs] @attrs.define class NestedReuseTOML(GlobalLicensing): """A class that represents a hierarchy of :class:`ReuseTOML` objects.""" reuse_tomls: list[ReuseTOML] = attrs.field()
[docs] @classmethod def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing": """TODO: *path* is a directory instead of a file.""" include_submodules: bool = kwargs.get("include_submodules", False) include_meson_subprojects: bool = kwargs.get( "include_meson_subprojects", False ) vcs_strategy: Optional[VCSStrategy] = kwargs.get("vcs_strategy") tomls = [ ReuseTOML.from_file(toml_path) for toml_path in cls.find_reuse_tomls( path, include_submodules=include_submodules, include_meson_subprojects=include_meson_subprojects, vcs_strategy=vcs_strategy, ) ] return cls(reuse_tomls=tomls, source=str(path))
[docs] def reuse_info_of( self, path: StrPath ) -> dict[PrecedenceType, list[ReuseInfo]]: path = PurePath(path) toml_items: list[tuple[ReuseTOML, AnnotationsItem]] = ( self._find_relevant_tomls_and_items(path) ) result = defaultdict(list) for keyval in toml_items: toml = keyval[0] item = keyval[1] relpath = (PurePath(self.source) / path).relative_to(toml.directory) # I'm pretty sure there should be no KeyError here. info = toml.reuse_info_of(relpath)[item.precedence][0] result[item.precedence].append( # Fix the paths to be relative to self.source. As-is, they # were relative to the directory of the respective # REUSE.toml. info.copy( path=path.as_posix(), source_path=PurePath(toml.source) .relative_to(self.source) .as_posix(), ) ) if item.precedence == PrecedenceType.OVERRIDE: # No more! break # Clean up CLOSEST. Some items were added that are not the closest. # Consider copyright and licensing separately. copyright_found = False licence_found = False to_keep: list[ReuseInfo] = [] for info in reversed(result[PrecedenceType.CLOSEST]): new_info = info.copy(copyright_lines=set(), spdx_expressions=set()) if not copyright_found and info.copyright_lines: new_info = new_info.copy(copyright_lines=info.copyright_lines) copyright_found = True if not licence_found and info.spdx_expressions: new_info = new_info.copy(spdx_expressions=info.spdx_expressions) licence_found = True if new_info.contains_copyright_or_licensing(): to_keep.append(new_info) result[PrecedenceType.CLOSEST] = list(reversed(to_keep)) # Looping over CLOSEST created it in the defaultdict. Remove it if it's # empty. if not result[PrecedenceType.CLOSEST]: del result[PrecedenceType.CLOSEST] return dict(result)
[docs] @classmethod def find_reuse_tomls( cls, path: StrPath, include_submodules: bool = False, include_meson_subprojects: bool = False, vcs_strategy: Optional[VCSStrategy] = None, ) -> Generator[Path, None, None]: """Find all REUSE.toml files in *path*.""" return ( item for item in iter_files( path, include_submodules=include_submodules, include_meson_subprojects=include_meson_subprojects, include_reuse_tomls=True, vcs_strategy=vcs_strategy, ) if item.name == "REUSE.toml" )
def _find_relevant_tomls(self, path: StrPath) -> list[ReuseTOML]: found = [] for toml in self.reuse_tomls: if PurePath(path).is_relative_to(toml.directory): found.append(toml) # Sort from topmost to deepest directory. found.sort(key=lambda toml: toml.directory.parts) return found def _find_relevant_tomls_and_items( self, path: StrPath ) -> list[tuple[ReuseTOML, AnnotationsItem]]: # *path* is relative to the Project root, which is the *source* of # NestedReuseTOML, which itself is a relative (to CWD) or absolute # path. path = PurePath(path) adjusted_path = PurePath(self.source) / path tomls = self._find_relevant_tomls(adjusted_path) toml_items: list[tuple[ReuseTOML, AnnotationsItem]] = [] for toml in tomls: relpath = adjusted_path.relative_to(toml.directory) item = toml.find_annotations_item(relpath) if item is not None: toml_items.append((toml, item)) return toml_items