Source code for reuse.global_licensing

# SPDX-FileCopyrightText: 2023 Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Code for parsing and validating REUSE.toml."""

# mypy: disable-error-code=attr-defined

import functools
import logging
import re
from abc import ABC, abstractmethod
from collections import defaultdict
from collections.abc import Callable, Collection, Generator, Iterable
from enum import Enum
from pathlib import Path, PurePath
from typing import Any, TypeVar, cast

import attrs
import tomlkit
from attr.validators import _InstanceOfValidator as _AttrInstanceOfValidator
from debian.copyright import Copyright
from debian.copyright import Error as DebianError

from .copyright import CopyrightNotice, ReuseInfo, SourceType, SpdxExpression
from .covered_files import is_path_ignored
from .exceptions import (
    CopyrightNoticeParseError,
    GlobalLicensingParseError,
    GlobalLicensingParseTypeError,
    GlobalLicensingParseValueError,
)
from .i18n import _
from .types import StrPath
from .vcs import VCSStrategy

_LOGGER = logging.getLogger(__name__)

_T = TypeVar("_T")

#: Current version of REUSE.toml.
REUSE_TOML_VERSION = 1

#: Relation between Python attribute names and TOML keys.
_TOML_KEYS = {
    "paths": "path",
    "precedence": "precedence",
    "_copyright_notices": "SPDX-FileCopyrightText",
    "_spdx_expressions": "SPDX-License-Identifier",
}



[docs]
class PrecedenceType(Enum):
    """An enum of behaviours surrounding order of precedence for entries in a
    :class:`GlobalLicensing`.
    """

    #: Aggregate the results from the file with the results from the global
    #: licensing file.
    AGGREGATE = "aggregate"
    #: Use the results that are closest to the covered file. This is typically
    #: the file itself, or the global licensing file if no REUSE information
    #: exists inside of the file.
    CLOSEST = "closest"
    #: Only use the results from the global licensing file.
    OVERRIDE = "override"



@attrs.define
class _CollectionOfValidator:
    collection_type: type[Collection] = attrs.field()
    value_type: type = attrs.field()
    optional: bool = attrs.field(default=True)

    def __call__(
        self,
        instance: object,
        attribute: attrs.Attribute,
        value: Collection[_T],
    ) -> None:
        # This is a hack to display the TOML's key names instead of the Python
        # attributes.
        if isinstance(instance, AnnotationsItem):
            attr_name = _TOML_KEYS[attribute.name]
        else:
            attr_name = attribute.name
        source = getattr(instance, "source", None)

        if not isinstance(value, self.collection_type):
            raise GlobalLicensingParseTypeError(
                _(
                    "{attr_name} must be a {type_name} (got {value} that is a"
                    " {value_class})."
                ).format(
                    attr_name=repr(attr_name),
                    type_name=self.collection_type.__name__,
                    value=repr(value),
                    value_class=repr(value.__class__),
                ),
                source=source,
            )
        for item in value:
            if not isinstance(item, self.value_type):
                raise GlobalLicensingParseTypeError(
                    _(
                        "Item in {attr_name} collection must be a {type_name}"
                        " (got {item_value} that is a {item_class})."
                    ).format(
                        attr_name=repr(attr_name),
                        type_name=self.value_type.__name__,
                        item_value=repr(item),
                        item_class=repr(item.__class__),
                    ),
                    source=source,
                )
        if not self.optional and not value:
            raise GlobalLicensingParseValueError(
                _("{attr_name} must not be empty.").format(
                    attr_name=repr(attr_name),
                ),
                source=source,
            )


def _validate_collection_of(
    collection_type: type[Collection],
    value_type: type[_T],
    optional: bool = False,
) -> Callable[[Any, attrs.Attribute, Collection[_T]], Any]:
    return _CollectionOfValidator(
        collection_type, value_type, optional=optional
    )


class _InstanceOfValidator(_AttrInstanceOfValidator):
    def __call__(self, inst: Any, attr: attrs.Attribute, value: _T) -> None:
        try:
            super().__call__(inst, attr, value)
        except TypeError as error:
            raise GlobalLicensingParseTypeError(
                _(
                    "{name} must be a {type} (got {value} that is a"
                    " {value_type})."
                ).format(
                    name=repr(error.args[1].name),
                    type=repr(error.args[2].__name__),
                    value=repr(error.args[3]),
                    value_type=repr(error.args[3].__class__),
                ),
                source=getattr(inst, "source", None),
            ) from error


def _instance_of(
    type_: type[_T],
) -> Callable[[Any, attrs.Attribute, _T], Any]:
    return _InstanceOfValidator(type_)


def _str_to_global_precedence(value: Any) -> PrecedenceType:
    try:
        return PrecedenceType(value)
    except ValueError as error:
        raise GlobalLicensingParseValueError(
            _(
                "The value of 'precedence' must be one of {precedence_vals}"
                " (got {received})"
            ).format(
                precedence_vals=tuple(
                    member.value for member in PrecedenceType
                ),
                received=repr(value),
            )
        ) from error


def _to_set(value: _T | Iterable[_T] | None) -> set[_T]:
    if value is None:
        return set()
    # Special case for strings.
    if isinstance(value, str):
        return {cast(_T, value)}
    if hasattr(value, "__iter__"):
        return set(value)
    return {value}


# The attrs library infers __init__ parameter types from the converter's
# signature. The signature of _to_set confuses mypy, so this wrapper exposes a
# simpler signature just for use with attrs.
def _to_set_any(value: Any | None) -> set[Any]:
    return cast(set[Any], _to_set(value))


def _to_set_of_expr(
    value: str | Iterable[str] | None,
) -> set[SpdxExpression]:
    value = _to_set(value)
    return {SpdxExpression(expression) for expression in value}


def _to_set_of_notice(
    value: str | Iterable[str] | None,
) -> set[CopyrightNotice]:
    value = _to_set(value)
    result = set()
    for notice in value:
        try:
            result.add(CopyrightNotice.from_string(notice))
        except CopyrightNoticeParseError as error:
            try:
                result.add(
                    CopyrightNotice.from_string(
                        f"SPDX-FileCopyrightText: {notice}"
                    )
                )
            except CopyrightNoticeParseError:
                raise GlobalLicensingParseValueError(
                    _("Could not parse '{notice}'").format(notice=notice)
                ) from error
    return result



[docs]
@attrs.define(frozen=True)
class GlobalLicensing(ABC):
    """An abstract class that represents a configuration file that contains
    licensing information that is pertinent to other files in the project.
    """

    source: str = attrs.field(validator=_instance_of(str))


[docs]
    @classmethod
    @abstractmethod
    def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing":
        """Parse the file and create a :class:`GlobalLicensing` object from its
        contents.

        Raises:
            FileNotFoundError: file doesn't exist.
            OSError: some other error surrounding I/O.
            GlobalLicensingParseError: file could not be parsed.
        """



[docs]
    @abstractmethod
    def reuse_info_of(
        self, path: StrPath
    ) -> dict[PrecedenceType, list[ReuseInfo]]:
        """Find the REUSE information of *path* defined in the configuration.
        The path must be relative to the root of a
        :class:`reuse.project.Project`.

        The key indicates the precedence type for the subsequent information.
        """





[docs]
@attrs.define(frozen=True)
class ReuseDep5(GlobalLicensing):
    """A soft wrapper around :class:`Copyright`."""

    dep5_copyright: Copyright


[docs]
    @classmethod
    def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseDep5":
        path = Path(path)
        try:
            with path.open(encoding="utf-8") as fp:
                return cls(str(path), Copyright(fp))
        except UnicodeDecodeError as error:
            raise GlobalLicensingParseError(
                str(error), source=str(path)
            ) from error
        # TODO: Remove ValueError once
        # <https://salsa.debian.org/python-debian-team/python-debian/-/merge_requests/123>
        # is closed
        except (DebianError, ValueError) as error:
            raise GlobalLicensingParseError(
                str(error), source=str(path)
            ) from error



[docs]
    def reuse_info_of(
        self, path: StrPath
    ) -> dict[PrecedenceType, list[ReuseInfo]]:
        path = PurePath(path).as_posix()
        result = self.dep5_copyright.find_files_paragraph(path)

        if result is None:
            return {}

        return {
            PrecedenceType.AGGREGATE: [
                ReuseInfo(
                    spdx_expressions=_to_set_of_expr(result.license.synopsis),
                    copyright_notices=_to_set_of_notice(
                        map(str.strip, result.copyright.splitlines())
                    ),
                    path=path,
                    source_type=SourceType.DEP5,
                    # This is hardcoded. It must be a relative path from the
                    # project root. self.source is not (guaranteed) a relative
                    # path.
                    source_path=".reuse/dep5",
                )
            ]
        }





[docs]
@attrs.define(frozen=True)
class AnnotationsItem:
    """A class that maps to a single [[annotations]] table element in
    REUSE.toml.
    """

    paths: set[str] = attrs.field(
        converter=_to_set_any,
        validator=_validate_collection_of(set, str, optional=False),
    )
    precedence: PrecedenceType = attrs.field(
        converter=_str_to_global_precedence, default=PrecedenceType.CLOSEST
    )
    _copyright_notices: set[str] = attrs.field(
        alias="copyright_notices",
        converter=_to_set_any,
        validator=_validate_collection_of(set, str, optional=True),
        default=None,
    )
    _spdx_expressions: set[str] = attrs.field(
        alias="spdx_expressions",
        converter=_to_set_any,
        validator=_validate_collection_of(set, str, optional=True),
        default=None,
    )

    def __attrs_post_init__(self) -> None:
        # Immediately trigger cached properties to get error as needed.
        _ = self.copyright_notices
        _ = self.spdx_expressions

    @functools.cached_property
    def copyright_notices(self) -> set[CopyrightNotice]:
        return _to_set_of_notice(self._copyright_notices)

    @functools.cached_property
    def spdx_expressions(self) -> set[SpdxExpression]:
        return _to_set_of_expr(self._spdx_expressions)

    @functools.cached_property
    def _paths_regex(self) -> re.Pattern:
        def translate(path: str) -> str:
            # pylint: disable=too-many-branches
            blocks = []
            escaping = False
            globstar = False
            prev_char = ""
            for char in path:
                if char == "\\":
                    if prev_char == "\\" and escaping:
                        escaping = False
                        blocks.append("\\\\")
                    else:
                        escaping = True
                elif char == "*":
                    if escaping:
                        blocks.append(re.escape("*"))
                        escaping = False
                    elif prev_char == "*" and not globstar:
                        globstar = True
                        blocks.append(r".*")
                elif char == "/":
                    if not globstar:
                        if prev_char == "*":
                            blocks.append("[^/]*")
                        blocks.append("/")
                    escaping = False
                else:
                    if prev_char == "*" and not globstar:
                        blocks.append(r"[^/]*")
                    blocks.append(re.escape(char))
                    globstar = False
                    escaping = False
                prev_char = char
            if prev_char == "*" and not globstar:
                blocks.append(r"[^/]*")
            result = "".join(blocks)
            return f"^({result})$"

        return re.compile("|".join(translate(path) for path in self.paths))


[docs]
    @classmethod
    def from_dict(cls, values: dict[str, Any]) -> "AnnotationsItem":
        """Create an :class:`AnnotationsItem` from a dictionary that uses the
        key-value pairs for an [[annotations]] table in REUSE.toml.
        """
        new_dict = {}
        new_dict["paths"] = values.get(_TOML_KEYS["paths"])
        precedence = values.get(_TOML_KEYS["precedence"])
        if precedence is not None:
            new_dict["precedence"] = precedence
        new_dict["copyright_notices"] = values.get(
            _TOML_KEYS["_copyright_notices"]
        )
        new_dict["spdx_expressions"] = values.get(
            _TOML_KEYS["_spdx_expressions"]
        )
        return cls(**new_dict)  # type: ignore



[docs]
    def matches(self, path: str) -> bool:
        """Determine whether *path* matches any of the paths (or path globs) in
        :class:`AnnotationsItem`.
        """
        return bool(self._paths_regex.match(path))





[docs]
@attrs.define(frozen=True)
class ReuseTOML(GlobalLicensing):
    """A class that contains the data parsed from a REUSE.toml file."""

    version: int = attrs.field(validator=_instance_of(int))
    annotations: list[AnnotationsItem] = attrs.field(
        validator=_validate_collection_of(list, AnnotationsItem, optional=True)
    )


[docs]
    @classmethod
    def from_dict(cls, values: dict[str, Any], source: str) -> "ReuseTOML":
        """Create a :class:`ReuseTOML` from the dict version of REUSE.toml."""
        new_dict = {}
        new_dict["version"] = values.get("version")
        new_dict["source"] = source

        annotation_dicts = values.get("annotations", [])
        try:
            annotations = [
                AnnotationsItem.from_dict(annotation)
                for annotation in annotation_dicts
            ]
        except GlobalLicensingParseError as error:
            error.source = source
            raise error from error

        new_dict["annotations"] = annotations

        return cls(**new_dict)  # type: ignore



[docs]
    @classmethod
    def from_toml(cls, toml: str, source: str) -> "ReuseTOML":
        """Create a :class:`ReuseTOML` from TOML text."""
        try:
            tomldict = tomlkit.loads(toml)
        except tomlkit.exceptions.TOMLKitError as error:
            raise GlobalLicensingParseError(
                str(error), source=source
            ) from error
        return cls.from_dict(tomldict, source)



[docs]
    @classmethod
    def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseTOML":
        try:
            with Path(path).open(encoding="utf-8") as fp:
                return cls.from_toml(fp.read(), str(path))
        except UnicodeDecodeError as error:
            raise GlobalLicensingParseError(
                str(error), source=str(path)
            ) from error



[docs]
    def find_annotations_item(self, path: StrPath) -> AnnotationsItem | None:
        """Find a :class:`AnnotationsItem` that matches *path*. The latest match
        in :attr:`annotations` is returned.
        """
        path = PurePath(path).as_posix()
        for item in reversed(self.annotations):
            if item.matches(path):
                return item
        return None



[docs]
    def reuse_info_of(
        self, path: StrPath
    ) -> dict[PrecedenceType, list[ReuseInfo]]:
        path = PurePath(path).as_posix()
        item = self.find_annotations_item(path)
        if item:
            return {
                item.precedence: [
                    ReuseInfo(
                        spdx_expressions=item.spdx_expressions,
                        copyright_notices=item.copyright_notices,
                        path=path,
                        source_path="REUSE.toml",
                        source_type=SourceType.REUSE_TOML,
                    )
                ]
            }
        return {}


    @property
    def directory(self) -> PurePath:
        """The directory in which the REUSE.toml file is located."""
        return PurePath(self.source).parent




[docs]
@attrs.define(frozen=True)
class NestedReuseTOML(GlobalLicensing):
    """A class that represents a hierarchy of :class:`ReuseTOML` objects."""

    reuse_tomls: list[ReuseTOML] = attrs.field()


[docs]
    @classmethod
    def from_file(cls, path: StrPath, **kwargs: Any) -> "NestedReuseTOML":
        """TODO: *path* is a directory instead of a file."""
        include_submodules: bool = kwargs.get("include_submodules", False)
        include_meson_subprojects: bool = kwargs.get(
            "include_meson_subprojects", False
        )
        vcs_strategy: VCSStrategy | None = kwargs.get("vcs_strategy")
        tomls = [
            ReuseTOML.from_file(toml_path)
            for toml_path in cls.find_reuse_tomls(
                path,
                include_submodules=include_submodules,
                include_meson_subprojects=include_meson_subprojects,
                vcs_strategy=vcs_strategy,
            )
        ]
        return cls(reuse_tomls=tomls, source=str(path))



[docs]
    def reuse_info_of(
        self, path: StrPath
    ) -> dict[PrecedenceType, list[ReuseInfo]]:
        path = PurePath(path)

        toml_items: list[tuple[ReuseTOML, AnnotationsItem]] = (
            self._find_relevant_tomls_and_items(path)
        )

        result = defaultdict(list)
        for keyval in toml_items:
            toml = keyval[0]
            item = keyval[1]
            relpath = (PurePath(self.source) / path).relative_to(toml.directory)
            # I'm pretty sure there should be no KeyError here.
            info = toml.reuse_info_of(relpath)[item.precedence][0]
            result[item.precedence].append(
                # Fix the paths to be relative to self.source. As-is, they
                # were relative to the directory of the respective
                # REUSE.toml.
                info.copy(
                    path=path.as_posix(),
                    source_path=PurePath(toml.source)
                    .relative_to(self.source)
                    .as_posix(),
                )
            )
            if item.precedence == PrecedenceType.OVERRIDE:
                # No more!
                break

        # Clean up CLOSEST. Some items were added that are not the closest.
        # Consider copyright and licensing separately.
        copyright_found = False
        licence_found = False
        to_keep: list[ReuseInfo] = []
        for info in reversed(result[PrecedenceType.CLOSEST]):
            new_info = info.copy(
                copyright_notices=set(), spdx_expressions=set()
            )
            if not copyright_found and info.copyright_notices:
                new_info = new_info.copy(
                    copyright_notices=info.copyright_notices
                )
                copyright_found = True
            if not licence_found and info.spdx_expressions:
                new_info = new_info.copy(spdx_expressions=info.spdx_expressions)
                licence_found = True
            if new_info.contains_copyright_or_licensing():
                to_keep.append(new_info)
        result[PrecedenceType.CLOSEST] = list(reversed(to_keep))
        # Looping over CLOSEST created it in the defaultdict. Remove it if it's
        # empty.
        if not result[PrecedenceType.CLOSEST]:
            del result[PrecedenceType.CLOSEST]

        return dict(result)



[docs]
    @classmethod
    def find_reuse_tomls(
        cls,
        path: StrPath,
        include_submodules: bool = False,
        include_meson_subprojects: bool = False,
        vcs_strategy: VCSStrategy | None = None,
    ) -> Generator[Path, None, None]:
        """Find all REUSE.toml files in *path*. *path* should be the root of the
        directory. If it is not, REUSE.toml files which are in ignored
        directories may not be correctly ignored.
        """
        path = Path(path)
        reuse_tomls = path.rglob("REUSE.toml")
        for item in reuse_tomls:
            if is_path_ignored(
                item,
                include_submodules=include_submodules,
                include_meson_subprojects=include_meson_subprojects,
                include_reuse_tomls=True,
                vcs_strategy=vcs_strategy,
            ):
                continue
            rel = item.relative_to(path)
            parts = rel.parts
            for directory in (
                path.joinpath(*parts[:i]) for i in range(1, len(parts))
            ):
                if is_path_ignored(
                    directory,
                    include_submodules=include_submodules,
                    include_meson_subprojects=include_meson_subprojects,
                    vcs_strategy=vcs_strategy,
                ):
                    break
            else:
                yield item


    def _find_relevant_tomls(self, path: StrPath) -> list[ReuseTOML]:
        found = []
        for toml in self.reuse_tomls:
            if PurePath(path).is_relative_to(toml.directory):
                found.append(toml)
        # Sort from topmost to deepest directory.
        found.sort(key=lambda toml: toml.directory.parts)
        return found

    def _find_relevant_tomls_and_items(
        self, path: StrPath
    ) -> list[tuple[ReuseTOML, AnnotationsItem]]:
        # *path* is relative to the Project root, which is the *source* of
        # NestedReuseTOML, which itself is a relative (to CWD) or absolute
        # path.
        path = PurePath(path)
        adjusted_path = PurePath(self.source) / path

        tomls = self._find_relevant_tomls(adjusted_path)
        toml_items: list[tuple[ReuseTOML, AnnotationsItem]] = []
        for toml in tomls:
            relpath = adjusted_path.relative_to(toml.directory)
            item = toml.find_annotations_item(relpath)
            if item is not None:
                toml_items.append((toml, item))
        return toml_items