Source code for reuse.project

# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
# SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER <carmenbianca@fsfe.org>
# SPDX-FileCopyrightText: 2023 Matthias Riße
# SPDX-FileCopyrightText: 2023 DB Systel GmbH
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains the central Project class."""

import contextlib
import errno
import glob
import logging
import os
import warnings
from collections import defaultdict
from gettext import gettext as _
from pathlib import Path
from typing import DefaultDict, Dict, Iterator, List, NamedTuple, Optional, Type

from binaryornot.check import is_binary

from . import (
    _IGNORE_DIR_PATTERNS,
    _IGNORE_FILE_PATTERNS,
    _IGNORE_MESON_PARENT_DIR_PATTERNS,
    IdentifierNotFound,
    ReuseInfo,
)
from ._licenses import EXCEPTION_MAP, LICENSE_MAP
from ._util import (
    _LICENSEREF_PATTERN,
    StrPath,
    _determine_license_path,
    relative_from_root,
    reuse_info_of_file,
)
from .global_licensing import (
    GlobalLicensing,
    NestedReuseTOML,
    PrecedenceType,
    ReuseDep5,
)
from .vcs import VCSStrategy, VCSStrategyNone, all_vcs_strategies

_LOGGER = logging.getLogger(__name__)


[docs]class GlobalLicensingConflict(Exception): """There are two global licensing files in the project that are not compatible. """
[docs]class GlobalLicensingFound(NamedTuple): path: Path cls: Type[GlobalLicensing]
[docs]class Project: """Simple object that holds the project's root, which is necessary for many interactions. """ # pylint: disable=too-many-arguments def __init__( self, root: StrPath, vcs_strategy: Optional[Type[VCSStrategy]] = None, license_map: Optional[Dict[str, Dict]] = None, licenses: Optional[Dict[str, Path]] = None, global_licensing: Optional[GlobalLicensing] = None, include_submodules: bool = False, include_meson_subprojects: bool = False, ): self.root = Path(root) if vcs_strategy is None: vcs_strategy = VCSStrategyNone self.vcs_strategy = vcs_strategy(self) if license_map is None: license_map = LICENSE_MAP self.license_map = license_map.copy() self.license_map.update(EXCEPTION_MAP) self.licenses_without_extension: Dict[str, Path] = {} if licenses is None: licenses = {} self.licenses = licenses self.global_licensing = global_licensing self.include_submodules = include_submodules self.include_meson_subprojects = include_meson_subprojects
[docs] @classmethod def from_directory( cls, root: StrPath, include_submodules: bool = False, include_meson_subprojects: bool = False, ) -> "Project": """A factory method that reads various files in the *root* directory to correctly build the :class:`Project` object. Args: root: The root of the project. include_submodules: Whether to also lint VCS submodules. include_meson_subprojects: Whether to also lint Meson subprojects. Raises: FileNotFoundError: if root does not exist. NotADirectoryError: if root is not a directory. UnicodeDecodeError: if the global licensing config file could not be decoded. GlobalLicensingParseError: if the global licensing config file could not be parsed. GlobalLicensingConflict: if more than one global licensing config file is present. """ root = Path(root) if not root.exists(): raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), str(root), ) if not root.is_dir(): raise NotADirectoryError( errno.ENOTDIR, os.strerror(errno.ENOTDIR), str(root), ) vcs_strategy = cls._detect_vcs_strategy(root) global_licensing: Optional[GlobalLicensing] = None found = cls.find_global_licensing(root) if found: global_licensing = found.cls.from_file(found.path) project = cls( root, vcs_strategy=vcs_strategy, global_licensing=global_licensing, include_submodules=include_submodules, include_meson_subprojects=include_meson_subprojects, ) # TODO: Because the `_find_licenses()` method is so broad and depends on # some object attributes, we set the attribute after creating the # object. Ideally we do this before creating the object, but that would # require refactoring the method. project.licenses = project._find_licenses() return project
[docs] def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]: """Yield all files in *directory* and its subdirectories. The files that are not yielded are: - Files ignored by VCS (e.g., see .gitignore) - Files/directories matching IGNORE_*_PATTERNS. """ if directory is None: directory = self.root directory = Path(directory) for root_str, dirs, files in os.walk(directory): root = Path(root_str) _LOGGER.debug("currently walking in '%s'", root) # Don't walk ignored directories for dir_ in list(dirs): the_dir = root / dir_ if self._is_path_ignored(the_dir): _LOGGER.debug("ignoring '%s'", the_dir) dirs.remove(dir_) elif the_dir.is_symlink(): _LOGGER.debug("skipping symlink '%s'", the_dir) dirs.remove(dir_) elif ( not self.include_submodules and self.vcs_strategy.is_submodule(the_dir) ): _LOGGER.info( "ignoring '%s' because it is a submodule", the_dir ) dirs.remove(dir_) # Filter files. for file_ in files: the_file = root / file_ if self._is_path_ignored(the_file): _LOGGER.debug("ignoring '%s'", the_file) continue if the_file.is_symlink(): _LOGGER.debug("skipping symlink '%s'", the_file) continue # Suppressing this error because I simply don't want to deal # with that here. with contextlib.suppress(OSError): if the_file.stat().st_size == 0: _LOGGER.debug("skipping 0-sized file '%s'", the_file) continue _LOGGER.debug("yielding '%s'", the_file) yield the_file
[docs] def reuse_info_of(self, path: StrPath) -> List[ReuseInfo]: """Return REUSE info of *path*. This function will return any REUSE information that it can find: from within the file, the .license file, from REUSE.toml, and/or from the .reuse/dep5 file. The presence of a .license file always means that the file itself will not be parsed for REUSE information. When information is found from multiple sources, and if the precedence for that file in REUSE.toml is 'aggregate' (or if .reuse/dep5 is used), then two (or more) :class:`ReuseInfo` objects are returned in list set, each with respective discovered REUSE information and information about the source. Alternatively, if the precedence is set to 'closest' or 'toml', or if information was found in only one source, then a list of one item is returned. The exact precedence handling is detailed in the specification. An empty list is returned if no information was found whatsoever. """ # pylint: disable=too-many-branches original_path = path path = _determine_license_path(path) _LOGGER.debug(f"searching '{path}' for REUSE information") # This means that only one 'source' of licensing/copyright information # is captured in ReuseInfo global_results: "DefaultDict[PrecedenceType, List[ReuseInfo]]" = ( defaultdict(list) ) file_result = ReuseInfo() result: List[ReuseInfo] = [] # Search the global licensing file for REUSE information. if self.global_licensing: relpath = self.relative_from_root(path) global_results = defaultdict( list, self.global_licensing.reuse_info_of(relpath) ) for info_list in global_results.values(): for global_result in info_list: if global_result.contains_copyright_or_licensing(): _LOGGER.info( _("'{path}' covered by {global_path}").format( path=path, global_path=global_result.source_path ) ) if PrecedenceType.OVERRIDE in global_results: _LOGGER.info( _( "'{path}' is covered exclusively by REUSE.toml. Not reading" " the file contents." ).format(path=path) ) elif is_binary(str(path)): _LOGGER.info( _( "'{path}' was detected as a binary file; not searching its" " contents for REUSE information." ).format(path=path) ) else: file_result = reuse_info_of_file(path, original_path, self.root) result.extend(global_results[PrecedenceType.OVERRIDE]) result.extend(global_results[PrecedenceType.AGGREGATE]) if file_result.contains_info(): result.append(file_result) if not file_result.contains_copyright_or_licensing(): result.extend(global_results[PrecedenceType.CLOSEST]) # Special case: If a file contains only copyright, apply the # REUSE.toml's licensing if it exists, and vice versa. elif file_result.contains_copyright_xor_licensing(): if global_results[PrecedenceType.CLOSEST]: # There should only by a single CLOSEST result in the list. closest = global_results[PrecedenceType.CLOSEST][0] if file_result.copyright_lines: result.append( closest.copy( copyright_lines=set(), ) ) elif file_result.spdx_expressions: result.append( closest.copy( spdx_expressions=set(), ) ) return result
[docs] def relative_from_root(self, path: StrPath) -> Path: """If the project root is /tmp/project, and *path* is /tmp/project/src/file, then return src/file. """ return relative_from_root(path, self.root)
[docs] @classmethod def find_global_licensing( cls, root: Path ) -> Optional[GlobalLicensingFound]: """Find the path and corresponding class of a project directory's :class:`GlobalLicensing`. Raises: GlobalLicensingConflict: if more than one global licensing config file is present. """ candidate: Optional[GlobalLicensingFound] = None dep5_path = root / ".reuse/dep5" if (dep5_path).exists(): # Sneaky workaround to not print this warning. if not os.environ.get("_SUPPRESS_DEP5_WARNING"): warnings.warn( _( "'.reuse/dep5' is deprecated. You are recommended to" " instead use REUSE.toml. Use `reuse convert-dep5` to" " convert." ), PendingDeprecationWarning, ) candidate = GlobalLicensingFound(dep5_path, ReuseDep5) toml_path = None with contextlib.suppress(StopIteration): toml_path = next(root.rglob("**/REUSE.toml")) if toml_path is not None: if candidate is not None: raise GlobalLicensingConflict( _( "Found both '{new_path}' and '{old_path}'. You" " cannot keep both files simultaneously; they are" " not intercompatible." ).format(new_path=toml_path, old_path=dep5_path) ) candidate = GlobalLicensingFound(root, NestedReuseTOML) return candidate
def _is_path_ignored(self, path: Path) -> bool: """Is *path* ignored by some mechanism?""" name = path.name parent_parts = path.parent.parts parent_dir = parent_parts[-1] if len(parent_parts) > 0 else "" if path.is_file(): for pattern in _IGNORE_FILE_PATTERNS: if pattern.match(name): return True elif path.is_dir(): for pattern in _IGNORE_DIR_PATTERNS: if pattern.match(name): return True if not self.include_meson_subprojects: for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS: if pattern.match(parent_dir): return True if self.vcs_strategy.is_ignored(path): return True return False def _identifier_of_license(self, path: Path) -> str: """Figure out the SPDX License identifier of a license given its path. The name of the path (minus its extension) should be a valid SPDX License Identifier. """ if not path.suffix: raise IdentifierNotFound(f"{path} has no file extension") if path.stem in self.license_map: return path.stem if _LICENSEREF_PATTERN.match(path.stem): return path.stem raise IdentifierNotFound( f"Could not find SPDX License Identifier for {path}" ) def _find_licenses(self) -> Dict[str, Path]: """Return a dictionary of all licenses in the project, with their SPDX identifiers as names and paths as values. """ # TODO: This method does more than one thing. We ought to simplify it. license_files: Dict[str, Path] = {} directory = str(self.root / "LICENSES/**") for path_str in glob.iglob(directory, recursive=True): path = Path(path_str) # For some reason, LICENSES/** is resolved even though it # doesn't exist. I have no idea why. Deal with that here. if not Path(path).exists() or Path(path).is_dir(): continue if Path(path).suffix == ".license": continue path = self.relative_from_root(path) _LOGGER.debug( _("determining identifier of '{path}'").format(path=path) ) try: identifier = self._identifier_of_license(path) except IdentifierNotFound: if path.name in self.license_map: _LOGGER.info( _("{path} does not have a file extension").format( path=path ) ) identifier = path.name self.licenses_without_extension[identifier] = path else: identifier = path.stem _LOGGER.warning( _( "Could not resolve SPDX License Identifier of" " {path}, resolving to {identifier}. Make sure the" " license is in the license list found at" " <https://spdx.org/licenses/> or that it starts" " with 'LicenseRef-', and that it has a file" " extension." ).format(path=path, identifier=identifier) ) if identifier in license_files: _LOGGER.critical( _( "{identifier} is the SPDX License Identifier of both" " {path} and {other_path}" ).format( identifier=identifier, path=path, other_path=license_files[identifier], ) ) raise RuntimeError("Multiple licenses resolve to {identifier}") # Add the identifiers license_files[identifier] = path if ( _LICENSEREF_PATTERN.match(identifier) and "Unknown" not in identifier ): self.license_map[identifier] = { "reference": str(path), "isDeprecatedLicenseId": False, "detailsUrl": None, "referenceNumber": None, "name": identifier, "licenseId": identifier, "seeAlso": [], "isOsiApproved": None, } return license_files @classmethod def _detect_vcs_strategy(cls, root: StrPath) -> Type[VCSStrategy]: """For each supported VCS, check if the software is available and if the directory is a repository. If not, return :class:`VCSStrategyNone`. """ for strategy in all_vcs_strategies(): if strategy.EXE and strategy.in_repo(root): return strategy _LOGGER.info( _( "project '{}' is not a VCS repository or required VCS" " software is not installed" ).format(root) ) return VCSStrategyNone