Source code for reuse.project

# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains the central Project class."""

import glob
import logging
import os
from gettext import gettext as _
from pathlib import Path
from typing import Dict, Iterator, Optional

from boolean.boolean import ParseError
from debian.copyright import Copyright
from debian.copyright import Error as DebianError
from license_expression import ExpressionError

from . import (
    _IGNORE_DIR_PATTERNS,
    _IGNORE_FILE_PATTERNS,
    IdentifierNotFound,
    SpdxInfo,
)
from ._licenses import EXCEPTION_MAP, LICENSE_MAP
from ._util import (
    _HEADER_BYTES,
    GIT_EXE,
    HG_EXE,
    PathLike,
    _copyright_from_dep5,
    _determine_license_path,
    decoded_text_from_binary,
    extract_spdx_info,
)
from .vcs import VCSStrategyGit, VCSStrategyHg, VCSStrategyNone, find_root

_LOGGER = logging.getLogger(__name__)


[docs]class Project: """Simple object that holds the project's root, which is necessary for many interactions. """ # pylint: disable=too-many-instance-attributes def __init__(self, root: PathLike, include_submodules: bool = False): self._root = Path(root) if not self._root.is_dir(): raise NotADirectoryError(f"{self._root} is no valid path") if GIT_EXE and VCSStrategyGit.in_repo(self._root): self.vcs_strategy = VCSStrategyGit(self) elif HG_EXE and VCSStrategyHg.in_repo(self._root): self.vcs_strategy = VCSStrategyHg(self) else: _LOGGER.warning(_("could not find supported VCS")) self.vcs_strategy = VCSStrategyNone(self) self.licenses_without_extension = dict() self.license_map = LICENSE_MAP.copy() # TODO: Is this correct? self.license_map.update(EXCEPTION_MAP) self.licenses = self._licenses() # Use '0' as None, because None is a valid value... self._copyright_val = 0 self.include_submodules = include_submodules
[docs] def all_files(self, directory: PathLike = None) -> Iterator[Path]: """Yield all files in *directory* and its subdirectories. The files that are not yielded are: - Files ignored by VCS (e.g., see .gitignore) - Files/directories matching IGNORE_*_PATTERNS. """ if directory is None: directory = self.root directory = Path(directory) for root, dirs, files in os.walk(directory): root = Path(root) _LOGGER.debug("currently walking in '%s'", root) # Don't walk ignored directories for dir_ in list(dirs): the_dir = root / dir_ if self._is_path_ignored(the_dir): _LOGGER.debug("ignoring '%s'", the_dir) dirs.remove(dir_) elif ( the_dir / ".git" ).is_file() and not self.include_submodules: _LOGGER.info( "ignoring '%s' because it is a submodule", the_dir ) dirs.remove(dir_) # Filter files. for file_ in files: the_file = root / file_ if self._is_path_ignored(the_file): _LOGGER.debug("ignoring '%s'", the_file) continue _LOGGER.debug("yielding '%s'", the_file) yield the_file
[docs] def spdx_info_of(self, path: PathLike) -> SpdxInfo: """Return SPDX info of *path*. This function will return any SPDX information that it can find, both from within the file and from the .reuse/dep5 file. """ path = _determine_license_path(path) _LOGGER.debug(f"searching '{path}' for SPDX information") dep5_result = SpdxInfo(set(), set()) file_result = SpdxInfo(set(), set()) # Search the .reuse/dep5 file for SPDX information. if self._copyright: dep5_result = _copyright_from_dep5( self.relative_from_root(path), self._copyright ) if any(dep5_result): _LOGGER.info( _("'{path}' covered by .reuse/dep5").format(path=path) ) # Search the file for SPDX information. with path.open("rb") as fp: try: file_result = extract_spdx_info( decoded_text_from_binary(fp, size=_HEADER_BYTES) ) except (ExpressionError, ParseError): _LOGGER.error( _( "'{path}' holds an SPDX expression that cannot be" " parsed, skipping the file" ).format(path) ) return SpdxInfo( dep5_result.spdx_expressions.union(file_result.spdx_expressions), dep5_result.copyright_lines.union(file_result.copyright_lines), )
[docs] def relative_from_root(self, path: Path) -> Path: """If the project root is /tmp/project, and *path* is /tmp/project/src/file, then return src/file. """ try: return path.relative_to(self.root) except ValueError: return Path(os.path.relpath(path, start=self.root))
def _is_path_ignored(self, path: Path) -> bool: """Is *path* ignored by some mechanism?""" name = path.name if path.is_file(): for pattern in _IGNORE_FILE_PATTERNS: if pattern.match(name): return True elif path.is_dir(): for pattern in _IGNORE_DIR_PATTERNS: if pattern.match(name): return True if self.vcs_strategy.is_ignored(path): return True return False def _identifier_of_license(self, path: Path) -> str: """Figure out the SPDX License identifier of a license given its path. The name of the path (minus its extension) should be a valid SPDX License Identifier. """ if not path.suffix: raise IdentifierNotFound(f"{path} has no file extension") if path.stem in self.license_map: return path.stem if path.stem.startswith("LicenseRef-"): return path.stem raise IdentifierNotFound( f"Could not find SPDX License Identifier for {path}" ) @property def root(self) -> Path: """Path to the root of the project.""" return self._root @property def _copyright(self) -> Optional[Copyright]: if self._copyright_val == 0: copyright_path = self.root / ".reuse/dep5" try: with copyright_path.open() as fp: self._copyright_val = Copyright(fp) except OSError: _LOGGER.debug("no .reuse/dep5 file, or could not read it") except DebianError: _LOGGER.exception(_(".reuse/dep5 has syntax errors")) # This check is a bit redundant, but otherwise I'd have to repeat # this line under each exception. if not self._copyright_val: self._copyright_val = None return self._copyright_val def _licenses(self) -> Dict[str, Path]: """Return a dictionary of all licenses in the project, with their SPDX identifiers as names and paths as values. """ license_files = dict() directory = str(self.root / "LICENSES/**") for path in glob.iglob(directory, recursive=True): path = Path(path) # For some reason, LICENSES/** is resolved even though it # doesn't exist. I have no idea why. Deal with that here. if not Path(path).exists() or Path(path).is_dir(): continue if Path(path).suffix == ".license": continue path = self.relative_from_root(path) _LOGGER.debug( _("determining identifier of '{path}'").format(path=path) ) try: identifier = self._identifier_of_license(path) except IdentifierNotFound: if path.name in self.license_map: _LOGGER.info( _("{path} does not have a file extension").format( path=path ) ) identifier = path.name self.licenses_without_extension[identifier] = path else: identifier = path.stem _LOGGER.warning( _( "Could not resolve SPDX License Identifier of" " {path}, resolving to {identifier}. Make sure the" " license is in the license list found at" " <https://spdx.org/licenses/> or that it starts" " with 'LicenseRef-', and that it has a file" " extension." ).format(path=path, identifier=identifier) ) if identifier in license_files: _LOGGER.critical( _( "{identifier} is the SPDX License Identifier of both" " {path} and {other_path}" ).format( identifier=identifier, path=path, other_path=license_files[identifier], ) ) raise RuntimeError( f"Multiple licenses resolve to {identifier}" ) # Add the identifiers license_files[identifier] = path if ( identifier.startswith("LicenseRef-") and "Unknown" not in identifier ): self.license_map[identifier] = { "reference": str(path), "isDeprecatedLicenseId": False, "detailsUrl": None, "referenceNumber": None, "name": identifier, "licenseId": identifier, "seeAlso": [], "isOsiApproved": None, } return license_files
[docs]def create_project() -> Project: """Create a project object. Try to find the project root from CWD, otherwise treat CWD as root. """ root = find_root() if root is None: root = Path.cwd() return Project(root)