Source code for reuse.project

# SPDX-FileCopyrightText: 2017-2019 Free Software Foundation Europe e.V.
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""Module that contains the central Project class."""

import glob
import logging
import os
from gettext import gettext as _
from pathlib import Path
from typing import Dict, Iterator, Optional

from boolean.boolean import ParseError
from debian.copyright import Copyright, NotMachineReadableError
from license_expression import ExpressionError

from . import (
    _IGNORE_DIR_PATTERNS,
    _IGNORE_FILE_PATTERNS,
    IdentifierNotFound,
    SpdxInfo,
)
from ._licenses import EXCEPTION_MAP, LICENSE_MAP
from ._util import (
    _HEADER_BYTES,
    GIT_EXE,
    PathLike,
    _all_files_ignored_by_git,
    _copyright_from_dep5,
    _determine_license_path,
    decoded_text_from_binary,
    extract_spdx_info,
    find_root,
    in_git_repo,
)

try:
    # pylint: disable=ungrouped-imports
    from debian.copyright import MachineReadableFormatError
except ImportError:

    class MachineReadableFormatError(ValueError):
        """Temporary workaround.

        https://github.com/fsfe/reuse-tool/issues/103
        """


_LOGGER = logging.getLogger(__name__)


[docs]class Project: """Simple object that holds the project's root, which is necessary for many interactions. """ def __init__(self, root: PathLike): self._root = Path(root) if not self._root.is_dir(): raise NotADirectoryError("{} is no valid path".format(self._root)) self._is_git_repo = False self._all_ignored_files = set() if GIT_EXE: self._is_git_repo = in_git_repo(self._root) else: _LOGGER.warning(_("could not find Git")) if self._is_git_repo: self._all_ignored_files = _all_files_ignored_by_git(self._root) self.license_map = LICENSE_MAP.copy() # TODO: Is this correct? self.license_map.update(EXCEPTION_MAP) self.licenses = self._licenses() # Use '0' as None, because None is a valid value... self._copyright_val = 0
[docs] def all_files(self, directory: PathLike = None) -> Iterator[Path]: """Yield all files in *directory* and its subdirectories. The files that are not yielded are: - Files ignored by VCS (e.g., see .gitignore) - Files/directories matching IGNORE_*_PATTERNS. If *directory* is a file, yield it if it is not ignored. """ if directory is None: directory = self.root directory = Path(directory) if directory.is_file() and not self._is_path_ignored(directory): _LOGGER.debug("yielding %s", directory) yield directory for root, dirs, files in os.walk(directory): root = Path(root) _LOGGER.debug("currently walking in %s", root) # Don't walk ignored directories for dir_ in list(dirs): if self._is_path_ignored(root / dir_): _LOGGER.debug("ignoring %s", root / dir_) dirs.remove(dir_) # Filter files. for file_ in files: if self._is_path_ignored(root / file_): _LOGGER.debug("ignoring %s", root / file_) continue _LOGGER.debug(_("yielding %s"), root / file_) yield root / file_
[docs] def spdx_info_of(self, path: PathLike) -> SpdxInfo: """Return SPDX info of *path*. This function will return any SPDX information that it can find, both from within the file and from the .reuse/dep5 file. """ path = _determine_license_path(path) # Translators: %s is a path. _LOGGER.debug("searching %s for SPDX information", path) dep5_result = SpdxInfo(set(), set()) file_result = SpdxInfo(set(), set()) # Search the .reuse/dep5 file for SPDX information. if self._copyright: dep5_result = _copyright_from_dep5( self._relative_from_root(path), self._copyright ) if any(dep5_result): # Translators: %s is a path. _LOGGER.info(_("%s covered by .reuse/dep5"), path) # Search the file for SPDX information. with path.open("rb") as fp: try: file_result = extract_spdx_info( decoded_text_from_binary(fp, size=_HEADER_BYTES) ) except (ExpressionError, ParseError): _LOGGER.error( _( "%s holds an SPDX expression that cannot be parsed, " "skipping the file" ), path, ) return SpdxInfo( dep5_result.spdx_expressions.union(file_result.spdx_expressions), dep5_result.copyright_lines.union(file_result.copyright_lines), )
def _relative_from_root(self, path: PathLike) -> Path: """If the project root is /tmp/project, and *path* is /tmp/project/src/file, then return src/file. """ return Path(os.path.relpath(path, start=self.root)) def _ignored_by_vcs(self, path: PathLike) -> bool: """Is *path* covered by the ignore mechanism of the VCS (e.g., .gitignore)? """ if self._is_git_repo: return self._ignored_by_git(path) return False def _ignored_by_git(self, path: PathLike) -> bool: """Is *path* covered by the ignore mechanism of git? Always return False if git is not installed. """ is_dir = path.is_dir() path = self._relative_from_root(path) if is_dir: path = "{}/".format(path) if self._is_git_repo: return str(path) in self._all_ignored_files return False def _is_path_ignored(self, path: PathLike) -> bool: """Is *path* ignored by some mechanism?""" path = Path(path) if path.is_file(): for pattern in _IGNORE_FILE_PATTERNS: if pattern.match(path.name): return True elif path.is_dir(): for pattern in _IGNORE_DIR_PATTERNS: if pattern.match(path.name): return True if self._ignored_by_vcs(path): return True return False def _identifier_of_license(self, path: PathLike) -> str: """Figure out the SPDX License identifier of a license given its path. The name of the path (minus its extension) should be a valid SPDX License Identifier. """ if path.stem in self.license_map: return path.stem if path.stem.startswith("LicenseRef-"): return path.stem raise IdentifierNotFound( "Could not find SPDX License Identifier for {}".format(path) ) @property def root(self) -> Path: """Path to the root of the project.""" return self._root @property def _copyright(self) -> Optional[Copyright]: if self._copyright_val == 0: copyright_path = self.root / ".reuse" / "dep5" try: with copyright_path.open() as fp: self._copyright_val = Copyright(fp) except (IOError, OSError): _LOGGER.debug("no .reuse/dep5 file, or could not read it") except (NotMachineReadableError, MachineReadableFormatError): _LOGGER.exception(_(".reuse/dep5 has syntax errors")) # This check is a bit redundant, but otherwise I'd have to repeat # this line under each exception. if not self._copyright_val: self._copyright_val = None return self._copyright_val def _licenses(self) -> Dict[str, Path]: """Return a dictionary of all licenses in the project, with their SPDX identifiers as names and paths as values. """ license_files = dict() directory = str(self.root.resolve() / "LICENSES/**") for path in glob.iglob(directory, recursive=True): # For some reason, LICENSES/** is resolved even though it # doesn't exist. I have no idea why. Deal with that here. if not Path(path).exists() or Path(path).is_dir(): continue if Path(path).suffix == ".license": continue if Path(path).suffix == ".spdx": continue path = self._relative_from_root(path) _LOGGER.debug(_("determining identifier of %s"), path) try: identifier = self._identifier_of_license(path) except IdentifierNotFound: identifier = path.stem _LOGGER.warning( _( "Could not resolve SPDX License Identifier of {path}, " "resolving to {identifier}. Make sure the license is " "in the license list found at " "<https://spdx.org/licenses/> or that it starts with " "'LicenseRef-', and that it has a file extension." ).format(path=path, identifier=identifier) ) if identifier in license_files: _LOGGER.critical( _( "{identifier} is the SPDX License Identifier of both " "{path} and {other_path}" ).format( identifier=identifier, path=path, other_path=license_files[identifier], ) ) raise RuntimeError( "Multiple licenses resolve to {}".format(identifier) ) # Add the identifiers license_files[identifier] = path if ( identifier.startswith("LicenseRef-") and "Unknown" not in identifier ): self.license_map[identifier] = path return license_files
[docs]def create_project() -> Project: """Create a project object. Try to find the project root from CWD, otherwise treat CWD as root. """ root = find_root() if root is None: root = Path.cwd() return Project(root)