From 4f92edb91f675aed69d1f9a4728fb3e412a19e62 Mon Sep 17 00:00:00 2001 From: Ben Alkov Date: Wed, 7 Feb 2024 14:23:26 -0500 Subject: [PATCH] toil(package managers:pip): refactor `_process_package_distributions()`,... ...`DistributionPackageInfo` Signed-off-by: Ben Alkov --- cachi2/core/package_managers/pip.py | 208 ++++++++++++++++------------ 1 file changed, 117 insertions(+), 91 deletions(-) diff --git a/cachi2/core/package_managers/pip.py b/cachi2/core/package_managers/pip.py index 27e18db26..65bc55976 100644 --- a/cachi2/core/package_managers/pip.py +++ b/cachi2/core/package_managers/pip.py @@ -12,7 +12,6 @@ import zipfile from abc import ABC, abstractmethod from dataclasses import dataclass, field -from os import PathLike from pathlib import Path from typing import ( IO, @@ -1386,7 +1385,9 @@ def _split_hashes_from_options(cls, options: list[str]) -> tuple[list[str], list def _download_dependencies( - output_dir: RootedPath, requirements_file: PipRequirementsFile, allow_binary: bool = False + output_dir: RootedPath, + requirements_file: PipRequirementsFile, + allow_binary: bool = False, ) -> list[dict[str, Any]]: """ Download sdists (source distributions) of all dependencies in a requirements.txt file. @@ -1400,6 +1401,7 @@ def _download_dependencies( """ options = _process_options(requirements_file.options) trusted_hosts = set(options["trusted_hosts"]) + downloaded: list[dict[str, Any]] = [] if options["require_hashes"]: log.info("Global --require-hashes option used, will require hashes") @@ -1419,72 +1421,102 @@ def _download_dependencies( pip_deps_dir = output_dir.join_within_root("deps", "pip") pip_deps_dir.path.mkdir(parents=True, exist_ok=True) - downloaded: list[dict[str, Any]] = [] - to_download: list[DistributionPackageInfo] = [] + def _check_local( + artifacts: list[DistributionPackageInfo], + ) -> tuple[list[DistributionPackageInfo], list[DistributionPackageInfo]]: + # check if artifacts already exist locally + to_download: list[DistributionPackageInfo] = [] + to_check: list[DistributionPackageInfo] = [] + if artifacts: + for artifact in artifacts: + if artifact.path.exists(): + to_check.append(artifact) + log.info("Artifact '%s' found locally", artifact.path.name) + continue + else: + to_download.append(artifact) + return to_download, to_check + + def _hash_verify( + download_info: dict[str, Any], dpi: Optional[DistributionPackageInfo] = None + ) -> dict[str, Any]: + if dpi: + download_info = dpi.download_info + + download_info["hash_verified"] = False + if dpi.should_verify_checksums(): + try: + must_match_any_checksum(dpi.path, dpi.checksums_to_verify) + download_info["hash_verified"] = True + except PackageRejected: + dpi.path.unlink() + log.warning("Package '%s' is removed from the output directory", dpi.path.name) + else: + download_info["hash_verified"] = False + if require_hashes or req.kind == "url": + hashes = req.hashes or [req.qualifiers.get("cachito_hash", "")] + must_match_any_checksum(download_info["path"], list(map(_to_checksum_info, hashes))) + download_info["hash_verified"] = True + + return download_info + + def _finalize(req: PipRequirement, download_info: dict[str, Any]) -> dict[str, Any]: + log.info( + "Successfully downloaded/checked '%s' in path '%s'", + req.download_line, + download_info["path"].relative_to(output_dir), + ) + download_info["kind"] = req.kind + download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root) + return download_info for req in requirements_file.requirements: - log.info("Downloading %s", req.download_line) + log.info("Processing requirement line '%s'", req.download_line) if req.kind == "pypi": - source, wheels = _process_package_distributions(req, pip_deps_dir, allow_binary) - if allow_binary: - # check if artifact already exists locally - to_download.extend(w for w in wheels if not w.path.exists()) - - if source is None: - # at least one wheel exists -> report in the SBOM - downloaded.append( - { - "package": req.package, - "version": req.version_specs[0][1], - "kind": req.kind, - "hash_verified": require_hashes, - "requirement_file": str(requirements_file.file_path.subpath_from_root), - } + sdist_to_download: list[DistributionPackageInfo] + sdist_to_check: list[DistributionPackageInfo] + wheels_to_download: list[DistributionPackageInfo] + wheels_to_check: list[DistributionPackageInfo] + artifacts = _process_package_distributions(req, pip_deps_dir, allow_binary) + sdist_to_download, sdist_to_check = _check_local( + [dpi for dpi in artifacts if dpi.package_type == "sdist"] + ) + wheels_to_download, wheels_to_check = _check_local( + [dpi for dpi in artifacts if dpi.package_type == "wheel"] + ) + + download_info: dict[str, Any] = {} + for dpi in sdist_to_download + wheels_to_download: + log.info("Processing pypi download '%s'", dpi.path.name) + asyncio.run( + async_download_files({dpi.url: dpi.path}, get_config().concurrency_limit) ) - continue - download_binary_file(source.url, source.path, auth=None) - _check_metadata_in_sdist(source.path) - download_info = source.download_info + download_info = _hash_verify(download_info, dpi) + if dpi.package_type == "sdist": + _check_metadata_in_sdist(dpi.path) + downloaded.append(_finalize(req, download_info)) + + download_info = {} + for dpi in sdist_to_check + wheels_to_check: + log.info("Checking existing pypi artifact '%s'", dpi.path.name) + download_info = _hash_verify(download_info, dpi) + if dpi.package_type == "sdist": + _check_metadata_in_sdist(dpi.path) + downloaded.append(_finalize(req, download_info)) elif req.kind == "vcs": download_info = _download_vcs_package(req, pip_deps_dir) + download_info = _hash_verify(download_info) + downloaded.append(_finalize(req, download_info)) elif req.kind == "url": download_info = _download_url_package(req, pip_deps_dir, trusted_hosts) + download_info = _hash_verify(download_info) + downloaded.append(_finalize(req, download_info)) else: # Should not happen - raise RuntimeError(f"Unexpected requirement kind: {req.kind!r}") - - log.info( - "Successfully downloaded %s to %s", - req.download_line, - download_info["path"].relative_to(output_dir), - ) - - if require_hashes or req.kind == "url": - hashes = req.hashes or [req.qualifiers["cachito_hash"]] - must_match_any_checksum(download_info["path"], list(map(_to_checksum_info, hashes))) - download_info["hash_verified"] = True - else: - download_info["hash_verified"] = False - - download_info["kind"] = req.kind - download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root) - downloaded.append(download_info) - - if allow_binary: - log.info("Downloading %d wheel(s) ...", len(to_download)) - files: dict[str, Union[str, PathLike[str]]] = {pkg.url: pkg.path for pkg in to_download} - asyncio.run(async_download_files(files, get_config().concurrency_limit)) - - for pkg in to_download: - try: - if pkg.should_verify_checksums(): - must_match_any_checksum(pkg.path, pkg.checksums_to_verify) - except PackageRejected: - pkg.path.unlink() - log.warning("The %s is removed from the output directory", pkg.path.name) + raise RuntimeError(f"Unexpected requirement kind: '{req.kind!r}'") return downloaded @@ -1688,8 +1720,7 @@ class DistributionPackageInfo: checksums_to_verify: set[ChecksumInfo] = field(init=False, default_factory=set) def __post_init__(self) -> None: - if self.package_type == "wheel": - self.checksums_to_verify = self._determine_checksums_to_verify() + self.checksums_to_verify = self._determine_checksums_to_verify() def _determine_checksums_to_verify(self) -> set[ChecksumInfo]: """Determine the set of checksums to verify for a given distribution package.""" @@ -1711,15 +1742,15 @@ def _determine_checksums_to_verify(self) -> set[ChecksumInfo]: log.debug("%s: %s", self.path.name, msg) return checksums - def should_download_wheel(self) -> bool: - """Determine if the wheel should be downloaded. + def should_download(self) -> bool: + """Determine if this artifact should be downloaded. If the user specified any checksums, but they do not match with those - reported by PyPI, we do not want to download the wheel. + reported by PyPI, we do not want to download the artifact. Otherwise, we do. """ - return self.package_type == "wheel" and ( + return ( len(self.checksums_to_verify) > 0 or len(self.pypi_checksums) == 0 or len(self.user_checksums) == 0 @@ -1741,10 +1772,9 @@ def download_info(self) -> dict[str, Any]: def _process_package_distributions( requirement: PipRequirement, pip_deps_dir: RootedPath, allow_binary: bool = False -) -> tuple[Optional[DistributionPackageInfo], list[DistributionPackageInfo]]: +) -> list[DistributionPackageInfo]: """ - Return a DistributionPackageInfo object | a list of DPI objects, for the - provided pip package. + Return a DistributionPackageInfo object | a list of DPI objects, for the provided pip package. Scrape the package's PyPI page and generate a list of all available artifacts. @@ -1752,38 +1782,37 @@ def _process_package_distributions( Filter to find the best matching sdist artifact. Process wheel artifacts. - :param requirement: which pip package to process :param str pip_deps_dir: :param bool allow_binary: process wheels? :return: a single DistributionPackageInfo, or a list of DPI :rtype: DistributionPackageInfo """ + allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"] + artifacts: list[DistributionPackageInfo] = [] + best_sdist: list[DistributionPackageInfo] = [] + client = pypi_simple.PyPISimple() name = requirement.package version = requirement.version_specs[0][1] normalized_version = canonicalize_version(version) + user_checksums = set(map(_to_checksum_info, requirement.hashes)) - client = pypi_simple.PyPISimple() try: timeout = get_config().requests_timeout project_page = client.get_project_page(name, timeout) - packages = project_page.packages + packages: list[pypi_simple.DistributionPackage] = project_page.packages except (requests.RequestException, pypi_simple.NoSuchProjectError) as e: raise FetchError(f"PyPI query failed: {e}") - allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"] - filtered_packages = filter( - lambda x: x.version is not None - and canonicalize_version(x.version) == normalized_version - and x.package_type is not None - and x.package_type in allowed_distros, - packages, - ) - - sdists: list[DistributionPackageInfo] = [] - wheels: list[DistributionPackageInfo] = [] + def _package_filter(pkg: pypi_simple.DistributionPackage) -> bool: + return ( + pkg.version is not None + and canonicalize_version(pkg.version) == normalized_version + and pkg.package_type is not None + and pkg.package_type in allowed_distros + ) - user_checksums = set(map(_to_checksum_info, requirement.hashes)) + filtered_packages = (package for package in packages if _package_filter(package)) for package in filtered_packages: pypi_checksums = { @@ -1801,17 +1830,16 @@ def _process_package_distributions( user_checksums, ) - if dpi.package_type == "sdist": - sdists.append(dpi) + if dpi.should_download(): + artifacts.append(dpi) else: - if dpi.should_download_wheel(): - wheels.append(dpi) - else: - log.info("Filtering out %s due to checksum mismatch", package.filename) + log.info("Filtering out %s due to checksum mismatch", package.filename) - if len(sdists) != 0: - best_sdist = max(sdists, key=_sdist_preference) - if best_sdist.is_yanked: + sdists = [dpi for dpi in artifacts if dpi.package_type == "sdist"] + wheels = [dpi for dpi in artifacts if dpi.package_type == "wheel"] + if sdists: + best_sdist.append(max(sdists, key=_sdist_preference)) + if best_sdist[0].is_yanked: raise PackageRejected( f"All sdists for package {name}=={version} are yanked", solution=( @@ -1823,7 +1851,6 @@ def _process_package_distributions( ) else: log.warning("No source distributions found for package %s==%s", name, version) - best_sdist = None if len(wheels) == 0: if allow_binary: @@ -1841,14 +1868,13 @@ def _process_package_distributions( "Alternately, allow the use of wheels." ) docs = PIP_NO_SDIST_DOC - raise PackageRejected( f"No distributions found for package {name}=={version}", solution=solution, docs=docs, ) - return best_sdist, wheels + return best_sdist + wheels def _sdist_preference(sdist_pkg: DistributionPackageInfo) -> tuple[int, int]: