Skip to content

Commit

Permalink
toil(package managers:pip): refactor `_process_package_distributions(…
Browse files Browse the repository at this point in the history
…)`,...

...`DistributionPackageInfo`

Signed-off-by: Ben Alkov <[email protected]>
  • Loading branch information
ben-alkov committed Feb 19, 2024
1 parent 74ace07 commit 4f92edb
Showing 1 changed file with 117 additions and 91 deletions.
208 changes: 117 additions & 91 deletions cachi2/core/package_managers/pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import zipfile
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from os import PathLike
from pathlib import Path
from typing import (
IO,
Expand Down Expand Up @@ -1386,7 +1385,9 @@ def _split_hashes_from_options(cls, options: list[str]) -> tuple[list[str], list


def _download_dependencies(
output_dir: RootedPath, requirements_file: PipRequirementsFile, allow_binary: bool = False
output_dir: RootedPath,
requirements_file: PipRequirementsFile,
allow_binary: bool = False,
) -> list[dict[str, Any]]:
"""
Download sdists (source distributions) of all dependencies in a requirements.txt file.
Expand All @@ -1400,6 +1401,7 @@ def _download_dependencies(
"""
options = _process_options(requirements_file.options)
trusted_hosts = set(options["trusted_hosts"])
downloaded: list[dict[str, Any]] = []

if options["require_hashes"]:
log.info("Global --require-hashes option used, will require hashes")
Expand All @@ -1419,72 +1421,102 @@ def _download_dependencies(
pip_deps_dir = output_dir.join_within_root("deps", "pip")
pip_deps_dir.path.mkdir(parents=True, exist_ok=True)

downloaded: list[dict[str, Any]] = []
to_download: list[DistributionPackageInfo] = []
def _check_local(
artifacts: list[DistributionPackageInfo],
) -> tuple[list[DistributionPackageInfo], list[DistributionPackageInfo]]:
# check if artifacts already exist locally
to_download: list[DistributionPackageInfo] = []
to_check: list[DistributionPackageInfo] = []
if artifacts:
for artifact in artifacts:
if artifact.path.exists():
to_check.append(artifact)
log.info("Artifact '%s' found locally", artifact.path.name)
continue
else:
to_download.append(artifact)
return to_download, to_check

def _hash_verify(
download_info: dict[str, Any], dpi: Optional[DistributionPackageInfo] = None
) -> dict[str, Any]:
if dpi:
download_info = dpi.download_info

download_info["hash_verified"] = False
if dpi.should_verify_checksums():
try:
must_match_any_checksum(dpi.path, dpi.checksums_to_verify)
download_info["hash_verified"] = True
except PackageRejected:
dpi.path.unlink()
log.warning("Package '%s' is removed from the output directory", dpi.path.name)
else:
download_info["hash_verified"] = False
if require_hashes or req.kind == "url":
hashes = req.hashes or [req.qualifiers.get("cachito_hash", "")]
must_match_any_checksum(download_info["path"], list(map(_to_checksum_info, hashes)))
download_info["hash_verified"] = True

return download_info

def _finalize(req: PipRequirement, download_info: dict[str, Any]) -> dict[str, Any]:
log.info(
"Successfully downloaded/checked '%s' in path '%s'",
req.download_line,
download_info["path"].relative_to(output_dir),
)
download_info["kind"] = req.kind
download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root)
return download_info

for req in requirements_file.requirements:
log.info("Downloading %s", req.download_line)
log.info("Processing requirement line '%s'", req.download_line)

if req.kind == "pypi":
source, wheels = _process_package_distributions(req, pip_deps_dir, allow_binary)
if allow_binary:
# check if artifact already exists locally
to_download.extend(w for w in wheels if not w.path.exists())

if source is None:
# at least one wheel exists -> report in the SBOM
downloaded.append(
{
"package": req.package,
"version": req.version_specs[0][1],
"kind": req.kind,
"hash_verified": require_hashes,
"requirement_file": str(requirements_file.file_path.subpath_from_root),
}
sdist_to_download: list[DistributionPackageInfo]
sdist_to_check: list[DistributionPackageInfo]
wheels_to_download: list[DistributionPackageInfo]
wheels_to_check: list[DistributionPackageInfo]
artifacts = _process_package_distributions(req, pip_deps_dir, allow_binary)
sdist_to_download, sdist_to_check = _check_local(
[dpi for dpi in artifacts if dpi.package_type == "sdist"]
)
wheels_to_download, wheels_to_check = _check_local(
[dpi for dpi in artifacts if dpi.package_type == "wheel"]
)

download_info: dict[str, Any] = {}
for dpi in sdist_to_download + wheels_to_download:
log.info("Processing pypi download '%s'", dpi.path.name)
asyncio.run(
async_download_files({dpi.url: dpi.path}, get_config().concurrency_limit)
)
continue

download_binary_file(source.url, source.path, auth=None)
_check_metadata_in_sdist(source.path)
download_info = source.download_info
download_info = _hash_verify(download_info, dpi)
if dpi.package_type == "sdist":
_check_metadata_in_sdist(dpi.path)
downloaded.append(_finalize(req, download_info))

download_info = {}
for dpi in sdist_to_check + wheels_to_check:
log.info("Checking existing pypi artifact '%s'", dpi.path.name)
download_info = _hash_verify(download_info, dpi)
if dpi.package_type == "sdist":
_check_metadata_in_sdist(dpi.path)
downloaded.append(_finalize(req, download_info))

elif req.kind == "vcs":
download_info = _download_vcs_package(req, pip_deps_dir)
download_info = _hash_verify(download_info)
downloaded.append(_finalize(req, download_info))
elif req.kind == "url":
download_info = _download_url_package(req, pip_deps_dir, trusted_hosts)
download_info = _hash_verify(download_info)
downloaded.append(_finalize(req, download_info))
else:
# Should not happen
raise RuntimeError(f"Unexpected requirement kind: {req.kind!r}")

log.info(
"Successfully downloaded %s to %s",
req.download_line,
download_info["path"].relative_to(output_dir),
)

if require_hashes or req.kind == "url":
hashes = req.hashes or [req.qualifiers["cachito_hash"]]
must_match_any_checksum(download_info["path"], list(map(_to_checksum_info, hashes)))
download_info["hash_verified"] = True
else:
download_info["hash_verified"] = False

download_info["kind"] = req.kind
download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root)
downloaded.append(download_info)

if allow_binary:
log.info("Downloading %d wheel(s) ...", len(to_download))
files: dict[str, Union[str, PathLike[str]]] = {pkg.url: pkg.path for pkg in to_download}
asyncio.run(async_download_files(files, get_config().concurrency_limit))

for pkg in to_download:
try:
if pkg.should_verify_checksums():
must_match_any_checksum(pkg.path, pkg.checksums_to_verify)
except PackageRejected:
pkg.path.unlink()
log.warning("The %s is removed from the output directory", pkg.path.name)
raise RuntimeError(f"Unexpected requirement kind: '{req.kind!r}'")

return downloaded

Expand Down Expand Up @@ -1688,8 +1720,7 @@ class DistributionPackageInfo:
checksums_to_verify: set[ChecksumInfo] = field(init=False, default_factory=set)

def __post_init__(self) -> None:
if self.package_type == "wheel":
self.checksums_to_verify = self._determine_checksums_to_verify()
self.checksums_to_verify = self._determine_checksums_to_verify()

def _determine_checksums_to_verify(self) -> set[ChecksumInfo]:
"""Determine the set of checksums to verify for a given distribution package."""
Expand All @@ -1711,15 +1742,15 @@ def _determine_checksums_to_verify(self) -> set[ChecksumInfo]:
log.debug("%s: %s", self.path.name, msg)
return checksums

def should_download_wheel(self) -> bool:
"""Determine if the wheel should be downloaded.
def should_download(self) -> bool:
"""Determine if this artifact should be downloaded.
If the user specified any checksums, but they do not match with those
reported by PyPI, we do not want to download the wheel.
reported by PyPI, we do not want to download the artifact.
Otherwise, we do.
"""
return self.package_type == "wheel" and (
return (
len(self.checksums_to_verify) > 0
or len(self.pypi_checksums) == 0
or len(self.user_checksums) == 0
Expand All @@ -1741,49 +1772,47 @@ def download_info(self) -> dict[str, Any]:

def _process_package_distributions(
requirement: PipRequirement, pip_deps_dir: RootedPath, allow_binary: bool = False
) -> tuple[Optional[DistributionPackageInfo], list[DistributionPackageInfo]]:
) -> list[DistributionPackageInfo]:
"""
Return a DistributionPackageInfo object | a list of DPI objects, for the
provided pip package.
Return a DistributionPackageInfo object | a list of DPI objects, for the provided pip package.
Scrape the package's PyPI page and generate a list of all available
artifacts.
Filter by version and allowed artifact type.
Filter to find the best matching sdist artifact.
Process wheel artifacts.
:param requirement: which pip package to process
:param str pip_deps_dir:
:param bool allow_binary: process wheels?
:return: a single DistributionPackageInfo, or a list of DPI
:rtype: DistributionPackageInfo
"""
allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"]
artifacts: list[DistributionPackageInfo] = []
best_sdist: list[DistributionPackageInfo] = []
client = pypi_simple.PyPISimple()
name = requirement.package
version = requirement.version_specs[0][1]
normalized_version = canonicalize_version(version)
user_checksums = set(map(_to_checksum_info, requirement.hashes))

client = pypi_simple.PyPISimple()
try:
timeout = get_config().requests_timeout
project_page = client.get_project_page(name, timeout)
packages = project_page.packages
packages: list[pypi_simple.DistributionPackage] = project_page.packages
except (requests.RequestException, pypi_simple.NoSuchProjectError) as e:
raise FetchError(f"PyPI query failed: {e}")

allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"]
filtered_packages = filter(
lambda x: x.version is not None
and canonicalize_version(x.version) == normalized_version
and x.package_type is not None
and x.package_type in allowed_distros,
packages,
)

sdists: list[DistributionPackageInfo] = []
wheels: list[DistributionPackageInfo] = []
def _package_filter(pkg: pypi_simple.DistributionPackage) -> bool:
return (
pkg.version is not None
and canonicalize_version(pkg.version) == normalized_version
and pkg.package_type is not None
and pkg.package_type in allowed_distros
)

user_checksums = set(map(_to_checksum_info, requirement.hashes))
filtered_packages = (package for package in packages if _package_filter(package))

for package in filtered_packages:
pypi_checksums = {
Expand All @@ -1801,17 +1830,16 @@ def _process_package_distributions(
user_checksums,
)

if dpi.package_type == "sdist":
sdists.append(dpi)
if dpi.should_download():
artifacts.append(dpi)
else:
if dpi.should_download_wheel():
wheels.append(dpi)
else:
log.info("Filtering out %s due to checksum mismatch", package.filename)
log.info("Filtering out %s due to checksum mismatch", package.filename)

if len(sdists) != 0:
best_sdist = max(sdists, key=_sdist_preference)
if best_sdist.is_yanked:
sdists = [dpi for dpi in artifacts if dpi.package_type == "sdist"]
wheels = [dpi for dpi in artifacts if dpi.package_type == "wheel"]
if sdists:
best_sdist.append(max(sdists, key=_sdist_preference))
if best_sdist[0].is_yanked:
raise PackageRejected(
f"All sdists for package {name}=={version} are yanked",
solution=(
Expand All @@ -1823,7 +1851,6 @@ def _process_package_distributions(
)
else:
log.warning("No source distributions found for package %s==%s", name, version)
best_sdist = None

if len(wheels) == 0:
if allow_binary:
Expand All @@ -1841,14 +1868,13 @@ def _process_package_distributions(
"Alternately, allow the use of wheels."
)
docs = PIP_NO_SDIST_DOC

raise PackageRejected(
f"No distributions found for package {name}=={version}",
solution=solution,
docs=docs,
)

return best_sdist, wheels
return best_sdist + wheels


def _sdist_preference(sdist_pkg: DistributionPackageInfo) -> tuple[int, int]:
Expand Down

0 comments on commit 4f92edb

Please sign in to comment.