Skip to content

Commit

Permalink
Add pip to SBOM at release stage
Browse files Browse the repository at this point in the history
Co-authored-by: Ezio Melotti <[email protected]>
  • Loading branch information
sethmlarson and ezio-melotti authored Feb 12, 2024
1 parent 69f572e commit d29c9c3
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 14 deletions.
1 change: 1 addition & 0 deletions dev-requirements.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pytest
pytest-mock
6 changes: 6 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ pluggy==1.4.0 \
pytest==8.0.0 \
--hash=sha256:249b1b0864530ba251b7438274c4d251c58d868edaaec8762893ad4a0d71c36c \
--hash=sha256:50fb9cbe836c3f20f0dfa99c565201fb75dc54c8d76373cd1bde06b06657bdb6
# via
# -r dev-requirements.in
# pytest-mock
pytest-mock==3.12.0 \
--hash=sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f \
--hash=sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9
# via -r dev-requirements.in
tomli==2.0.1 \
--hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \
Expand Down
228 changes: 215 additions & 13 deletions sbom.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@

import datetime
import hashlib
import io
import json
import os
import re
import subprocess
import sys
import tarfile
import zipfile
from urllib.request import urlopen
import typing


def spdx_id(value: str) -> str:
Expand Down Expand Up @@ -132,6 +136,186 @@ def recursive_sort_in_place(value):
recursive_sort_in_place(sbom_data)


def fetch_package_metadata_from_pypi(project: str, version: str, filename: str | None = None) -> tuple[str, str] | None:
"""
Fetches the SHA256 checksum and download location from PyPI.
If we're given a filename then we match with that, otherwise we use wheels.
"""
# Get the package download URL from PyPI.
try:
raw_text = urlopen(f"https://pypi.org/pypi/{project}/{version}/json").read()
release_metadata = json.loads(raw_text)
url: dict[str, typing.Any]

# Look for a matching artifact filename and then check
# its remote checksum to the local one.
for url in release_metadata["urls"]:
# pip can only use Python-only dependencies, so there's
# no risk of picking the 'incorrect' wheel here.
if (
(filename is None and url["packagetype"] == "bdist_wheel")
or (filename is not None and url["filename"] == filename)
):
break
else:
raise ValueError(f"No matching filename on PyPI for '{filename}'")

# Successfully found the download URL for the matching artifact.
download_url = url["url"]
checksum_sha256 = url["digests"]["sha256"]
return download_url, checksum_sha256

except (OSError, ValueError) as e:
raise ValueError(f"Couldn't fetch metadata for project '{project}' from PyPI: {e}")


def remove_pip_from_sbom(sbom_data: dict[str, typing.Any]) -> None:
"""
Removes pip and its dependencies from the SBOM data.
This is only necessary if there's potential we get
pip SBOM data from the CPython source SBOM.
"""
sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip")
sbom_spdx_ids_to_remove = {sbom_pip_spdx_id}

# Find all package SPDXIDs that pip depends on.
for sbom_relationship in sbom_data["relationships"]:
if (
sbom_relationship["relationshipType"] == "DEPENDS_ON"
and sbom_relationship["spdxElementId"] == sbom_pip_spdx_id
):
sbom_spdx_ids_to_remove.add(sbom_relationship["relatedSpdxElement"])

# Remove all the packages and relationships.
sbom_data["packages"] = [
sbom_package for sbom_package in sbom_data["packages"]
if sbom_package["SPDXID"] not in sbom_spdx_ids_to_remove
]
sbom_data["relationships"] = [
sbom_relationship for sbom_relationship in sbom_data["relationships"]
if sbom_relationship["relatedSpdxElement"] not in sbom_spdx_ids_to_remove
]


def create_pip_sbom_from_wheel(
sbom_data: dict[str, typing.Any],
pip_wheel_filename: str,
pip_wheel_bytes: bytes
) -> None:
"""
pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
automatable to discover the metadata we need like the version and checksums
so let's do that on behalf of our friends at the PyPA. This function also
discovers vendored packages within pip and fetches their metadata.
"""
# Remove pip from the SBOM in case it's included in the CPython source code SBOM.
remove_pip_from_sbom(sbom_data)

# Wheel filename format puts the version right after the project name.
pip_version = pip_wheel_filename.split("-")[1]
pip_checksum_sha256 = hashlib.sha256(pip_wheel_bytes).hexdigest()

pip_download_url, pip_actual_sha256 = fetch_package_metadata_from_pypi(
project="pip",
version=pip_version,
filename=pip_wheel_filename,
)
if pip_actual_sha256 != pip_checksum_sha256:
raise ValueError("pip wheel checksum doesn't match PyPI")

# Parse 'pip/_vendor/vendor.txt' from the wheel for sub-dependencies.
with zipfile.ZipFile(io.BytesIO(pip_wheel_bytes)) as whl:
vendor_txt_data = whl.read("pip/_vendor/vendor.txt").decode()

# With this version regex we're assuming that pip isn't using pre-releases.
# If any version doesn't match we get a failure below, so we're safe doing this.
version_pin_re = re.compile(r"^([a-zA-Z0-9_.-]+)==([0-9.]*[0-9])$")
sbom_pip_dependency_spdx_ids = set()
for line in vendor_txt_data.splitlines():
line = line.partition("#")[0].strip() # Strip comments and whitespace.
if not line: # Skip empty lines.
continue

# Non-empty lines we must be able to match.
match = version_pin_re.match(line)
assert match is not None, f"Unparseable line in vendor.txt: {line!r}" # Make mypy happy.

# Parse out and normalize the project name.
project_name, project_version = match.groups()
project_name = project_name.lower()

# Fetch the metadata from PyPI
project_download_url, project_checksum_sha256 = (
fetch_package_metadata_from_pypi(project_name, project_version)
)

# Update our SBOM data with what we received from PyPI.
sbom_project_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{project_name}")
sbom_pip_dependency_spdx_ids.add(sbom_project_spdx_id)
sbom_data["packages"].append({
"SPDXID": sbom_project_spdx_id,
"name": project_name,
"versionInfo": project_version,
"downloadLocation": project_download_url,
"checksums": [
{"algorithm": "SHA256", "checksumValue": project_checksum_sha256}
],
"externalRefs": [
{
"referenceCategory": "PACKAGE_MANAGER",
"referenceLocator": f"pkg:pypi/{project_name}@{project_version}",
"referenceType": "purl",
},
],
"primaryPackagePurpose": "SOURCE",
"licenseConcluded": "NOASSERTION",
})

# Now we add pip to the SBOM and dependency relationships
sbom_pip_spdx_id = spdx_id("SPDXRef-PACKAGE-pip")
sbom_data["packages"].append(
{
"SPDXID": sbom_pip_spdx_id,
"name": "pip",
"versionInfo": pip_version,
"originator": "Organization: Python Packaging Authority",
"licenseConcluded": "NOASSERTION",
"downloadLocation": pip_download_url,
"checksums": [
{"algorithm": "SHA256", "checksumValue": pip_checksum_sha256}
],
"externalRefs": [
{
"referenceCategory": "SECURITY",
"referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*",
"referenceType": "cpe23Type",
},
{
"referenceCategory": "PACKAGE_MANAGER",
"referenceLocator": f"pkg:pypi/pip@{pip_version}",
"referenceType": "purl",
},
],
"primaryPackagePurpose": "SOURCE",
}
)
for sbom_dep_spdx_id in sorted(sbom_pip_dependency_spdx_ids):
sbom_data["relationships"].append({
"spdxElementId": sbom_pip_spdx_id,
"relatedSpdxElement": sbom_dep_spdx_id,
"relationshipType": "DEPENDS_ON"
})

# Finally, CPython depends on pip.
sbom_data["relationships"].append(
{
"spdxElementId": "SPDXRef-PACKAGE-cpython",
"relatedSpdxElement": sbom_pip_spdx_id,
"relationshipType": "DEPENDS_ON",
}
)


def create_sbom_for_source_tarball(tarball_path: str):
"""Stitches together an SBOM for a source tarball"""
tarball_name = os.path.basename(tarball_path)
Expand Down Expand Up @@ -163,9 +347,9 @@ def create_sbom_for_source_tarball(tarball_path: str):
"Tarball doesn't contain an SBOM at 'Misc/sbom.spdx.json'"
) from None
sbom_bytes = tarball.extractfile(sbom_tarball_member).read()
sbom_data = json.loads(sbom_bytes)

sbom = json.loads(sbom_bytes)
sbom.update({
sbom_data.update({
"SPDXID": "SPDXRef-DOCUMENT",
"spdxVersion": "SPDX-2.3",
"name": "CPython SBOM",
Expand Down Expand Up @@ -211,18 +395,36 @@ def create_sbom_for_source_tarball(tarball_path: str):
}

# The top-level CPython package depends on every vendored sub-package.
for sbom_package in sbom["packages"]:
sbom["relationships"].append({
for sbom_package in sbom_data["packages"]:
sbom_data["relationships"].append({
"spdxElementId": sbom_cpython_package["SPDXID"],
"relatedSpdxElement": sbom_package["SPDXID"],
"relationshipType": "DEPENDS_ON",
})

sbom["packages"].append(sbom_cpython_package)
sbom_data["packages"].append(sbom_cpython_package)

# Find the pip wheel in ensurepip in the tarball
for member in tarball.getmembers():
match = re.match(rf"^Python-{cpython_version}/Lib/ensurepip/_bundled/(pip-.*\.whl)$", member.name)
if match is not None:
pip_wheel_filename = match.group(1)
pip_wheel_bytes = tarball.extractfile(member).read()
break
else:
raise ValueError("Could not find pip wheel in 'Lib/ensurepip/_bundled/...'")

# Now add pip to the SBOM. We do this after the above step to avoid
# CPython being dependent on packages that pip is dependent on.
create_pip_sbom_from_wheel(
sbom_data=sbom_data,
pip_wheel_filename=pip_wheel_filename,
pip_wheel_bytes=pip_wheel_bytes
)

# Extract all currently known files from the SBOM with their checksums.
known_sbom_files = {}
for sbom_file in sbom["files"]:
for sbom_file in sbom_data["files"]:
sbom_filename = sbom_file["fileName"]

# Look for the expected SHA256 checksum.
Expand Down Expand Up @@ -267,7 +469,7 @@ def create_sbom_for_source_tarball(tarball_path: str):
# If this is a new file, then it's a part of the 'CPython' SBOM package.
else:
sbom_file_spdx_id = spdx_id(f"SPDXRef-FILE-{member_name_no_prefix}")
sbom["files"].append(
sbom_data["files"].append(
{
"SPDXID": sbom_file_spdx_id,
"fileName": member_name_no_prefix,
Expand All @@ -283,7 +485,7 @@ def create_sbom_for_source_tarball(tarball_path: str):
],
}
)
sbom["relationships"].append(
sbom_data["relationships"].append(
{
"spdxElementId": sbom_cpython_package["SPDXID"],
"relatedSpdxElement": sbom_file_spdx_id,
Expand All @@ -300,7 +502,7 @@ def create_sbom_for_source_tarball(tarball_path: str):
)

# Final relationship, this SBOM describes the CPython package.
sbom["relationships"].append(
sbom_data["relationships"].append(
{
"spdxElementId": "SPDXRef-DOCUMENT",
"relatedSpdxElement": sbom_cpython_package["SPDXID"],
Expand All @@ -310,17 +512,17 @@ def create_sbom_for_source_tarball(tarball_path: str):

# Apply the 'supplier' tag to every package since we're shipping
# the package in the tarball itself. Originator field is used for maintainers.
for sbom_package in sbom["packages"]:
for sbom_package in sbom_data["packages"]:
sbom_package["supplier"] = "Organization: Python Software Foundation"
sbom_package["filesAnalyzed"] = True

# Calculate the 'packageVerificationCode' values for files in packages.
calculate_package_verification_codes(sbom)
calculate_package_verification_codes(sbom_data)

# Normalize SBOM structures for reproducibility.
normalize_sbom_data(sbom)
normalize_sbom_data(sbom_data)

return sbom
return sbom_data


def main() -> None:
Expand Down
60 changes: 59 additions & 1 deletion tests/test_sbom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import pytest
import json
import random
import hashlib
import unittest.mock

import pytest

import sbom


Expand Down Expand Up @@ -58,3 +62,57 @@ def test_normalization():
"a": [1, 2, 3, {"b": ["c", 4, ["2", 7, True, {}]]}],
"b": [["a", 1, 2], ["b", 1, 2]]
}


def test_fetch_project_metadata_from_pypi(mocker):

mock_urlopen = mocker.patch("sbom.urlopen")
mock_urlopen.return_value = unittest.mock.Mock()

# This is only a partial response using the information
# that this function uses.
mock_urlopen.return_value.read.return_value = json.dumps({
"urls": [
{
"digests": {
"blake2b_256": "94596638090c25e9bc4ce0c42817b5a234e183872a1129735a9330c472cc2056",
"md5": "1331aabb4d1a2677f493effeebda3605",
"sha256": "ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"
},
"filename": "pip-24.0.tar.gz",
"packagetype": "sdist",
"url": "https://files.pythonhosted.org/packages/.../pip-24.0.tar.gz",
},
{
"digests": {
"blake2b_256": "8a6a19e9fe04fca059ccf770861c7d5721ab4c2aebc539889e97c7977528a53b",
"md5": "74e3c5e4082113b1239ca0e9abfd1e82",
"sha256": "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc"
},
"filename": "pip-24.0-py3-none-any.whl",
"packagetype": "bdist_wheel",
"url": "https://files.pythonhosted.org/packages/.../pip-24.0-py3-none-any.whl",
}
]
}).encode()

# Default filename is the wheel
download_url, checksum_sha256 = sbom.fetch_package_metadata_from_pypi(
project="pip",
version="24.0",
)

mock_urlopen.assert_called_once_with("https://pypi.org/pypi/pip/24.0/json")
assert download_url == "https://files.pythonhosted.org/packages/.../pip-24.0-py3-none-any.whl"
assert checksum_sha256 == "ba0d021a166865d2265246961bec0152ff124de910c5cc39f1156ce3fa7c69dc"

# If we ask for the sdist (which we don't do normally)
# then it'll be returned instead.
download_url, checksum_sha256 = sbom.fetch_package_metadata_from_pypi(
project="pip",
version="24.0",
filename="pip-24.0.tar.gz"
)

assert download_url == "https://files.pythonhosted.org/packages/.../pip-24.0.tar.gz"
assert checksum_sha256 == "ea9bd1a847e8c5774a5777bb398c19e80bcd4e2aa16a4b301b718fe6f593aba2"

0 comments on commit d29c9c3

Please sign in to comment.