From 6902c15c28bae6b2d795932531e57f125b641d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Armin=20T=C3=A4nzer?= Date: Wed, 21 Jun 2023 13:43:34 +0200 Subject: [PATCH] Add SPDX generation using spdx-tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is set up to produce the same output as the current spdx generation module while utilising the spdx-tools library. The goal is to replace the current module with this new one, which will allow easy migration to more SPDX formats as well as SPDXv3. Signed-off-by: Armin Tänzer --- requirements.txt | 1 + setup.cfg | 2 + tern/formats/spdx_new/__init__.py | 0 tern/formats/spdx_new/constants.py | 15 ++ tern/formats/spdx_new/file_helpers.py | 125 ++++++++++++ tern/formats/spdx_new/general_helpers.py | 127 ++++++++++++ tern/formats/spdx_new/image_helpers.py | 68 +++++++ tern/formats/spdx_new/layer_helpers.py | 189 ++++++++++++++++++ tern/formats/spdx_new/make_spdx_model.py | 113 +++++++++++ tern/formats/spdx_new/package_helpers.py | 168 ++++++++++++++++ tern/formats/spdx_new/spdxjson/__init__.py | 0 tern/formats/spdx_new/spdxjson/generator.py | 50 +++++ .../formats/spdx_new/spdxtagvalue/__init__.py | 0 .../spdx_new/spdxtagvalue/generator.py | 51 +++++ 14 files changed, 909 insertions(+) create mode 100644 tern/formats/spdx_new/__init__.py create mode 100644 tern/formats/spdx_new/constants.py create mode 100644 tern/formats/spdx_new/file_helpers.py create mode 100644 tern/formats/spdx_new/general_helpers.py create mode 100644 tern/formats/spdx_new/image_helpers.py create mode 100644 tern/formats/spdx_new/layer_helpers.py create mode 100644 tern/formats/spdx_new/make_spdx_model.py create mode 100644 tern/formats/spdx_new/package_helpers.py create mode 100644 tern/formats/spdx_new/spdxjson/__init__.py create mode 100644 tern/formats/spdx_new/spdxjson/generator.py create mode 100644 tern/formats/spdx_new/spdxtagvalue/__init__.py create mode 100644 tern/formats/spdx_new/spdxtagvalue/generator.py diff --git a/requirements.txt b/requirements.txt index bb62be80..fb30b2a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ GitPython~=3.1 prettytable~=3.6 packageurl-python>=0.10.4 license-expression>=30.1 +spdx-tools>=0.8.0a3 diff --git a/setup.cfg b/setup.cfg index e5132f68..0090c3e5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,8 @@ tern.formats = yaml = tern.formats.yaml.generator:YAML html = tern.formats.html.generator:HTML cyclonedxjson = tern.formats.cyclonedx.cyclonedxjson.generator:CycloneDXJSON + spdxjson_new = tern.formats.spdx_new.spdxjson.generator:SpdxJSON + spdxtagvalue_new = tern.formats.spdx_new.spdxtagvalue.generator:SpdxTagValue tern.extensions = cve_bin_tool = tern.extensions.cve_bin_tool.executor:CveBinTool scancode = tern.extensions.scancode.executor:Scancode diff --git a/tern/formats/spdx_new/__init__.py b/tern/formats/spdx_new/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tern/formats/spdx_new/constants.py b/tern/formats/spdx_new/constants.py new file mode 100644 index 00000000..b3901e63 --- /dev/null +++ b/tern/formats/spdx_new/constants.py @@ -0,0 +1,15 @@ +from spdx_tools.spdx.model import Version + +DOCUMENT_ID = 'SPDXRef-DOCUMENT' +DOCUMENT_NAME = 'Tern report for {image_name}' +SPDX_VERSION = 'SPDX-2.2' +DATA_LICENSE = 'CC0-1.0' +DOCUMENT_COMMENT = 'This document was generated by ' \ + 'the Tern Project: https://github.com/tern-tools/tern' +DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \ + 'report-{version}-{image}-{uuid}' +LICENSE_LIST_VERSION = Version(3, 20) +CREATOR_NAME = 'tern-{version}' +DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM' # TODO: different name here that is not specific to JSON +DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \ + '{timestamp}-{uuid}' diff --git a/tern/formats/spdx_new/file_helpers.py b/tern/formats/spdx_new/file_helpers.py new file mode 100644 index 00000000..54caafda --- /dev/null +++ b/tern/formats/spdx_new/file_helpers.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +File level helpers for SPDX document generator +""" +from datetime import datetime +from typing import List + +from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm + +from tern.classes.file_data import FileData +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.classes.template import Template +from tern.formats.spdx_new.layer_helpers import get_layer_checksum +from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref + + +def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]: + """Given a layer object and the SPDX template mapping, return a list + of SPDX Files for each file in the layer""" + spdx_files: List[SpdxFile] = [] + file_refs = set() + for filedata in layer_obj.files: + # we do not know the layer's id, so we will use the timestamp instead + file_ref = get_file_spdxref(filedata, str(timestamp)) + if file_ref not in file_refs: + spdx_files.append(get_file_dict(filedata, template, str(timestamp))) + file_refs.add(file_ref) + return spdx_files + + +def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]: + '''Given an image_obj object, and the SPDX template mapping, return a list + of SPDX dictionary representations for each file in each layer of the + image.''' + file_list: List[SpdxFile] = [] + + # use file refs to keep track of duplicate files that may be located + # in different places in the filesystem + file_refs = set() + for layer in image_obj.layers: + if layer.files_analyzed: + layer_checksum_value = get_layer_checksum(layer).value + for filedata in layer.files: + # we use the layer checksum as the layer id + file_ref = get_file_spdxref(filedata, layer_checksum_value) + if file_ref not in file_refs: + file_list.append(get_file_dict(filedata, template, layer_checksum_value)) + file_refs.add(file_ref) + return file_list + + +def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile: + """Given a FileData object and its SPDX template mapping, return an + SPDX representation of the file. A layer_id is used to + distinguish copies of the same file occurring in different places in the + image""" + mapping = filedata.to_dict(template) + + if filedata.licenses: + license_info_in_file = [] + for lic in set(filedata.licenses): + # Add the license expression to the list if it is a valid SPDX + # identifier; otherwise, add the LicenseRef + license_info_in_file.append(get_package_license_declared(lic)) + else: + license_info_in_file = [SpdxNone()] + + file_notice = get_file_notice(filedata) + file_comment = get_file_comment(filedata) + file_contributors = get_file_contributors(filedata) + + return SpdxFile( + spdx_id=get_file_spdxref(filedata, layer_id), + name=mapping['FileName'], + checksums=[get_file_checksum(filedata)], + license_concluded=SpdxNoAssertion(), # we don't provide this + copyright_text=SpdxNoAssertion(), # we don't know this + file_types=[mapping['FileType']] if mapping['FileType'] else None, + license_info_in_file=license_info_in_file, + notice=file_notice if file_notice else None, + comment=file_comment if file_comment else None, + contributors=file_contributors if file_contributors else None, + ) + + +def get_file_checksum(filedata: FileData) -> Checksum: + """Given a FileData object, return the checksum required by SPDX. + Currently, the spec requires a SHA1 checksum""" + return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1')) + + +def get_file_notice(filedata: FileData) -> str: + """Return a formatted string with all copyrights found in a file. Return + an empty string if there are no copyrights""" + notice = '' + for cp in filedata.copyrights: + notice = notice + cp + '\n' + return notice + + +def get_file_comment(filedata: FileData) -> str: + """Return a formatted comment string with all file level notices. Return + an empty string if no notices are present""" + comment = '' + for origin in filedata.origins.origins: + comment = comment + f'{origin.origin_str}:' + '\n' + for notice in origin.notices: + comment = comment + \ + f'{notice.level}: {notice.message}' + '\n' + return comment + + +def get_file_contributors(filedata: FileData) -> List[str]: + """The SPDX spec allows for an optional list of file contributors. + If there are any authors found in the file, return a list of authors. + If empty, return an empty list""" + contributors = [] + for author in filedata.authors: + contributors.append(author) + return contributors diff --git a/tern/formats/spdx_new/general_helpers.py b/tern/formats/spdx_new/general_helpers.py new file mode 100644 index 00000000..7fc682aa --- /dev/null +++ b/tern/formats/spdx_new/general_helpers.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +General helpers for SPDX document generator +""" +import hashlib +import io +import re +import uuid +from datetime import datetime +from typing import Union, Callable, IO, Tuple + +from license_expression import get_spdx_licensing, LicenseExpression, Licensing +from spdx_tools.spdx.model import SpdxNone, Document + +from tern.classes.file_data import FileData +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.classes.package import Package + + +def get_uuid() -> str: + return str(uuid.uuid4()) + + +def get_current_timestamp() -> datetime: + return datetime.utcnow().replace(microsecond=0) + + +def get_string_id(string: str) -> str: + """Return a unique identifier for the given string""" + return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:] + + +def get_license_ref(license_string: str) -> str: + """For SPDX format, return a LicenseRef string""" + return 'LicenseRef-' + get_string_id(str(license_string)) + + +def replace_invalid_chars_in_license_expression(license_string: str) -> str: + """Given a license string, replace common invalid SPDX license characters.""" + not_allowed = [',', ';', '/', '&'] + if any(x in license_string for x in not_allowed): + # Try to replace common invalid license characters + license_string = license_string.replace(',', ' and') + license_string = license_string.replace('/', '-') + license_string = license_string.replace(';', '.') + license_string = license_string.replace('&', 'and') + return license_string + + +def is_valid_license_expression(license_string: str) -> bool: + licensing = get_spdx_licensing() + try: + return licensing.validate(license_string).errors == [] + # Catch any invalid license chars here + except AttributeError: + return False + + +def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]: + """After substituting common invalid SPDX license characters using + the is_spdx_license_expression() function, determines if the declared + license string for a package or file is a valid SPDX license expression. + If license expression is valid after substitutions, return the updated string. + If not, return the LicenseRef of the original declared license expression + passed in to the function. If a blank string is passed in, return `NONE`.""" + if package_license_declared: + package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared) + if is_valid_license_expression(package_license_declared): + return Licensing().parse(package_license_declared) + + return Licensing().parse(get_license_ref(package_license_declared)) + return SpdxNone() + + +def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str: + with io.StringIO() as stream: + writer_function(spdx_document, stream, validate=False) + return stream.getvalue() + + +########################################################################################### +# central place for SPDXRef-generators to avoid circular imports as these are widely used # +########################################################################################### + +def get_image_spdxref(image_obj: Image) -> str: + """Given the image object, return an SPDX reference ID""" + # here we return the image name, tag and id + return f'SPDXRef-{image_obj.get_human_readable_id()}' + + +def get_package_spdxref(package_obj: Package) -> Tuple[str, str]: + """Given the package obj, return an SPDX reference ID for the binary + and source package, if available""" + pkg_ref = f"{package_obj.name}-{package_obj.version}" + src_ref = '' + if package_obj.src_name: + # differentiate between binary and source package refs + src_ver = package_obj.src_version + "-src" + src_ref = f"{package_obj.src_name}-{src_ver}" + # replace all the strings that SPDX doesn't like + # allowed characters are: letters, numbers, "." and "-" + clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref) + if src_ref: + clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref) + return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}' + return f'SPDXRef-{clean_pkg_ref}', '' + + +def get_layer_spdxref(layer_obj: ImageLayer) -> str: + """Given the layer object, return an SPDX reference ID""" + # here we return the shortened diff_id of the layer + return f'SPDXRef-{layer_obj.diff_id[:10]}' + + +def get_file_spdxref(filedata: FileData, layer_id: str) -> str: + """Given a FileData object, return a unique identifier for the SPDX + document. According to the spec, this should be of the form: SPDXRef- + We will use a combination of the file name, checksum and layer_id and + calculate a hash of this string""" + file_string = filedata.path + filedata.checksum[:7] + layer_id + fileid = get_string_id(file_string) + return f'SPDXRef-{fileid}' diff --git a/tern/formats/spdx_new/image_helpers.py b/tern/formats/spdx_new/image_helpers.py new file mode 100644 index 00000000..7bacb8bb --- /dev/null +++ b/tern/formats/spdx_new/image_helpers.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +Image level helpers for SPDX document generator +Images for SPDX act like a Package +""" +from typing import List + +from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \ + SpdxNoAssertion + +from tern.classes.image import Image +from tern.classes.template import Template +from tern.formats.spdx_new.layer_helpers import get_layer_licenses +from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \ + get_image_spdxref +from tern.utils.general import get_git_rev_or_version + + +def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]: + """Given an image_obj, return a unique list of extractedLicensingInfo + that contains all the file and package LicenseRef and their corresponding plain text.""" + + unique_licenses = set() + for layer in image_obj.layers: + # Get all of the unique file licenses, if they exist + unique_licenses.update(get_layer_licenses(layer)) + # Next, collect any package licenses not already accounted for + for package in layer.packages: + if package.pkg_license: + unique_licenses.add(package.pkg_license) + # Add debian licenses from copyright text as one license + if package.pkg_licenses: + unique_licenses.add(", ".join(package.pkg_licenses)) + extracted_licensing_info = [] + for lic in list(unique_licenses): + valid_spdx = is_valid_license_expression(lic) + if not valid_spdx: + extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic)) + + return extracted_licensing_info + + +def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage: # TODO: these kind of functions don't produce dicts anymore, rename them + """Given an image object and the template object for SPDX, return the + SPDX Package for the given image.""" + mapping = image_obj.to_dict(template) + return SpdxPackage( + spdx_id=get_image_spdxref(image_obj), + name=mapping["PackageName"], + download_location=SpdxNoAssertion(), + version=mapping["PackageVersion"], + supplier=SpdxNoAssertion(), + files_analyzed=False, + license_concluded=SpdxNoAssertion(), + license_declared=SpdxNoAssertion(), + copyright_text=SpdxNoAssertion(), + ) + + +def get_document_namespace(image_obj: Image) -> str: + """Given the image object, return a unique SPDX document uri. + This is a combination of the tool name and version, the image name + and the uuid""" + return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}' diff --git a/tern/formats/spdx_new/layer_helpers.py b/tern/formats/spdx_new/layer_helpers.py new file mode 100644 index 00000000..832688ab --- /dev/null +++ b/tern/formats/spdx_new/layer_helpers.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +Layer level helpers for SPDX document generator +Layers for SPDX act like a Package +""" +import hashlib +import logging +import os +from typing import List, Optional, Tuple + +from license_expression import Licensing + +from spdx_tools.spdx.model import Package as SpdxPackage, SpdxNoAssertion, SpdxNone, PackageVerificationCode, Checksum, \ + ChecksumAlgorithm, Relationship, RelationshipType, ExtractedLicensingInfo + +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.formats.spdx_new.general_helpers import get_license_ref, is_valid_license_expression, \ + get_image_spdxref, get_package_spdxref, get_layer_spdxref, get_file_spdxref +from tern.formats.spdx_new.constants import DOCUMENT_ID +from tern.utils import constants +from tern.report import content + + +# global logger +logger = logging.getLogger(constants.logger_name) + + +def get_layer_extracted_licenses(layer_obj: ImageLayer) -> List[ExtractedLicensingInfo]: + """Given an image_obj, return a unique list of ExtractedLicensingInfo + that contains all the file and package LicenseRef and the corresponding plain text.""" + + # Get all of the unique file licenses, if they exist + unique_licenses = set(get_layer_licenses(layer_obj)) + # Next, collect any package licenses not already accounted for + for package in layer_obj.packages: + if package.pkg_license: + unique_licenses.add(package.pkg_license) + extracted_texts = [] + for lic in list(unique_licenses): + valid_spdx = is_valid_license_expression(lic) + if not valid_spdx: + extracted_texts.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic)) + return extracted_texts + + +def get_image_layer_relationships(image_obj: Image) -> List[Relationship]: + """Given an image object, return a list of dictionaries describing the + relationship between each layer "package" and the image and packages + related to it.""" + layer_relationships = [] + image_ref = get_image_spdxref(image_obj) + + # Required - DOCUMENT_DESCRIBES relationship + layer_relationships.append(Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, image_ref)) + + for index, layer in enumerate(image_obj.layers): + layer_ref = get_layer_spdxref(layer) + # First, add dictionaries for the layer relationship to the image + layer_relationships.append(Relationship(image_ref, RelationshipType.CONTAINS, layer_ref)) + # Next, add dictionary of the layer relationship to other layers + if index != 0: + prev_layer_ref = get_layer_spdxref(image_obj.layers[index - 1]) + layer_relationships.append(Relationship(prev_layer_ref, RelationshipType.HAS_PREREQUISITE, layer_ref)) + # Finally, add package relationships for the layer + if layer.packages: + for package in layer.packages: + pkg_ref, src_ref = get_package_spdxref(package) + layer_relationships.append(Relationship(layer_ref, RelationshipType.CONTAINS, pkg_ref)) + if src_ref: + layer_relationships.append(Relationship(pkg_ref, RelationshipType.GENERATED_FROM, src_ref)) + + return layer_relationships + + +def get_layer_file_data_list(layer_obj: ImageLayer) -> List[str]: + """Given a layer object return the SPDX list of file refs in the layer. + Return an empty list if the files are not analyzed""" + file_refs = [] + if layer_obj.files_analyzed: + layer_checksum = get_layer_checksum(layer_obj) + file_refs = [get_file_spdxref(filedata, layer_checksum.value) for filedata in layer_obj.files] + # some files are located in different places in the filesystem + # we make sure they don't occur as duplicates in this list + return list(set(file_refs)) + + +def get_layer_package_comment(layer_obj: ImageLayer) -> str: + """Return a package comment string value for a list of NoticeOrigin + objects for the given layer object""" + comment = '' + if "headers" in layer_obj.extension_info.keys(): + for header in layer_obj.extension_info.get("headers"): + comment += header + comment += '\n' + if not layer_obj.origins.is_empty(): + for notice_origin in layer_obj.origins.origins: + comment += content.print_notices(notice_origin, '', '\t') + return comment + + +def get_layer_dict(layer_obj: ImageLayer) -> Tuple[SpdxPackage, List[Relationship]]: + """Given a layer object, return an SPDX Package representation + of the layer and the list of CONTAINS relationships to all files in that layer. + The analyzed files will go in a separate part of the document.""" + + comment = get_layer_package_comment(layer_obj) + verification_code = get_layer_verification_code(layer_obj) + + layer_licenses = get_layer_licenses(layer_obj) + license_info_from_files = [] + if layer_licenses: + # Use the layer LicenseRef in the list instead of license expression + for lic in layer_licenses: + license_info_from_files.append(get_license_ref(lic)) # TODO: potential bug here that converts valid expressions to LicenseRef- identifiers + license_info_from_files = [Licensing().parse(lic) for lic in license_info_from_files] + + layer_spdx_id = get_layer_spdxref(layer_obj) + package = SpdxPackage( + spdx_id=layer_spdx_id, + name=os.path.basename(layer_obj.tar_file), + version=layer_obj.layer_index, + supplier=SpdxNoAssertion(), + file_name=layer_obj.tar_file, + download_location=SpdxNone(), + files_analyzed=bool(layer_obj.files_analyzed), + verification_code=verification_code if bool(layer_obj.files_analyzed) else None, + checksums=[get_layer_checksum(layer_obj)], + license_concluded=SpdxNoAssertion(), + license_declared=SpdxNoAssertion(), + copyright_text=SpdxNoAssertion(), + comment=comment if comment else None, + license_info_from_files=license_info_from_files, + ) + + relationships = [ + Relationship(layer_spdx_id, RelationshipType.CONTAINS, file_ref) + for file_ref in get_layer_file_data_list(layer_obj) + ] + + return package, relationships + + +def get_layer_licenses(layer_obj: ImageLayer) -> List[str]: + """Return a list of unique licenses from the files analyzed + in the layer object. It is assumed that the files were analyzed and + there should be some license expressions. If there are not, an empty list + is returned""" + licenses = set() + for filedata in layer_obj.files: + # we will use the SPDX license expressions here as they will be + # valid SPDX license identifiers + if filedata.licenses: + for lic in list(set(filedata.licenses)): + licenses.add(lic) + return list(licenses) + + +def get_layer_verification_code(layer_obj: ImageLayer) -> Optional[PackageVerificationCode]: + """Calculate the verification code from the files in an image layer. This + assumes that layer_obj.files_analyzed is True. The implementation follows + the algorithm in the SPDX spec v 2.2 which requires SHA1 to be used to + calculate the checksums of the file and the final verification code""" + sha1_list = [] + for filedata in layer_obj.files: + filesha = filedata.get_checksum('sha1') + if not filesha: + # we cannot create a verification code, hence file generation + # is aborted + logger.critical( + 'File %s does not have a sha1 checksum. Failed to generate ' + 'an SPDX report', filedata.path) + return None + sha1_list.append(filesha) + sha1_list.sort() + sha1s = ''.join(sha1_list) + verification_code = hashlib.sha1(sha1s.encode('utf-8')).hexdigest() # nosec + return PackageVerificationCode(verification_code) + + +def get_layer_checksum(layer_obj: ImageLayer) -> Checksum: + return Checksum( + ChecksumAlgorithm[layer_obj.checksum_type.upper()], + layer_obj.checksum + ) diff --git a/tern/formats/spdx_new/make_spdx_model.py b/tern/formats/spdx_new/make_spdx_model.py new file mode 100644 index 00000000..188abd10 --- /dev/null +++ b/tern/formats/spdx_new/make_spdx_model.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +Common functions that are useful for all SPDX serialization formats +""" + +import logging +from typing import List + +from spdx_tools.spdx.model import Document, CreationInfo, Actor, ActorType, Relationship, RelationshipType + +from tern.classes.image_layer import ImageLayer +from tern.classes.template import Template +from tern.formats.spdx_new.constants import DOCUMENT_ID, DOCUMENT_NAME, SPDX_VERSION, DATA_LICENSE, DOCUMENT_COMMENT, \ + LICENSE_LIST_VERSION, CREATOR_NAME, DOCUMENT_NAME_SNAPSHOT, DOCUMENT_NAMESPACE_SNAPSHOT +from tern.formats.spdx_new.file_helpers import get_layer_files_list +from tern.formats.spdx_new.general_helpers import get_current_timestamp, get_uuid, get_image_spdxref +from tern.classes.image import Image +from tern.formats.spdx.spdx import SPDX +from tern.formats.spdx_new.file_helpers import get_files_list +from tern.formats.spdx_new.image_helpers import get_image_extracted_licenses, \ + get_image_dict, get_document_namespace +from tern.formats.spdx_new.layer_helpers import get_layer_dict, get_image_layer_relationships, get_layer_extracted_licenses +from tern.formats.spdx_new.package_helpers import get_packages_list, get_layer_packages_list +from tern.utils import constants + +from tern.utils.general import get_git_rev_or_version + +# global logger +logger = logging.getLogger(constants.logger_name) + + +def make_spdx_model(image_obj_list: List[Image]) -> Document: + template = SPDX() + # we still don't know how SPDX documents could represent multiple + # images. Hence, we will assume only one image is analyzed and the + # input is a list of length 1 + image_obj = image_obj_list[0] + + creation_info = CreationInfo( + spdx_version=SPDX_VERSION, + spdx_id=DOCUMENT_ID, + name=DOCUMENT_NAME.format(image_name=image_obj.name), + document_namespace=get_document_namespace(image_obj), + creators=[Actor(actor_type=ActorType.TOOL, name=CREATOR_NAME.format(version=get_git_rev_or_version()[1]))], + created=get_current_timestamp(), + license_list_version=LICENSE_LIST_VERSION, + data_license=DATA_LICENSE, + document_comment=DOCUMENT_COMMENT, + ) + describes_relationship = Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, get_image_spdxref(image_obj)) + packages = [get_image_dict(image_obj, template)] + image_layer_relationships = get_image_layer_relationships(image_obj) + + layer_file_relationships = [] + for layer in image_obj.layers: + package, relationships = get_layer_dict(layer) + packages.append(package) + layer_file_relationships.extend(relationships) + + packages.extend(get_packages_list(image_obj, template)) + files = get_files_list(image_obj, template) + extracted_licensing_info = get_image_extracted_licenses(image_obj) + + return Document( + creation_info=creation_info, + packages=packages, + files=files, + relationships=[describes_relationship] + image_layer_relationships + layer_file_relationships, + extracted_licensing_info=extracted_licensing_info + ) + + +def make_spdx_model_snapshot(layer_obj: ImageLayer, template: Template) -> Document: + """This is the SPDX document containing just the packages found at + container build time""" + timestamp = get_current_timestamp() + + creation_info = CreationInfo( + spdx_version=SPDX_VERSION, + spdx_id=DOCUMENT_ID, + name=DOCUMENT_NAME_SNAPSHOT, + document_namespace=DOCUMENT_NAMESPACE_SNAPSHOT.format(timestamp=timestamp, uuid=get_uuid()), + creators=[Actor(actor_type=ActorType.TOOL, name=CREATOR_NAME.format(get_git_rev_or_version()[1]))], + created=timestamp, + license_list_version=LICENSE_LIST_VERSION, + data_license=DATA_LICENSE, + document_comment=DOCUMENT_COMMENT, + ) + + # Add list of package dictionaries to packages list, if they exist + packages = get_layer_packages_list(layer_obj, template) + describes_relationships = [ + Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, package.spdx_id) + for package in packages + ] + + # Add list of file dictionaries, if they exist + files = get_layer_files_list(layer_obj, template, timestamp) + + # Add package and file extracted license texts, if they exist + extracted_licensing_info = get_layer_extracted_licenses(layer_obj) + + return Document( + creation_info=creation_info, + packages=packages, + files=files, + relationships=describes_relationships, + extracted_licensing_info=extracted_licensing_info + ) diff --git a/tern/formats/spdx_new/package_helpers.py b/tern/formats/spdx_new/package_helpers.py new file mode 100644 index 00000000..f76fd677 --- /dev/null +++ b/tern/formats/spdx_new/package_helpers.py @@ -0,0 +1,168 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +Package level helpers for SPDX document generator +""" +from typing import List + +from packageurl import PackageURL +from spdx_tools.spdx.model import Package as SpdxPackage, SpdxNoAssertion, SpdxNone, Actor, ActorType, \ + ExternalPackageRef, ExternalPackageRefCategory + +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.classes.package import Package +from tern.classes.template import Template +from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_package_spdxref +from tern.report import content + + +SOURCE_PACKAGE_COMMENT = 'This package refers to a source package associated' \ + ' with one or more binary packages installed in this container. ' \ + 'This source pacakge is NOT installed in the container but may be useful' \ + ' for CVE lookups.' + + +def get_layer_packages_list(layer: ImageLayer, template: Template) -> List[SpdxPackage]: + """Given a layer object and an SPDX template object, return a list + of SPDX dictionary representations for each of the packages in the layer + and their package references""" + package_dicts = [] + package_refs = [] + for package in layer.packages: + # Create a list of SpdxPackages, each one representing + # one package object in the image + pkg_ref, _ = get_package_spdxref(package) # TODO: Is it ok to ignore source_package_ref here? + if pkg_ref not in package_refs: + package_dicts.append(get_package_dict(package, template)) + package_refs.append(pkg_ref) + return package_dicts + + +def get_package_comment(package: Package) -> str: + """Given a package object, return a PackageComment string for a list of + NoticeOrigin objects""" + comment = '' + if package.origins.origins: + for notice_origin in package.origins.origins: + comment = comment + content.print_notices( + notice_origin, '', '\t') + return comment + + +def get_source_package_dict(package: Package, template: Template) -> SpdxPackage: + """Given a package object and its SPDX template mapping, return an SPDX Package of the associated source package. + The analyzed files will go in a separate dictionary for the JSON document.""" + mapping = package.to_dict(template) + + _, src_ref = get_package_spdxref(package) + declared_lic = mapping['PackageLicenseDeclared'] + # Define debian licenses from copyright text as one license + if package.pkg_format == 'deb': + declared_lic = ', '.join(package.pkg_licenses) + + return SpdxPackage( + spdx_id=src_ref, + name=mapping['SourcePackageName'], + version=mapping['SourcePackageVersion'] if mapping['SourcePackageVersion'] else 'NOASSERTION', + download_location=mapping['PackageDownloadLocation'] if mapping['PackageDownloadLocation'] else SpdxNoAssertion(), + files_analyzed=False, + license_concluded=SpdxNoAssertion(), + license_declared=get_package_license_declared(declared_lic), + copyright_text=mapping['PackageCopyrightText'] if mapping['PackageCopyrightText'] else SpdxNone(), + comment=SOURCE_PACKAGE_COMMENT, + ) + + +def get_package_dict(package: Package, template: Template) -> SpdxPackage: + """Given a package object and its SPDX template mapping, return an SPDX Package. + The analyzed files will go in a separate dictionary for the JSON document.""" + mapping = package.to_dict(template) + + if mapping['PackageSupplier']: + supplier = Actor(ActorType.ORGANIZATION, mapping['PackageSupplier']) + else: + supplier = SpdxNoAssertion() + + external_ref = [] + if get_purl(package): + external_ref.append(ExternalPackageRef( + ExternalPackageRefCategory.PACKAGE_MANAGER, + "purl", + get_purl(package) + )) + + pkg_ref, _ = get_package_spdxref(package) + # Define debian licenses from copyright text as one license + declared_lic = mapping['PackageLicenseDeclared'] + if package.pkg_format == 'deb': + declared_lic = ', '.join(package.pkg_licenses) + + return SpdxPackage( + spdx_id=pkg_ref, + name=mapping['PackageName'], + version=mapping['PackageVersion'] if mapping['PackageVersion'] else 'NOASSERTION', + supplier=supplier, + download_location=mapping['PackageDownloadLocation'] if mapping['PackageDownloadLocation'] else SpdxNoAssertion(), + files_analyzed=False, + license_concluded=SpdxNoAssertion(), + license_declared=get_package_license_declared(declared_lic), + copyright_text=mapping['PackageCopyrightText'] if mapping['PackageCopyrightText'] else SpdxNone(), + external_references=external_ref, + comment=get_package_comment(package), + ) + + +def get_packages_list(image_obj: Image, template: Template) -> List[SpdxPackage]: + """Given an image object and the template object for SPDX, return a list + of SPDX dictionary representations for each of the packages in the image. + The SPDX JSON spec for packages requires: + name + versionInfo + downloadLocation""" + packages = [] + package_refs = set() + + for layer in image_obj.layers: + for package in layer.packages: + # Create a list of dictionaries. Each dictionary represents + # one package object in the image + pkg_ref, src_ref = get_package_spdxref(package) + if pkg_ref not in package_refs and package.name: + packages.append(get_package_dict(package, template)) + package_refs.add(pkg_ref) + if src_ref and src_ref not in package_refs: + packages.append(get_source_package_dict( + package, template)) + package_refs.add(src_ref) + return packages + + +purl_types_with_namespaces = [ + 'deb', + 'rpm', + 'apk', + 'alpm' +] + + +def get_purl(package_obj: Package) -> str: + """Return a purl string for a given package""" + purl_type = package_obj.pkg_format + purl_namespace = '' + if purl_type in purl_types_with_namespaces and package_obj.pkg_supplier: + # https://github.com/package-url/purl-spec/pull/214 + if package_obj.pkg_supplier.split(' ')[0] == "VMware": + purl_namespace = package_obj.pkg_supplier.split(' ')[1].lower() + else: + purl_namespace = package_obj.pkg_supplier.split(' ')[0].lower() + # TODO- this might need adjusting for alpm. Currently can't test on M1 + purl = PackageURL(purl_type, purl_namespace, package_obj.name.lower(), package_obj.version, + qualifiers={'arch': package_obj.arch if package_obj.arch else ''}) + try: + return purl.to_string() + except ValueError: + return '' diff --git a/tern/formats/spdx_new/spdxjson/__init__.py b/tern/formats/spdx_new/spdxjson/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tern/formats/spdx_new/spdxjson/generator.py b/tern/formats/spdx_new/spdxjson/generator.py new file mode 100644 index 00000000..1ab8a6a5 --- /dev/null +++ b/tern/formats/spdx_new/spdxjson/generator.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +SPDX JSON document generator +""" +import logging +from typing import List + +from spdx_tools.spdx.model import Document +from spdx_tools.spdx.writer.json.json_writer import write_document_to_stream + +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.formats import generator +from tern.formats.spdx.spdx import SPDX +from tern.formats.spdx_new.general_helpers import get_serialized_document_string +from tern.formats.spdx_new.make_spdx_model import make_spdx_model, make_spdx_model_snapshot +from tern.utils import constants + +# global logger +logger = logging.getLogger(constants.logger_name) + + +class SpdxJSON(generator.Generate): + def generate(self, image_obj_list: List[Image], print_inclusive=False) -> str: + """Generate an SPDX document + WARNING: This assumes that the list consists of one image or the base + image and a stub image, in which case, the information in the stub + image is not applicable in the SPDX case as it is an empty image + object with no metadata as nothing got built. + + For the sake of SPDX, an image is a 'Package' which 'CONTAINS' each + layer which is also a 'Package' which 'CONTAINS' the real Packages""" + logger.debug("Generating SPDX JSON document...") + + spdx_document: Document = make_spdx_model(image_obj_list) + + return get_serialized_document_string(spdx_document, write_document_to_stream) + + def generate_layer(self, layer: ImageLayer) -> str: + """Generate an SPDX document containing package and file information + at container build time""" + logger.debug("Generating SPDX JSON snapshot document...") + template = SPDX() + spdx_document: Document = make_spdx_model_snapshot(layer, template) + + return get_serialized_document_string(spdx_document, write_document_to_stream) diff --git a/tern/formats/spdx_new/spdxtagvalue/__init__.py b/tern/formats/spdx_new/spdxtagvalue/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tern/formats/spdx_new/spdxtagvalue/generator.py b/tern/formats/spdx_new/spdxtagvalue/generator.py new file mode 100644 index 00000000..928ccdfd --- /dev/null +++ b/tern/formats/spdx_new/spdxtagvalue/generator.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 VMware, Inc. All Rights Reserved. +# SPDX-License-Identifier: BSD-2-Clause + +""" +SPDX Tag-Value document generator +""" + +import logging +from typing import List + +from spdx_tools.spdx.model import Document +from spdx_tools.spdx.writer.tagvalue.tagvalue_writer import write_document_to_stream + +from tern.classes.image import Image +from tern.classes.image_layer import ImageLayer +from tern.formats import generator +from tern.formats.spdx.spdx import SPDX +from tern.formats.spdx_new.general_helpers import get_serialized_document_string +from tern.formats.spdx_new.make_spdx_model import make_spdx_model, make_spdx_model_snapshot +from tern.utils import constants + +# global logger +logger = logging.getLogger(constants.logger_name) + + +class SpdxTagValue(generator.Generate): + def generate(self, image_obj_list: List[Image], print_inclusive=False) -> str: + """Generate an SPDX document + WARNING: This assumes that the list consists of one image or the base + image and a stub image, in which case, the information in the stub + image is not applicable in the SPDX case as it is an empty image + object with no metadata as nothing got built. + + For the sake of SPDX, an image is a 'Package' which 'CONTAINS' each + layer which is also a 'Package' which 'CONTAINS' the real Packages""" + logger.debug("Generating SPDX Tag-Value document...") + + spdx_document: Document = make_spdx_model(image_obj_list) + + return get_serialized_document_string(spdx_document, write_document_to_stream) + + def generate_layer(self, layer: ImageLayer) -> str: + """Generate an SPDX document containing package and file information + at container build time""" + logger.debug("Generating SPDX Tag-Value snapshot document...") + template = SPDX() + spdx_document: Document = make_spdx_model_snapshot(layer, template) + + return get_serialized_document_string(spdx_document, write_document_to_stream)