-
Notifications
You must be signed in to change notification settings - Fork 186
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SPDX generation using spdx-tools
This is set up to produce the same output as the current spdx generation module while utilising the spdx-tools library. The goal is to replace the current module with this new one, which will allow easy migration to more SPDX formats as well as SPDXv3. Signed-off-by: Armin Tänzer <[email protected]>
- Loading branch information
1 parent
62507ed
commit 6902c15
Showing
14 changed files
with
909 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,4 +18,5 @@ GitPython~=3.1 | |
prettytable~=3.6 | ||
packageurl-python>=0.10.4 | ||
license-expression>=30.1 | ||
spdx-tools>=0.8.0a3 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from spdx_tools.spdx.model import Version | ||
|
||
DOCUMENT_ID = 'SPDXRef-DOCUMENT' | ||
DOCUMENT_NAME = 'Tern report for {image_name}' | ||
SPDX_VERSION = 'SPDX-2.2' | ||
DATA_LICENSE = 'CC0-1.0' | ||
DOCUMENT_COMMENT = 'This document was generated by ' \ | ||
'the Tern Project: https://github.com/tern-tools/tern' | ||
DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \ | ||
'report-{version}-{image}-{uuid}' | ||
LICENSE_LIST_VERSION = Version(3, 20) | ||
CREATOR_NAME = 'tern-{version}' | ||
DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM' # TODO: different name here that is not specific to JSON | ||
DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \ | ||
'{timestamp}-{uuid}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (c) 2021 VMware, Inc. All Rights Reserved. | ||
# SPDX-License-Identifier: BSD-2-Clause | ||
|
||
""" | ||
File level helpers for SPDX document generator | ||
""" | ||
from datetime import datetime | ||
from typing import List | ||
|
||
from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm | ||
|
||
from tern.classes.file_data import FileData | ||
from tern.classes.image import Image | ||
from tern.classes.image_layer import ImageLayer | ||
from tern.classes.template import Template | ||
from tern.formats.spdx_new.layer_helpers import get_layer_checksum | ||
from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref | ||
|
||
|
||
def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]: | ||
"""Given a layer object and the SPDX template mapping, return a list | ||
of SPDX Files for each file in the layer""" | ||
spdx_files: List[SpdxFile] = [] | ||
file_refs = set() | ||
for filedata in layer_obj.files: | ||
# we do not know the layer's id, so we will use the timestamp instead | ||
file_ref = get_file_spdxref(filedata, str(timestamp)) | ||
if file_ref not in file_refs: | ||
spdx_files.append(get_file_dict(filedata, template, str(timestamp))) | ||
file_refs.add(file_ref) | ||
return spdx_files | ||
|
||
|
||
def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]: | ||
'''Given an image_obj object, and the SPDX template mapping, return a list | ||
of SPDX dictionary representations for each file in each layer of the | ||
image.''' | ||
file_list: List[SpdxFile] = [] | ||
|
||
# use file refs to keep track of duplicate files that may be located | ||
# in different places in the filesystem | ||
file_refs = set() | ||
for layer in image_obj.layers: | ||
if layer.files_analyzed: | ||
layer_checksum_value = get_layer_checksum(layer).value | ||
for filedata in layer.files: | ||
# we use the layer checksum as the layer id | ||
file_ref = get_file_spdxref(filedata, layer_checksum_value) | ||
if file_ref not in file_refs: | ||
file_list.append(get_file_dict(filedata, template, layer_checksum_value)) | ||
file_refs.add(file_ref) | ||
return file_list | ||
|
||
|
||
def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile: | ||
"""Given a FileData object and its SPDX template mapping, return an | ||
SPDX representation of the file. A layer_id is used to | ||
distinguish copies of the same file occurring in different places in the | ||
image""" | ||
mapping = filedata.to_dict(template) | ||
|
||
if filedata.licenses: | ||
license_info_in_file = [] | ||
for lic in set(filedata.licenses): | ||
# Add the license expression to the list if it is a valid SPDX | ||
# identifier; otherwise, add the LicenseRef | ||
license_info_in_file.append(get_package_license_declared(lic)) | ||
else: | ||
license_info_in_file = [SpdxNone()] | ||
|
||
file_notice = get_file_notice(filedata) | ||
file_comment = get_file_comment(filedata) | ||
file_contributors = get_file_contributors(filedata) | ||
|
||
return SpdxFile( | ||
spdx_id=get_file_spdxref(filedata, layer_id), | ||
name=mapping['FileName'], | ||
checksums=[get_file_checksum(filedata)], | ||
license_concluded=SpdxNoAssertion(), # we don't provide this | ||
copyright_text=SpdxNoAssertion(), # we don't know this | ||
file_types=[mapping['FileType']] if mapping['FileType'] else None, | ||
license_info_in_file=license_info_in_file, | ||
notice=file_notice if file_notice else None, | ||
comment=file_comment if file_comment else None, | ||
contributors=file_contributors if file_contributors else None, | ||
) | ||
|
||
|
||
def get_file_checksum(filedata: FileData) -> Checksum: | ||
"""Given a FileData object, return the checksum required by SPDX. | ||
Currently, the spec requires a SHA1 checksum""" | ||
return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1')) | ||
|
||
|
||
def get_file_notice(filedata: FileData) -> str: | ||
"""Return a formatted string with all copyrights found in a file. Return | ||
an empty string if there are no copyrights""" | ||
notice = '' | ||
for cp in filedata.copyrights: | ||
notice = notice + cp + '\n' | ||
return notice | ||
|
||
|
||
def get_file_comment(filedata: FileData) -> str: | ||
"""Return a formatted comment string with all file level notices. Return | ||
an empty string if no notices are present""" | ||
comment = '' | ||
for origin in filedata.origins.origins: | ||
comment = comment + f'{origin.origin_str}:' + '\n' | ||
for notice in origin.notices: | ||
comment = comment + \ | ||
f'{notice.level}: {notice.message}' + '\n' | ||
return comment | ||
|
||
|
||
def get_file_contributors(filedata: FileData) -> List[str]: | ||
"""The SPDX spec allows for an optional list of file contributors. | ||
If there are any authors found in the file, return a list of authors. | ||
If empty, return an empty list""" | ||
contributors = [] | ||
for author in filedata.authors: | ||
contributors.append(author) | ||
return contributors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (c) 2021 VMware, Inc. All Rights Reserved. | ||
# SPDX-License-Identifier: BSD-2-Clause | ||
|
||
""" | ||
General helpers for SPDX document generator | ||
""" | ||
import hashlib | ||
import io | ||
import re | ||
import uuid | ||
from datetime import datetime | ||
from typing import Union, Callable, IO, Tuple | ||
|
||
from license_expression import get_spdx_licensing, LicenseExpression, Licensing | ||
from spdx_tools.spdx.model import SpdxNone, Document | ||
|
||
from tern.classes.file_data import FileData | ||
from tern.classes.image import Image | ||
from tern.classes.image_layer import ImageLayer | ||
from tern.classes.package import Package | ||
|
||
|
||
def get_uuid() -> str: | ||
return str(uuid.uuid4()) | ||
|
||
|
||
def get_current_timestamp() -> datetime: | ||
return datetime.utcnow().replace(microsecond=0) | ||
|
||
|
||
def get_string_id(string: str) -> str: | ||
"""Return a unique identifier for the given string""" | ||
return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:] | ||
|
||
|
||
def get_license_ref(license_string: str) -> str: | ||
"""For SPDX format, return a LicenseRef string""" | ||
return 'LicenseRef-' + get_string_id(str(license_string)) | ||
|
||
|
||
def replace_invalid_chars_in_license_expression(license_string: str) -> str: | ||
"""Given a license string, replace common invalid SPDX license characters.""" | ||
not_allowed = [',', ';', '/', '&'] | ||
if any(x in license_string for x in not_allowed): | ||
# Try to replace common invalid license characters | ||
license_string = license_string.replace(',', ' and') | ||
license_string = license_string.replace('/', '-') | ||
license_string = license_string.replace(';', '.') | ||
license_string = license_string.replace('&', 'and') | ||
return license_string | ||
|
||
|
||
def is_valid_license_expression(license_string: str) -> bool: | ||
licensing = get_spdx_licensing() | ||
try: | ||
return licensing.validate(license_string).errors == [] | ||
# Catch any invalid license chars here | ||
except AttributeError: | ||
return False | ||
|
||
|
||
def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]: | ||
"""After substituting common invalid SPDX license characters using | ||
the is_spdx_license_expression() function, determines if the declared | ||
license string for a package or file is a valid SPDX license expression. | ||
If license expression is valid after substitutions, return the updated string. | ||
If not, return the LicenseRef of the original declared license expression | ||
passed in to the function. If a blank string is passed in, return `NONE`.""" | ||
if package_license_declared: | ||
package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared) | ||
if is_valid_license_expression(package_license_declared): | ||
return Licensing().parse(package_license_declared) | ||
|
||
return Licensing().parse(get_license_ref(package_license_declared)) | ||
return SpdxNone() | ||
|
||
|
||
def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str: | ||
with io.StringIO() as stream: | ||
writer_function(spdx_document, stream, validate=False) | ||
return stream.getvalue() | ||
|
||
|
||
########################################################################################### | ||
# central place for SPDXRef-generators to avoid circular imports as these are widely used # | ||
########################################################################################### | ||
|
||
def get_image_spdxref(image_obj: Image) -> str: | ||
"""Given the image object, return an SPDX reference ID""" | ||
# here we return the image name, tag and id | ||
return f'SPDXRef-{image_obj.get_human_readable_id()}' | ||
|
||
|
||
def get_package_spdxref(package_obj: Package) -> Tuple[str, str]: | ||
"""Given the package obj, return an SPDX reference ID for the binary | ||
and source package, if available""" | ||
pkg_ref = f"{package_obj.name}-{package_obj.version}" | ||
src_ref = '' | ||
if package_obj.src_name: | ||
# differentiate between binary and source package refs | ||
src_ver = package_obj.src_version + "-src" | ||
src_ref = f"{package_obj.src_name}-{src_ver}" | ||
# replace all the strings that SPDX doesn't like | ||
# allowed characters are: letters, numbers, "." and "-" | ||
clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref) | ||
if src_ref: | ||
clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref) | ||
return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}' | ||
return f'SPDXRef-{clean_pkg_ref}', '' | ||
|
||
|
||
def get_layer_spdxref(layer_obj: ImageLayer) -> str: | ||
"""Given the layer object, return an SPDX reference ID""" | ||
# here we return the shortened diff_id of the layer | ||
return f'SPDXRef-{layer_obj.diff_id[:10]}' | ||
|
||
|
||
def get_file_spdxref(filedata: FileData, layer_id: str) -> str: | ||
"""Given a FileData object, return a unique identifier for the SPDX | ||
document. According to the spec, this should be of the form: SPDXRef-<id> | ||
We will use a combination of the file name, checksum and layer_id and | ||
calculate a hash of this string""" | ||
file_string = filedata.path + filedata.checksum[:7] + layer_id | ||
fileid = get_string_id(file_string) | ||
return f'SPDXRef-{fileid}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (c) 2021 VMware, Inc. All Rights Reserved. | ||
# SPDX-License-Identifier: BSD-2-Clause | ||
|
||
""" | ||
Image level helpers for SPDX document generator | ||
Images for SPDX act like a Package | ||
""" | ||
from typing import List | ||
|
||
from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \ | ||
SpdxNoAssertion | ||
|
||
from tern.classes.image import Image | ||
from tern.classes.template import Template | ||
from tern.formats.spdx_new.layer_helpers import get_layer_licenses | ||
from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \ | ||
get_image_spdxref | ||
from tern.utils.general import get_git_rev_or_version | ||
|
||
|
||
def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]: | ||
"""Given an image_obj, return a unique list of extractedLicensingInfo | ||
that contains all the file and package LicenseRef and their corresponding plain text.""" | ||
|
||
unique_licenses = set() | ||
for layer in image_obj.layers: | ||
# Get all of the unique file licenses, if they exist | ||
unique_licenses.update(get_layer_licenses(layer)) | ||
# Next, collect any package licenses not already accounted for | ||
for package in layer.packages: | ||
if package.pkg_license: | ||
unique_licenses.add(package.pkg_license) | ||
# Add debian licenses from copyright text as one license | ||
if package.pkg_licenses: | ||
unique_licenses.add(", ".join(package.pkg_licenses)) | ||
extracted_licensing_info = [] | ||
for lic in list(unique_licenses): | ||
valid_spdx = is_valid_license_expression(lic) | ||
if not valid_spdx: | ||
extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic)) | ||
|
||
return extracted_licensing_info | ||
|
||
|
||
def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage: # TODO: these kind of functions don't produce dicts anymore, rename them | ||
"""Given an image object and the template object for SPDX, return the | ||
SPDX Package for the given image.""" | ||
mapping = image_obj.to_dict(template) | ||
return SpdxPackage( | ||
spdx_id=get_image_spdxref(image_obj), | ||
name=mapping["PackageName"], | ||
download_location=SpdxNoAssertion(), | ||
version=mapping["PackageVersion"], | ||
supplier=SpdxNoAssertion(), | ||
files_analyzed=False, | ||
license_concluded=SpdxNoAssertion(), | ||
license_declared=SpdxNoAssertion(), | ||
copyright_text=SpdxNoAssertion(), | ||
) | ||
|
||
|
||
def get_document_namespace(image_obj: Image) -> str: | ||
"""Given the image object, return a unique SPDX document uri. | ||
This is a combination of the tool name and version, the image name | ||
and the uuid""" | ||
return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}' |
Oops, something went wrong.