From 6902c15c28bae6b2d795932531e57f125b641d89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Armin=20T=C3=A4nzer?= <armin.taenzer@tngtech.com>
Date: Wed, 21 Jun 2023 13:43:34 +0200
Subject: [PATCH] Add SPDX generation using spdx-tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is set up to produce the same output as the current
spdx generation module while utilising the spdx-tools
library. The goal is to replace the current module with
this new one, which will allow easy migration to more SPDX
formats as well as SPDXv3.

Signed-off-by: Armin Tänzer <armin.taenzer@tngtech.com>
---
 requirements.txt                              |   1 +
 setup.cfg                                     |   2 +
 tern/formats/spdx_new/__init__.py             |   0
 tern/formats/spdx_new/constants.py            |  15 ++
 tern/formats/spdx_new/file_helpers.py         | 125 ++++++++++++
 tern/formats/spdx_new/general_helpers.py      | 127 ++++++++++++
 tern/formats/spdx_new/image_helpers.py        |  68 +++++++
 tern/formats/spdx_new/layer_helpers.py        | 189 ++++++++++++++++++
 tern/formats/spdx_new/make_spdx_model.py      | 113 +++++++++++
 tern/formats/spdx_new/package_helpers.py      | 168 ++++++++++++++++
 tern/formats/spdx_new/spdxjson/__init__.py    |   0
 tern/formats/spdx_new/spdxjson/generator.py   |  50 +++++
 .../formats/spdx_new/spdxtagvalue/__init__.py |   0
 .../spdx_new/spdxtagvalue/generator.py        |  51 +++++
 14 files changed, 909 insertions(+)
 create mode 100644 tern/formats/spdx_new/__init__.py
 create mode 100644 tern/formats/spdx_new/constants.py
 create mode 100644 tern/formats/spdx_new/file_helpers.py
 create mode 100644 tern/formats/spdx_new/general_helpers.py
 create mode 100644 tern/formats/spdx_new/image_helpers.py
 create mode 100644 tern/formats/spdx_new/layer_helpers.py
 create mode 100644 tern/formats/spdx_new/make_spdx_model.py
 create mode 100644 tern/formats/spdx_new/package_helpers.py
 create mode 100644 tern/formats/spdx_new/spdxjson/__init__.py
 create mode 100644 tern/formats/spdx_new/spdxjson/generator.py
 create mode 100644 tern/formats/spdx_new/spdxtagvalue/__init__.py
 create mode 100644 tern/formats/spdx_new/spdxtagvalue/generator.py

diff --git a/requirements.txt b/requirements.txt
index bb62be80..fb30b2a6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,4 +18,5 @@ GitPython~=3.1
 prettytable~=3.6
 packageurl-python>=0.10.4
 license-expression>=30.1
+spdx-tools>=0.8.0a3
 
diff --git a/setup.cfg b/setup.cfg
index e5132f68..0090c3e5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,6 +52,8 @@ tern.formats =
     yaml = tern.formats.yaml.generator:YAML
     html = tern.formats.html.generator:HTML
     cyclonedxjson = tern.formats.cyclonedx.cyclonedxjson.generator:CycloneDXJSON
+    spdxjson_new = tern.formats.spdx_new.spdxjson.generator:SpdxJSON
+    spdxtagvalue_new = tern.formats.spdx_new.spdxtagvalue.generator:SpdxTagValue
 tern.extensions =
     cve_bin_tool = tern.extensions.cve_bin_tool.executor:CveBinTool
     scancode = tern.extensions.scancode.executor:Scancode
diff --git a/tern/formats/spdx_new/__init__.py b/tern/formats/spdx_new/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tern/formats/spdx_new/constants.py b/tern/formats/spdx_new/constants.py
new file mode 100644
index 00000000..b3901e63
--- /dev/null
+++ b/tern/formats/spdx_new/constants.py
@@ -0,0 +1,15 @@
+from spdx_tools.spdx.model import Version
+
+DOCUMENT_ID = 'SPDXRef-DOCUMENT'
+DOCUMENT_NAME = 'Tern report for {image_name}'
+SPDX_VERSION = 'SPDX-2.2'
+DATA_LICENSE = 'CC0-1.0'
+DOCUMENT_COMMENT = 'This document was generated by ' \
+    'the Tern Project: https://github.com/tern-tools/tern'
+DOCUMENT_NAMESPACE = 'https://spdx.org/spdxdocs/tern-' \
+    'report-{version}-{image}-{uuid}'
+LICENSE_LIST_VERSION = Version(3, 20)
+CREATOR_NAME = 'tern-{version}'
+DOCUMENT_NAME_SNAPSHOT = 'Tern SPDX JSON SBoM'  # TODO: different name here that is not specific to JSON
+DOCUMENT_NAMESPACE_SNAPSHOT = 'https://spdx.org/spdxdocs/tern-report-' \
+    '{timestamp}-{uuid}'
diff --git a/tern/formats/spdx_new/file_helpers.py b/tern/formats/spdx_new/file_helpers.py
new file mode 100644
index 00000000..54caafda
--- /dev/null
+++ b/tern/formats/spdx_new/file_helpers.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+File level helpers for SPDX document generator
+"""
+from datetime import datetime
+from typing import List
+
+from spdx_tools.spdx.model import File as SpdxFile, SpdxNone, SpdxNoAssertion, Checksum, ChecksumAlgorithm
+
+from tern.classes.file_data import FileData
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.classes.template import Template
+from tern.formats.spdx_new.layer_helpers import get_layer_checksum
+from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_file_spdxref
+
+
+def get_layer_files_list(layer_obj: ImageLayer, template: Template, timestamp: datetime) -> List[SpdxFile]:
+    """Given a layer object and the SPDX template mapping, return a list
+    of SPDX Files for each file in the layer"""
+    spdx_files: List[SpdxFile] = []
+    file_refs = set()
+    for filedata in layer_obj.files:
+        # we do not know the layer's id, so we will use the timestamp instead
+        file_ref = get_file_spdxref(filedata, str(timestamp))
+        if file_ref not in file_refs:
+            spdx_files.append(get_file_dict(filedata, template, str(timestamp)))
+            file_refs.add(file_ref)
+    return spdx_files
+
+
+def get_files_list(image_obj: Image, template: Template) -> List[SpdxFile]:
+    '''Given an image_obj object, and the SPDX template mapping, return a list
+    of SPDX dictionary representations for each file in each layer of the
+    image.'''
+    file_list: List[SpdxFile] = []
+
+    # use file refs to keep track of duplicate files that may be located
+    # in different places in the filesystem
+    file_refs = set()
+    for layer in image_obj.layers:
+        if layer.files_analyzed:
+            layer_checksum_value = get_layer_checksum(layer).value
+            for filedata in layer.files:
+                # we use the layer checksum as the layer id
+                file_ref = get_file_spdxref(filedata, layer_checksum_value)
+                if file_ref not in file_refs:
+                    file_list.append(get_file_dict(filedata, template, layer_checksum_value))
+                    file_refs.add(file_ref)
+    return file_list
+
+
+def get_file_dict(filedata: FileData, template: Template, layer_id: str) -> SpdxFile:
+    """Given a FileData object and its SPDX template mapping, return an
+    SPDX representation of the file. A layer_id is used to
+    distinguish copies of the same file occurring in different places in the
+    image"""
+    mapping = filedata.to_dict(template)
+
+    if filedata.licenses:
+        license_info_in_file = []
+        for lic in set(filedata.licenses):
+            # Add the license expression to the list if it is a valid SPDX
+            # identifier; otherwise, add the LicenseRef
+            license_info_in_file.append(get_package_license_declared(lic))
+    else:
+        license_info_in_file = [SpdxNone()]
+
+    file_notice = get_file_notice(filedata)
+    file_comment = get_file_comment(filedata)
+    file_contributors = get_file_contributors(filedata)
+
+    return SpdxFile(
+        spdx_id=get_file_spdxref(filedata, layer_id),
+        name=mapping['FileName'],
+        checksums=[get_file_checksum(filedata)],
+        license_concluded=SpdxNoAssertion(),  # we don't provide this
+        copyright_text=SpdxNoAssertion(),     # we don't know this
+        file_types=[mapping['FileType']] if mapping['FileType'] else None,
+        license_info_in_file=license_info_in_file,
+        notice=file_notice if file_notice else None,
+        comment=file_comment if file_comment else None,
+        contributors=file_contributors if file_contributors else None,
+    )
+
+
+def get_file_checksum(filedata: FileData) -> Checksum:
+    """Given a FileData object, return the checksum required by SPDX.
+    Currently, the spec requires a SHA1 checksum"""
+    return Checksum(ChecksumAlgorithm.SHA1, filedata.get_checksum('sha1'))
+
+
+def get_file_notice(filedata: FileData) -> str:
+    """Return a formatted string with all copyrights found in a file. Return
+    an empty string if there are no copyrights"""
+    notice = ''
+    for cp in filedata.copyrights:
+        notice = notice + cp + '\n'
+    return notice
+
+
+def get_file_comment(filedata: FileData) -> str:
+    """Return a formatted comment string with all file level notices. Return
+    an empty string if no notices are present"""
+    comment = ''
+    for origin in filedata.origins.origins:
+        comment = comment + f'{origin.origin_str}:' + '\n'
+        for notice in origin.notices:
+            comment = comment + \
+                f'{notice.level}: {notice.message}' + '\n'
+    return comment
+
+
+def get_file_contributors(filedata: FileData) -> List[str]:
+    """The SPDX spec allows for an optional list of file contributors.
+    If there are any authors found in the file, return a list of authors.
+    If empty, return an empty list"""
+    contributors = []
+    for author in filedata.authors:
+        contributors.append(author)
+    return contributors
diff --git a/tern/formats/spdx_new/general_helpers.py b/tern/formats/spdx_new/general_helpers.py
new file mode 100644
index 00000000..7fc682aa
--- /dev/null
+++ b/tern/formats/spdx_new/general_helpers.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+General helpers for SPDX document generator
+"""
+import hashlib
+import io
+import re
+import uuid
+from datetime import datetime
+from typing import Union, Callable, IO, Tuple
+
+from license_expression import get_spdx_licensing, LicenseExpression, Licensing
+from spdx_tools.spdx.model import SpdxNone, Document
+
+from tern.classes.file_data import FileData
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.classes.package import Package
+
+
+def get_uuid() -> str:
+    return str(uuid.uuid4())
+
+
+def get_current_timestamp() -> datetime:
+    return datetime.utcnow().replace(microsecond=0)
+
+
+def get_string_id(string: str) -> str:
+    """Return a unique identifier for the given string"""
+    return hashlib.sha256(string.encode('utf-8')).hexdigest()[-7:]
+
+
+def get_license_ref(license_string: str) -> str:
+    """For SPDX format, return a LicenseRef string"""
+    return 'LicenseRef-' + get_string_id(str(license_string))
+
+
+def replace_invalid_chars_in_license_expression(license_string: str) -> str:
+    """Given a license string, replace common invalid SPDX license characters."""
+    not_allowed = [',', ';', '/', '&']
+    if any(x in license_string for x in not_allowed):
+        # Try to replace common invalid license characters
+        license_string = license_string.replace(',', ' and')
+        license_string = license_string.replace('/', '-')
+        license_string = license_string.replace(';', '.')
+        license_string = license_string.replace('&', 'and')
+    return license_string
+
+
+def is_valid_license_expression(license_string: str) -> bool:
+    licensing = get_spdx_licensing()
+    try:
+        return licensing.validate(license_string).errors == []
+    # Catch any invalid license chars here
+    except AttributeError:
+        return False
+
+
+def get_package_license_declared(package_license_declared: str) -> Union[LicenseExpression, SpdxNone]:
+    """After substituting common invalid SPDX license characters using
+    the is_spdx_license_expression() function, determines if the declared
+    license string for a package or file is a valid SPDX license expression.
+    If license expression is valid after substitutions, return the updated string.
+    If not, return the LicenseRef of the original declared license expression
+    passed in to the function. If a blank string is passed in, return `NONE`."""
+    if package_license_declared:
+        package_license_declared = replace_invalid_chars_in_license_expression(package_license_declared)
+        if is_valid_license_expression(package_license_declared):
+            return Licensing().parse(package_license_declared)
+
+        return Licensing().parse(get_license_ref(package_license_declared))
+    return SpdxNone()
+
+
+def get_serialized_document_string(spdx_document: Document, writer_function: Callable[[Document, IO[str]], str]) -> str:
+    with io.StringIO() as stream:
+        writer_function(spdx_document, stream, validate=False)
+        return stream.getvalue()
+
+
+###########################################################################################
+# central place for SPDXRef-generators to avoid circular imports as these are widely used #
+###########################################################################################
+
+def get_image_spdxref(image_obj: Image) -> str:
+    """Given the image object, return an SPDX reference ID"""
+    # here we return the image name, tag and id
+    return f'SPDXRef-{image_obj.get_human_readable_id()}'
+
+
+def get_package_spdxref(package_obj: Package) -> Tuple[str, str]:
+    """Given the package obj, return an SPDX reference ID for the binary
+    and source package, if available"""
+    pkg_ref = f"{package_obj.name}-{package_obj.version}"
+    src_ref = ''
+    if package_obj.src_name:
+        # differentiate between binary and source package refs
+        src_ver = package_obj.src_version + "-src"
+        src_ref = f"{package_obj.src_name}-{src_ver}"
+    # replace all the strings that SPDX doesn't like
+    # allowed characters are: letters, numbers, "." and "-"
+    clean_pkg_ref = re.sub(r'[:+~_/]', r'-', pkg_ref)
+    if src_ref:
+        clean_src_ref = re.sub(r'[:+~/]', r'-', src_ref)
+        return f'SPDXRef-{clean_pkg_ref}', f'SPDXRef-{clean_src_ref}'
+    return f'SPDXRef-{clean_pkg_ref}', ''
+
+
+def get_layer_spdxref(layer_obj: ImageLayer) -> str:
+    """Given the layer object, return an SPDX reference ID"""
+    # here we return the shortened diff_id of the layer
+    return f'SPDXRef-{layer_obj.diff_id[:10]}'
+
+
+def get_file_spdxref(filedata: FileData, layer_id: str) -> str:
+    """Given a FileData object, return a unique identifier for the SPDX
+    document. According to the spec, this should be of the form: SPDXRef-<id>
+    We will use a combination of the file name, checksum and layer_id and
+    calculate a hash of this string"""
+    file_string = filedata.path + filedata.checksum[:7] + layer_id
+    fileid = get_string_id(file_string)
+    return f'SPDXRef-{fileid}'
diff --git a/tern/formats/spdx_new/image_helpers.py b/tern/formats/spdx_new/image_helpers.py
new file mode 100644
index 00000000..7bacb8bb
--- /dev/null
+++ b/tern/formats/spdx_new/image_helpers.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Image level helpers for SPDX document generator
+Images for SPDX act like a Package
+"""
+from typing import List
+
+from spdx_tools.spdx.model import ExtractedLicensingInfo, Package as SpdxPackage, \
+    SpdxNoAssertion
+
+from tern.classes.image import Image
+from tern.classes.template import Template
+from tern.formats.spdx_new.layer_helpers import get_layer_licenses
+from tern.formats.spdx_new.general_helpers import get_license_ref, get_uuid, is_valid_license_expression, \
+    get_image_spdxref
+from tern.utils.general import get_git_rev_or_version
+
+
+def get_image_extracted_licenses(image_obj: Image) -> List[ExtractedLicensingInfo]:
+    """Given an image_obj, return a unique list of extractedLicensingInfo
+    that contains all the file and package LicenseRef and their corresponding plain text."""
+
+    unique_licenses = set()
+    for layer in image_obj.layers:
+        # Get all of the unique file licenses, if they exist
+        unique_licenses.update(get_layer_licenses(layer))
+        # Next, collect any package licenses not already accounted for
+        for package in layer.packages:
+            if package.pkg_license:
+                unique_licenses.add(package.pkg_license)
+            # Add debian licenses from copyright text as one license
+            if package.pkg_licenses:
+                unique_licenses.add(", ".join(package.pkg_licenses))
+    extracted_licensing_info = []
+    for lic in list(unique_licenses):
+        valid_spdx = is_valid_license_expression(lic)
+        if not valid_spdx:
+            extracted_licensing_info.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic))
+
+    return extracted_licensing_info
+
+
+def get_image_dict(image_obj: Image, template: Template) -> SpdxPackage:  # TODO: these kind of functions don't produce dicts anymore, rename them
+    """Given an image object and the template object for SPDX, return the
+    SPDX Package for the given image."""
+    mapping = image_obj.to_dict(template)
+    return SpdxPackage(
+        spdx_id=get_image_spdxref(image_obj),
+        name=mapping["PackageName"],
+        download_location=SpdxNoAssertion(),
+        version=mapping["PackageVersion"],
+        supplier=SpdxNoAssertion(),
+        files_analyzed=False,
+        license_concluded=SpdxNoAssertion(),
+        license_declared=SpdxNoAssertion(),
+        copyright_text=SpdxNoAssertion(),
+    )
+
+
+def get_document_namespace(image_obj: Image) -> str:
+    """Given the image object, return a unique SPDX document uri.
+    This is a combination of the tool name and version, the image name
+    and the uuid"""
+    return f'https://spdx.org/spdxdocs/tern-report-{get_git_rev_or_version()[1]}-{image_obj.name}-{get_uuid()}'
diff --git a/tern/formats/spdx_new/layer_helpers.py b/tern/formats/spdx_new/layer_helpers.py
new file mode 100644
index 00000000..832688ab
--- /dev/null
+++ b/tern/formats/spdx_new/layer_helpers.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Layer level helpers for SPDX document generator
+Layers for SPDX act like a Package
+"""
+import hashlib
+import logging
+import os
+from typing import List, Optional, Tuple
+
+from license_expression import Licensing
+
+from spdx_tools.spdx.model import Package as SpdxPackage, SpdxNoAssertion, SpdxNone, PackageVerificationCode, Checksum, \
+    ChecksumAlgorithm, Relationship, RelationshipType, ExtractedLicensingInfo
+
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.formats.spdx_new.general_helpers import get_license_ref, is_valid_license_expression, \
+    get_image_spdxref, get_package_spdxref, get_layer_spdxref, get_file_spdxref
+from tern.formats.spdx_new.constants import DOCUMENT_ID
+from tern.utils import constants
+from tern.report import content
+
+
+# global logger
+logger = logging.getLogger(constants.logger_name)
+
+
+def get_layer_extracted_licenses(layer_obj: ImageLayer) -> List[ExtractedLicensingInfo]:
+    """Given an image_obj, return a unique list of ExtractedLicensingInfo
+    that contains all the file and package LicenseRef and the corresponding plain text."""
+
+    # Get all of the unique file licenses, if they exist
+    unique_licenses = set(get_layer_licenses(layer_obj))
+    # Next, collect any package licenses not already accounted for
+    for package in layer_obj.packages:
+        if package.pkg_license:
+            unique_licenses.add(package.pkg_license)
+    extracted_texts = []
+    for lic in list(unique_licenses):
+        valid_spdx = is_valid_license_expression(lic)
+        if not valid_spdx:
+            extracted_texts.append(ExtractedLicensingInfo(license_id=get_license_ref(lic), extracted_text=lic))
+    return extracted_texts
+
+
+def get_image_layer_relationships(image_obj: Image) -> List[Relationship]:
+    """Given an image object, return a list of dictionaries describing the
+    relationship between each layer "package" and the image and packages
+    related to it."""
+    layer_relationships = []
+    image_ref = get_image_spdxref(image_obj)
+
+    # Required - DOCUMENT_DESCRIBES relationship
+    layer_relationships.append(Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, image_ref))
+
+    for index, layer in enumerate(image_obj.layers):
+        layer_ref = get_layer_spdxref(layer)
+        # First, add dictionaries for the layer relationship to the image
+        layer_relationships.append(Relationship(image_ref, RelationshipType.CONTAINS, layer_ref))
+        # Next, add dictionary of the layer relationship to other layers
+        if index != 0:
+            prev_layer_ref = get_layer_spdxref(image_obj.layers[index - 1])
+            layer_relationships.append(Relationship(prev_layer_ref, RelationshipType.HAS_PREREQUISITE, layer_ref))
+        # Finally, add package relationships for the layer
+        if layer.packages:
+            for package in layer.packages:
+                pkg_ref, src_ref = get_package_spdxref(package)
+                layer_relationships.append(Relationship(layer_ref, RelationshipType.CONTAINS, pkg_ref))
+                if src_ref:
+                    layer_relationships.append(Relationship(pkg_ref, RelationshipType.GENERATED_FROM, src_ref))
+
+    return layer_relationships
+
+
+def get_layer_file_data_list(layer_obj: ImageLayer) -> List[str]:
+    """Given a layer object return the SPDX list of file refs in the layer.
+    Return an empty list if the files are not analyzed"""
+    file_refs = []
+    if layer_obj.files_analyzed:
+        layer_checksum = get_layer_checksum(layer_obj)
+        file_refs = [get_file_spdxref(filedata, layer_checksum.value) for filedata in layer_obj.files]
+    # some files are located in different places in the filesystem
+    # we make sure they don't occur as duplicates in this list
+    return list(set(file_refs))
+
+
+def get_layer_package_comment(layer_obj: ImageLayer) -> str:
+    """Return a package comment string value for a list of NoticeOrigin
+    objects for the given layer object"""
+    comment = ''
+    if "headers" in layer_obj.extension_info.keys():
+        for header in layer_obj.extension_info.get("headers"):
+            comment += header
+            comment += '\n'
+    if not layer_obj.origins.is_empty():
+        for notice_origin in layer_obj.origins.origins:
+            comment += content.print_notices(notice_origin, '', '\t')
+    return comment
+
+
+def get_layer_dict(layer_obj: ImageLayer) -> Tuple[SpdxPackage, List[Relationship]]:
+    """Given a layer object, return an SPDX Package representation
+     of the layer and the list of CONTAINS relationships to all files in that layer.
+     The analyzed files will go in a separate part of the document."""
+
+    comment = get_layer_package_comment(layer_obj)
+    verification_code = get_layer_verification_code(layer_obj)
+
+    layer_licenses = get_layer_licenses(layer_obj)
+    license_info_from_files = []
+    if layer_licenses:
+        # Use the layer LicenseRef in the list instead of license expression
+        for lic in layer_licenses:
+            license_info_from_files.append(get_license_ref(lic))  # TODO: potential bug here that converts valid expressions to LicenseRef- identifiers
+    license_info_from_files = [Licensing().parse(lic) for lic in license_info_from_files]
+
+    layer_spdx_id = get_layer_spdxref(layer_obj)
+    package = SpdxPackage(
+        spdx_id=layer_spdx_id,
+        name=os.path.basename(layer_obj.tar_file),
+        version=layer_obj.layer_index,
+        supplier=SpdxNoAssertion(),
+        file_name=layer_obj.tar_file,
+        download_location=SpdxNone(),
+        files_analyzed=bool(layer_obj.files_analyzed),
+        verification_code=verification_code if bool(layer_obj.files_analyzed) else None,
+        checksums=[get_layer_checksum(layer_obj)],
+        license_concluded=SpdxNoAssertion(),
+        license_declared=SpdxNoAssertion(),
+        copyright_text=SpdxNoAssertion(),
+        comment=comment if comment else None,
+        license_info_from_files=license_info_from_files,
+    )
+
+    relationships = [
+        Relationship(layer_spdx_id, RelationshipType.CONTAINS, file_ref)
+        for file_ref in get_layer_file_data_list(layer_obj)
+    ]
+
+    return package, relationships
+
+
+def get_layer_licenses(layer_obj: ImageLayer) -> List[str]:
+    """Return a list of unique licenses from the files analyzed
+    in the layer object. It is assumed that the files were analyzed and
+    there should be some license expressions. If there are not, an empty list
+    is returned"""
+    licenses = set()
+    for filedata in layer_obj.files:
+        # we will use the SPDX license expressions here as they will be
+        # valid SPDX license identifiers
+        if filedata.licenses:
+            for lic in list(set(filedata.licenses)):
+                licenses.add(lic)
+    return list(licenses)
+
+
+def get_layer_verification_code(layer_obj: ImageLayer) -> Optional[PackageVerificationCode]:
+    """Calculate the verification code from the files in an image layer. This
+    assumes that layer_obj.files_analyzed is True. The implementation follows
+    the algorithm in the SPDX spec v 2.2 which requires SHA1 to be used to
+    calculate the checksums of the file and the final verification code"""
+    sha1_list = []
+    for filedata in layer_obj.files:
+        filesha = filedata.get_checksum('sha1')
+        if not filesha:
+            # we cannot create a verification code, hence file generation
+            # is aborted
+            logger.critical(
+                'File %s does not have a sha1 checksum. Failed to generate '
+                'an SPDX report', filedata.path)
+            return None
+        sha1_list.append(filesha)
+    sha1_list.sort()
+    sha1s = ''.join(sha1_list)
+    verification_code = hashlib.sha1(sha1s.encode('utf-8')).hexdigest()  # nosec
+    return PackageVerificationCode(verification_code)
+
+
+def get_layer_checksum(layer_obj: ImageLayer) -> Checksum:
+    return Checksum(
+        ChecksumAlgorithm[layer_obj.checksum_type.upper()],
+        layer_obj.checksum
+    )
diff --git a/tern/formats/spdx_new/make_spdx_model.py b/tern/formats/spdx_new/make_spdx_model.py
new file mode 100644
index 00000000..188abd10
--- /dev/null
+++ b/tern/formats/spdx_new/make_spdx_model.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Common functions that are useful for all SPDX serialization formats
+"""
+
+import logging
+from typing import List
+
+from spdx_tools.spdx.model import Document, CreationInfo, Actor, ActorType, Relationship, RelationshipType
+
+from tern.classes.image_layer import ImageLayer
+from tern.classes.template import Template
+from tern.formats.spdx_new.constants import DOCUMENT_ID, DOCUMENT_NAME, SPDX_VERSION, DATA_LICENSE, DOCUMENT_COMMENT, \
+    LICENSE_LIST_VERSION, CREATOR_NAME, DOCUMENT_NAME_SNAPSHOT, DOCUMENT_NAMESPACE_SNAPSHOT
+from tern.formats.spdx_new.file_helpers import get_layer_files_list
+from tern.formats.spdx_new.general_helpers import get_current_timestamp, get_uuid, get_image_spdxref
+from tern.classes.image import Image
+from tern.formats.spdx.spdx import SPDX
+from tern.formats.spdx_new.file_helpers import get_files_list
+from tern.formats.spdx_new.image_helpers import get_image_extracted_licenses, \
+    get_image_dict, get_document_namespace
+from tern.formats.spdx_new.layer_helpers import get_layer_dict, get_image_layer_relationships, get_layer_extracted_licenses
+from tern.formats.spdx_new.package_helpers import get_packages_list, get_layer_packages_list
+from tern.utils import constants
+
+from tern.utils.general import get_git_rev_or_version
+
+# global logger
+logger = logging.getLogger(constants.logger_name)
+
+
+def make_spdx_model(image_obj_list: List[Image]) -> Document:
+    template = SPDX()
+    # we still don't know how SPDX documents could represent multiple
+    # images. Hence, we will assume only one image is analyzed and the
+    # input is a list of length 1
+    image_obj = image_obj_list[0]
+
+    creation_info = CreationInfo(
+        spdx_version=SPDX_VERSION,
+        spdx_id=DOCUMENT_ID,
+        name=DOCUMENT_NAME.format(image_name=image_obj.name),
+        document_namespace=get_document_namespace(image_obj),
+        creators=[Actor(actor_type=ActorType.TOOL, name=CREATOR_NAME.format(version=get_git_rev_or_version()[1]))],
+        created=get_current_timestamp(),
+        license_list_version=LICENSE_LIST_VERSION,
+        data_license=DATA_LICENSE,
+        document_comment=DOCUMENT_COMMENT,
+    )
+    describes_relationship = Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, get_image_spdxref(image_obj))
+    packages = [get_image_dict(image_obj, template)]
+    image_layer_relationships = get_image_layer_relationships(image_obj)
+
+    layer_file_relationships = []
+    for layer in image_obj.layers:
+        package, relationships = get_layer_dict(layer)
+        packages.append(package)
+        layer_file_relationships.extend(relationships)
+
+    packages.extend(get_packages_list(image_obj, template))
+    files = get_files_list(image_obj, template)
+    extracted_licensing_info = get_image_extracted_licenses(image_obj)
+
+    return Document(
+        creation_info=creation_info,
+        packages=packages,
+        files=files,
+        relationships=[describes_relationship] + image_layer_relationships + layer_file_relationships,
+        extracted_licensing_info=extracted_licensing_info
+    )
+
+
+def make_spdx_model_snapshot(layer_obj: ImageLayer, template: Template) -> Document:
+    """This is the SPDX document containing just the packages found at
+    container build time"""
+    timestamp = get_current_timestamp()
+
+    creation_info = CreationInfo(
+        spdx_version=SPDX_VERSION,
+        spdx_id=DOCUMENT_ID,
+        name=DOCUMENT_NAME_SNAPSHOT,
+        document_namespace=DOCUMENT_NAMESPACE_SNAPSHOT.format(timestamp=timestamp, uuid=get_uuid()),
+        creators=[Actor(actor_type=ActorType.TOOL, name=CREATOR_NAME.format(get_git_rev_or_version()[1]))],
+        created=timestamp,
+        license_list_version=LICENSE_LIST_VERSION,
+        data_license=DATA_LICENSE,
+        document_comment=DOCUMENT_COMMENT,
+    )
+
+    # Add list of package dictionaries to packages list, if they exist
+    packages = get_layer_packages_list(layer_obj, template)
+    describes_relationships = [
+        Relationship(DOCUMENT_ID, RelationshipType.DESCRIBES, package.spdx_id)
+        for package in packages
+    ]
+
+    # Add list of file dictionaries, if they exist
+    files = get_layer_files_list(layer_obj, template, timestamp)
+
+    # Add package and file extracted license texts, if they exist
+    extracted_licensing_info = get_layer_extracted_licenses(layer_obj)
+
+    return Document(
+        creation_info=creation_info,
+        packages=packages,
+        files=files,
+        relationships=describes_relationships,
+        extracted_licensing_info=extracted_licensing_info
+    )
diff --git a/tern/formats/spdx_new/package_helpers.py b/tern/formats/spdx_new/package_helpers.py
new file mode 100644
index 00000000..f76fd677
--- /dev/null
+++ b/tern/formats/spdx_new/package_helpers.py
@@ -0,0 +1,168 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+Package level helpers for SPDX document generator
+"""
+from typing import List
+
+from packageurl import PackageURL
+from spdx_tools.spdx.model import Package as SpdxPackage, SpdxNoAssertion, SpdxNone, Actor, ActorType, \
+    ExternalPackageRef, ExternalPackageRefCategory
+
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.classes.package import Package
+from tern.classes.template import Template
+from tern.formats.spdx_new.general_helpers import get_package_license_declared, get_package_spdxref
+from tern.report import content
+
+
+SOURCE_PACKAGE_COMMENT = 'This package refers to a source package associated' \
+    ' with one or more binary packages installed in this container. ' \
+    'This source pacakge is NOT installed in the container but may be useful' \
+    ' for CVE lookups.'
+
+
+def get_layer_packages_list(layer: ImageLayer, template: Template) -> List[SpdxPackage]:
+    """Given a layer object and an SPDX template object, return a list
+    of SPDX dictionary representations for each of the packages in the layer
+    and their package references"""
+    package_dicts = []
+    package_refs = []
+    for package in layer.packages:
+        # Create a list of SpdxPackages, each one representing
+        # one package object in the image
+        pkg_ref, _ = get_package_spdxref(package)  # TODO: Is it ok to ignore source_package_ref here?
+        if pkg_ref not in package_refs:
+            package_dicts.append(get_package_dict(package, template))
+            package_refs.append(pkg_ref)
+    return package_dicts
+
+
+def get_package_comment(package: Package) -> str:
+    """Given a package object, return a PackageComment string for a list of
+    NoticeOrigin objects"""
+    comment = ''
+    if package.origins.origins:
+        for notice_origin in package.origins.origins:
+            comment = comment + content.print_notices(
+                notice_origin, '', '\t')
+    return comment
+
+
+def get_source_package_dict(package: Package, template: Template) -> SpdxPackage:
+    """Given a package object and its SPDX template mapping, return an SPDX Package of the associated source package.
+    The analyzed files will go in a separate dictionary for the JSON document."""
+    mapping = package.to_dict(template)
+
+    _, src_ref = get_package_spdxref(package)
+    declared_lic = mapping['PackageLicenseDeclared']
+    # Define debian licenses from copyright text as one license
+    if package.pkg_format == 'deb':
+        declared_lic = ', '.join(package.pkg_licenses)
+
+    return SpdxPackage(
+        spdx_id=src_ref,
+        name=mapping['SourcePackageName'],
+        version=mapping['SourcePackageVersion'] if mapping['SourcePackageVersion'] else 'NOASSERTION',
+        download_location=mapping['PackageDownloadLocation'] if mapping['PackageDownloadLocation'] else SpdxNoAssertion(),
+        files_analyzed=False,
+        license_concluded=SpdxNoAssertion(),
+        license_declared=get_package_license_declared(declared_lic),
+        copyright_text=mapping['PackageCopyrightText'] if mapping['PackageCopyrightText'] else SpdxNone(),
+        comment=SOURCE_PACKAGE_COMMENT,
+    )
+
+
+def get_package_dict(package: Package, template: Template) -> SpdxPackage:
+    """Given a package object and its SPDX template mapping, return an SPDX Package.
+    The analyzed files will go in a separate dictionary for the JSON document."""
+    mapping = package.to_dict(template)
+
+    if mapping['PackageSupplier']:
+        supplier = Actor(ActorType.ORGANIZATION, mapping['PackageSupplier'])
+    else:
+        supplier = SpdxNoAssertion()
+
+    external_ref = []
+    if get_purl(package):
+        external_ref.append(ExternalPackageRef(
+            ExternalPackageRefCategory.PACKAGE_MANAGER,
+            "purl",
+            get_purl(package)
+        ))
+
+    pkg_ref, _ = get_package_spdxref(package)
+    # Define debian licenses from copyright text as one license
+    declared_lic = mapping['PackageLicenseDeclared']
+    if package.pkg_format == 'deb':
+        declared_lic = ', '.join(package.pkg_licenses)
+
+    return SpdxPackage(
+        spdx_id=pkg_ref,
+        name=mapping['PackageName'],
+        version=mapping['PackageVersion'] if mapping['PackageVersion'] else 'NOASSERTION',
+        supplier=supplier,
+        download_location=mapping['PackageDownloadLocation'] if mapping['PackageDownloadLocation'] else SpdxNoAssertion(),
+        files_analyzed=False,
+        license_concluded=SpdxNoAssertion(),
+        license_declared=get_package_license_declared(declared_lic),
+        copyright_text=mapping['PackageCopyrightText'] if mapping['PackageCopyrightText'] else SpdxNone(),
+        external_references=external_ref,
+        comment=get_package_comment(package),
+    )
+
+
+def get_packages_list(image_obj: Image, template: Template) -> List[SpdxPackage]:
+    """Given an image object and the template object for SPDX, return a list
+    of SPDX dictionary representations for each of the packages in the image.
+    The SPDX JSON spec for packages requires:
+        name
+        versionInfo
+        downloadLocation"""
+    packages = []
+    package_refs = set()
+
+    for layer in image_obj.layers:
+        for package in layer.packages:
+            # Create a list of dictionaries. Each dictionary represents
+            # one package object in the image
+            pkg_ref, src_ref = get_package_spdxref(package)
+            if pkg_ref not in package_refs and package.name:
+                packages.append(get_package_dict(package, template))
+                package_refs.add(pkg_ref)
+            if src_ref and src_ref not in package_refs:
+                packages.append(get_source_package_dict(
+                    package, template))
+                package_refs.add(src_ref)
+    return packages
+
+
+purl_types_with_namespaces = [
+    'deb',
+    'rpm',
+    'apk',
+    'alpm'
+]
+
+
+def get_purl(package_obj: Package) -> str:
+    """Return a purl string for a given package"""
+    purl_type = package_obj.pkg_format
+    purl_namespace = ''
+    if purl_type in purl_types_with_namespaces and package_obj.pkg_supplier:
+        # https://github.com/package-url/purl-spec/pull/214
+        if package_obj.pkg_supplier.split(' ')[0] == "VMware":
+            purl_namespace = package_obj.pkg_supplier.split(' ')[1].lower()
+        else:
+            purl_namespace = package_obj.pkg_supplier.split(' ')[0].lower()
+            # TODO- this might need adjusting for alpm. Currently can't test on M1
+    purl = PackageURL(purl_type, purl_namespace, package_obj.name.lower(), package_obj.version,
+                      qualifiers={'arch': package_obj.arch if package_obj.arch else ''})
+    try:
+        return purl.to_string()
+    except ValueError:
+        return ''
diff --git a/tern/formats/spdx_new/spdxjson/__init__.py b/tern/formats/spdx_new/spdxjson/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tern/formats/spdx_new/spdxjson/generator.py b/tern/formats/spdx_new/spdxjson/generator.py
new file mode 100644
index 00000000..1ab8a6a5
--- /dev/null
+++ b/tern/formats/spdx_new/spdxjson/generator.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+SPDX JSON document generator
+"""
+import logging
+from typing import List
+
+from spdx_tools.spdx.model import Document
+from spdx_tools.spdx.writer.json.json_writer import write_document_to_stream
+
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.formats import generator
+from tern.formats.spdx.spdx import SPDX
+from tern.formats.spdx_new.general_helpers import get_serialized_document_string
+from tern.formats.spdx_new.make_spdx_model import make_spdx_model, make_spdx_model_snapshot
+from tern.utils import constants
+
+# global logger
+logger = logging.getLogger(constants.logger_name)
+
+
+class SpdxJSON(generator.Generate):
+    def generate(self, image_obj_list: List[Image], print_inclusive=False) -> str:
+        """Generate an SPDX document
+        WARNING: This assumes that the list consists of one image or the base
+        image and a stub image, in which case, the information in the stub
+        image is not applicable in the SPDX case as it is an empty image
+        object with no metadata as nothing got built.
+
+        For the sake of SPDX, an image is a 'Package' which 'CONTAINS' each
+        layer which is also a 'Package' which 'CONTAINS' the real Packages"""
+        logger.debug("Generating SPDX JSON document...")
+
+        spdx_document: Document = make_spdx_model(image_obj_list)
+
+        return get_serialized_document_string(spdx_document, write_document_to_stream)
+
+    def generate_layer(self, layer: ImageLayer) -> str:
+        """Generate an SPDX document containing package and file information
+        at container build time"""
+        logger.debug("Generating SPDX JSON snapshot document...")
+        template = SPDX()
+        spdx_document: Document = make_spdx_model_snapshot(layer, template)
+
+        return get_serialized_document_string(spdx_document, write_document_to_stream)
diff --git a/tern/formats/spdx_new/spdxtagvalue/__init__.py b/tern/formats/spdx_new/spdxtagvalue/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tern/formats/spdx_new/spdxtagvalue/generator.py b/tern/formats/spdx_new/spdxtagvalue/generator.py
new file mode 100644
index 00000000..928ccdfd
--- /dev/null
+++ b/tern/formats/spdx_new/spdxtagvalue/generator.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 VMware, Inc. All Rights Reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+"""
+SPDX Tag-Value document generator
+"""
+
+import logging
+from typing import List
+
+from spdx_tools.spdx.model import Document
+from spdx_tools.spdx.writer.tagvalue.tagvalue_writer import write_document_to_stream
+
+from tern.classes.image import Image
+from tern.classes.image_layer import ImageLayer
+from tern.formats import generator
+from tern.formats.spdx.spdx import SPDX
+from tern.formats.spdx_new.general_helpers import get_serialized_document_string
+from tern.formats.spdx_new.make_spdx_model import make_spdx_model, make_spdx_model_snapshot
+from tern.utils import constants
+
+# global logger
+logger = logging.getLogger(constants.logger_name)
+
+
+class SpdxTagValue(generator.Generate):
+    def generate(self, image_obj_list: List[Image], print_inclusive=False) -> str:
+        """Generate an SPDX document
+        WARNING: This assumes that the list consists of one image or the base
+        image and a stub image, in which case, the information in the stub
+        image is not applicable in the SPDX case as it is an empty image
+        object with no metadata as nothing got built.
+
+        For the sake of SPDX, an image is a 'Package' which 'CONTAINS' each
+        layer which is also a 'Package' which 'CONTAINS' the real Packages"""
+        logger.debug("Generating SPDX Tag-Value document...")
+
+        spdx_document: Document = make_spdx_model(image_obj_list)
+
+        return get_serialized_document_string(spdx_document, write_document_to_stream)
+
+    def generate_layer(self, layer: ImageLayer) -> str:
+        """Generate an SPDX document containing package and file information
+        at container build time"""
+        logger.debug("Generating SPDX Tag-Value snapshot document...")
+        template = SPDX()
+        spdx_document: Document = make_spdx_model_snapshot(layer, template)
+
+        return get_serialized_document_string(spdx_document, write_document_to_stream)