From 3434af5929b0530cd1102069a59c127dec73d0f1 Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Wed, 26 Jul 2023 10:23:41 -0700
Subject: [PATCH 1/8] Add ip type check to diff command

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/src/diff/diff.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py
index 69b1edf..54cb7e4 100644
--- a/cli/src/diff/diff.py
+++ b/cli/src/diff/diff.py
@@ -1,4 +1,5 @@
 import json
+import re
 
 import click
 from beartype import beartype
@@ -21,6 +22,9 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]:
         case "date":
             if not isinstance(actual, str) and not isinstance(actual, int):
                 return {"expected": expect, "actual": actual}
+        case "ip":
+            if not isinstance(actual, str) or not re.match(r"(\d{1,3}\.){3}\d{1,3}", actual):
+                return {"expected": expect, "actual": actual}
         case _:
             click.secho(f"WARNING: unknown type '{expect}'", err=True, fg="yellow")
     return {}
@@ -116,6 +120,7 @@ def diff(mapping, data, output_json, show_missing):
         click.echo(json.dumps(check, sort_keys=True))
     else:
         output_diff(check)
+    quit(0 if check == {} else 1)
 
 
 if __name__ == "__main__":

From 202fd8dff8498f850c3a31213b37288da523f286 Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Wed, 26 Jul 2023 10:34:05 -0700
Subject: [PATCH 2/8] Add check-all flag to diff

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/src/diff/diff.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py
index 54cb7e4..31cdfc3 100644
--- a/cli/src/diff/diff.py
+++ b/cli/src/diff/diff.py
@@ -107,20 +107,31 @@ def output_diff(difference: dict[str, object], prefix: str = "") -> None:
     is_flag=True,
     help="Output fields that are expected in the mappings but missing in the data",
 )
-def diff(mapping, data, output_json, show_missing):
+@click.option(
+    "--check-all",
+    "check_all",
+    is_flag=True,
+    help="Check every available data record and report the first one with errors (default: only check first record)"
+)
+def diff(mapping, data, output_json, show_missing, check_all):
     """Type check your integration given a sample data record and the appropriate SS4O schema."""
     properties = load_mapping(mapping)
     with open(data, "r") as data_file:
         data_json = json.load(data_file)
-    if isinstance(data_json, list):
-        # Unwrap list of data, assume first record is representative
-        data_json = data_json[0]
-    check = do_check(properties, data_json, show_missing)
-    if output_json:
-        click.echo(json.dumps(check, sort_keys=True))
-    else:
-        output_diff(check)
-    quit(0 if check == {} else 1)
+    if not isinstance(data_json, list):
+        # Wrap individual data record in a list
+        data_json = [data_json]
+    for i, record in enumerate(data_json if check_all else data_json[:1], 1):
+        check = do_check(properties, record, show_missing)
+        if check == {}:
+            continue
+        if check_all:
+            click.echo(f"Validation errors found in record {i}", err=True)
+        if output_json:
+            click.echo(json.dumps(check, sort_keys=True))
+        else:
+            output_diff(check)
+        quit(1)
 
 
 if __name__ == "__main__":

From dd064f9e781ccfaf33250c1a0839715b193fbb51 Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Wed, 26 Jul 2023 10:56:50 -0700
Subject: [PATCH 3/8] Fix issue with updating nested fields on schema load

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/src/utils/mappings.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cli/src/utils/mappings.py b/cli/src/utils/mappings.py
index cdca72a..36c20b6 100644
--- a/cli/src/utils/mappings.py
+++ b/cli/src/utils/mappings.py
@@ -6,6 +6,16 @@
 from beartype import beartype
 
 
+@beartype
+def nested_update(base: dict, updates: dict) -> dict:
+    for k, v in updates.items():
+        if isinstance(v, dict):
+            base[k] = nested_update(base.get(k, {}), v)
+        else:
+            base[k] = v
+    return base
+
+
 @beartype
 def load_mapping(mapping: str) -> dict[str, dict]:
     with open(mapping, "r") as mapping_file:
@@ -39,5 +49,5 @@ def load_mapping(mapping: str) -> dict[str, dict]:
                 err=True,
                 fg="yellow",
             )
-        properties.update(load_mapping(item_glob[0]))
+        nested_update(properties, load_mapping(item_glob[0]))
     return properties

From bb68442d011edd5e428b1242f02610c4843b0a2a Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Wed, 26 Jul 2023 10:57:02 -0700
Subject: [PATCH 4/8] Add double checking to diff

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/src/diff/diff.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py
index 31cdfc3..8d8aa17 100644
--- a/cli/src/diff/diff.py
+++ b/cli/src/diff/diff.py
@@ -16,6 +16,9 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]:
         case "long" | "integer":
             if not isinstance(actual, int):
                 return {"expected": expect, "actual": actual}
+        case "double":
+            if not isinstance(actual, float):
+                return {"expected": expect, "actual": actual}
         case "alias":
             # We assume aliases were already unwrapped by the caller and ignore them.
             return {}

From dbb1c7453dc620bd7f07f21bf37eb56a4e8ea44d Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Thu, 8 Feb 2024 16:22:48 -0800
Subject: [PATCH 5/8] Improve data type recognition

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/src/diff/diff.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py
index 2254267..afb2c8e 100644
--- a/cli/src/diff/diff.py
+++ b/cli/src/diff/diff.py
@@ -13,12 +13,15 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]:
         case "text" | "keyword":
             if not isinstance(actual, str):
                 return {"expected": expect, "actual": actual}
-        case "long" | "integer":
+        case "long" | "integer" | "short":
             if not isinstance(actual, int):
                 return {"expected": expect, "actual": actual}
-        case "double":
+        case "double" | "float" | "half_float":
             if not isinstance(actual, float):
                 return {"expected": expect, "actual": actual}
+        case "boolean":
+            if not isinstance(actual, bool):
+                return {"expected": expect, "actual": actual}
         case "alias":
             # We assume aliases were already unwrapped by the caller and ignore them.
             return {}

From d2cd70121301ca63bed602b6424b87a7c96d104a Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Thu, 8 Feb 2024 16:41:06 -0800
Subject: [PATCH 6/8] Copy bundle script as command

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/.gitignore             |   2 +
 cli/requirements.txt       |   1 +
 cli/src/bundle/__init__.py |   1 +
 cli/src/bundle/bundle.py   | 171 +++++++++++++++++++++++++++++++++++++
 cli/src/main.py            |   2 +
 5 files changed, 177 insertions(+)
 create mode 100644 cli/src/bundle/__init__.py
 create mode 100644 cli/src/bundle/bundle.py

diff --git a/cli/.gitignore b/cli/.gitignore
index ad4a1f1..bae087f 100644
--- a/cli/.gitignore
+++ b/cli/.gitignore
@@ -1,3 +1,5 @@
+data
+
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
 
diff --git a/cli/requirements.txt b/cli/requirements.txt
index e9cd148..01df2a1 100644
--- a/cli/requirements.txt
+++ b/cli/requirements.txt
@@ -3,6 +3,7 @@ black==23.3.0
 click==8.1.3
 isort==5.12.0
 mypy-extensions==1.0.0
+ndjson==0.3.1
 packaging==23.1
 pathspec==0.11.1
 platformdirs==3.8.0
diff --git a/cli/src/bundle/__init__.py b/cli/src/bundle/__init__.py
new file mode 100644
index 0000000..44262d8
--- /dev/null
+++ b/cli/src/bundle/__init__.py
@@ -0,0 +1 @@
+from .bundle import bundle
diff --git a/cli/src/bundle/bundle.py b/cli/src/bundle/bundle.py
new file mode 100644
index 0000000..6c16bc7
--- /dev/null
+++ b/cli/src/bundle/bundle.py
@@ -0,0 +1,171 @@
+import base64
+import copy
+import json
+import sys
+import typing
+import uuid
+from datetime import datetime
+from glob import glob
+from pathlib import Path
+
+from beartype import beartype
+import click
+import ndjson
+
+# Random UUID4, use as namespace for UUID5s for helpful collision identification
+NAMESPACE_OS_CATALOG: uuid.UUID = uuid.UUID("f21aff9f-a6b3-43eb-85af-5ce18a880430")
+CONFIG_FIELD_DIR_INFO: dict[str, tuple[str, str]] = {
+    "statics": ("static", "bin"),
+    "components": ("schemas", "mapping.json"),
+    "savedObjects": ("assets", "ndjson"),
+    "queries": ("assets", "text"),
+    "sampleData": ("data", "json"),
+}
+OS_OBJECT_SIZE_LIMIT = 1_048_576  # 1 MB
+
+# Minified JSON serialization helper
+# Due to hashing and object size limits we want to be consistent about whitespace and key ordering
+min_json = lambda obj: json.dumps(obj, separators=(",", ":"), sort_keys=True)
+
+
+def try_attach_assets(config: dict, path: Path, info: None | tuple[str, str]) -> bool:
+    # Guard clauses: Skip anything that can't be read
+    if not info:
+        return False
+    if not ("path" in config or ("name" in config and "version" in config)):
+        return False
+    subdir, encoding = info
+
+    # Read data
+    read_mode = "r" if encoding != "bin" else "rb"
+    match (config.get("path"), encoding):
+        case (
+            None,
+            "text",
+        ):  # If no path and text encoding, rely on language for extension
+            full_path = (
+                path
+                / subdir
+                / f"{config['name']}-{config['version']}.{config['language']}"
+            )
+        case (None, _):  # Otherwise, use encoding as extension with name
+            full_path = (
+                path / subdir / f"{config['name']}-{config['version']}.{encoding}"
+            )
+        case (_, _):  # If a path is present, use it regardless of specified encoding
+            full_path = path / subdir / config["path"]
+    with open(full_path, read_mode) as data_file:
+        data = data_file.read()
+
+    # Attach data to config
+    match encoding:
+        case "bin":
+            config["data"] = str(base64.b64encode(data), encoding="ascii")
+        case "mapping.json" | "json":
+            config["data"] = min_json(json.loads(data))
+        case "ndjson":
+            config["data"] = min_json(ndjson.loads(data))
+        case "text":
+            config["data"] = data
+    return True
+
+
+def attach_assets_in_place(
+    config: typing.Any, path: Path, info: None | tuple[str, str] = None
+) -> None:
+    if not isinstance(config, list) and not isinstance(config, dict):
+        return
+    if isinstance(config, list):
+        for item in config:
+            attach_assets_in_place(item, path, info)
+        return
+    if try_attach_assets(config, path, info):
+        return
+    for key, value in config.items():
+        info = CONFIG_FIELD_DIR_INFO.get(key, info)
+        attach_assets_in_place(value, path, info)
+
+
+def attach_assets(config: dict, path: Path) -> dict:
+    config = copy.deepcopy(config)
+    attach_assets_in_place(config, path)
+    return config
+
+
+# Serialize integration as local dictionary
+def scan_integration(path: Path) -> dict:
+    integration_name = path.stem
+    # TODO detect latest version instead of defaulting to 1.0.0
+    with open(path / f"{integration_name}-1.0.0.json", "r") as config_file:
+        config = json.load(config_file)
+    config = attach_assets(config, path)
+    return config
+
+
+# Convert an integration json config to a full saved object
+def bundle_integration(integration: dict):
+    obj_id = uuid.uuid5(
+        NAMESPACE_OS_CATALOG,
+        min_json([integration["name"], integration["type"], integration["version"]]),
+    )
+    return {
+        "type": "integration-template",
+        "id": str(obj_id),
+        "updated_at": datetime.utcnow().isoformat(),
+        "attributes": integration,
+    }
+
+
+# If the object is too large, truncate sample data and inform user
+# Necessary as some integrations have lots of sample data
+def check_truncate_sample(integration: dict, dir: str):
+    if not integration["attributes"].get("sampleData"):
+        return
+    if len(min_json(integration)) > OS_OBJECT_SIZE_LIMIT:
+        print(f"{dir}: Integration too large! Truncating sample data to 100 records")
+        data = integration["attributes"]["sampleData"]["data"]
+        data = min_json(json.loads(data)[:100])
+        integration["attributes"]["sampleData"]["data"] = data
+
+
+def convert_integration(integration_dir: str) -> str | None:
+    scanned = scan_integration(Path(integration_dir))
+    saved_object = bundle_integration(scanned)
+    check_truncate_sample(saved_object, integration_dir)
+    return min_json(saved_object)
+
+
+@click.command()
+@click.option(
+    "--integrations",
+    type=click.Path(exists=True, dir_okay=True, file_okay=False),
+    help="The directory to scan for integrations",
+)
+@click.option(
+    "--output",
+    type=click.Path(writable=True),
+    help="The destination file to put the bundle (.ndjson)",
+)
+@beartype
+def bundle(integrations: str, output: str) -> bool:
+    """Convert local integration folders into an ndjson bundle."""
+    input_files = glob(str(Path(integrations) / "*"))
+    integ_map = map(convert_integration, input_files)
+
+    summary = {"exportedCount": 0, "missingRefCount": 0, "missingReferences": []}
+    with open(Path(output), "w") as bundle:
+        for integration in integ_map:
+            if not integration:
+                continue
+            bundle.write(f"{integration}\n")
+            summary["exportedCount"] += 1
+        bundle.write(f"{min_json(summary)}\n")
+    print(f"Wrote {summary['exportedCount']} integrations to {output}")
+
+    return int(len(input_files) > summary["exportedCount"]) # Status code 0 or 1
+
+if __name__ == "__main__":
+    # Inputs: The directory to scan for integrations, and where to put the bundle
+    input_files = glob("repository/*")
+    output_path = Path("integrations_bundle.ndjson")
+    sys.exit(bundle(input_files, output_path))
diff --git a/cli/src/main.py b/cli/src/main.py
index 2287d08..d885d0e 100644
--- a/cli/src/main.py
+++ b/cli/src/main.py
@@ -2,6 +2,7 @@
 
 from .diff import diff
 from .scanviz import scanviz
+from .bundle import bundle
 
 
 @click.group()
@@ -12,6 +13,7 @@ def cli():
 
 cli.add_command(diff)
 cli.add_command(scanviz)
+cli.add_command(bundle)
 
 
 if __name__ == "__main__":

From a50f6990f3deb210949cd08d20dcd49791db9839 Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Thu, 8 Feb 2024 16:47:07 -0800
Subject: [PATCH 7/8] Upgrade requirements

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/requirements.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cli/requirements.txt b/cli/requirements.txt
index 01df2a1..afd56c0 100644
--- a/cli/requirements.txt
+++ b/cli/requirements.txt
@@ -1,9 +1,9 @@
-beartype==0.14.1
-black==23.3.0
-click==8.1.3
-isort==5.12.0
+beartype==0.17.0
+black==24.1.1
+click==8.1.7
+isort==5.13.2
 mypy-extensions==1.0.0
 ndjson==0.3.1
-packaging==23.1
-pathspec==0.11.1
-platformdirs==3.8.0
+packaging==23.2
+pathspec==0.12.1
+platformdirs==4.2.0

From c536891937422d7e22ef920c84223e4271d9463e Mon Sep 17 00:00:00 2001
From: Simeon Widdis <sawiddis@amazon.com>
Date: Fri, 22 Mar 2024 10:48:35 -0700
Subject: [PATCH 8/8] Update bundling for new format

Signed-off-by: Simeon Widdis <sawiddis@amazon.com>
---
 cli/setup.py             | 2 +-
 cli/src/bundle/bundle.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cli/setup.py b/cli/setup.py
index 1d4a88d..582ab0d 100644
--- a/cli/setup.py
+++ b/cli/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="osints",
-    version="0.2.0",
+    version="0.3.0",
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
diff --git a/cli/src/bundle/bundle.py b/cli/src/bundle/bundle.py
index 6c16bc7..86f13e4 100644
--- a/cli/src/bundle/bundle.py
+++ b/cli/src/bundle/bundle.py
@@ -17,8 +17,7 @@
 CONFIG_FIELD_DIR_INFO: dict[str, tuple[str, str]] = {
     "statics": ("static", "bin"),
     "components": ("schemas", "mapping.json"),
-    "savedObjects": ("assets", "ndjson"),
-    "queries": ("assets", "text"),
+    "assets": ("assets", "text"),
     "sampleData": ("data", "json"),
 }
 OS_OBJECT_SIZE_LIMIT = 1_048_576  # 1 MB
@@ -46,7 +45,7 @@ def try_attach_assets(config: dict, path: Path, info: None | tuple[str, str]) ->
             full_path = (
                 path
                 / subdir
-                / f"{config['name']}-{config['version']}.{config['language']}"
+                / f"{config['name']}-{config['version']}.{config['extension']}"
             )
         case (None, _):  # Otherwise, use encoding as extension with name
             full_path = (
@@ -147,7 +146,7 @@ def convert_integration(integration_dir: str) -> str | None:
     help="The destination file to put the bundle (.ndjson)",
 )
 @beartype
-def bundle(integrations: str, output: str) -> bool:
+def bundle(integrations: str, output: str) -> int:
     """Convert local integration folders into an ndjson bundle."""
     input_files = glob(str(Path(integrations) / "*"))
     integ_map = map(convert_integration, input_files)