Merge pull request #122 from Swiddis/osints/dev

Add Bundle command to CLI
opensearch-project · Mar 28, 2024 · 5c3ea08 · 5c3ea08
2 parents 7e1b2e9 + c536891
commit 5c3ea08
Show file tree

Hide file tree

Showing 8 changed files with 227 additions and 19 deletions.
diff --git a/cli/.gitignore b/cli/.gitignore
@@ -1,3 +1,5 @@
+data
+
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
 

diff --git a/cli/requirements.txt b/cli/requirements.txt
@@ -1,8 +1,9 @@
-beartype==0.14.1
-black==23.3.0
-click==8.1.3
-isort==5.12.0
+beartype==0.17.0
+black==24.1.1
+click==8.1.7
+isort==5.13.2
 mypy-extensions==1.0.0
-packaging==23.1
-pathspec==0.11.1
-platformdirs==3.8.0
+ndjson==0.3.1
+packaging==23.2
+pathspec==0.12.1
+platformdirs==4.2.0
diff --git a/cli/setup.py b/cli/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="osints",
-    version="0.2.0",
+    version="0.3.0",
     packages=find_packages(),
     include_package_data=True,
     install_requires=[

diff --git a/cli/src/bundle/__init__.py b/cli/src/bundle/__init__.py
@@ -0,0 +1 @@
+from .bundle import bundle
diff --git a/cli/src/bundle/bundle.py b/cli/src/bundle/bundle.py
@@ -0,0 +1,170 @@
+import base64
+import copy
+import json
+import sys
+import typing
+import uuid
+from datetime import datetime
+from glob import glob
+from pathlib import Path
+
+from beartype import beartype
+import click
+import ndjson
+
+# Random UUID4, use as namespace for UUID5s for helpful collision identification
+NAMESPACE_OS_CATALOG: uuid.UUID = uuid.UUID("f21aff9f-a6b3-43eb-85af-5ce18a880430")
+CONFIG_FIELD_DIR_INFO: dict[str, tuple[str, str]] = {
+    "statics": ("static", "bin"),
+    "components": ("schemas", "mapping.json"),
+    "assets": ("assets", "text"),
+    "sampleData": ("data", "json"),
+}
+OS_OBJECT_SIZE_LIMIT = 1_048_576  # 1 MB
+
+# Minified JSON serialization helper
+# Due to hashing and object size limits we want to be consistent about whitespace and key ordering
+min_json = lambda obj: json.dumps(obj, separators=(",", ":"), sort_keys=True)
+
+
+def try_attach_assets(config: dict, path: Path, info: None | tuple[str, str]) -> bool:
+    # Guard clauses: Skip anything that can't be read
+    if not info:
+        return False
+    if not ("path" in config or ("name" in config and "version" in config)):
+        return False
+    subdir, encoding = info
+
+    # Read data
+    read_mode = "r" if encoding != "bin" else "rb"
+    match (config.get("path"), encoding):
+        case (
+            None,
+            "text",
+        ):  # If no path and text encoding, rely on language for extension
+            full_path = (
+                path
+                / subdir
+                / f"{config['name']}-{config['version']}.{config['extension']}"
+            )
+        case (None, _):  # Otherwise, use encoding as extension with name
+            full_path = (
+                path / subdir / f"{config['name']}-{config['version']}.{encoding}"
+            )
+        case (_, _):  # If a path is present, use it regardless of specified encoding
+            full_path = path / subdir / config["path"]
+    with open(full_path, read_mode) as data_file:
+        data = data_file.read()
+
+    # Attach data to config
+    match encoding:
+        case "bin":
+            config["data"] = str(base64.b64encode(data), encoding="ascii")
+        case "mapping.json" | "json":
+            config["data"] = min_json(json.loads(data))
+        case "ndjson":
+            config["data"] = min_json(ndjson.loads(data))
+        case "text":
+            config["data"] = data
+    return True
+
+
+def attach_assets_in_place(
+    config: typing.Any, path: Path, info: None | tuple[str, str] = None
+) -> None:
+    if not isinstance(config, list) and not isinstance(config, dict):
+        return
+    if isinstance(config, list):
+        for item in config:
+            attach_assets_in_place(item, path, info)
+        return
+    if try_attach_assets(config, path, info):
+        return
+    for key, value in config.items():
+        info = CONFIG_FIELD_DIR_INFO.get(key, info)
+        attach_assets_in_place(value, path, info)
+
+
+def attach_assets(config: dict, path: Path) -> dict:
+    config = copy.deepcopy(config)
+    attach_assets_in_place(config, path)
+    return config
+
+
+# Serialize integration as local dictionary
+def scan_integration(path: Path) -> dict:
+    integration_name = path.stem
+    # TODO detect latest version instead of defaulting to 1.0.0
+    with open(path / f"{integration_name}-1.0.0.json", "r") as config_file:
+        config = json.load(config_file)
+    config = attach_assets(config, path)
+    return config
+
+
+# Convert an integration json config to a full saved object
+def bundle_integration(integration: dict):
+    obj_id = uuid.uuid5(
+        NAMESPACE_OS_CATALOG,
+        min_json([integration["name"], integration["type"], integration["version"]]),
+    )
+    return {
+        "type": "integration-template",
+        "id": str(obj_id),
+        "updated_at": datetime.utcnow().isoformat(),
+        "attributes": integration,
+    }
+
+
+# If the object is too large, truncate sample data and inform user
+# Necessary as some integrations have lots of sample data
+def check_truncate_sample(integration: dict, dir: str):
+    if not integration["attributes"].get("sampleData"):
+        return
+    if len(min_json(integration)) > OS_OBJECT_SIZE_LIMIT:
+        print(f"{dir}: Integration too large! Truncating sample data to 100 records")
+        data = integration["attributes"]["sampleData"]["data"]
+        data = min_json(json.loads(data)[:100])
+        integration["attributes"]["sampleData"]["data"] = data
+
+
+def convert_integration(integration_dir: str) -> str | None:
+    scanned = scan_integration(Path(integration_dir))
+    saved_object = bundle_integration(scanned)
+    check_truncate_sample(saved_object, integration_dir)
+    return min_json(saved_object)
+
+
+@click.command()
+@click.option(
+    "--integrations",
+    type=click.Path(exists=True, dir_okay=True, file_okay=False),
+    help="The directory to scan for integrations",
+)
+@click.option(
+    "--output",
+    type=click.Path(writable=True),
+    help="The destination file to put the bundle (.ndjson)",
+)
+@beartype
+def bundle(integrations: str, output: str) -> int:
+    """Convert local integration folders into an ndjson bundle."""
+    input_files = glob(str(Path(integrations) / "*"))
+    integ_map = map(convert_integration, input_files)
+
+    summary = {"exportedCount": 0, "missingRefCount": 0, "missingReferences": []}
+    with open(Path(output), "w") as bundle:
+        for integration in integ_map:
+            if not integration:
+                continue
+            bundle.write(f"{integration}\n")
+            summary["exportedCount"] += 1
+        bundle.write(f"{min_json(summary)}\n")
+    print(f"Wrote {summary['exportedCount']} integrations to {output}")
+
+    return int(len(input_files) > summary["exportedCount"]) # Status code 0 or 1
+
+if __name__ == "__main__":
+    # Inputs: The directory to scan for integrations, and where to put the bundle
+    input_files = glob("repository/*")
+    output_path = Path("integrations_bundle.ndjson")
+    sys.exit(bundle(input_files, output_path))
diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py
@@ -1,4 +1,5 @@
 import json
+import re
 
 import click
 from beartype import beartype
@@ -12,15 +13,24 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]:
         case "text" | "keyword":
             if not isinstance(actual, str):
                 return {"expected": expect, "actual": actual}
-        case "long" | "integer":
+        case "long" | "integer" | "short":
             if not isinstance(actual, int):
                 return {"expected": expect, "actual": actual}
+        case "double" | "float" | "half_float":
+            if not isinstance(actual, float):
+                return {"expected": expect, "actual": actual}
+        case "boolean":
+            if not isinstance(actual, bool):
+                return {"expected": expect, "actual": actual}
         case "alias":
             # We assume aliases were already unwrapped by the caller and ignore them.
             return {}
         case "date":
             if not isinstance(actual, str) and not isinstance(actual, int):
                 return {"expected": expect, "actual": actual}
+        case "ip":
+            if not isinstance(actual, str) or not re.match(r"(\d{1,3}\.){3}\d{1,3}", actual):
+                return {"expected": expect, "actual": actual}
         case _:
             click.secho(f"WARNING: unknown type '{expect}'", err=True, fg="yellow")
     return {}
@@ -103,19 +113,31 @@ def output_diff(difference: dict[str, object], prefix: str = "") -> None:
     is_flag=True,
     help="Output fields that are expected in the mappings but missing in the data",
 )
-def diff(mapping, data, output_json, show_missing):
+@click.option(
+    "--check-all",
+    "check_all",
+    is_flag=True,
+    help="Check every available data record and report the first one with errors (default: only check first record)"
+)
+def diff(mapping, data, output_json, show_missing, check_all):
     """Type check your integration given a sample data record and the appropriate SS4O schema."""
     properties = load_mapping(mapping)
     with open(data, "r") as data_file:
         data_json = json.load(data_file)
-    if isinstance(data_json, list):
-        # Unwrap list of data, assume first record is representative
-        data_json = data_json[0]
-    check = do_check(properties, data_json, show_missing)
-    if output_json:
-        click.echo(json.dumps(check, sort_keys=True))
-    else:
-        output_diff(check)
+    if not isinstance(data_json, list):
+        # Wrap individual data record in a list
+        data_json = [data_json]
+    for i, record in enumerate(data_json if check_all else data_json[:1], 1):
+        check = do_check(properties, record, show_missing)
+        if check == {}:
+            continue
+        if check_all:
+            click.echo(f"Validation errors found in record {i}", err=True)
+        if output_json:
+            click.echo(json.dumps(check, sort_keys=True))
+        else:
+            output_diff(check)
+        quit(1)
 
 
 if __name__ == "__main__":

diff --git a/cli/src/main.py b/cli/src/main.py
@@ -2,6 +2,7 @@
 
 from .diff import diff
 from .scanviz import scanviz
+from .bundle import bundle
 
 
 @click.group()
@@ -12,6 +13,7 @@ def cli():
 
 cli.add_command(diff)
 cli.add_command(scanviz)
+cli.add_command(bundle)
 
 
 if __name__ == "__main__":

diff --git a/cli/src/utils/mappings.py b/cli/src/utils/mappings.py
@@ -6,6 +6,16 @@
 from beartype import beartype
 
 
+@beartype
+def nested_update(base: dict, updates: dict) -> dict:
+    for k, v in updates.items():
+        if isinstance(v, dict):
+            base[k] = nested_update(base.get(k, {}), v)
+        else:
+            base[k] = v
+    return base
+
+
 @beartype
 def load_mapping(mapping: str) -> dict[str, dict]:
     with open(mapping, "r") as mapping_file:
@@ -39,5 +49,5 @@ def load_mapping(mapping: str) -> dict[str, dict]:
                 err=True,
                 fg="yellow",
             )
-        properties.update(load_mapping(item_glob[0]))
+        nested_update(properties, load_mapping(item_glob[0]))
     return properties