From 3434af5929b0530cd1102069a59c127dec73d0f1 Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Wed, 26 Jul 2023 10:23:41 -0700 Subject: [PATCH 1/8] Add ip type check to diff command Signed-off-by: Simeon Widdis --- cli/src/diff/diff.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py index 69b1edf..54cb7e4 100644 --- a/cli/src/diff/diff.py +++ b/cli/src/diff/diff.py @@ -1,4 +1,5 @@ import json +import re import click from beartype import beartype @@ -21,6 +22,9 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]: case "date": if not isinstance(actual, str) and not isinstance(actual, int): return {"expected": expect, "actual": actual} + case "ip": + if not isinstance(actual, str) or not re.match(r"(\d{1,3}\.){3}\d{1,3}", actual): + return {"expected": expect, "actual": actual} case _: click.secho(f"WARNING: unknown type '{expect}'", err=True, fg="yellow") return {} @@ -116,6 +120,7 @@ def diff(mapping, data, output_json, show_missing): click.echo(json.dumps(check, sort_keys=True)) else: output_diff(check) + quit(0 if check == {} else 1) if __name__ == "__main__": From 202fd8dff8498f850c3a31213b37288da523f286 Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Wed, 26 Jul 2023 10:34:05 -0700 Subject: [PATCH 2/8] Add check-all flag to diff Signed-off-by: Simeon Widdis --- cli/src/diff/diff.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py index 54cb7e4..31cdfc3 100644 --- a/cli/src/diff/diff.py +++ b/cli/src/diff/diff.py @@ -107,20 +107,31 @@ def output_diff(difference: dict[str, object], prefix: str = "") -> None: is_flag=True, help="Output fields that are expected in the mappings but missing in the data", ) -def diff(mapping, data, output_json, show_missing): +@click.option( + "--check-all", + "check_all", + is_flag=True, + help="Check every available data record and report the first one with errors (default: only check first record)" +) +def diff(mapping, data, output_json, show_missing, check_all): """Type check your integration given a sample data record and the appropriate SS4O schema.""" properties = load_mapping(mapping) with open(data, "r") as data_file: data_json = json.load(data_file) - if isinstance(data_json, list): - # Unwrap list of data, assume first record is representative - data_json = data_json[0] - check = do_check(properties, data_json, show_missing) - if output_json: - click.echo(json.dumps(check, sort_keys=True)) - else: - output_diff(check) - quit(0 if check == {} else 1) + if not isinstance(data_json, list): + # Wrap individual data record in a list + data_json = [data_json] + for i, record in enumerate(data_json if check_all else data_json[:1], 1): + check = do_check(properties, record, show_missing) + if check == {}: + continue + if check_all: + click.echo(f"Validation errors found in record {i}", err=True) + if output_json: + click.echo(json.dumps(check, sort_keys=True)) + else: + output_diff(check) + quit(1) if __name__ == "__main__": From dd064f9e781ccfaf33250c1a0839715b193fbb51 Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Wed, 26 Jul 2023 10:56:50 -0700 Subject: [PATCH 3/8] Fix issue with updating nested fields on schema load Signed-off-by: Simeon Widdis --- cli/src/utils/mappings.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cli/src/utils/mappings.py b/cli/src/utils/mappings.py index cdca72a..36c20b6 100644 --- a/cli/src/utils/mappings.py +++ b/cli/src/utils/mappings.py @@ -6,6 +6,16 @@ from beartype import beartype +@beartype +def nested_update(base: dict, updates: dict) -> dict: + for k, v in updates.items(): + if isinstance(v, dict): + base[k] = nested_update(base.get(k, {}), v) + else: + base[k] = v + return base + + @beartype def load_mapping(mapping: str) -> dict[str, dict]: with open(mapping, "r") as mapping_file: @@ -39,5 +49,5 @@ def load_mapping(mapping: str) -> dict[str, dict]: err=True, fg="yellow", ) - properties.update(load_mapping(item_glob[0])) + nested_update(properties, load_mapping(item_glob[0])) return properties From bb68442d011edd5e428b1242f02610c4843b0a2a Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Wed, 26 Jul 2023 10:57:02 -0700 Subject: [PATCH 4/8] Add double checking to diff Signed-off-by: Simeon Widdis --- cli/src/diff/diff.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py index 31cdfc3..8d8aa17 100644 --- a/cli/src/diff/diff.py +++ b/cli/src/diff/diff.py @@ -16,6 +16,9 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]: case "long" | "integer": if not isinstance(actual, int): return {"expected": expect, "actual": actual} + case "double": + if not isinstance(actual, float): + return {"expected": expect, "actual": actual} case "alias": # We assume aliases were already unwrapped by the caller and ignore them. return {} From dbb1c7453dc620bd7f07f21bf37eb56a4e8ea44d Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Thu, 8 Feb 2024 16:22:48 -0800 Subject: [PATCH 5/8] Improve data type recognition Signed-off-by: Simeon Widdis --- cli/src/diff/diff.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cli/src/diff/diff.py b/cli/src/diff/diff.py index 2254267..afb2c8e 100644 --- a/cli/src/diff/diff.py +++ b/cli/src/diff/diff.py @@ -13,12 +13,15 @@ def flat_type_check(expect: str, actual: object) -> dict[str, dict]: case "text" | "keyword": if not isinstance(actual, str): return {"expected": expect, "actual": actual} - case "long" | "integer": + case "long" | "integer" | "short": if not isinstance(actual, int): return {"expected": expect, "actual": actual} - case "double": + case "double" | "float" | "half_float": if not isinstance(actual, float): return {"expected": expect, "actual": actual} + case "boolean": + if not isinstance(actual, bool): + return {"expected": expect, "actual": actual} case "alias": # We assume aliases were already unwrapped by the caller and ignore them. return {} From d2cd70121301ca63bed602b6424b87a7c96d104a Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Thu, 8 Feb 2024 16:41:06 -0800 Subject: [PATCH 6/8] Copy bundle script as command Signed-off-by: Simeon Widdis --- cli/.gitignore | 2 + cli/requirements.txt | 1 + cli/src/bundle/__init__.py | 1 + cli/src/bundle/bundle.py | 171 +++++++++++++++++++++++++++++++++++++ cli/src/main.py | 2 + 5 files changed, 177 insertions(+) create mode 100644 cli/src/bundle/__init__.py create mode 100644 cli/src/bundle/bundle.py diff --git a/cli/.gitignore b/cli/.gitignore index ad4a1f1..bae087f 100644 --- a/cli/.gitignore +++ b/cli/.gitignore @@ -1,3 +1,5 @@ +data + # Created by https://www.toptal.com/developers/gitignore/api/python # Edit at https://www.toptal.com/developers/gitignore?templates=python diff --git a/cli/requirements.txt b/cli/requirements.txt index e9cd148..01df2a1 100644 --- a/cli/requirements.txt +++ b/cli/requirements.txt @@ -3,6 +3,7 @@ black==23.3.0 click==8.1.3 isort==5.12.0 mypy-extensions==1.0.0 +ndjson==0.3.1 packaging==23.1 pathspec==0.11.1 platformdirs==3.8.0 diff --git a/cli/src/bundle/__init__.py b/cli/src/bundle/__init__.py new file mode 100644 index 0000000..44262d8 --- /dev/null +++ b/cli/src/bundle/__init__.py @@ -0,0 +1 @@ +from .bundle import bundle diff --git a/cli/src/bundle/bundle.py b/cli/src/bundle/bundle.py new file mode 100644 index 0000000..6c16bc7 --- /dev/null +++ b/cli/src/bundle/bundle.py @@ -0,0 +1,171 @@ +import base64 +import copy +import json +import sys +import typing +import uuid +from datetime import datetime +from glob import glob +from pathlib import Path + +from beartype import beartype +import click +import ndjson + +# Random UUID4, use as namespace for UUID5s for helpful collision identification +NAMESPACE_OS_CATALOG: uuid.UUID = uuid.UUID("f21aff9f-a6b3-43eb-85af-5ce18a880430") +CONFIG_FIELD_DIR_INFO: dict[str, tuple[str, str]] = { + "statics": ("static", "bin"), + "components": ("schemas", "mapping.json"), + "savedObjects": ("assets", "ndjson"), + "queries": ("assets", "text"), + "sampleData": ("data", "json"), +} +OS_OBJECT_SIZE_LIMIT = 1_048_576 # 1 MB + +# Minified JSON serialization helper +# Due to hashing and object size limits we want to be consistent about whitespace and key ordering +min_json = lambda obj: json.dumps(obj, separators=(",", ":"), sort_keys=True) + + +def try_attach_assets(config: dict, path: Path, info: None | tuple[str, str]) -> bool: + # Guard clauses: Skip anything that can't be read + if not info: + return False + if not ("path" in config or ("name" in config and "version" in config)): + return False + subdir, encoding = info + + # Read data + read_mode = "r" if encoding != "bin" else "rb" + match (config.get("path"), encoding): + case ( + None, + "text", + ): # If no path and text encoding, rely on language for extension + full_path = ( + path + / subdir + / f"{config['name']}-{config['version']}.{config['language']}" + ) + case (None, _): # Otherwise, use encoding as extension with name + full_path = ( + path / subdir / f"{config['name']}-{config['version']}.{encoding}" + ) + case (_, _): # If a path is present, use it regardless of specified encoding + full_path = path / subdir / config["path"] + with open(full_path, read_mode) as data_file: + data = data_file.read() + + # Attach data to config + match encoding: + case "bin": + config["data"] = str(base64.b64encode(data), encoding="ascii") + case "mapping.json" | "json": + config["data"] = min_json(json.loads(data)) + case "ndjson": + config["data"] = min_json(ndjson.loads(data)) + case "text": + config["data"] = data + return True + + +def attach_assets_in_place( + config: typing.Any, path: Path, info: None | tuple[str, str] = None +) -> None: + if not isinstance(config, list) and not isinstance(config, dict): + return + if isinstance(config, list): + for item in config: + attach_assets_in_place(item, path, info) + return + if try_attach_assets(config, path, info): + return + for key, value in config.items(): + info = CONFIG_FIELD_DIR_INFO.get(key, info) + attach_assets_in_place(value, path, info) + + +def attach_assets(config: dict, path: Path) -> dict: + config = copy.deepcopy(config) + attach_assets_in_place(config, path) + return config + + +# Serialize integration as local dictionary +def scan_integration(path: Path) -> dict: + integration_name = path.stem + # TODO detect latest version instead of defaulting to 1.0.0 + with open(path / f"{integration_name}-1.0.0.json", "r") as config_file: + config = json.load(config_file) + config = attach_assets(config, path) + return config + + +# Convert an integration json config to a full saved object +def bundle_integration(integration: dict): + obj_id = uuid.uuid5( + NAMESPACE_OS_CATALOG, + min_json([integration["name"], integration["type"], integration["version"]]), + ) + return { + "type": "integration-template", + "id": str(obj_id), + "updated_at": datetime.utcnow().isoformat(), + "attributes": integration, + } + + +# If the object is too large, truncate sample data and inform user +# Necessary as some integrations have lots of sample data +def check_truncate_sample(integration: dict, dir: str): + if not integration["attributes"].get("sampleData"): + return + if len(min_json(integration)) > OS_OBJECT_SIZE_LIMIT: + print(f"{dir}: Integration too large! Truncating sample data to 100 records") + data = integration["attributes"]["sampleData"]["data"] + data = min_json(json.loads(data)[:100]) + integration["attributes"]["sampleData"]["data"] = data + + +def convert_integration(integration_dir: str) -> str | None: + scanned = scan_integration(Path(integration_dir)) + saved_object = bundle_integration(scanned) + check_truncate_sample(saved_object, integration_dir) + return min_json(saved_object) + + +@click.command() +@click.option( + "--integrations", + type=click.Path(exists=True, dir_okay=True, file_okay=False), + help="The directory to scan for integrations", +) +@click.option( + "--output", + type=click.Path(writable=True), + help="The destination file to put the bundle (.ndjson)", +) +@beartype +def bundle(integrations: str, output: str) -> bool: + """Convert local integration folders into an ndjson bundle.""" + input_files = glob(str(Path(integrations) / "*")) + integ_map = map(convert_integration, input_files) + + summary = {"exportedCount": 0, "missingRefCount": 0, "missingReferences": []} + with open(Path(output), "w") as bundle: + for integration in integ_map: + if not integration: + continue + bundle.write(f"{integration}\n") + summary["exportedCount"] += 1 + bundle.write(f"{min_json(summary)}\n") + print(f"Wrote {summary['exportedCount']} integrations to {output}") + + return int(len(input_files) > summary["exportedCount"]) # Status code 0 or 1 + +if __name__ == "__main__": + # Inputs: The directory to scan for integrations, and where to put the bundle + input_files = glob("repository/*") + output_path = Path("integrations_bundle.ndjson") + sys.exit(bundle(input_files, output_path)) diff --git a/cli/src/main.py b/cli/src/main.py index 2287d08..d885d0e 100644 --- a/cli/src/main.py +++ b/cli/src/main.py @@ -2,6 +2,7 @@ from .diff import diff from .scanviz import scanviz +from .bundle import bundle @click.group() @@ -12,6 +13,7 @@ def cli(): cli.add_command(diff) cli.add_command(scanviz) +cli.add_command(bundle) if __name__ == "__main__": From a50f6990f3deb210949cd08d20dcd49791db9839 Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Thu, 8 Feb 2024 16:47:07 -0800 Subject: [PATCH 7/8] Upgrade requirements Signed-off-by: Simeon Widdis --- cli/requirements.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cli/requirements.txt b/cli/requirements.txt index 01df2a1..afd56c0 100644 --- a/cli/requirements.txt +++ b/cli/requirements.txt @@ -1,9 +1,9 @@ -beartype==0.14.1 -black==23.3.0 -click==8.1.3 -isort==5.12.0 +beartype==0.17.0 +black==24.1.1 +click==8.1.7 +isort==5.13.2 mypy-extensions==1.0.0 ndjson==0.3.1 -packaging==23.1 -pathspec==0.11.1 -platformdirs==3.8.0 +packaging==23.2 +pathspec==0.12.1 +platformdirs==4.2.0 From c536891937422d7e22ef920c84223e4271d9463e Mon Sep 17 00:00:00 2001 From: Simeon Widdis Date: Fri, 22 Mar 2024 10:48:35 -0700 Subject: [PATCH 8/8] Update bundling for new format Signed-off-by: Simeon Widdis --- cli/setup.py | 2 +- cli/src/bundle/bundle.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cli/setup.py b/cli/setup.py index 1d4a88d..582ab0d 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -2,7 +2,7 @@ setup( name="osints", - version="0.2.0", + version="0.3.0", packages=find_packages(), include_package_data=True, install_requires=[ diff --git a/cli/src/bundle/bundle.py b/cli/src/bundle/bundle.py index 6c16bc7..86f13e4 100644 --- a/cli/src/bundle/bundle.py +++ b/cli/src/bundle/bundle.py @@ -17,8 +17,7 @@ CONFIG_FIELD_DIR_INFO: dict[str, tuple[str, str]] = { "statics": ("static", "bin"), "components": ("schemas", "mapping.json"), - "savedObjects": ("assets", "ndjson"), - "queries": ("assets", "text"), + "assets": ("assets", "text"), "sampleData": ("data", "json"), } OS_OBJECT_SIZE_LIMIT = 1_048_576 # 1 MB @@ -46,7 +45,7 @@ def try_attach_assets(config: dict, path: Path, info: None | tuple[str, str]) -> full_path = ( path / subdir - / f"{config['name']}-{config['version']}.{config['language']}" + / f"{config['name']}-{config['version']}.{config['extension']}" ) case (None, _): # Otherwise, use encoding as extension with name full_path = ( @@ -147,7 +146,7 @@ def convert_integration(integration_dir: str) -> str | None: help="The destination file to put the bundle (.ndjson)", ) @beartype -def bundle(integrations: str, output: str) -> bool: +def bundle(integrations: str, output: str) -> int: """Convert local integration folders into an ndjson bundle.""" input_files = glob(str(Path(integrations) / "*")) integ_map = map(convert_integration, input_files)