cartography-cncf · juju4 · Nov 13, 2022 · Nov 13, 2022 · Nov 13, 2022 · Nov 13, 2022
diff --git a/cartography/cli.py b/cartography/cli.py
@@ -419,6 +419,31 @@ def _build_parser(self):
                 'The crowdstrike URL, if using self-hosted. Defaults to the public crowdstrike API URL otherwise.'
             ),
         )
+        parser.add_argument(
+            '--sumologic-access-id',
+            type=str,
+            default=None,
+            help=(
+                'The sumologic access id for authentication.'
+            ),
+        )
+        parser.add_argument(
+            '--sumologic-access-key-env-var',
+            type=str,
+            default=None,
+            help=(
+                'The name of environment variable containing the sumologic access key for authentication.'
+            ),
+        )
+        parser.add_argument(
+            '--sumologic-api-url',
+            type=str,
+            default=None,
+            help=(
+                'The url of the target Sumologic API instance - https://help.sumologic.com/'
+                'docs/api/getting-started/#sumo-logic-endpoints-by-deployment-and-firewall-security.'
+            ),
+        )
         parser.add_argument(
             '--gsuite-auth-method',
             type=str,
@@ -569,6 +594,15 @@ def main(self, argv: str) -> int:
         else:
             config.crowdstrike_client_secret = None
 
+        # Sumologic config
+        if config.sumologic_access_key_env_var:
+            logger.debug(
+                f"Reading password for sumologic from environment variable {config.sumologic_access_key_env_var}",
+            )
+            config.sumologic_access_key = os.environ.get(config.sumologic_access_key_env_var)
+        else:
+            config.sumologic_access_key = None
+
         # GSuite config
         if config.gsuite_tokens_env_var:
             logger.debug(f"Reading config string for GSuite from environment variable {config.gsuite_tokens_env_var}")

diff --git a/cartography/config.py b/cartography/config.py
@@ -131,6 +131,9 @@ def __init__(
         crowdstrike_client_id=None,
         crowdstrike_client_secret=None,
         crowdstrike_api_url=None,
+        sumologic_access_id=None,
+        sumologic_access_key=None,
+        sumologic_api_url=None,
         gsuite_auth_method=None,
         gsuite_config=None,
     ):
@@ -174,5 +177,8 @@ def __init__(
         self.crowdstrike_client_id = crowdstrike_client_id
         self.crowdstrike_client_secret = crowdstrike_client_secret
         self.crowdstrike_api_url = crowdstrike_api_url
+        self.sumologic_access_id = sumologic_access_id
+        self.sumologic_access_key = sumologic_access_key
+        self.sumologic_api_url = sumologic_api_url
         self.gsuite_auth_method = gsuite_auth_method
         self.gsuite_config = gsuite_config
diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher
@@ -451,3 +451,5 @@ CREATE INDEX IF NOT EXISTS FOR (n:KubernetesSecret) ON (n.lastupdated);
 CREATE INDEX IF NOT EXISTS FOR (n:KubernetesService) ON (n.id);
 CREATE INDEX IF NOT EXISTS FOR (n:KubernetesService) ON (n.name);
 CREATE INDEX IF NOT EXISTS FOR (n:KubernetesService) ON (n.lastupdated);
+CREATE INDEX IF NOT EXISTS FOR (n:SumologicHost) ON (n.hostname);
+CREATE INDEX IF NOT EXISTS FOR (n:SumologicHost) ON (n.lastupdated);
diff --git a/cartography/data/jobs/cleanup/sumologic_import_cleanup.json b/cartography/data/jobs/cleanup/sumologic_import_cleanup.json
@@ -0,0 +1,10 @@
+{
+  "statements": [
+    {
+      "query": "WITH datetime()-duration('P7D') AS threshold MATCH (h:SumologicHost) WHERE h.lastupdated < threshold WITH h LIMIT $LIMIT_SIZE DETACH DELETE (h)",
+      "iterative": true,
+      "iterationsize": 100
+    }
+  ],
+  "name": "cleanup sumologic"
+}
diff --git a/cartography/intel/sumologic/__init__.py b/cartography/intel/sumologic/__init__.py
@@ -0,0 +1,63 @@
+"""
+cartography/intel/sumologic
+"""
+import logging
+
+import neo4j
+
+from cartography.config import Config
+from cartography.intel.sumologic.endpoints import sync_hosts
+from cartography.stats import get_stats_client
+from cartography.util import merge_module_sync_metadata
+from cartography.util import run_cleanup_job
+from cartography.util import timeit
+
+logger = logging.getLogger(__name__)
+stat_handler = get_stats_client(__name__)
+
+
+@timeit
+def start_sumologic_ingestion(
+    neo4j_session: neo4j.Session,
+    config: Config,
+) -> None:
+    """
+    Perform ingestion of Sumologic data.
+    :param neo4j_session: Neo4J session for database interface
+    :param config: A cartography.config object
+    :return: None
+    """
+    common_job_parameters = {
+        "UPDATE_TAG": config.update_tag,
+    }
+    if not config.sumologic_access_id or not config.sumologic_access_key:
+        logger.error("sumologic config not found")
+        return
+
+    authorization = (
+        config.sumologic_access_id,
+        config.sumologic_access_key,
+        config.sumologic_api_url,
+    )
+    sync_hosts(
+        neo4j_session,
+        config.update_tag,
+        authorization,
+    )
+    run_cleanup_job(
+        "sumologic_import_cleanup.json",
+        neo4j_session,
+        common_job_parameters,
+    )
+
+    group_id = "public"
+    if config.sumologic_api_url:
+        group_id = config.sumologic_api_url
+    merge_module_sync_metadata(
+        neo4j_session,
+        group_type="sumologic",
+        group_id=group_id,
+        synced_type="sumologic",
+        update_tag=config.update_tag,
+        stat_handler=stat_handler,
+    )
diff --git a/cartography/intel/sumologic/endpoints.py b/cartography/intel/sumologic/endpoints.py
@@ -0,0 +1,55 @@
+"""
+cartography/intel/sumologic/endpoints
+"""
+# pylint: disable=missing-function-docstring,too-many-arguments
+import logging
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import neo4j
+
+from .util import get_sumologic_hosts
+from cartography.util import timeit
+
+logger = logging.getLogger(__name__)
+
+
+@timeit
+def sync_hosts(
+    neo4j_session: neo4j.Session,
+    update_tag: int,
+    authorization: Tuple[str, str, str],
+) -> None:
+    sumologic_hosts_list = get_sumologic_hosts(authorization)
+    for host_data in sumologic_hosts_list:
+        load_host_data(neo4j_session, host_data, update_tag)
+
+
+def load_host_data(
+    neo4j_session: neo4j.Session,
+    data: List[Dict],
+    update_tag: int,
+) -> None:
+    """
+    Transform and load scan information
+    """
+    ingestion_cypher_query = """
+    UNWIND $Hosts AS host
+        MERGE (h:SumologicHost{hostname: host.hostname})
+        ON CREATE SET h.hostname = host.hostname,
+            h.sumologic_instance = host.instance,
+            h.tool_first_seen = host.firstseen
+        SET h.short_hostname = host.short_hostname,
+            h.tool_last_seen = host.lastseen,
+            h.sumologic_bu = host.bu,
+            h.sumologic_dc = host.dc,
+            h.modified_timestamp = host.modified_timestamp,
+            h.lastupdated = $update_tag
+    """
+    logger.info("Loading %s sumologic hosts.", len(data))
+    neo4j_session.run(
+        ingestion_cypher_query,
+        Hosts=data,
+        update_tag=update_tag,
+    )
diff --git a/cartography/intel/sumologic/util.py b/cartography/intel/sumologic/util.py
@@ -0,0 +1,80 @@
+"""
+cartography/intel/sumologic/util
+"""
+# pylint: disable=invalid-name,broad-except
+import datetime
+import json
+import logging
+from typing import List
+from typing import Tuple
+
+from msticpy.data.data_providers import QueryProvider
+
+logger = logging.getLogger(__name__)
+
+
+def get_sumologic_hosts(
+    authorization: Tuple[str, str, str],
+    timeout_max: int = 600,
+) -> List:
+    """
+    Get Sumologic (Logging) coverage inventory
+
+    Timeout should be adapted to context, mostly size of indexes and searched timeperiod.
+
+    https://help.sumologic.com/docs/api/search-job/
+    https://msticpy.readthedocs.io/en/latest/data_acquisition/DataProv-Sumologic.html
+    """
+
+    (
+        sumologic_access_id,
+        sumologic_access_key,
+        sumologic_server_url,
+    ) = authorization
+    end_time = datetime.datetime.now()
+    start_time = end_time - datetime.timedelta(hours=1)
+
+    qry_prov = QueryProvider("Sumologic")
+    qry_prov.connect(
+        connection_str=sumologic_server_url,
+        accessid=sumologic_access_id,
+        accesskey=sumologic_access_key,
+    )
+
+    # _sourceCategory, _sourceHost and _collector will get multiple entries for same host...
+    # Exact query depends on your context and how structured in your platform and
+    # the existence of a hostname (through Field Extraction Rule for example)
+    # or use Sumologic Cloud SIEMT Enterprise normalized schema if apply
+    # | formatDate(_messageTime,"yyyy/dd/MM HH:mm:ss") as date
+    # | formatDate(_messageTime,"yyyy/MM/dd'T'HH:mm:ss'Z'") as date - NOK
+    # | formatDate(_messageTime,"yyyy-MM-dd HH:mm:ss") as date - NOK
+    vm_assets_query = r"""(_sourceCategory=*/*/SERVER or _sourceCategory=*/*/NXLOG_*)
+| formatDate(_messageTime,"yyyy-MM-dd'T'HH:mm:ss'Z'") as date
+| tolowercase(replace(hostname, /\..*$/, "")) as short_hostname
+| tolowercase(hostname) as hostname
+| first(date) as lastseen, last(date) as firstseen by bu,dc,hostname,short_hostname
+| count bu,dc,hostname,short_hostname,firstseen,lastseen
+| fields bu,dc,hostname,short_hostname,firstseen,lastseen"""
+    df_vm_assets = qry_prov.exec_query(
+        vm_assets_query,
+        start_time=start_time,
+        end_time=end_time,
+        timeout=timeout_max,
+        verbosity=2,
+    )
+
+    df_vm_assets.columns = df_vm_assets.columns.str.lstrip("map.")
+    df_vm_assets["instance"] = sumologic_server_url.replace(
+        ".sumologic.com/api",
+        "",
+    ).replace("https://api.", "")
+
+    logger.info("SumologicHosts count final: %s", df_vm_assets.shape[0])
+
+    if df_vm_assets.shape[0]:
+        flatten_data = json.loads(df_vm_assets.to_json(orient="records"))
+        logger.debug("Example: %s", flatten_data[0])
+        return flatten_data
+
+    logger.warning("No data returned")
+    return []
diff --git a/cartography/sync.py b/cartography/sync.py
@@ -25,6 +25,7 @@
 import cartography.intel.kubernetes
 import cartography.intel.oci
 import cartography.intel.okta
+import cartography.intel.sumologic
 from cartography.config import Config
 from cartography.stats import set_stats_client
 from cartography.util import STATUS_FAILURE
@@ -44,6 +45,7 @@
     'cve': cartography.intel.cve.start_cve_ingestion,
     'oci': cartography.intel.oci.start_oci_ingestion,
     'okta': cartography.intel.okta.start_okta_ingestion,
+    'sumologic': cartography.intel.sumologic.start_sumologic_ingestion,
     'github': cartography.intel.github.start_github_ingestion,
     'digitalocean': cartography.intel.digitalocean.start_digitalocean_ingestion,
     'kubernetes': cartography.intel.kubernetes.start_k8s_ingestion,

diff --git a/docs/root/modules/sumologic/config.md b/docs/root/modules/sumologic/config.md
@@ -0,0 +1,15 @@
+## Sumologic Configuration
+
+.. _sumologic_config:
+
+Follow these steps to collect Sumologic hosts data with Cartography:
+
+1. Set up an Access key as per https://help.sumologic.com/docs/manage/security/access-keys/
+1. Populate environment variable with the value generated in the previous step (for example, `SUMO_ACCESSKEY`)
+1. Call the `cartography` CLI with:
+    ```bash
+    --sumologic-access-id xxx \
+    --sumologic-access-key-env-var SUMO_ACCESSKEY \
+    --sumologic-api-url https://api.us2.sumologic.com/api
+    ```
+    API url per https://help.sumologic.com/docs/api/getting-started/
diff --git a/docs/root/modules/sumologic/schema.md b/docs/root/modules/sumologic/schema.md
@@ -0,0 +1,18 @@
+## Sumologic Schema
+
+.. _sumologic_schema:
+
+### Sumologic core platform
+
+Representation of a system sending logs to Sumologic core platform. Be aware that this may vary depending on your environment.
+Default code expects _sourceCategory to be formatted as BU/DC/SOURCE_TYPE aka business unit, datacenter or location, and source type. Source type as SERVER or NXLOG_* to match targeted systems (linux, windows...). Some fields may need to be set through [Field Extraction Rules](https://help.sumologic.com/docs/manage/field-extractions/)
+
+| Field | Description |
+|-------|-------------|
+|tool_first_seen| Timestamp of when first available logs for host is available since first sync|
+|tool_last_seen| Timestamp of when last available logs for host is available per last sync|
+|lastupdated| Timestamp of the last time the node was updated|
+|**hostname**| The Hostname Computer name|
+|short_hostname| The short hostname, lowercase|
+|bu| The business unit|
+|dc| The datacenter|
diff --git a/setup.py b/setup.py
@@ -59,6 +59,8 @@
         "pdpyras>=4.3.0",
         "crowdstrike-falconpy>=0.5.1",
         "python-dateutil",
+        "msticpy[azure]>=2.2.0",
+        "sumologic-sdk>=0.1.13",
     ],
     extras_require={
         ':python_version<"3.7"': [

diff --git a/tests/data/sumologic/sumologic_endpoints.py b/tests/data/sumologic/sumologic_endpoints.py
@@ -0,0 +1,11 @@
+GET_HOSTS = [
+    {
+        "firstseen": "2020-12-01T10:20:30Z",
+        "lastseen": "2022-12-15T15:15:15Z",
+        "hostname": "sumohostname.example.com",
+        "bu": "myBU",
+        "short_hostname": "sumohostname",
+        "dc": "REGION1",
+        "instance": "us2",
+    },
+]