Merge pull request #193 from splunk/improved_data_sources

better data source handling
splunk · Jul 25, 2024 · 5bd600c · 5bd600c
2 parents 7b10d64 + 85bd7c8
commit 5bd600c
Show file tree

Hide file tree

Showing 15 changed files with 400 additions and 121 deletions.
diff --git a/contentctl/actions/build.py b/contentctl/actions/build.py
@@ -10,6 +10,8 @@
 from contentctl.output.conf_writer import ConfWriter
 from contentctl.output.ba_yml_output import BAYmlOutput
 from contentctl.output.api_json_output import ApiJsonOutput
+from contentctl.output.data_source_writer import DataSourceWriter
+from contentctl.objects.lookup import Lookup
 import pathlib
 import json
 import datetime
@@ -28,9 +30,20 @@ class Build:
 
 
     def execute(self, input_dto: BuildInputDto) -> DirectorOutputDto:
-        if input_dto.config.build_app:    
+        if input_dto.config.build_app:
+
             updated_conf_files:set[pathlib.Path] = set()
             conf_output = ConfOutput(input_dto.config)
+
+            # Construct a special lookup whose CSV is created at runtime and
+            # written directly into the output folder. It is created with model_construct,
+            # not model_validate, because the CSV does not exist yet.
+            data_sources_lookup_csv_path = input_dto.config.getPackageDirectoryPath() / "lookups" / "data_sources.csv"
+            DataSourceWriter.writeDataSourceCsv(input_dto.director_output_dto.data_sources, data_sources_lookup_csv_path)
+            input_dto.director_output_dto.addContentToDictMappings(Lookup.model_construct(description= "A lookup file that will contain the data source objects for detections.", 
+                                                                        filename=data_sources_lookup_csv_path, 
+                                                                        name="data_sources"))
+
             updated_conf_files.update(conf_output.writeHeaders())
             updated_conf_files.update(conf_output.writeObjects(input_dto.director_output_dto.detections, SecurityContentType.detections))
             updated_conf_files.update(conf_output.writeObjects(input_dto.director_output_dto.stories, SecurityContentType.stories))

diff --git a/contentctl/actions/initialize.py b/contentctl/actions/initialize.py
@@ -28,6 +28,7 @@ def execute(self, config: test) -> None:
             ('../templates/app_template/', 'app_template'),
             ('../templates/deployments/', 'deployments'),
             ('../templates/detections/', 'detections'),
+            ('../templates/data_sources/', 'data_sources'),
             ('../templates/macros/','macros'),
             ('../templates/stories/', 'stories'),
         ]:

diff --git a/contentctl/actions/validate.py b/contentctl/actions/validate.py
@@ -28,7 +28,6 @@ def execute(self, input_dto: validate) -> DirectorOutputDto:
             [],
             [],
             [],
-            [],
         )
 
         director = Director(director_output_dto)

diff --git a/contentctl/input/director.py b/contentctl/input/director.py
@@ -58,7 +58,6 @@ class DirectorOutputDto:
     deployments: list[Deployment]
     ssa_detections: list[SSADetection]
     data_sources: list[DataSource]
-    event_sources: list[EventSource]
     name_to_content_map: dict[str, SecurityContentObject] = field(default_factory=dict)
     uuid_to_content_map: dict[UUID, SecurityContentObject] = field(default_factory=dict)
 
@@ -68,17 +67,19 @@ def addContentToDictMappings(self, content: SecurityContentObject):
             # Since SSA detections may have the same name as ESCU detection,
             # for this function we prepend 'SSA ' to the name.
             content_name = f"SSA {content_name}"
+
         if content_name in self.name_to_content_map:
             raise ValueError(
                 f"Duplicate name '{content_name}' with paths:\n"
                 f" - {content.file_path}\n"
                 f" - {self.name_to_content_map[content_name].file_path}"
             )
-        elif content.id in self.uuid_to_content_map:
+
+        if content.id in self.uuid_to_content_map:
             raise ValueError(
                 f"Duplicate id '{content.id}' with paths:\n"
                 f" - {content.file_path}\n"
-                f" - {self.name_to_content_map[content_name].file_path}"
+                f" - {self.uuid_to_content_map[content.id].file_path}"
             )
 
         if isinstance(content, Lookup):
@@ -99,9 +100,10 @@ def addContentToDictMappings(self, content: SecurityContentObject):
             self.detections.append(content)
         elif isinstance(content, SSADetection):
             self.ssa_detections.append(content)
+        elif isinstance(content, DataSource):
+            self.data_sources.append(content)
         else:
-             raise Exception(f"Unknown security content type: {type(content)}")
-
+            raise Exception(f"Unknown security content type: {type(content)}")
 
         self.name_to_content_map[content_name] = content
         self.uuid_to_content_map[content.id] = content
@@ -124,41 +126,27 @@ def execute(self, input_dto: validate) -> None:
         self.createSecurityContent(SecurityContentType.stories)
         self.createSecurityContent(SecurityContentType.baselines)
         self.createSecurityContent(SecurityContentType.investigations)
-        self.createSecurityContent(SecurityContentType.event_sources)
         self.createSecurityContent(SecurityContentType.data_sources)
         self.createSecurityContent(SecurityContentType.playbooks)
         self.createSecurityContent(SecurityContentType.detections)
         self.createSecurityContent(SecurityContentType.ssa_detections)
 
+
+        from contentctl.objects.abstract_security_content_objects.detection_abstract import MISSING_SOURCES
+        if len(MISSING_SOURCES) > 0:
+            missing_sources_string = "\n 🟡 ".join(sorted(list(MISSING_SOURCES)))
+            print("WARNING: The following data_sources have been used in detections, but are not yet defined.\n"
+                  "This is not yet an error since not all data_sources have been defined, but will be convered to an error soon:\n 🟡 "
+                  f"{missing_sources_string}")
+        else:
+            print("No missing data_sources!")
+
     def createSecurityContent(self, contentType: SecurityContentType) -> None:
         if contentType == SecurityContentType.ssa_detections:
             files = Utils.get_all_yml_files_from_directory(
                 os.path.join(self.input_dto.path, "ssa_detections")
             )
             security_content_files = [f for f in files if f.name.startswith("ssa___")]
-
-        elif contentType == SecurityContentType.data_sources:
-            security_content_files = (
-                Utils.get_all_yml_files_from_directory_one_layer_deep(
-                    os.path.join(self.input_dto.path, "data_sources")
-                )
-            )
-
-        elif contentType == SecurityContentType.event_sources:
-            security_content_files = Utils.get_all_yml_files_from_directory(
-                os.path.join(self.input_dto.path, "data_sources", "cloud", "event_sources")
-            )
-            security_content_files.extend(
-                Utils.get_all_yml_files_from_directory(
-                    os.path.join(self.input_dto.path, "data_sources", "endpoint", "event_sources")
-                )
-            )
-            security_content_files.extend(
-                Utils.get_all_yml_files_from_directory(
-                    os.path.join(self.input_dto.path, "data_sources", "network", "event_sources")
-                )
-            )
-
         elif contentType in [
             SecurityContentType.deployments,
             SecurityContentType.lookups,
@@ -168,6 +156,7 @@ def createSecurityContent(self, contentType: SecurityContentType) -> None:
             SecurityContentType.investigations,
             SecurityContentType.playbooks,
             SecurityContentType.detections,
+            SecurityContentType.data_sources,
         ]:
             files = Utils.get_all_yml_files_from_directory(
                 os.path.join(self.input_dto.path, str(contentType.name))
@@ -190,54 +179,48 @@ def createSecurityContent(self, contentType: SecurityContentType) -> None:
                 modelDict = YmlReader.load_file(file)
 
                 if contentType == SecurityContentType.lookups:
-                        lookup = Lookup.model_validate(modelDict,context={"output_dto":self.output_dto, "config":self.input_dto})
-                        self.output_dto.addContentToDictMappings(lookup)
+                    lookup = Lookup.model_validate(modelDict,context={"output_dto":self.output_dto, "config":self.input_dto})
+                    self.output_dto.addContentToDictMappings(lookup)
 
                 elif contentType == SecurityContentType.macros:
-                        macro = Macro.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(macro)
+                    macro = Macro.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(macro)
 
                 elif contentType == SecurityContentType.deployments:
-                        deployment = Deployment.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(deployment)
+                    deployment = Deployment.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(deployment)
 
                 elif contentType == SecurityContentType.playbooks:
-                        playbook = Playbook.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(playbook)                  
+                    playbook = Playbook.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(playbook)                  
 
                 elif contentType == SecurityContentType.baselines:
-                        baseline = Baseline.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(baseline)
+                    baseline = Baseline.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(baseline)
 
                 elif contentType == SecurityContentType.investigations:
-                        investigation = Investigation.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(investigation)
+                    investigation = Investigation.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(investigation)
 
                 elif contentType == SecurityContentType.stories:
-                        story = Story.model_validate(modelDict,context={"output_dto":self.output_dto})
-                        self.output_dto.addContentToDictMappings(story)
+                    story = Story.model_validate(modelDict,context={"output_dto":self.output_dto})
+                    self.output_dto.addContentToDictMappings(story)
 
                 elif contentType == SecurityContentType.detections:
-                        detection = Detection.model_validate(modelDict,context={"output_dto":self.output_dto, "app":self.input_dto.app})
-                        self.output_dto.addContentToDictMappings(detection)
+                    detection = Detection.model_validate(modelDict,context={"output_dto":self.output_dto, "app":self.input_dto.app})
+                    self.output_dto.addContentToDictMappings(detection)
 
                 elif contentType == SecurityContentType.ssa_detections:
-                        self.constructSSADetection(self.ssa_detection_builder, self.output_dto,str(file))
-                        ssa_detection = self.ssa_detection_builder.getObject()
-                        if ssa_detection.status in [DetectionStatus.production.value, DetectionStatus.validation.value]:
-                            self.output_dto.addContentToDictMappings(ssa_detection)
+                    self.constructSSADetection(self.ssa_detection_builder, self.output_dto,str(file))
+                    ssa_detection = self.ssa_detection_builder.getObject()
+                    if ssa_detection.status in [DetectionStatus.production.value, DetectionStatus.validation.value]:
+                        self.output_dto.addContentToDictMappings(ssa_detection)
 
                 elif contentType == SecurityContentType.data_sources:
                     data_source = DataSource.model_validate(
                         modelDict, context={"output_dto": self.output_dto}
                     )
-                    self.output_dto.data_sources.append(data_source)
-
-                elif contentType == SecurityContentType.event_sources:
-                    event_source = EventSource.model_validate(
-                        modelDict, context={"output_dto": self.output_dto}
-                    )
-                    self.output_dto.event_sources.append(event_source)
+                    self.output_dto.addContentToDictMappings(data_source)
 
                 else:
                     raise Exception(f"Unsupported type: [{contentType}]")

diff --git a/contentctl/input/yml_reader.py b/contentctl/input/yml_reader.py
@@ -40,6 +40,8 @@ def load_file(file_path: pathlib.Path, add_fields=True, STRICT_YML_CHECKING=Fals
         if add_fields == False:
             return yml_obj
 
+
         yml_obj['file_path'] = str(file_path)
+
 
         return yml_obj
diff --git a/contentctl/objects/abstract_security_content_objects/detection_abstract.py b/contentctl/objects/abstract_security_content_objects/detection_abstract.py
@@ -22,25 +22,26 @@
 from contentctl.objects.unit_test import UnitTest
 from contentctl.objects.test_group import TestGroup
 from contentctl.objects.integration_test import IntegrationTest
-
+from contentctl.objects.event_source import EventSource
+from contentctl.objects.data_source import DataSource
 
 #from contentctl.objects.playbook import Playbook
-from contentctl.objects.enums import DataSource,ProvidingTechnology
+from contentctl.objects.enums import ProvidingTechnology
 from contentctl.enrichments.cve_enrichment import CveEnrichmentObj
 
+MISSING_SOURCES:set[str] = set()
 
 class Detection_Abstract(SecurityContentObject):
     model_config = ConfigDict(use_enum_values=True)
 
     #contentType: SecurityContentType = SecurityContentType.detections
     type: AnalyticsType = Field(...)
     status: DetectionStatus = Field(...)
-    data_source: Optional[List[str]] = None
+    data_source: list[str] = []
     tags: DetectionTags = Field(...)
     search: Union[str, dict[str,Any]] = Field(...)
     how_to_implement: str = Field(..., min_length=4)
     known_false_positives: str = Field(..., min_length=4)
-    data_source_objects: Optional[List[DataSource]] = None
 
     enabled_by_default: bool = False
     file_path: FilePath = Field(...)
@@ -53,6 +54,8 @@ class Detection_Abstract(SecurityContentObject):
     # A list of groups of tests, relying on the same data
     test_groups: Union[list[TestGroup], None] = Field(None,validate_default=True)
 
+    data_source_objects: list[DataSource] = []
+
 
     @field_validator("search", mode="before")
     @classmethod
@@ -138,6 +141,7 @@ def datamodel(self)->List[DataModel]:
         else:
             return []
 
+
     @computed_field
     @property
     def source(self)->str:
@@ -161,10 +165,12 @@ def annotations(self)->dict[str,Union[List[str],int,str]]:
         annotations_dict["type"] = self.type
         #annotations_dict["version"] = self.version
 
+        annotations_dict["data_source"] = self.data_source
+
         #The annotations object is a superset of the mappings object.
         # So start with the mapping object.
         annotations_dict.update(self.mappings)
-
+        
         #Make sure that the results are sorted for readability/easier diffs
         return dict(sorted(annotations_dict.items(), key=lambda item: item[0]))
 
@@ -384,23 +390,37 @@ def model_post_init(self, ctx:dict[str,Any]):
                 raise ValueError(f"Error, failed to replace detection reference in Baseline '{baseline.name}' to detection '{self.name}'")             
             baseline.tags.detections = new_detections
 
-        self.data_source_objects = []
-        for data_source_obj in director.data_sources:
-            for detection_data_source in self.data_source:
-                if data_source_obj.name in detection_data_source:
-                    self.data_source_objects.append(data_source_obj)
-
-        # Remove duplicate data source objects based on their 'name' property
-        unique_data_sources = {}
-        for data_source_obj in self.data_source_objects:
-            if data_source_obj.name not in unique_data_sources:
-                unique_data_sources[data_source_obj.name] = data_source_obj
-        self.data_source_objects = list(unique_data_sources.values())
+        # Data source may be defined 1 on each line, OR they may be defined as
+        # SOUCE_1 AND ANOTHERSOURCE AND A_THIRD_SOURCE
+        # if more than 1 data source is required for a detection (for example, because it includes a join)
+        # Parse and update the list to resolve individual names and remove potential duplicates
+        updated_data_source_names:set[str] = set()
+
+        for ds in self.data_source:
+            split_data_sources = {d.strip() for d in ds.split('AND')}
+            updated_data_source_names.update(split_data_sources)
+
+        sources = sorted(list(updated_data_source_names))
+
+        matched_data_sources:list[DataSource] = []
+        missing_sources:list[str] = []
+        for source in sources:
+            try:
+                matched_data_sources += DataSource.mapNamesToSecurityContentObjects([source], director)
+            except Exception as data_source_mapping_exception:
+                # We gobble this up and add it to a global set so that we
+                # can print it ONCE at the end of the build of datasources.
+                # This will be removed later as per the note below
+                MISSING_SOURCES.add(source)
+
+        if len(missing_sources) > 0:
+            # This will be changed to ValueError when we have a complete list of data sources
+            print(f"WARNING: The following exception occurred when mapping the data_source field to DataSource objects:{missing_sources}")
+
+        self.data_source_objects = matched_data_sources
 
         for story in self.tags.analytic_story:
-            story.detections.append(self)
-            story.data_sources.extend(self.data_source_objects)
-
+            story.detections.append(self)            
         return self
 
 
@@ -424,14 +444,16 @@ def mapDetectionNamesToBaselineObjects(cls, v:list[str], info:ValidationInfo)->L
             raise ValueError("Error, baselines are constructed automatically at runtime.  Please do not include this field.")
 
 
-        name:Union[str,dict] = info.data.get("name",None)
+        name:Union[str,None] = info.data.get("name",None)
         if name is None:
             raise ValueError("Error, cannot get Baselines because the Detection does not have a 'name' defined.")
-
+         
         director:DirectorOutputDto = info.context.get("output_dto",None)
         baselines:List[Baseline] = []
         for baseline in director.baselines:
-            if name in baseline.tags.detections:
+            # This matching is a bit strange, because baseline.tags.detections starts as a list of strings, but 
+            # is eventually updated to a list of Detections as we construct all of the detection objects. 
+            if name in [detection_name for detection_name in baseline.tags.detections if isinstance(detection_name,str)]:
                 baselines.append(baseline)
 
         return baselines
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,7 +28,6 @@ def execute(self, input_dto: validate) -> DirectorOutputDto: @@
                 [],
                 [],
                 [],
-                [],
             )
             director = Director(director_output_dto)
@@ Expand Down @@