Merge pull request #114 from GSA/add-non-federal-dcatus

add fed & non-fed dcatus1.1 schemas
GSA · Nov 22, 2024 · 15b3c46 · 15b3c46 · github-actions · Nov 22, 2024
2 parents dec3444 + ac01245
commit 15b3c46
Show file tree

Hide file tree

Showing 10 changed files with 833 additions and 19 deletions.
diff --git a/app/forms.py b/app/forms.py
@@ -37,7 +37,13 @@ class HarvestSourceForm(FlaskForm):
     )
     schema_type = SelectField(
         "Schema Type",
-        choices=["iso19115_1", "iso19115_2", "csdgm", "dcatus1.1"],
+        choices=[
+            "iso19115_1",
+            "iso19115_2",
+            "csdgm",
+            "dcatus1.1: federal",
+            "dcatus1.1: non-federal",
+        ],
         validators=[DataRequired()],
     )
     source_type = SelectField(

diff --git a/app/readme.txt b/app/readme.txt
@@ -43,7 +43,7 @@ curl -X POST http://{site}/harvest_source/add -H "Content-Type: application/json
     "notification_emails": "[email protected]",
     "frequency": "daily",
     "url": "http://example2.com",
-    "schema_type": "dcatus1.1",
+    "schema_type": "dcatus1.1: federal",
     "source_type": "json"
 }
 '

diff --git a/database/models.py b/database/models.py
@@ -53,9 +53,17 @@ class HarvestSource(db.Model):
         index=True,
     )
     schema_type = db.Column(
-        db.Enum("iso19115_1", "iso19115_2", "csdgm", "dcatus1.1", name="schema_type"),
+        db.Enum(
+            "iso19115_1",
+            "iso19115_2",
+            "csdgm",
+            "dcatus1.1: federal",
+            "dcatus1.1: non-federal",
+            name="schema_type",
+        ),
         nullable=False,
     )
+
     source_type = db.Column(
         db.Enum("document", "waf", name="source_type"), nullable=False
     )

diff --git a/example_data/dcatus/dcatus_single_record_non-federal.json b/example_data/dcatus/dcatus_single_record_non-federal.json
@@ -0,0 +1,33 @@
+{
+  "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
+  "@id": "http://www.cftc.gov/data.json",
+  "@type": "dcat:Catalog",
+  "conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
+  "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
+  "dataset": [
+    {
+      "contactPoint": {
+        "fn": "Harold W. Hild",
+        "hasEmail": "mailto:[email protected]"
+      },
+      "describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm",
+      "description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC",
+      "distribution": [
+        {
+          "accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm"
+        }
+      ],
+      "modified": "R/P1W",
+      "publisher": {
+        "name": "U.S. Commodity Futures Trading Commission",
+        "subOrganizationOf": {
+          "name": "U.S. Government"
+        }
+      },
+      "title": "Commitment of Traders",
+      "accessLevel": "public",
+      "identifier": "cftc-dc1",
+      "keyword": ["commitment of traders", "cot", "open interest"]
+    }
+  ]
+}
diff --git a/harvester/harvest.py b/harvester/harvest.py
@@ -78,10 +78,7 @@ class HarvestSource:
         repr=False,
     )
 
-    _dataset_schema: dict = field(
-        default_factory=lambda: open_json(ROOT_DIR / "schemas" / "dataset.json"),
-        repr=False,
-    )
+    _dataset_schema: dict = field(default_factory=lambda: {}, repr=False)
     _no_harvest_resp: bool = False
 
     # not read-only because these values are added after initialization
@@ -100,6 +97,15 @@ def __post_init__(self) -> None:
         self._db_interface: HarvesterDBInterface = db_interface
         self.get_source_info_from_job_id(self.job_id)
 
+        if self.schema_type == "dcatus1.1: federal":
+            self.dataset_schema = open_json(
+                ROOT_DIR / "schemas" / "federal_dataset.json"
+            )
+        else:
+            self.dataset_schema = open_json(
+                ROOT_DIR / "schemas" / "non-federal_dataset.json"
+            )
+
     @property
     def job_id(self) -> str:
         return self._job_id
@@ -116,6 +122,12 @@ def source_attrs(self) -> list:
     def dataset_schema(self) -> dict:
         return self._dataset_schema
 
+    @dataset_schema.setter
+    def dataset_schema(self, value) -> None:
+        if not isinstance(value, dict):
+            raise ValueError("dataset schema must be a dict")
+        self._dataset_schema = value
+
     @property
     def no_harvest_resp(self) -> bool:
         return self._no_harvest_resp
@@ -151,7 +163,7 @@ def internal_records_to_id_hash(self, records: list[dict]) -> None:
 
     def get_record_identifier(self, record: dict) -> str:
 
-        record_id = "identifier" if self.schema_type == "dcatus1.1" else "url"
+        record_id = "identifier" if self.schema_type.startswith("dcatus") else "url"
 
         if record_id not in record:
             raise Exception
@@ -257,7 +269,7 @@ def write_compare_to_db(self) -> dict:
                 else:
                     record = self.external_records[record_id]
 
-                if self.schema_type == "dcatus1.1":
+                if self.schema_type.startswith("dcatus"):
                     source_raw = json.dumps(record.metadata)
                 else:
                     source_raw = record.metadata["content"]
@@ -320,7 +332,7 @@ def synchronize_records(self) -> None:
                     # no longer setting action in compare so setting it here...
                     record.action = action
 
-                    if self.schema_type != "dcatus1.1":
+                    if not self.schema_type.startswith("dcatus"):
                         record.transform()
                     record.validate()
                     record.sync()
@@ -454,7 +466,6 @@ class Record:
         default_factory=lambda: {
             "iso19115_1": "iso19115_1",
             "iso19115_2": "iso19115_2_datagov",
-            "dcatus1.1": "dcat_us",
             "csdgm": "fgdc",
         }
     )

diff --git a/schemas/dataset.json → schemas/federal_dataset.json b/schemas/dataset.json → schemas/federal_dataset.json