Skip to content

Commit

Permalink
Merge pull request #114 from GSA/add-non-federal-dcatus
Browse files Browse the repository at this point in the history
add fed & non-fed dcatus1.1 schemas
  • Loading branch information
rshewitt authored Nov 22, 2024
2 parents dec3444 + ac01245 commit 15b3c46
Show file tree
Hide file tree
Showing 10 changed files with 833 additions and 19 deletions.
8 changes: 7 additions & 1 deletion app/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ class HarvestSourceForm(FlaskForm):
)
schema_type = SelectField(
"Schema Type",
choices=["iso19115_1", "iso19115_2", "csdgm", "dcatus1.1"],
choices=[
"iso19115_1",
"iso19115_2",
"csdgm",
"dcatus1.1: federal",
"dcatus1.1: non-federal",
],
validators=[DataRequired()],
)
source_type = SelectField(
Expand Down
2 changes: 1 addition & 1 deletion app/readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ curl -X POST http://{site}/harvest_source/add -H "Content-Type: application/json
"notification_emails": "[email protected]",
"frequency": "daily",
"url": "http://example2.com",
"schema_type": "dcatus1.1",
"schema_type": "dcatus1.1: federal",
"source_type": "json"
}
'
Expand Down
10 changes: 9 additions & 1 deletion database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ class HarvestSource(db.Model):
index=True,
)
schema_type = db.Column(
db.Enum("iso19115_1", "iso19115_2", "csdgm", "dcatus1.1", name="schema_type"),
db.Enum(
"iso19115_1",
"iso19115_2",
"csdgm",
"dcatus1.1: federal",
"dcatus1.1: non-federal",
name="schema_type",
),
nullable=False,
)

source_type = db.Column(
db.Enum("document", "waf", name="source_type"), nullable=False
)
Expand Down
33 changes: 33 additions & 0 deletions example_data/dcatus/dcatus_single_record_non-federal.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"@id": "http://www.cftc.gov/data.json",
"@type": "dcat:Catalog",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"dataset": [
{
"contactPoint": {
"fn": "Harold W. Hild",
"hasEmail": "mailto:[email protected]"
},
"describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm",
"description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC",
"distribution": [
{
"accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm"
}
],
"modified": "R/P1W",
"publisher": {
"name": "U.S. Commodity Futures Trading Commission",
"subOrganizationOf": {
"name": "U.S. Government"
}
},
"title": "Commitment of Traders",
"accessLevel": "public",
"identifier": "cftc-dc1",
"keyword": ["commitment of traders", "cot", "open interest"]
}
]
}
27 changes: 19 additions & 8 deletions harvester/harvest.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,7 @@ class HarvestSource:
repr=False,
)

_dataset_schema: dict = field(
default_factory=lambda: open_json(ROOT_DIR / "schemas" / "dataset.json"),
repr=False,
)
_dataset_schema: dict = field(default_factory=lambda: {}, repr=False)
_no_harvest_resp: bool = False

# not read-only because these values are added after initialization
Expand All @@ -100,6 +97,15 @@ def __post_init__(self) -> None:
self._db_interface: HarvesterDBInterface = db_interface
self.get_source_info_from_job_id(self.job_id)

if self.schema_type == "dcatus1.1: federal":
self.dataset_schema = open_json(
ROOT_DIR / "schemas" / "federal_dataset.json"
)
else:
self.dataset_schema = open_json(
ROOT_DIR / "schemas" / "non-federal_dataset.json"
)

@property
def job_id(self) -> str:
return self._job_id
Expand All @@ -116,6 +122,12 @@ def source_attrs(self) -> list:
def dataset_schema(self) -> dict:
return self._dataset_schema

@dataset_schema.setter
def dataset_schema(self, value) -> None:
if not isinstance(value, dict):
raise ValueError("dataset schema must be a dict")
self._dataset_schema = value

@property
def no_harvest_resp(self) -> bool:
return self._no_harvest_resp
Expand Down Expand Up @@ -151,7 +163,7 @@ def internal_records_to_id_hash(self, records: list[dict]) -> None:

def get_record_identifier(self, record: dict) -> str:

record_id = "identifier" if self.schema_type == "dcatus1.1" else "url"
record_id = "identifier" if self.schema_type.startswith("dcatus") else "url"

if record_id not in record:
raise Exception
Expand Down Expand Up @@ -257,7 +269,7 @@ def write_compare_to_db(self) -> dict:
else:
record = self.external_records[record_id]

if self.schema_type == "dcatus1.1":
if self.schema_type.startswith("dcatus"):
source_raw = json.dumps(record.metadata)
else:
source_raw = record.metadata["content"]
Expand Down Expand Up @@ -320,7 +332,7 @@ def synchronize_records(self) -> None:
# no longer setting action in compare so setting it here...
record.action = action

if self.schema_type != "dcatus1.1":
if not self.schema_type.startswith("dcatus"):
record.transform()
record.validate()
record.sync()
Expand Down Expand Up @@ -454,7 +466,6 @@ class Record:
default_factory=lambda: {
"iso19115_1": "iso19115_1",
"iso19115_2": "iso19115_2_datagov",
"dcatus1.1": "dcat_us",
"csdgm": "fgdc",
}
)
Expand Down
File renamed without changes.
Loading

2 comments on commit 15b3c46

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
2 0 💤 0 ❌ 0 🔥 9.401s ⏱️

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
2 0 💤 0 ❌ 0 🔥 8.601s ⏱️

Please sign in to comment.