Skip to content

Commit

Permalink
Merge pull request #72 from GSA/harvest-records-compare
Browse files Browse the repository at this point in the history
Get latest harvest records from db
  • Loading branch information
rshewitt authored May 17, 2024
2 parents c7049e2 + 9ae19ec commit 5e9fba7
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 4 deletions.
20 changes: 19 additions & 1 deletion database/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import uuid
from sqlalchemy import create_engine, inspect, or_
from sqlalchemy import create_engine, inspect, or_, text
from sqlalchemy.exc import NoResultFound
from sqlalchemy.orm import scoped_session, sessionmaker

Expand Down Expand Up @@ -261,6 +261,24 @@ def get_harvest_record_by_source(self, source_id):
)
return [HarvesterDBInterface._to_dict(rcd) for rcd in harvest_records]

def get_latest_records_by_source(self, source_id):
# datetimes are returned as datetime objs not strs
sql = text(
f"""SELECT * FROM (
SELECT DISTINCT ON (identifier) *
FROM harvest_record
WHERE status = 'success' AND harvest_source_id = '{source_id}'
ORDER BY identifier, date_created DESC ) sq
WHERE sq.action != 'delete';"""
)

res = self.db.execute(sql)

fields = list(res.keys())
records = res.fetchall()

return [dict(zip(fields, record)) for record in records]

def get_source_by_jobid(self, jobid):
harvest_job = self.db.query(HarvestJob).filter_by(id=jobid).first()
if harvest_job is None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "datagov-harvesting-logic"
version = "0.4.4"
version = "0.4.5"
description = ""
# authors = [
# {name = "Jin Sun", email = "[email protected]"},
Expand Down
111 changes: 109 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
def app() -> Flask:
app = create_app()

app.config["TESTING"] = True

with app.app_context():
db.create_all()
yield app
Expand Down Expand Up @@ -73,6 +71,21 @@ def source_data_dcatus(organization_data: dict) -> dict:
}


@pytest.fixture
def source_data_dcatus_2(organization_data: dict) -> dict:
return {
"id": "3f2652de-91df-4c63-8b53-bfced20b276b",
"name": "Test Source",
"notification_emails": "[email protected]",
"organization_id": organization_data["id"],
"frequency": "daily",
"url": f"{HARVEST_SOURCE_URL}/dcatus/dcatus_2.json",
"schema_type": "type1",
"source_type": "dcatus",
"status": "active",
}


@pytest.fixture
def source_data_waf(organization_data: dict) -> dict:
return {
Expand Down Expand Up @@ -213,6 +226,100 @@ def interface_with_multiple_jobs(
return interface


@pytest.fixture
def latest_records(source_data_dcatus, source_data_dcatus_2):
return [
{
"identifier": "a",
"date_created": "2024-01-01T00:00:00.001Z",
"source_raw": "data",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "a",
"date_created": "2024-03-01T00:00:00.001Z",
"source_raw": "data_1",
"status": "success",
"action": "update",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "b",
"date_created": "2024-03-01T00:00:00.001Z",
"source_raw": "data_10",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "b",
"date_created": "2022-05-01T00:00:00.001Z",
"source_raw": "data_30",
"status": "error",
"action": "update",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "c",
"date_created": "2024-05-01T00:00:00.001Z",
"source_raw": "data_12",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "d",
"date_created": "2024-05-01T00:00:00.001Z",
"source_raw": "data_2",
"status": "success",
"action": "delete",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "d",
"date_created": "2024-04-01T00:00:00.001Z",
"source_raw": "data_5",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "e",
"date_created": "2024-04-01T00:00:00.001Z",
"source_raw": "data_123",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "e",
"date_created": "2024-04-02T00:00:00.001Z",
"source_raw": "data_123",
"status": "success",
"action": "delete",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "e",
"date_created": "2024-04-03T00:00:00.001Z",
"source_raw": "data_123",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus["id"],
},
{
"identifier": "f",
"date_created": "2024-04-03T00:00:00.001Z",
"source_raw": "data_123",
"status": "success",
"action": "create",
"harvest_source_id": source_data_dcatus_2["id"],
},
]


@pytest.fixture
def internal_compare_data(job_data_dcatus: dict) -> dict:
# ruff: noqa: E501
Expand Down
87 changes: 87 additions & 0 deletions tests/integration/database/test_db.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import datetime


class TestDatabase:
def test_add_organization(self, interface, organization_data):
org = interface.add_organization(organization_data)
Expand Down Expand Up @@ -190,3 +193,87 @@ def test_filter_jobs_by_faceted_filter(
)
== 2
)

def test_get_latest_harvest_records(
self,
interface,
organization_data,
source_data_dcatus,
source_data_dcatus_2,
job_data_dcatus,
latest_records,
):
interface.add_organization(organization_data)
interface.add_harvest_source(source_data_dcatus)
# another source for querying against. see last record in
# `latest_records` fixture
interface.add_harvest_source(source_data_dcatus_2)
interface.add_harvest_job(job_data_dcatus)
interface.add_harvest_records(latest_records)

latest_records = interface.get_latest_records_by_source(
source_data_dcatus["id"]
)

# remove so compare works
for record in latest_records:
del record["id"]

expected_records = [
{
"identifier": "a",
"harvest_job_id": None,
"harvest_source_id": "2f2652de-91df-4c63-8b53-bfced20b276b",
"source_hash": None,
"source_raw": "data_1",
"date_created": datetime.datetime(2024, 3, 1, 0, 0, 0, 1000),
"date_finished": None,
"ckan_id": None,
"type": None,
"action": "update",
"status": "success",
},
{
"identifier": "b",
"harvest_job_id": None,
"harvest_source_id": "2f2652de-91df-4c63-8b53-bfced20b276b",
"source_hash": None,
"source_raw": "data_10",
"date_created": datetime.datetime(2024, 3, 1, 0, 0, 0, 1000),
"date_finished": None,
"ckan_id": None,
"type": None,
"action": "create",
"status": "success",
},
{
"identifier": "c",
"harvest_job_id": None,
"harvest_source_id": "2f2652de-91df-4c63-8b53-bfced20b276b",
"source_hash": None,
"source_raw": "data_12",
"date_created": datetime.datetime(2024, 5, 1, 0, 0, 0, 1000),
"date_finished": None,
"ckan_id": None,
"type": None,
"action": "create",
"status": "success",
},
{
"identifier": "e",
"harvest_job_id": None,
"harvest_source_id": "2f2652de-91df-4c63-8b53-bfced20b276b",
"source_hash": None,
"source_raw": "data_123",
"date_created": datetime.datetime(2024, 4, 3, 0, 0, 0, 1000),
"date_finished": None,
"ckan_id": None,
"type": None,
"action": "create",
"status": "success",
},
]

assert len(latest_records) == 4
# make sure there aren't records that are different
assert not any(x != y for x, y in zip(latest_records, expected_records))

1 comment on commit 5e9fba7

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests Skipped Failures Errors Time
40 0 💤 0 ❌ 0 🔥 16.076s ⏱️

Please sign in to comment.