diff --git a/app/__init__.py b/app/__init__.py index 9dcc4682..a2d24691 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,25 +1,28 @@ -from flask import Flask -from .models import db -from flask_migrate import Migrate import os + from dotenv import load_dotenv +from flask import Flask from flask_bootstrap import Bootstrap +from flask_migrate import Migrate + +from .models import db load_dotenv() -DATABASE_URI = os.getenv('DATABASE_URI') +DATABASE_URI = os.getenv("DATABASE_URI") + def create_app(testing=False): app = Flask(__name__) if testing: - app.config['TESTING'] = True - app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:' - app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False + app.config["TESTING"] = True + app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///:memory:" + app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False else: - app.config['SQLALCHEMY_DATABASE_URI'] = os.getenv("DATABASE_URI") - app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False - app.config['SECRET_KEY'] = os.urandom(16) + app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv("DATABASE_URI") + app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False + app.config["SECRET_KEY"] = os.urandom(16) Bootstrap(app) db.init_app(app) @@ -28,6 +31,7 @@ def create_app(testing=False): Migrate(app, db) from .routes import register_routes + register_routes(app) - return app \ No newline at end of file + return app diff --git a/app/forms.py b/app/forms.py index dcb6fc1f..14d3b1c8 100644 --- a/app/forms.py +++ b/app/forms.py @@ -1,35 +1,44 @@ -from flask_wtf import FlaskForm -from wtforms import StringField, SubmitField, SelectField, TextAreaField -from wtforms.validators import DataRequired, URL, ValidationError import re +from flask_wtf import FlaskForm +from wtforms import SelectField, StringField, SubmitField, TextAreaField +from wtforms.validators import URL, DataRequired, ValidationError + + def validate_email_list(form, field): - emails = field.data.split(',') + emails = field.data.split(",") for email in emails: if not re.match(r"[^@]+@[^@]+\.[^@]+", email.strip()): raise ValidationError("Invalid email address: {}".format(email)) - + + class HarvestSourceForm(FlaskForm): - organization_id = SelectField('Organization', - choices=[], validators=[DataRequired()]) - name = StringField('Name', validators=[DataRequired()]) - url = StringField('URL', validators=[DataRequired(), URL()]) - emails = TextAreaField('Notification_emails', - validators=[DataRequired(), validate_email_list]) - frequency = SelectField('Frequency', - choices=['Manual', 'Daily', 'Weekly', 'Biweekly','Monthly'], - validators=[DataRequired()]) - user_requested_frequency = StringField('User_requested_frequency', - validators=[DataRequired()]) - schema_type = SelectField('Schema Type', - choices=['strict', 'other'], - validators=[DataRequired()]) - source_type = SelectField('Source Type', - choices=['Datajson', 'WAF'], - validators=[DataRequired()]) - submit = SubmitField('Submit') + organization_id = SelectField( + "Organization", choices=[], validators=[DataRequired()] + ) + name = StringField("Name", validators=[DataRequired()]) + url = StringField("URL", validators=[DataRequired(), URL()]) + emails = TextAreaField( + "Notification_emails", validators=[DataRequired(), validate_email_list] + ) + frequency = SelectField( + "Frequency", + choices=["Manual", "Daily", "Weekly", "Biweekly", "Monthly"], + validators=[DataRequired()], + ) + user_requested_frequency = StringField( + "User_requested_frequency", validators=[DataRequired()] + ) + schema_type = SelectField( + "Schema Type", choices=["strict", "other"], validators=[DataRequired()] + ) + source_type = SelectField( + "Source Type", choices=["Datajson", "WAF"], validators=[DataRequired()] + ) + submit = SubmitField("Submit") + class OrganizationForm(FlaskForm): - name = StringField('Name', validators=[DataRequired()]) - logo = StringField('Logo', validators=[DataRequired()]) - submit = SubmitField('Submit') \ No newline at end of file + name = StringField("Name", validators=[DataRequired()]) + logo = StringField("Logo", validators=[DataRequired()]) + submit = SubmitField("Submit") diff --git a/app/interface.py b/app/interface.py index 025e016c..a960d586 100644 --- a/app/interface.py +++ b/app/interface.py @@ -1,12 +1,18 @@ from sqlalchemy import create_engine, inspect -from sqlalchemy.orm import sessionmaker, scoped_session from sqlalchemy.exc import NoResultFound +from sqlalchemy.orm import scoped_session, sessionmaker + from app.models import ( - Organization, HarvestSource, - HarvestJob, HarvestError, HarvestRecord + HarvestError, + HarvestJob, + HarvestRecord, + HarvestSource, + Organization, ) + from . import DATABASE_URI + class HarvesterDBInterface: def __init__(self, session=None): if session is None: @@ -15,20 +21,19 @@ def __init__(self, session=None): pool_size=10, max_overflow=20, pool_timeout=60, - pool_recycle=1800 + pool_recycle=1800, + ) + session_factory = sessionmaker( + bind=engine, autocommit=False, autoflush=False ) - session_factory = sessionmaker(bind=engine, - autocommit=False, - autoflush=False) self.db = scoped_session(session_factory) else: self.db = session - + @staticmethod def _to_dict(obj): - return {c.key: getattr(obj, c.key) - for c in inspect(obj).mapper.column_attrs} - + return {c.key: getattr(obj, c.key) for c in inspect(obj).mapper.column_attrs} + def add_organization(self, org_data): try: new_org = Organization(**org_data) @@ -46,10 +51,9 @@ def get_all_organizations(self): if orgs is None: return None else: - orgs_data = [ - HarvesterDBInterface._to_dict(org) for org in orgs] + orgs_data = [HarvesterDBInterface._to_dict(org) for org in orgs] return orgs_data - + def get_organization(self, org_id): result = self.db.query(Organization).filter_by(id=org_id).first() if result is None: @@ -68,11 +72,11 @@ def update_organization(self, org_id, updates): self.db.commit() return self._to_dict(org) - + except NoResultFound: self.db.rollback() - return None - + return None + def delete_organization(self, org_id): org = self.db.get(Organization, org_id) if org is None: @@ -80,7 +84,7 @@ def delete_organization(self, org_id): self.db.delete(org) self.db.commit() return "Organization deleted successfully" - + def add_harvest_source(self, source_data): try: new_source = HarvestSource(**source_data) @@ -99,23 +103,26 @@ def get_all_harvest_sources(self): return None else: harvest_sources_data = [ - HarvesterDBInterface._to_dict(source) for source in harvest_sources] + HarvesterDBInterface._to_dict(source) for source in harvest_sources + ] return harvest_sources_data - + def get_harvest_source(self, source_id): result = self.db.query(HarvestSource).filter_by(id=source_id).first() if result is None: return None return HarvesterDBInterface._to_dict(result) - + def get_harvest_source_by_org(self, org_id): - harvest_source = self.db.query( - HarvestSource).filter_by(organization_id=org_id).all() + harvest_source = ( + self.db.query(HarvestSource).filter_by(organization_id=org_id).all() + ) if harvest_source is None: return None else: harvest_source_data = [ - HarvesterDBInterface._to_dict(src) for src in harvest_source] + HarvesterDBInterface._to_dict(src) for src in harvest_source + ] return harvest_source_data def update_harvest_source(self, source_id, updates): @@ -130,11 +137,11 @@ def update_harvest_source(self, source_id, updates): self.db.commit() return self._to_dict(source) - + except NoResultFound: self.db.rollback() - return None - + return None + def delete_harvest_source(self, source_id): source = self.db.get(HarvestSource, source_id) if source is None: @@ -142,7 +149,7 @@ def delete_harvest_source(self, source_id): self.db.delete(source) self.db.commit() return "Harvest source deleted successfully" - + def add_harvest_job(self, job_data): try: new_job = HarvestJob(**job_data) @@ -155,11 +162,10 @@ def add_harvest_job(self, job_data): self.db.rollback() return None - # for test, will remove later + # for test, will remove later def get_all_harvest_jobs(self): harvest_jobs = self.db.query(HarvestJob).all() - harvest_jobs_data = [ - HarvesterDBInterface._to_dict(job) for job in harvest_jobs] + harvest_jobs_data = [HarvesterDBInterface._to_dict(job) for job in harvest_jobs] return harvest_jobs_data def get_harvest_job(self, job_id): @@ -167,15 +173,17 @@ def get_harvest_job(self, job_id): if result is None: return None return HarvesterDBInterface._to_dict(result) - + def get_harvest_job_by_source(self, source_id): - harvest_job = self.db.query( - HarvestJob).filter_by(harvest_source_id=source_id).all() + harvest_job = ( + self.db.query(HarvestJob).filter_by(harvest_source_id=source_id).all() + ) if harvest_job is None: return None else: harvest_job_data = [ - HarvesterDBInterface._to_dict(job) for job in harvest_job] + HarvesterDBInterface._to_dict(job) for job in harvest_job + ] return harvest_job_data def update_harvest_job(self, job_id, updates): @@ -190,11 +198,11 @@ def update_harvest_job(self, job_id, updates): self.db.commit() return self._to_dict(job) - + except NoResultFound: self.db.rollback() - return None - + return None + def delete_harvest_job(self, job_id): job = self.db.get(HarvestJob, job_id) if job is None: @@ -215,11 +223,12 @@ def add_harvest_error(self, error_data): self.db.rollback() return None - # for test, will remove later + # for test, will remove later def get_all_harvest_errors(self): harvest_errors = self.db.query(HarvestError).all() harvest_errors_data = [ - HarvesterDBInterface._to_dict(err) for err in harvest_errors] + HarvesterDBInterface._to_dict(err) for err in harvest_errors + ] return harvest_errors_data def get_harvest_error(self, error_id): @@ -234,10 +243,10 @@ def get_harvest_error_by_job(self, job_id): return None else: harvest_errors_data = [ - HarvesterDBInterface._to_dict(err) for err in harvest_errors] + HarvesterDBInterface._to_dict(err) for err in harvest_errors + ] return harvest_errors_data - def add_harvest_record(self, record_data): try: new_record = HarvestRecord(**record_data) @@ -250,11 +259,12 @@ def add_harvest_record(self, record_data): self.db.rollback() return None - # for test, will remove later + # for test, will remove later def get_all_harvest_records(self): harvest_records = self.db.query(HarvestRecord).all() harvest_records_data = [ - HarvesterDBInterface._to_dict(err) for err in harvest_records] + HarvesterDBInterface._to_dict(err) for err in harvest_records + ] return harvest_records_data def get_harvest_record(self, record_id): @@ -262,29 +272,27 @@ def get_harvest_record(self, record_id): if result is None: return None return HarvesterDBInterface._to_dict(result) - + def get_harvest_record_by_job(self, job_id): - harvest_records = ( - self.db.query(HarvestRecord) - .filter_by(harvest_job_id=job_id) - ) + harvest_records = self.db.query(HarvestRecord).filter_by(harvest_job_id=job_id) if harvest_records is None: return None else: harvest_records_data = [ - HarvesterDBInterface._to_dict(rcd) for rcd in harvest_records] + HarvesterDBInterface._to_dict(rcd) for rcd in harvest_records + ] return harvest_records_data - + def get_harvest_record_by_source(self, source_id): - harvest_records = ( - self.db.query(HarvestRecord) - .filter_by(harvest_source_id=source_id) + harvest_records = self.db.query(HarvestRecord).filter_by( + harvest_source_id=source_id ) if harvest_records is None: return None else: harvest_records_data = [ - HarvesterDBInterface._to_dict(rcd) for rcd in harvest_records] + HarvesterDBInterface._to_dict(rcd) for rcd in harvest_records + ] return harvest_records_data def get_source_by_jobid(self, jobid): @@ -293,9 +301,9 @@ def get_source_by_jobid(self, jobid): return None else: return HarvesterDBInterface._to_dict(harvest_job.source) - + def close(self): if hasattr(self.db, "remove"): self.db.remove() - elif hasattr(self.db, 'close'): + elif hasattr(self.db, "close"): self.db.close() diff --git a/app/models.py b/app/models.py index 6c6e53ce..0dac4ead 100644 --- a/app/models.py +++ b/app/models.py @@ -1,49 +1,56 @@ +import uuid + from flask_sqlalchemy import SQLAlchemy from sqlalchemy import Enum, func -import uuid db = SQLAlchemy() + class Base(db.Model): __abstract__ = True # Indicates that this class should not be created as a table id = db.Column(db.String(36), primary_key=True, default=lambda: str(uuid.uuid4())) + class Organization(Base): __tablename__ = "organization" - + name = db.Column(db.String(), nullable=False, index=True) logo = db.Column(db.String()) - sources = db.relationship("HarvestSource", backref="org", - cascade="all, delete-orphan", - lazy=True) + sources = db.relationship( + "HarvestSource", backref="org", cascade="all, delete-orphan", lazy=True + ) + class HarvestSource(Base): __tablename__ = "harvest_source" - + name = db.Column(db.String, nullable=False) notification_emails = db.Column(db.String) - organization_id = db.Column(db.String(36), - db.ForeignKey("organization.id"), - nullable=False) + organization_id = db.Column( + db.String(36), db.ForeignKey("organization.id"), nullable=False + ) frequency = db.Column(db.String, nullable=False) user_requested_frequency = db.Column(db.String) url = db.Column(db.String, nullable=False, unique=True) schema_type = db.Column(db.String, nullable=False) source_type = db.Column(db.String, nullable=False) status = db.Column(db.String) - jobs = db.relationship("HarvestJob", backref="source", - cascade="all, delete-orphan", - lazy=True) + jobs = db.relationship( + "HarvestJob", backref="source", cascade="all, delete-orphan", lazy=True + ) + class HarvestJob(Base): __tablename__ = "harvest_job" - - harvest_source_id = db.Column(db.String(36), - db.ForeignKey("harvest_source.id"), - nullable=False) - status = db.Column(Enum("new", "in_progress", "complete", name="job_status"), - nullable=False, - index=True) + + harvest_source_id = db.Column( + db.String(36), db.ForeignKey("harvest_source.id"), nullable=False + ) + status = db.Column( + Enum("new", "in_progress", "complete", name="job_status"), + nullable=False, + index=True, + ) date_created = db.Column(db.DateTime, index=True, default=func.now()) date_finished = db.Column(db.DateTime) records_added = db.Column(db.Integer) @@ -51,39 +58,43 @@ class HarvestJob(Base): records_deleted = db.Column(db.Integer) records_errored = db.Column(db.Integer) records_ignored = db.Column(db.Integer) - errors = db.relationship("HarvestError", backref="job", - cascade="all, delete-orphan", - lazy=True) + errors = db.relationship( + "HarvestError", backref="job", cascade="all, delete-orphan", lazy=True + ) + class HarvestError(Base): __tablename__ = "harvest_error" - - harvest_job_id = db.Column(db.String(36), - db.ForeignKey("harvest_job.id"), - nullable=False) - harvest_record_id = db.Column(db.String, - db.ForeignKey("harvest_record.id"), - nullable=True) + + harvest_job_id = db.Column( + db.String(36), db.ForeignKey("harvest_job.id"), nullable=False + ) + harvest_record_id = db.Column( + db.String, db.ForeignKey("harvest_record.id"), nullable=True + ) date_created = db.Column(db.DateTime, default=func.now()) type = db.Column(db.String) - severity = db.Column(Enum("CRITICAL", "ERROR", "WARN", name="error_serverity"), - nullable=False, - index=True) + severity = db.Column( + Enum("CRITICAL", "ERROR", "WARN", name="error_serverity"), + nullable=False, + index=True, + ) message = db.Column(db.String) reference = db.Column(db.String) + class HarvestRecord(db.Model): __tablename__ = "harvest_record" - + id = db.Column(db.String, primary_key=True) - harvest_job_id = db.Column(db.String(36), - db.ForeignKey("harvest_job.id"), - nullable=True) - harvest_source_id = db.Column(db.String(36), - db.ForeignKey("harvest_source.id"), - nullable=True) + harvest_job_id = db.Column( + db.String(36), db.ForeignKey("harvest_job.id"), nullable=True + ) + harvest_source_id = db.Column( + db.String(36), db.ForeignKey("harvest_source.id"), nullable=True + ) source_hash = db.Column(db.String) date_created = db.Column(db.DateTime, index=True, default=func.now()) ckan_id = db.Column(db.String, index=True) type = db.Column(db.String) - status = db.Column(db.String) \ No newline at end of file + status = db.Column(db.String) diff --git a/app/routes.py b/app/routes.py index 6b908231..8c7bb3f4 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,14 +1,17 @@ -from flask import Blueprint, request, render_template, jsonify -from .interface import HarvesterDBInterface +from flask import Blueprint, jsonify, render_template, request + from .forms import HarvestSourceForm, OrganizationForm +from .interface import HarvesterDBInterface mod = Blueprint("harvest", __name__) db = HarvesterDBInterface() + @mod.route("/", methods=["GET"]) def index(): return render_template("index.html") + @mod.route("/organization/add", methods=["POST", "GET"]) def add_organization(): form = OrganizationForm() @@ -21,17 +24,15 @@ def add_organization(): else: form = OrganizationForm() if form.validate_on_submit(): - new_org = { - "name": form.name.data, - "logo": form.logo.data - } - org=db.add_organization(new_org) + new_org = {"name": form.name.data, "logo": form.logo.data} + org = db.add_organization(new_org) if org: return f"Added new organization with ID: {org.id}" else: return "Failed to add organization." return render_template("org_form.html", form=form) + @mod.route("/organization", methods=["GET"]) @mod.route("/organization/", methods=["GET"]) def get_organization(org_id=None): @@ -42,24 +43,28 @@ def get_organization(org_id=None): org = db.get_all_organizations() return org + @mod.route("/organization/", methods=["PUT"]) def update_organization(org_id): result = db.update_organization(org_id, request.json) return result + @mod.route("/organization/", methods=["DELETE"]) def delete_organization(org_id): result = db.delete_organization(org_id) return result + @mod.route("/harvest_source/add", methods=["POST", "GET"]) def add_harvest_source(): form = HarvestSourceForm() organizations = db.get_all_organizations() - organization_choices = [(str(org["id"]), f'{org["name"]} - {org["id"]}') - for org in organizations] + organization_choices = [ + (str(org["id"]), f'{org["name"]} - {org["id"]}') for org in organizations + ] form.organization_id.choices = organization_choices - + if request.is_json: org = db.add_harvest_source(request.json) if org: @@ -70,34 +75,36 @@ def add_harvest_source(): if form.validate_on_submit(): new_source = { "name": form.name.data, - "notification_emails": form.emails.data.replace('\r\n', ', '), + "notification_emails": form.emails.data.replace("\r\n", ", "), "frequency": form.frequency.data, "user_requested_frequency": form.frequency.data, "url": form.url.data, "schema_type": form.schema_type.data, "source_type": form.source_type.data, - "organization_id": form.organization_id.data + "organization_id": form.organization_id.data, } - source=db.add_harvest_source(new_source) + source = db.add_harvest_source(new_source) if source: return f"Added new source with ID: {source.id}" else: return "Failed to add harvest source." return render_template("source_form.html", form=form, choices=organization_choices) + # test interface, will remove later -@mod.route("/get_harvest_source", methods=["GET"]) +@mod.route("/get_harvest_source", methods=["GET"]) def get_all_harvest_sources(): source = db.get_all_harvest_sources() org = db.get_all_organizations() return render_template("harvest_source.html", sources=source, organizations=org) -@mod.route("/harvest_source/", methods=["GET"]) + +@mod.route("/harvest_source/", methods=["GET"]) @mod.route("/harvest_source/", methods=["GET"]) def get_harvest_source(source_id=None): if source_id: source = db.get_harvest_source(source_id) - return jsonify(source) if source else ("Not Found", 404) + return jsonify(source) if source else ("Not Found", 404) organization_id = request.args.get("organization_id") if organization_id: @@ -107,17 +114,20 @@ def get_harvest_source(source_id=None): else: source = db.get_all_harvest_sources() return jsonify(source) - + + @mod.route("/harvest_source/", methods=["PUT"]) def update_harvest_source(source_id): result = db.update_harvest_source(source_id, request.json) return result + @mod.route("/harvest_source/", methods=["DELETE"]) def delete_harvest_source(source_id): result = db.delete_harvest_source(source_id) return result + @mod.route("/harvest_job/add", methods=["POST"]) def add_harvest_job(): if request.is_json: @@ -129,7 +139,8 @@ def add_harvest_job(): else: return jsonify({"Please provide harvest job with json format."}) -@mod.route("/harvest_job/", methods=["GET"]) + +@mod.route("/harvest_job/", methods=["GET"]) @mod.route("/harvest_job/", methods=["GET"]) def get_harvest_job(job_id=None): try: @@ -144,21 +155,24 @@ def get_harvest_job(job_id=None): return "No harvest jobs found for this harvest source", 404 else: job = db.get_all_harvest_jobs() - + return jsonify(job) except Exception: return "Please provide correct job_id or harvest_source_id" + @mod.route("/harvest_job/", methods=["PUT"]) def update_harvest_job(job_id): result = db.update_harvest_job(job_id, request.json) return result + @mod.route("/harvest_job/", methods=["DELETE"]) def delete_harvest_job(job_id): result = db.delete_harvest_job(job_id) return result + @mod.route("/harvest_error/add", methods=["POST", "GET"]) def add_harvest_error(): if request.is_json: @@ -170,7 +184,8 @@ def add_harvest_error(): else: return jsonify({"Please provide harvest error with json format."}) -@mod.route("/harvest_error/", methods=["GET"]) + +@mod.route("/harvest_error/", methods=["GET"]) @mod.route("/harvest_error/", methods=["GET"]) def get_harvest_error(error_id=None): try: @@ -186,11 +201,12 @@ def get_harvest_error(error_id=None): else: # for test, will remove later error = db.get_all_harvest_errors() - + return jsonify(error) except Exception: return "Please provide correct error_id or harvest_job_id" + @mod.route("/harvest_record/add", methods=["POST", "GET"]) def add_harvest_record(): if request.is_json: @@ -202,7 +218,8 @@ def add_harvest_record(): else: return jsonify({"Please provide harvest record with json format."}) -@mod.route("/harvest_record/", methods=["GET"]) + +@mod.route("/harvest_record/", methods=["GET"]) @mod.route("/harvest_record/", methods=["GET"]) def get_harvest_record(record_id=None): try: @@ -223,7 +240,7 @@ def get_harvest_record(record_id=None): else: # for test, will remove later record = db.get_all_harvest_records() - + return jsonify(record) except Exception: return "Please provide correct record_id or harvest_job_id" diff --git a/harvester/__init__.py b/harvester/__init__.py index 99a4defd..83abe371 100644 --- a/harvester/__init__.py +++ b/harvester/__init__.py @@ -1,7 +1,9 @@ import logging.config -from harvester.logger_config import LOGGING_CONFIG + from dotenv import load_dotenv +from harvester.logger_config import LOGGING_CONFIG + load_dotenv() logging.config.dictConfig(LOGGING_CONFIG) diff --git a/harvester/exceptions.py b/harvester/exceptions.py index 1bf76e24..756576ee 100644 --- a/harvester/exceptions.py +++ b/harvester/exceptions.py @@ -1,5 +1,6 @@ import logging from datetime import datetime + from app.interface import HarvesterDBInterface @@ -60,7 +61,7 @@ def __init__(self, msg, harvest_job_id, title): "severity": self.severity, "type": self.type, "date_created": datetime.utcnow(), - "harvest_record_id": self.title # to-do + "harvest_record_id": self.title, # to-do } self.db_interface.add_harvest_error(error_data) diff --git a/harvester/harvest.py b/harvester/harvest.py index 32a158b5..b0ef824c 100644 --- a/harvester/harvest.py +++ b/harvester/harvest.py @@ -1,18 +1,27 @@ +import functools import json +import logging import os import re import xml.etree.ElementTree as ET from dataclasses import asdict, dataclass, field from datetime import datetime from pathlib import Path -import functools -import logging import ckanapi import requests from bs4 import BeautifulSoup from jsonschema import Draft202012Validator +from .ckan_utils import munge_tag, munge_title_to_name +from .exceptions import ( + CompareException, + DCATUSToCKANException, + ExtractCKANSourceException, + ExtractHarvestSourceException, + SynchronizeException, + ValidationException, +) from .utils import ( S3Handler, convert_set_to_list, @@ -21,16 +30,6 @@ sort_dataset, ) -from .ckan_utils import munge_tag, munge_title_to_name -from .exceptions import ( - ExtractHarvestSourceException, - ExtractCKANSourceException, - ValidationException, - DCATUSToCKANException, - SynchronizeException, - CompareException, -) - # requests data session = requests.Session() # TODD: make sure this timeout config doesn't change all requests! diff --git a/harvester/utils.py b/harvester/utils.py index 3e31bf7b..dd304a49 100644 --- a/harvester/utils.py +++ b/harvester/utils.py @@ -1,11 +1,10 @@ +import argparse import hashlib import json import os -import argparse import boto3 import sansjson - from cloudfoundry_client.client import CloudFoundryClient from cloudfoundry_client.v3.tasks import TaskManager diff --git a/migrations/env.py b/migrations/env.py index 4c970927..54a60716 100644 --- a/migrations/env.py +++ b/migrations/env.py @@ -1,9 +1,8 @@ import logging from logging.config import fileConfig -from flask import current_app - from alembic import context +from flask import current_app # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -12,32 +11,31 @@ # Interpret the config file for Python logging. # This line sets up loggers basically. fileConfig(config.config_file_name) -logger = logging.getLogger('alembic.env') +logger = logging.getLogger("alembic.env") def get_engine(): try: # this works with Flask-SQLAlchemy<3 and Alchemical - return current_app.extensions['migrate'].db.get_engine() + return current_app.extensions["migrate"].db.get_engine() except (TypeError, AttributeError): # this works with Flask-SQLAlchemy>=3 - return current_app.extensions['migrate'].db.engine + return current_app.extensions["migrate"].db.engine def get_engine_url(): try: - return get_engine().url.render_as_string(hide_password=False).replace( - '%', '%%') + return get_engine().url.render_as_string(hide_password=False).replace("%", "%%") except AttributeError: - return str(get_engine().url).replace('%', '%%') + return str(get_engine().url).replace("%", "%%") # add your model's MetaData object here # for 'autogenerate' support # from myapp import mymodel # target_metadata = mymodel.Base.metadata -config.set_main_option('sqlalchemy.url', get_engine_url()) -target_db = current_app.extensions['migrate'].db +config.set_main_option("sqlalchemy.url", get_engine_url()) +target_db = current_app.extensions["migrate"].db # other values from the config, defined by the needs of env.py, # can be acquired: @@ -46,7 +44,7 @@ def get_engine_url(): def get_metadata(): - if hasattr(target_db, 'metadatas'): + if hasattr(target_db, "metadatas"): return target_db.metadatas[None] return target_db.metadata @@ -64,9 +62,7 @@ def run_migrations_offline(): """ url = config.get_main_option("sqlalchemy.url") - context.configure( - url=url, target_metadata=get_metadata(), literal_binds=True - ) + context.configure(url=url, target_metadata=get_metadata(), literal_binds=True) with context.begin_transaction(): context.run_migrations() @@ -84,13 +80,13 @@ def run_migrations_online(): # when there are no changes to the schema # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html def process_revision_directives(context, revision, directives): - if getattr(config.cmd_opts, 'autogenerate', False): + if getattr(config.cmd_opts, "autogenerate", False): script = directives[0] if script.upgrade_ops.is_empty(): directives[:] = [] - logger.info('No changes in schema detected.') + logger.info("No changes in schema detected.") - conf_args = current_app.extensions['migrate'].configure_args + conf_args = current_app.extensions["migrate"].configure_args if conf_args.get("process_revision_directives") is None: conf_args["process_revision_directives"] = process_revision_directives @@ -98,9 +94,7 @@ def process_revision_directives(context, revision, directives): with connectable.connect() as connection: context.configure( - connection=connection, - target_metadata=get_metadata(), - **conf_args + connection=connection, target_metadata=get_metadata(), **conf_args ) with context.begin_transaction(): diff --git a/migrations/versions/112aacfec4f3_.py b/migrations/versions/112aacfec4f3_.py index 1c58f6a5..87fa903b 100644 --- a/migrations/versions/112aacfec4f3_.py +++ b/migrations/versions/112aacfec4f3_.py @@ -6,12 +6,11 @@ Create Date: 2024-04-08 16:31:41.323203 """ -from alembic import op import sqlalchemy as sa - +from alembic import op # revision identifiers, used by Alembic. -revision = '112aacfec4f3' +revision = "112aacfec4f3" down_revision = None branch_labels = None depends_on = None @@ -19,103 +18,146 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.create_table('organization', - sa.Column('name', sa.String(), nullable=False), - sa.Column('logo', sa.String(), nullable=True), - sa.Column('id', sa.String(length=36), nullable=False), - sa.PrimaryKeyConstraint('id') + op.create_table( + "organization", + sa.Column("name", sa.String(), nullable=False), + sa.Column("logo", sa.String(), nullable=True), + sa.Column("id", sa.String(length=36), nullable=False), + sa.PrimaryKeyConstraint("id"), ) - with op.batch_alter_table('organization', schema=None) as batch_op: - batch_op.create_index(batch_op.f('ix_organization_name'), ['name'], unique=False) - - op.create_table('harvest_source', - sa.Column('name', sa.String(), nullable=False), - sa.Column('notification_emails', sa.String(), nullable=True), - sa.Column('organization_id', sa.String(length=36), nullable=False), - sa.Column('frequency', sa.String(), nullable=False), - sa.Column('user_requested_frequency', sa.String(), nullable=True), - sa.Column('url', sa.String(), nullable=False), - sa.Column('schema_type', sa.String(), nullable=False), - sa.Column('source_type', sa.String(), nullable=False), - sa.Column('status', sa.String(), nullable=True), - sa.Column('id', sa.String(length=36), nullable=False), - sa.ForeignKeyConstraint(['organization_id'], ['organization.id'], ), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('url') + with op.batch_alter_table("organization", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_organization_name"), ["name"], unique=False + ) + + op.create_table( + "harvest_source", + sa.Column("name", sa.String(), nullable=False), + sa.Column("notification_emails", sa.String(), nullable=True), + sa.Column("organization_id", sa.String(length=36), nullable=False), + sa.Column("frequency", sa.String(), nullable=False), + sa.Column("user_requested_frequency", sa.String(), nullable=True), + sa.Column("url", sa.String(), nullable=False), + sa.Column("schema_type", sa.String(), nullable=False), + sa.Column("source_type", sa.String(), nullable=False), + sa.Column("status", sa.String(), nullable=True), + sa.Column("id", sa.String(length=36), nullable=False), + sa.ForeignKeyConstraint( + ["organization_id"], + ["organization.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("url"), ) - op.create_table('harvest_job', - sa.Column('harvest_source_id', sa.String(length=36), nullable=False), - sa.Column('status', sa.Enum('new', 'in_progress', 'complete', name='job_status'), nullable=False), - sa.Column('date_created', sa.DateTime(), nullable=True), - sa.Column('date_finished', sa.DateTime(), nullable=True), - sa.Column('records_added', sa.Integer(), nullable=True), - sa.Column('records_updated', sa.Integer(), nullable=True), - sa.Column('records_deleted', sa.Integer(), nullable=True), - sa.Column('records_errored', sa.Integer(), nullable=True), - sa.Column('records_ignored', sa.Integer(), nullable=True), - sa.Column('id', sa.String(length=36), nullable=False), - sa.ForeignKeyConstraint(['harvest_source_id'], ['harvest_source.id'], ), - sa.PrimaryKeyConstraint('id') + op.create_table( + "harvest_job", + sa.Column("harvest_source_id", sa.String(length=36), nullable=False), + sa.Column( + "status", + sa.Enum("new", "in_progress", "complete", name="job_status"), + nullable=False, + ), + sa.Column("date_created", sa.DateTime(), nullable=True), + sa.Column("date_finished", sa.DateTime(), nullable=True), + sa.Column("records_added", sa.Integer(), nullable=True), + sa.Column("records_updated", sa.Integer(), nullable=True), + sa.Column("records_deleted", sa.Integer(), nullable=True), + sa.Column("records_errored", sa.Integer(), nullable=True), + sa.Column("records_ignored", sa.Integer(), nullable=True), + sa.Column("id", sa.String(length=36), nullable=False), + sa.ForeignKeyConstraint( + ["harvest_source_id"], + ["harvest_source.id"], + ), + sa.PrimaryKeyConstraint("id"), ) - with op.batch_alter_table('harvest_job', schema=None) as batch_op: - batch_op.create_index(batch_op.f('ix_harvest_job_date_created'), ['date_created'], unique=False) - batch_op.create_index(batch_op.f('ix_harvest_job_status'), ['status'], unique=False) - - op.create_table('harvest_record', - sa.Column('id', sa.String(), nullable=False), - sa.Column('harvest_job_id', sa.String(length=36), nullable=True), - sa.Column('harvest_source_id', sa.String(length=36), nullable=True), - sa.Column('source_hash', sa.String(), nullable=True), - sa.Column('date_created', sa.DateTime(), nullable=True), - sa.Column('ckan_id', sa.String(), nullable=True), - sa.Column('type', sa.String(), nullable=True), - sa.Column('status', sa.String(), nullable=True), - sa.ForeignKeyConstraint(['harvest_job_id'], ['harvest_job.id'], ), - sa.ForeignKeyConstraint(['harvest_source_id'], ['harvest_source.id'], ), - sa.PrimaryKeyConstraint('id') + with op.batch_alter_table("harvest_job", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_harvest_job_date_created"), ["date_created"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_harvest_job_status"), ["status"], unique=False + ) + + op.create_table( + "harvest_record", + sa.Column("id", sa.String(), nullable=False), + sa.Column("harvest_job_id", sa.String(length=36), nullable=True), + sa.Column("harvest_source_id", sa.String(length=36), nullable=True), + sa.Column("source_hash", sa.String(), nullable=True), + sa.Column("date_created", sa.DateTime(), nullable=True), + sa.Column("ckan_id", sa.String(), nullable=True), + sa.Column("type", sa.String(), nullable=True), + sa.Column("status", sa.String(), nullable=True), + sa.ForeignKeyConstraint( + ["harvest_job_id"], + ["harvest_job.id"], + ), + sa.ForeignKeyConstraint( + ["harvest_source_id"], + ["harvest_source.id"], + ), + sa.PrimaryKeyConstraint("id"), ) - with op.batch_alter_table('harvest_record', schema=None) as batch_op: - batch_op.create_index(batch_op.f('ix_harvest_record_ckan_id'), ['ckan_id'], unique=False) - batch_op.create_index(batch_op.f('ix_harvest_record_date_created'), ['date_created'], unique=False) - - op.create_table('harvest_error', - sa.Column('harvest_job_id', sa.String(length=36), nullable=False), - sa.Column('harvest_record_id', sa.String(), nullable=True), - sa.Column('date_created', sa.DateTime(), nullable=True), - sa.Column('type', sa.String(), nullable=True), - sa.Column('severity', sa.Enum('CRITICAL', 'ERROR', 'WARN', name='error_serverity'), nullable=False), - sa.Column('message', sa.String(), nullable=True), - sa.Column('reference', sa.String(), nullable=True), - sa.Column('id', sa.String(length=36), nullable=False), - sa.ForeignKeyConstraint(['harvest_job_id'], ['harvest_job.id'], ), - sa.ForeignKeyConstraint(['harvest_record_id'], ['harvest_record.id'], ), - sa.PrimaryKeyConstraint('id') + with op.batch_alter_table("harvest_record", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_harvest_record_ckan_id"), ["ckan_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_harvest_record_date_created"), ["date_created"], unique=False + ) + + op.create_table( + "harvest_error", + sa.Column("harvest_job_id", sa.String(length=36), nullable=False), + sa.Column("harvest_record_id", sa.String(), nullable=True), + sa.Column("date_created", sa.DateTime(), nullable=True), + sa.Column("type", sa.String(), nullable=True), + sa.Column( + "severity", + sa.Enum("CRITICAL", "ERROR", "WARN", name="error_serverity"), + nullable=False, + ), + sa.Column("message", sa.String(), nullable=True), + sa.Column("reference", sa.String(), nullable=True), + sa.Column("id", sa.String(length=36), nullable=False), + sa.ForeignKeyConstraint( + ["harvest_job_id"], + ["harvest_job.id"], + ), + sa.ForeignKeyConstraint( + ["harvest_record_id"], + ["harvest_record.id"], + ), + sa.PrimaryKeyConstraint("id"), ) - with op.batch_alter_table('harvest_error', schema=None) as batch_op: - batch_op.create_index(batch_op.f('ix_harvest_error_severity'), ['severity'], unique=False) + with op.batch_alter_table("harvest_error", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_harvest_error_severity"), ["severity"], unique=False + ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - with op.batch_alter_table('harvest_error', schema=None) as batch_op: - batch_op.drop_index(batch_op.f('ix_harvest_error_severity')) + with op.batch_alter_table("harvest_error", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_harvest_error_severity")) - op.drop_table('harvest_error') - with op.batch_alter_table('harvest_record', schema=None) as batch_op: - batch_op.drop_index(batch_op.f('ix_harvest_record_date_created')) - batch_op.drop_index(batch_op.f('ix_harvest_record_ckan_id')) + op.drop_table("harvest_error") + with op.batch_alter_table("harvest_record", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_harvest_record_date_created")) + batch_op.drop_index(batch_op.f("ix_harvest_record_ckan_id")) - op.drop_table('harvest_record') - with op.batch_alter_table('harvest_job', schema=None) as batch_op: - batch_op.drop_index(batch_op.f('ix_harvest_job_status')) - batch_op.drop_index(batch_op.f('ix_harvest_job_date_created')) + op.drop_table("harvest_record") + with op.batch_alter_table("harvest_job", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_harvest_job_status")) + batch_op.drop_index(batch_op.f("ix_harvest_job_date_created")) - op.drop_table('harvest_job') - op.drop_table('harvest_source') - with op.batch_alter_table('organization', schema=None) as batch_op: - batch_op.drop_index(batch_op.f('ix_organization_name')) + op.drop_table("harvest_job") + op.drop_table("harvest_source") + with op.batch_alter_table("organization", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_organization_name")) - op.drop_table('organization') + op.drop_table("organization") # ### end Alembic commands ### diff --git a/run.py b/run.py index e3252c1e..63fab832 100644 --- a/run.py +++ b/run.py @@ -2,5 +2,5 @@ app = create_app() -if __name__ == '__main__': - app.run(debug=True, port=8080) \ No newline at end of file +if __name__ == "__main__": + app.run(debug=True, port=8080) diff --git a/scripts/load-test.py b/scripts/load-test.py index a24aa2e4..4c696e47 100644 --- a/scripts/load-test.py +++ b/scripts/load-test.py @@ -1,29 +1,23 @@ - import os import sys sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])) -from harvester import HarvestSource # noqa E402 +from harvester import HarvestSource # noqa E402 -title = os.environ['SRC_TITLE'] -url = os.environ['SRC_URL'] -owner_org = os.environ['SRC_OWNER_ORG'] -source_type = os.environ['SRC_SOURCE_TYPE'] +title = os.environ["SRC_TITLE"] +url = os.environ["SRC_URL"] +owner_org = os.environ["SRC_OWNER_ORG"] +source_type = os.environ["SRC_SOURCE_TYPE"] -print('Running load test for the following harvest config') -print(f'title: {title}') -print(f'url: {url}') -print(f'owner_org: {owner_org}') -print(f'source_type: {source_type}') +print("Running load test for the following harvest config") +print(f"title: {title}") +print(f"url: {url}") +print(f"owner_org: {owner_org}") +print(f"source_type: {source_type}") -harvest_source = HarvestSource( - title, - url, - owner_org, - source_type -) +harvest_source = HarvestSource(title, url, owner_org, source_type) -harvest_source.get_record_changes() +harvest_source.get_record_changes() harvest_source.synchronize_records() harvest_source.report() diff --git a/scripts/smoke-test.py b/scripts/smoke-test.py index 73a693fe..37286004 100755 --- a/scripts/smoke-test.py +++ b/scripts/smoke-test.py @@ -2,6 +2,7 @@ import os import sys + from harvester import HarvestSource, Record sys.path.insert(1, "/".join(os.path.realpath(__file__).split("/")[0:-2])) diff --git a/tests/conftest.py b/tests/conftest.py index a0a27f2e..0af03939 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,14 @@ -from pathlib import Path import os +from pathlib import Path import pytest from dotenv import load_dotenv - from sqlalchemy import create_engine, text -from sqlalchemy.orm import sessionmaker, scoped_session -from app.models import Base -from app.interface import HarvesterDBInterface +from sqlalchemy.orm import scoped_session, sessionmaker -from harvester.utils import open_json -from harvester.utils import CFHandler +from app.interface import HarvesterDBInterface +from app.models import Base +from harvester.utils import CFHandler, open_json load_dotenv() diff --git a/tests/database/test_db.py b/tests/database/test_db.py index c72bb921..f037bbee 100644 --- a/tests/database/test_db.py +++ b/tests/database/test_db.py @@ -1,21 +1,24 @@ import pytest +from sqlalchemy.orm import scoped_session, sessionmaker + from app import create_app -from app.models import db from app.interface import HarvesterDBInterface -from sqlalchemy.orm import scoped_session, sessionmaker +from app.models import db -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def app(): _app = create_app(testing=True) - _app.config['TESTING'] = True - _app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:' + _app.config["TESTING"] = True + _app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///:memory:" with _app.app_context(): db.create_all() yield _app db.drop_all() -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def session(app): with app.app_context(): connection = db.engine.connect() @@ -29,41 +32,48 @@ def session(app): transaction.rollback() connection.close() -@pytest.fixture(scope='function') + +@pytest.fixture(scope="function") def interface(session): return HarvesterDBInterface(session=session) @pytest.fixture def org_data(): - return {'name': 'Test Org', 'logo': 'https://example.com/logo.png'} + return {"name": "Test Org", "logo": "https://example.com/logo.png"} + @pytest.fixture def organization(interface, org_data): org = interface.add_organization(org_data) return org + def test_add_organization(interface, org_data): org = interface.add_organization(org_data) assert org is not None - assert org.name == 'Test Org' + assert org.name == "Test Org" + def test_get_all_organizations(interface, org_data): interface.add_organization(org_data) orgs = interface.get_all_organizations() assert len(orgs) > 0 - assert orgs[0]['name'] == 'Test Org' + assert orgs[0]["name"] == "Test Org" + def test_update_organization(interface, organization): - updates = {'name': 'Updated Org'} + updates = {"name": "Updated Org"} updated_org = interface.update_organization(organization.id, updates) - assert updated_org['name'] == 'Updated Org' + assert updated_org["name"] == "Updated Org" + def test_delete_organization(interface, organization): result = interface.delete_organization(organization.id) assert result == "Organization deleted successfully" + @pytest.fixture def source_data(organization): return { @@ -74,26 +84,30 @@ def source_data(organization): "url": "http://example.com", "schema_type": "type1", "source_type": "typeA", - "status": "active" + "status": "active", } + def test_add_harvest_source(interface, source_data): source = interface.add_harvest_source(source_data) assert source is not None assert source.name == source_data["name"] + def test_get_all_harvest_sources(interface, source_data): interface.add_harvest_source(source_data) sources = interface.get_all_harvest_sources() assert len(sources) > 0 assert sources[0]["name"] == source_data["name"] + def test_get_harvest_source(interface, source_data): source = interface.add_harvest_source(source_data) fetched_source = interface.get_harvest_source(source.id) assert fetched_source is not None assert fetched_source["name"] == source_data["name"] + def test_update_harvest_source(interface, source_data): source = interface.add_harvest_source(source_data) updates = {"name": "Updated Test Source"} @@ -101,6 +115,7 @@ def test_update_harvest_source(interface, source_data): assert updated_source is not None assert updated_source["name"] == updates["name"] + def test_delete_harvest_source(interface, source_data): source = interface.add_harvest_source(source_data) assert source is not None @@ -111,6 +126,7 @@ def test_delete_harvest_source(interface, source_data): deleted_source = interface.get_harvest_source(source.id) assert deleted_source is None + @pytest.fixture def job_data(source_data): return {"status": "new"} @@ -124,4 +140,4 @@ def test_harvest_source_by_jobid(interface, source_data, job_data): harvest_job = interface.add_harvest_job(job_data) harvest_source = interface.get_source_by_jobid(harvest_job.id) - assert source.id == harvest_source["id"] \ No newline at end of file + assert source.id == harvest_source["id"] diff --git a/tests/unit/cf/test_cf_tasks.py b/tests/unit/cf/test_cf_tasks.py index cf6e507c..7da22e78 100644 --- a/tests/unit/cf/test_cf_tasks.py +++ b/tests/unit/cf/test_cf_tasks.py @@ -1,4 +1,5 @@ from unittest.mock import patch + from harvester.utils import CFHandler diff --git a/tests/unit/exception/test_exception_handling.py b/tests/unit/exception/test_exception_handling.py index c3a4d430..cb7f5ceb 100644 --- a/tests/unit/exception/test_exception_handling.py +++ b/tests/unit/exception/test_exception_handling.py @@ -1,19 +1,18 @@ from datetime import datetime from unittest.mock import patch +import ckanapi import pytest import harvester -from harvester.harvest import HarvestSource from harvester.exceptions import ( - ExtractHarvestSourceException, - ExtractCKANSourceException, - ValidationException, DCATUSToCKANException, + ExtractCKANSourceException, + ExtractHarvestSourceException, SynchronizeException, + ValidationException, ) - -import ckanapi +from harvester.harvest import HarvestSource # ruff: noqa: F401 # ruff: noqa: F841 @@ -22,9 +21,9 @@ class TestExceptionHandling: def test_add_harvest_source(self, db_interface): organization = { - "id": "919bfb9e-89eb-4032-9abf-eee54be5a00c", - "logo": "url for the logo", - "name": "GSA" + "id": "919bfb9e-89eb-4032-9abf-eee54be5a00c", + "logo": "url for the logo", + "name": "GSA", } harvest_source = { diff --git a/tests/unit/utils/test_utils.py b/tests/unit/utils/test_utils.py index 81ad049b..b81f240d 100644 --- a/tests/unit/utils/test_utils.py +++ b/tests/unit/utils/test_utils.py @@ -1,6 +1,7 @@ +import pytest + from harvester.ckan_utils import munge_tag, munge_title_to_name from harvester.utils import parse_args -import pytest # these tests are copied from # https://github.com/ckan/ckan/blob/master/ckan/tests/lib/test_munge.py