From 2fcd4543922edb968c96967743ccc135715677a4 Mon Sep 17 00:00:00 2001 From: cka-y Date: Thu, 15 Aug 2024 13:09:49 -0400 Subject: [PATCH 01/22] feat: infra code of gbfs validator arc --- functions-python/extract_location/src/main.py | 8 +- functions-python/gbfs_validator/.coveragerc | 10 ++ .../gbfs_validator/function_config.json | 20 +++ .../gbfs_validator/requirements.txt | 15 +++ .../gbfs_validator/requirements_dev.txt | 4 + .../gbfs_validator/src/__init__.py | 0 functions-python/gbfs_validator/src/main.py | 121 ++++++++++++++++++ functions-python/helpers/parser.py | 18 +++ infra/functions-python/main.tf | 109 +++++++++++++++- 9 files changed, 299 insertions(+), 6 deletions(-) create mode 100644 functions-python/gbfs_validator/.coveragerc create mode 100644 functions-python/gbfs_validator/function_config.json create mode 100644 functions-python/gbfs_validator/requirements.txt create mode 100644 functions-python/gbfs_validator/requirements_dev.txt create mode 100644 functions-python/gbfs_validator/src/__init__.py create mode 100644 functions-python/gbfs_validator/src/main.py create mode 100644 functions-python/helpers/parser.py diff --git a/functions-python/extract_location/src/main.py b/functions-python/extract_location/src/main.py index c6346b095..d3970828f 100644 --- a/functions-python/extract_location/src/main.py +++ b/functions-python/extract_location/src/main.py @@ -20,6 +20,7 @@ ) from helpers.database import start_db_session from helpers.logger import Logger +from helpers.parser import jsonify_pubsub from .bounding_box.bounding_box_extractor import ( create_polygon_wkt_element, update_dataset_bounding_box, @@ -61,11 +62,8 @@ def extract_location_pubsub(cloud_event: CloudEvent): logging.info(f"Function triggered with Pub/Sub event data: {data}") # Extract the Pub/Sub message data - try: - message_data = data["message"]["data"] - message_json = json.loads(base64.b64decode(message_data).decode("utf-8")) - except Exception as e: - logging.error(f"Error parsing message data: {e}") + message_json = jsonify_pubsub(data) + if message_json is None: return "Invalid Pub/Sub message data." logging.info(f"Parsed message data: {message_json}") diff --git a/functions-python/gbfs_validator/.coveragerc b/functions-python/gbfs_validator/.coveragerc new file mode 100644 index 000000000..ae792ac20 --- /dev/null +++ b/functions-python/gbfs_validator/.coveragerc @@ -0,0 +1,10 @@ +[run] +omit = + */test*/* + */helpers/* + */database_gen/* + */dataset_service/* + +[report] +exclude_lines = + if __name__ == .__main__.: \ No newline at end of file diff --git a/functions-python/gbfs_validator/function_config.json b/functions-python/gbfs_validator/function_config.json new file mode 100644 index 000000000..5cfa6ee5f --- /dev/null +++ b/functions-python/gbfs_validator/function_config.json @@ -0,0 +1,20 @@ +{ + "name": "gbfs_validator", + "description": "Validate GBFS feeds", + "entry_point": "gbfs_validator", + "timeout": 540, + "memory": "2Gi", + "trigger_http": false, + "include_folders": ["database_gen", "helpers", "dataset_service"], + "environment_variables": [], + "secret_environment_variables": [ + { + "key": "FEEDS_DATABASE_URL" + } + ], + "ingress_settings": "ALLOW_INTERNAL_AND_GCLB", + "max_instance_request_concurrency": 1, + "max_instance_count": 5, + "min_instance_count": 0, + "available_cpu": 1 +} diff --git a/functions-python/gbfs_validator/requirements.txt b/functions-python/gbfs_validator/requirements.txt new file mode 100644 index 000000000..250bf4b34 --- /dev/null +++ b/functions-python/gbfs_validator/requirements.txt @@ -0,0 +1,15 @@ +functions-framework==3.* +google-cloud-storage +google-cloud-pubsub +google-cloud-logging +google-api-core +google-cloud-firestore +google-cloud-datastore +psycopg2-binary==2.9.6 +aiohttp +asyncio +urllib3~=2.1.0 +SQLAlchemy==2.0.23 +geoalchemy2==0.14.7 +requests~=2.31.0 +cloudevents~=1.10.1 \ No newline at end of file diff --git a/functions-python/gbfs_validator/requirements_dev.txt b/functions-python/gbfs_validator/requirements_dev.txt new file mode 100644 index 000000000..800a4ac11 --- /dev/null +++ b/functions-python/gbfs_validator/requirements_dev.txt @@ -0,0 +1,4 @@ +Faker +pytest~=7.4.3 +urllib3-mock +requests-mock \ No newline at end of file diff --git a/functions-python/gbfs_validator/src/__init__.py b/functions-python/gbfs_validator/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py new file mode 100644 index 000000000..45ea9b8c9 --- /dev/null +++ b/functions-python/gbfs_validator/src/main.py @@ -0,0 +1,121 @@ +import logging +import os +import uuid + +import functions_framework +from cloudevents.http import CloudEvent +from google.cloud import pubsub_v1 +from sqlalchemy.orm import joinedload + +from database_gen.sqlacodegen_models import Gbfsfeed +from helpers.database import start_db_session +from helpers.logger import Logger +from helpers.parser import jsonify_pubsub + +logging.basicConfig(level=logging.INFO) + + +def get_all_gbfs_feeds(): + """ + Get all GBFS feeds from the database. + @return: A list of all GBFS feeds. + """ + session = None + try: + session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) + gbfs_feeds = ( + session.query(Gbfsfeed).options(joinedload(Gbfsfeed.gbfsversions)).all() + ) + return gbfs_feeds + except Exception as e: + logging.error(f"Error getting all GBFS feeds: {e}") + raise e + finally: + if session: + session.close() + + +@functions_framework.cloud_event +def gbfs_validator_pubsub(cloud_event: CloudEvent): + """ + Main function triggered by a Pub/Sub message to validate a GBFS feed. + @param cloud_event: The CloudEvent containing the Pub/Sub message. + """ + Logger.init_logger() + data = cloud_event.data + logging.info(f"Function triggered with Pub/Sub event data: {data}") + try: + maximum_executions = int(os.getenv("MAXIMUM_EXECUTIONS", 1)) + except ValueError: + maximum_executions = 1 + logging.info(f"Maximum allowed executions: {maximum_executions}") + + message_json = jsonify_pubsub(cloud_event) + if message_json is None: + return "Invalid Pub/Sub message data." + logging.info(f"Parsed message data: {message_json}") + + # TODO: 1. Parse the CloudEvent data to extract the feed information + # TODO: 2. Store all gbfs file and generate new gbfs.json and store it as well + # TODO: 2.5. Store gbfs snapshot information in the database + # TODO: 3. Validate the feed's version otherwise add a version to the feed + # TODO: 4. Validate the feed (summary) and store the results in the database + return + + +@functions_framework.http +def gbfs_validator_batch(_): + """ + HTTP Cloud Function to trigger the GBFS Validator function for multiple datasets. + @param _: The request object. + @return: The response of the function. + """ + Logger.init_logger() + logging.info("Batch function triggered.") + pubsub_topic_name = os.getenv("PUBSUB_TOPIC_NAME", None) + if pubsub_topic_name is None: + logging.error("PUBSUB_TOPIC_NAME environment variable not set.") + return "PUBSUB_TOPIC_NAME environment variable not set.", 500 + + # Get all GBFS feeds from the database + try: + gbfs_feeds = get_all_gbfs_feeds() + except Exception: + return "Error getting all GBFS feeds.", 500 + + feeds_data = [] + execution_id = str(uuid.uuid4()) + + for gbfs_feed in gbfs_feeds: + if len(gbfs_feed.gbfsversions) == 0: + logging.warning(f"Feed {gbfs_feed.stable_id} has no versions.") + latest_version = None + else: + latest_version = sorted( + gbfs_feed.gbfsversions, key=lambda v: v.version, reverse=True + )[0].version + logging.info( + f"Latest version for feed {gbfs_feed.stable_id}: {latest_version}" + ) + feed_data = { + "execution_id": execution_id, + "stable_id": gbfs_feed.stable_id, + "url": gbfs_feed.auto_discovery_url, + "latest_version": latest_version, + } + feeds_data.append(feed_data) + logging.info(f"Feed {gbfs_feed.stable_id} added to the batch.") + + # Publish to Pub/Sub topic + publisher = pubsub_v1.PublisherClient() + topic_path = publisher.topic_path(os.getenv("PROJECT_ID"), pubsub_topic_name) + + for feed_data in feeds_data: + future = publisher.publish(topic_path, data=b"", **feed_data) + future.result() # Ensure message was published + logging.info(f"Published feed {feed_data['stable_id']} to Pub/Sub.") + + return ( + f"GBFS Validator batch function triggered successfully for {len(feeds_data)} feeds.", + 200, + ) diff --git a/functions-python/helpers/parser.py b/functions-python/helpers/parser.py new file mode 100644 index 000000000..352eee5b1 --- /dev/null +++ b/functions-python/helpers/parser.py @@ -0,0 +1,18 @@ +import base64 +import json +import logging +from cloudevents.http import CloudEvent + + +def jsonify_pubsub(event: CloudEvent): + """ + Convert the message data passed to a pub/sub triggered function to JSON + @param event: The Pub/Sub message. + """ + try: + message_data = event["message"]["data"] + message_json = json.loads(base64.b64decode(message_data).decode("utf-8")) + return message_json + except Exception as e: + logging.error(f"Error parsing message data: {e}") + return None diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 2ee0d984e..53ae3d80c 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -30,6 +30,9 @@ locals { function_update_validation_report_config = jsondecode(file("${path.module}/../../functions-python/update_validation_report/function_config.json")) function_update_validation_report_zip = "${path.module}/../../functions-python/update_validation_report/.dist/update_validation_report.zip" + + function_gbfs_validation_report_config = jsondecode(file("${path.module}/../../functions-python/gbfs_validator/function_config.json")) + function_gbfs_validation_report_zip = "${path.module}/../../functions-python/gbfs_validator/.dist/gbfs_validator.zip" } locals { @@ -71,7 +74,7 @@ resource "google_storage_bucket_object" "function_token_zip" { bucket = google_storage_bucket.functions_bucket.name source = local.function_tokens_zip } -# 2. Bucket extract bounding box +# 2. Extract location resource "google_storage_bucket_object" "function_extract_location_zip_object" { name = "bucket-extract-bb-${substr(filebase64sha256(local.function_extract_location_zip),0,10)}.zip" bucket = google_storage_bucket.functions_bucket.name @@ -91,6 +94,13 @@ resource "google_storage_bucket_object" "update_validation_report_zip" { source = local.function_update_validation_report_zip } +# 5. GBFS validation report +resource "google_storage_bucket_object" "gbfs_validation_report_zip" { + bucket = google_storage_bucket.functions_bucket.name + name = "gbfs-validator-${substr(filebase64sha256(local.function_gbfs_validation_report_zip), 0, 10)}.zip" + source = local.function_gbfs_validation_report_zip +} + # Secrets access resource "google_secret_manager_secret_iam_member" "secret_iam_member" { for_each = local.unique_secret_keys @@ -382,6 +392,103 @@ resource "google_cloudfunctions2_function" "update_validation_report" { } } +# 5. functions/gbfs_validator cloud function +# 5.1 Create Pub/Sub topic +resource "google_pubsub_topic" "validate_gbfs_feed" { + name = "validate-gbfs-feed" +} + +# 5.2 Create batch function that publishes to the Pub/Sub topic +resource "google_cloudfunctions2_function" "gbfs_validator_batch" { + name = "${local.function_gbfs_validation_report_config.name}-batch" + description = local.function_gbfs_validation_report_config.description + location = var.gcp_region + depends_on = [google_project_iam_member.event-receiving, google_secret_manager_secret_iam_member.secret_iam_member] + + build_config { + runtime = var.python_runtime + entry_point = "${local.function_gbfs_validation_report_config.entry_point}_batch" + source { + storage_source { + bucket = google_storage_bucket.functions_bucket.name + object = google_storage_bucket_object.gbfs_validation_report_zip.name + } + } + } + service_config { + environment_variables = { + PROJECT_ID = var.project_id + PUBSUB_TOPIC_NAME = google_pubsub_topic.validate_gbfs_feed.name + PYTHONNODEBUGRANGES = 0 + } + available_memory = "1Gi" + timeout_seconds = local.function_gbfs_validation_report_config.timeout + available_cpu = local.function_gbfs_validation_report_config.available_cpu + max_instance_request_concurrency = local.function_gbfs_validation_report_config.max_instance_request_concurrency + max_instance_count = local.function_gbfs_validation_report_config.max_instance_count + min_instance_count = local.function_gbfs_validation_report_config.min_instance_count + service_account_email = google_service_account.functions_service_account.email + ingress_settings = "ALLOW_ALL" + vpc_connector = data.google_vpc_access_connector.vpc_connector.id + vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" + dynamic "secret_environment_variables" { + for_each = local.function_gbfs_validation_report_config.secret_environment_variables + content { + key = secret_environment_variables.value["key"] + project_id = var.project_id + secret = "${upper(var.environment)}_${secret_environment_variables.value["key"]}" + version = "latest" + } + } + } +} + +# 5.3 Create function that subscribes to the Pub/Sub topic +resource "google_cloudfunctions2_function" "gbfs_validator_pubsub" { + name = "${local.function_gbfs_validation_report_config.name}-pubsub" + description = local.function_gbfs_validation_report_config.description + location = var.gcp_region + depends_on = [google_project_iam_member.event-receiving, google_secret_manager_secret_iam_member.secret_iam_member] + event_trigger { + trigger_region = var.gcp_region + service_account_email = google_service_account.functions_service_account.email + event_type = "google.cloud.pubsub.topic.v1.messagePublished" + pubsub_topic = google_pubsub_topic.validate_gbfs_feed.id + retry_policy = "RETRY_POLICY_RETRY" + } + build_config { + runtime = var.python_runtime + entry_point = "${local.function_gbfs_validation_report_config.entry_point}_pubsub" + source { + storage_source { + bucket = google_storage_bucket.functions_bucket.name + object = google_storage_bucket_object.gbfs_validation_report_zip.name + } + } + } + service_config { + available_memory = local.function_gbfs_validation_report_config.memory + timeout_seconds = local.function_gbfs_validation_report_config.timeout + available_cpu = local.function_gbfs_validation_report_config.available_cpu + max_instance_request_concurrency = local.function_gbfs_validation_report_config.max_instance_request_concurrency + max_instance_count = local.function_gbfs_validation_report_config.max_instance_count + min_instance_count = local.function_gbfs_validation_report_config.min_instance_count + service_account_email = google_service_account.functions_service_account.email + ingress_settings = "ALLOW_ALL" + vpc_connector = data.google_vpc_access_connector.vpc_connector.id + vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" + dynamic "secret_environment_variables" { + for_each = local.function_gbfs_validation_report_config.secret_environment_variables + content { + key = secret_environment_variables.value["key"] + project_id = var.project_id + secret = "${upper(var.environment)}_${secret_environment_variables.value["key"]}" + version = "latest" + } + } + } +} + # IAM entry for all users to invoke the function resource "google_cloudfunctions2_function_iam_member" "tokens_invoker" { project = var.project_id From 379a31231a435b5d9f7bd80c2d06560aa04d00d9 Mon Sep 17 00:00:00 2001 From: cka-y Date: Thu, 15 Aug 2024 13:39:26 -0400 Subject: [PATCH 02/22] fix: added pubsub permissions --- .../gbfs_validator/function_config.json | 2 +- functions-python/gbfs_validator/src/main.py | 20 +++++++++++------ infra/functions-python/main.tf | 22 ++++++++++++++----- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/functions-python/gbfs_validator/function_config.json b/functions-python/gbfs_validator/function_config.json index 5cfa6ee5f..1b14dfe4e 100644 --- a/functions-python/gbfs_validator/function_config.json +++ b/functions-python/gbfs_validator/function_config.json @@ -1,5 +1,5 @@ { - "name": "gbfs_validator", + "name": "gbfs-validator", "description": "Validate GBFS feeds", "entry_point": "gbfs_validator", "timeout": 540, diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 45ea9b8c9..b9a7c8d3b 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -1,3 +1,4 @@ +import json import logging import os import uuid @@ -107,13 +108,18 @@ def gbfs_validator_batch(_): logging.info(f"Feed {gbfs_feed.stable_id} added to the batch.") # Publish to Pub/Sub topic - publisher = pubsub_v1.PublisherClient() - topic_path = publisher.topic_path(os.getenv("PROJECT_ID"), pubsub_topic_name) - - for feed_data in feeds_data: - future = publisher.publish(topic_path, data=b"", **feed_data) - future.result() # Ensure message was published - logging.info(f"Published feed {feed_data['stable_id']} to Pub/Sub.") + try: + publisher = pubsub_v1.PublisherClient() + topic_path = publisher.topic_path(os.getenv("PROJECT_ID"), pubsub_topic_name) + + for feed_data in feeds_data: + message_data = json.dumps(feed_data).encode("utf-8") + future = publisher.publish(topic_path, message_data) + future.result() # Ensure message was published + logging.info(f"Published feed {feed_data['stable_id']} to Pub/Sub.") + except Exception as e: + logging.error(f"Error publishing feeds to Pub/Sub: {e}") + return "Error publishing feeds to Pub/Sub.", 500 return ( f"GBFS Validator batch function triggered successfully for {len(feeds_data)} feeds.", diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 53ae3d80c..53cb8a3ad 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -578,19 +578,29 @@ resource "google_cloud_tasks_queue" "update_validation_report_task_queue" { } # Grant permissions to the service account to publish to the pubsub topic -resource "google_pubsub_topic_iam_binding" "functions_publisher" { +resource "google_pubsub_topic_iam_member" "functions_publisher" { + for_each = { + dataset_updates = google_pubsub_topic.dataset_updates.name + validate_gbfs_feed = google_pubsub_topic.validate_gbfs_feed.name + } + project = var.project_id role = "roles/pubsub.publisher" - topic = google_pubsub_topic.dataset_updates.name - members = ["serviceAccount:${google_service_account.functions_service_account.email}"] + topic = each.value + member = "serviceAccount:${google_service_account.functions_service_account.email}" } # Grant permissions to the service account to subscribe to the pubsub topic -resource "google_pubsub_topic_iam_binding" "functions_subscriber" { +resource "google_pubsub_topic_iam_member" "functions_subscriber" { + for_each = { + dataset_updates = google_pubsub_topic.dataset_updates.name + validate_gbfs_feed = google_pubsub_topic.validate_gbfs_feed.name + } + project = var.project_id role = "roles/pubsub.subscriber" - topic = google_pubsub_topic.dataset_updates.name - members = ["serviceAccount:${google_service_account.functions_service_account.email}"] + topic = each.value + member = "serviceAccount:${google_service_account.functions_service_account.email}" } # Grant permissions to the service account to write/read in datastore From fce3464f57dd295da4c73ad6871a43679ed02291 Mon Sep 17 00:00:00 2001 From: cka-y Date: Thu, 15 Aug 2024 14:32:20 -0400 Subject: [PATCH 03/22] feat: added snapshot storage to gcp --- functions-python/gbfs_validator/src/main.py | 101 +++++++++++++++++++- functions-python/helpers/utils.py | 5 +- infra/functions-python/main.tf | 13 +++ infra/functions-python/vars.tf | 6 ++ 4 files changed, 118 insertions(+), 7 deletions(-) diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index b9a7c8d3b..28c2996ae 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -2,19 +2,25 @@ import logging import os import uuid +from datetime import datetime import functions_framework from cloudevents.http import CloudEvent from google.cloud import pubsub_v1 from sqlalchemy.orm import joinedload - +import requests from database_gen.sqlacodegen_models import Gbfsfeed from helpers.database import start_db_session from helpers.logger import Logger from helpers.parser import jsonify_pubsub +from helpers.utils import create_bucket +from google.cloud import storage logging.basicConfig(level=logging.INFO) +BUCKET_NAME_PREFIX = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots") +ENV = os.getenv("ENV", "dev") + def get_all_gbfs_feeds(): """ @@ -36,6 +42,50 @@ def get_all_gbfs_feeds(): session.close() +@functions_framework.cloud_event +def fetch_gbfs_files(url): + """Fetch the GBFS files from the autodiscovery URL.""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def store_gbfs_file_in_bucket(bucket, file_url, destination_blob_name): + """Store a GBFS file in a Cloud Storage bucket.""" + response = requests.get(file_url) + response.raise_for_status() + blob = bucket.blob(destination_blob_name) + blob.upload_from_string(response.content) + blob.make_public() + return blob.public_url + + +def generate_new_gbfs_json(bucket, gbfs_data, stable_id): + """Generate a new gbfs.json with paths pointing to Cloud Storage.""" + new_gbfs_data = gbfs_data.copy() + today = datetime.now().strftime("%Y-%m-%d") + + for feed_key, feed in new_gbfs_data["data"].items(): + if isinstance(feed["feeds"], dict): + # Case when 'feeds' is a dictionary keyed by language + for feed_language, feed_info in feed["feeds"].items(): + old_url = feed_info["url"] + blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}_{feed_language}.json" + new_url = store_gbfs_file_in_bucket(bucket, old_url, blob_name) + feed_info["url"] = new_url + elif isinstance(feed["feeds"], list): + # Case when 'feeds' is a list without language codes + for feed_info in feed["feeds"]: + old_url = feed_info["url"] + blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}.json" + new_url = store_gbfs_file_in_bucket(bucket, old_url, blob_name) + feed_info["url"] = new_url + else: + logging.warning(f"Unexpected format in feed: {feed_key}") + + return new_gbfs_data + + @functions_framework.cloud_event def gbfs_validator_pubsub(cloud_event: CloudEvent): """ @@ -51,17 +101,58 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): maximum_executions = 1 logging.info(f"Maximum allowed executions: {maximum_executions}") - message_json = jsonify_pubsub(cloud_event) + message_json = jsonify_pubsub(data) if message_json is None: return "Invalid Pub/Sub message data." logging.info(f"Parsed message data: {message_json}") + try: + execution_id, stable_id, url, latest_version = ( + message_json["execution_id"], + message_json["stable_id"], + message_json["url"], + message_json["latest_version"], + ) + except KeyError: + return ( + "Invalid Pub/Sub message data. " + "Missing required field(s) execution_id, stable_id, url, or latest_version." + ) + logging.info(f"Execution ID: {execution_id}") + logging.info(f"Stable ID: {stable_id}") + logging.info(f"URL: {url}") + logging.info(f"Latest version: {latest_version}") + + bucket_name = f"{BUCKET_NAME_PREFIX}-{ENV}" + logging.info(f"Bucket name: {bucket_name}") + create_bucket(bucket_name) + + # Step 2: Store all gbfs files and generate new gbfs.json + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + try: + gbfs_data = fetch_gbfs_files(url) + except Exception as e: + logging.error(f"Error fetching data from autodiscovery URL: {e}") + return "Error fetching data from autodiscovery URL." + try: + new_gbfs_json = generate_new_gbfs_json(bucket, gbfs_data, stable_id) + except Exception as e: + logging.error(f"Error generating new gbfs.json: {e}") + return "Error generating new gbfs.json." + + # Store the new gbfs.json in the bucket + today = datetime.now().strftime("%Y-%m-%d") + new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") + new_gbfs_blob.upload_from_string( + json.dumps(new_gbfs_json), content_type="application/json" + ) + logging.info(f"Stored new gbfs.json at {new_gbfs_blob.public_url}") - # TODO: 1. Parse the CloudEvent data to extract the feed information - # TODO: 2. Store all gbfs file and generate new gbfs.json and store it as well # TODO: 2.5. Store gbfs snapshot information in the database # TODO: 3. Validate the feed's version otherwise add a version to the feed # TODO: 4. Validate the feed (summary) and store the results in the database - return + + return "GBFS files processed and stored successfully.", 200 @functions_framework.http diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py index b0e508ddc..0bffa19a4 100644 --- a/functions-python/helpers/utils.py +++ b/functions-python/helpers/utils.py @@ -15,6 +15,7 @@ # import hashlib +import logging import os import requests @@ -34,9 +35,9 @@ def create_bucket(bucket_name): bucket = storage_client.lookup_bucket(bucket_name) if bucket is None: bucket = storage_client.create_bucket(bucket_name) - print(f"Bucket {bucket} created.") + logging.info(f"Bucket {bucket} created.") else: - print(f"Bucket {bucket_name} already exists.") + logging.info(f"Bucket {bucket_name} already exists.") def download_url_content(url, with_retry=False): diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 53cb8a3ad..efb31025c 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -477,6 +477,10 @@ resource "google_cloudfunctions2_function" "gbfs_validator_pubsub" { ingress_settings = "ALLOW_ALL" vpc_connector = data.google_vpc_access_connector.vpc_connector.id vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" + environment_variables = { + ENV = var.environment + BUCKET_NAME = "${var.gbfs_bucket_name}-${var.environment}" + } dynamic "secret_environment_variables" { for_each = local.function_gbfs_validation_report_config.secret_environment_variables content { @@ -529,6 +533,15 @@ resource "google_storage_bucket_iam_binding" "bucket_object_viewer" { ] } +# Grant write access to the gbfs bucket for the service account +resource "google_storage_bucket_iam_binding" "gbfs_bucket_object_creator" { + bucket = "${var.gbfs_bucket_name}-${var.environment}" + role = "roles/storage.objectCreator" + members = [ + "serviceAccount:${google_service_account.functions_service_account.email}" + ] +} + # Grant the service account the ability to invoke the workflows resource "google_project_iam_member" "workflows_invoker" { project = var.project_id diff --git a/infra/functions-python/vars.tf b/infra/functions-python/vars.tf index 70287618b..63e2dc1f3 100644 --- a/infra/functions-python/vars.tf +++ b/infra/functions-python/vars.tf @@ -52,3 +52,9 @@ variable "web_validator_url" { description = "URL of the web validator" default = "https://stg-gtfs-validator-web-mbzoxaljzq-ue.a.run.app" } + +variable "gbfs_bucket_name" { + type = string + description = "Name of the bucket where the GBFS feeds are stored" + default = "mobilitydata-gbfs-snapshots" +} From 9f7959024a3eb1ab2fb529ae72f858ead961d234 Mon Sep 17 00:00:00 2001 From: cka-y Date: Thu, 15 Aug 2024 14:50:15 -0400 Subject: [PATCH 04/22] feat: creating the bucket from terraform --- functions-python/gbfs_validator/src/main.py | 9 ++------- infra/functions-python/main.tf | 9 +++++++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 28c2996ae..6963c683c 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -18,8 +18,7 @@ logging.basicConfig(level=logging.INFO) -BUCKET_NAME_PREFIX = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots") -ENV = os.getenv("ENV", "dev") +BUCKET_NAME = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots-dev") def get_all_gbfs_feeds(): @@ -122,13 +121,9 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): logging.info(f"URL: {url}") logging.info(f"Latest version: {latest_version}") - bucket_name = f"{BUCKET_NAME_PREFIX}-{ENV}" - logging.info(f"Bucket name: {bucket_name}") - create_bucket(bucket_name) - # Step 2: Store all gbfs files and generate new gbfs.json storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name) + bucket = storage_client.bucket(BUCKET_NAME) try: gbfs_data = fetch_gbfs_files(url) except Exception as e: diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index efb31025c..5917054c9 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -67,6 +67,11 @@ resource "google_storage_bucket" "functions_bucket" { location = "us" } +resource "google_storage_bucket" "gbfs_snapshots_bucket" { + location = "us" + name = "${var.gbfs_bucket_name}-${var.environment}" +} + # Cloud function source code zip files: # 1. Tokens resource "google_storage_bucket_object" "function_token_zip" { @@ -479,7 +484,7 @@ resource "google_cloudfunctions2_function" "gbfs_validator_pubsub" { vpc_connector_egress_settings = "PRIVATE_RANGES_ONLY" environment_variables = { ENV = var.environment - BUCKET_NAME = "${var.gbfs_bucket_name}-${var.environment}" + BUCKET_NAME = google_storage_bucket.gbfs_snapshots_bucket.name } dynamic "secret_environment_variables" { for_each = local.function_gbfs_validation_report_config.secret_environment_variables @@ -535,7 +540,7 @@ resource "google_storage_bucket_iam_binding" "bucket_object_viewer" { # Grant write access to the gbfs bucket for the service account resource "google_storage_bucket_iam_binding" "gbfs_bucket_object_creator" { - bucket = "${var.gbfs_bucket_name}-${var.environment}" + bucket = google_storage_bucket.gbfs_snapshots_bucket.name role = "roles/storage.objectCreator" members = [ "serviceAccount:${google_service_account.functions_service_account.email}" From bb0250601e7cc550d8e633ffabef61d1b9aa578a Mon Sep 17 00:00:00 2001 From: cka-y Date: Thu, 15 Aug 2024 14:52:41 -0400 Subject: [PATCH 05/22] fix: lint --- functions-python/gbfs_validator/src/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 6963c683c..f3a5d34e5 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -5,16 +5,16 @@ from datetime import datetime import functions_framework +import requests from cloudevents.http import CloudEvent from google.cloud import pubsub_v1 +from google.cloud import storage from sqlalchemy.orm import joinedload -import requests + from database_gen.sqlacodegen_models import Gbfsfeed from helpers.database import start_db_session from helpers.logger import Logger from helpers.parser import jsonify_pubsub -from helpers.utils import create_bucket -from google.cloud import storage logging.basicConfig(level=logging.INFO) From 50607503b13f07fece93816bd8c4072a9fcec870 Mon Sep 17 00:00:00 2001 From: cka-y Date: Fri, 16 Aug 2024 13:49:32 -0400 Subject: [PATCH 06/22] feat: added trace storage --- .github/workflows/api-deployer.yml | 306 +++++++++--------- .github/workflows/build-test.yml | 40 +-- functions-python/dataset_service/main.py | 19 +- functions-python/extract_location/src/main.py | 22 +- functions-python/gbfs_validator/src/main.py | 73 +++-- 5 files changed, 245 insertions(+), 215 deletions(-) diff --git a/.github/workflows/api-deployer.yml b/.github/workflows/api-deployer.yml index 4d7036876..89f2b4c8a 100644 --- a/.github/workflows/api-deployer.yml +++ b/.github/workflows/api-deployer.yml @@ -67,163 +67,163 @@ jobs: api-build-test: uses: ./.github/workflows/build-test.yml name: Build & Test - - create-artifact-repo: - runs-on: ubuntu-latest - permissions: write-all - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.python_version }} - - - name: Authenticate to Google Cloud - id: gcloud_auth - uses: google-github-actions/auth@v2 - with: - credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} - - - name: GCloud Setup - uses: google-github-actions/setup-gcloud@v2 - - - name: Set Variables - run: | - echo "Setting variables" - echo "BUCKET_NAME=${{ inputs.BUCKET_NAME }}" >> $GITHUB_ENV - echo "OBJECT_PREFIX=${{ inputs.OBJECT_PREFIX }}-artifact" >> $GITHUB_ENV - echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV - echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV - echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV - - - name: Populate Variables - run: | - scripts/replace-variables.sh -in_file infra/backend.conf.rename_me -out_file infra/artifact-registry/backend.conf -variables BUCKET_NAME,OBJECT_PREFIX - scripts/replace-variables.sh -in_file infra/artifact-registry/vars.tfvars.rename_me -out_file infra/artifact-registry/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,ARTIFACT_REPO_NAME - - - uses: hashicorp/setup-terraform@v2 - with: - terraform_version: 1.5.3 - terraform_wrapper: false - - - name: Terraform Init - run: | - cd infra/artifact-registry - terraform init -backend-config=backend.conf - - - name: Terraform Plan - id: plan - run: | - cd infra/artifact-registry - terraform plan -var-file=vars.tfvars -out=tf.plan - terraform show -no-color tf.plan > terraform-plan.txt - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Terraform Apply - if: ${{ inputs.TF_APPLY }} - run: | - cd infra/artifact-registry - terraform apply -auto-approve tf.plan - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Persist TF plan - uses: actions/upload-artifact@v4 - with: - name: terraform-artifact-plan.txt - path: infra/artifact-registry/terraform-plan.txt - overwrite: true - - docker-build-publish: - # Add docker healthy test - runs-on: ubuntu-latest - permissions: write-all - needs: [create-artifact-repo, api-build-test] - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Extract commit hash and version from git - run: ./scripts/extract-hash-and-version.sh - - - name: Upload version_info to workflow artefacts - uses: actions/upload-artifact@v4 - with: - name: version_info - path: api/src/version_info - compression-level: 0 - - - name: Authenticate to Google Cloud - id: gcloud_auth - uses: google-github-actions/auth@v2 - with: - credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} - - - name: Login to Google Artifact Registry - uses: docker/login-action@v2 - with: - registry: ${{inputs.REGION}}-docker.pkg.dev - username: _json_key_base64 - password: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} - - - name: Set up JDK ${{ env.java_version }} - uses: actions/setup-java@v4 - with: - java-version: ${{ env.java_version }} - distribution: 'temurin' - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.python_version }} - - - name: Update .env file - run: | - echo "POSTGRES_USER=${{ env.local_postgres_user }}" > config/.env.local - echo "PGUSER=${{ env.local_postgres_user }}" >> config/.env.local - echo "POSTGRES_PASSWORD=${{ env.local_postgres_pwd }}" >> config/.env.local - echo "POSTGRES_DB=${{ env.local_postgres_db }}" >> config/.env.local - echo "POSTGRES_PORT=${{ env.local_postgres_port }}" >> config/.env.local - echo "POSTGRES_HOST=localhost" >> config/.env.local - echo "ENV=dev" >> config/.env.local - - # db models were generated and uploaded in api-build-test job above. - - uses: actions/download-artifact@v4 - with: - name: database_gen - path: api/src/database_gen/ - - - name: Copy to db models to functions directory - run: | - cp -R api/src/database_gen/ functions-python/database_gen - - # api schema was generated and uploaded in api-build-test job above. - - uses: actions/download-artifact@v4 - with: - name: feeds_gen - path: api/src/feeds_gen/ - - - name: Set Variables - id: set_variables - run: | - echo "Setting variables" - echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV - echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV - echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV - echo "FEED_API_IMAGE_VERSION=${{ inputs.FEED_API_IMAGE_VERSION }}" >> $GITHUB_ENV - - - name: Build & Publish Docker Image - run: | - # We want to generate the image even if it's the same commit that has been tagged. So use the version - # (coming from the tag) in the docker image tag (If the docket tag does not change it's won't be uploaded) - DOCKER_IMAGE_VERSION=$EXTRACTED_VERSION.$FEED_API_IMAGE_VERSION - scripts/docker-build-push.sh -project_id $PROJECT_ID -repo_name feeds-$ENVIRONMENT -service feed-api -region $REGION -version $DOCKER_IMAGE_VERSION +# +# create-artifact-repo: +# runs-on: ubuntu-latest +# permissions: write-all +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# +# - uses: actions/setup-python@v4 +# with: +# python-version: ${{ env.python_version }} +# +# - name: Authenticate to Google Cloud +# id: gcloud_auth +# uses: google-github-actions/auth@v2 +# with: +# credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} +# +# - name: GCloud Setup +# uses: google-github-actions/setup-gcloud@v2 +# +# - name: Set Variables +# run: | +# echo "Setting variables" +# echo "BUCKET_NAME=${{ inputs.BUCKET_NAME }}" >> $GITHUB_ENV +# echo "OBJECT_PREFIX=${{ inputs.OBJECT_PREFIX }}-artifact" >> $GITHUB_ENV +# echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV +# echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV +# echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV +# +# - name: Populate Variables +# run: | +# scripts/replace-variables.sh -in_file infra/backend.conf.rename_me -out_file infra/artifact-registry/backend.conf -variables BUCKET_NAME,OBJECT_PREFIX +# scripts/replace-variables.sh -in_file infra/artifact-registry/vars.tfvars.rename_me -out_file infra/artifact-registry/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,ARTIFACT_REPO_NAME +# +# - uses: hashicorp/setup-terraform@v2 +# with: +# terraform_version: 1.5.3 +# terraform_wrapper: false +# +# - name: Terraform Init +# run: | +# cd infra/artifact-registry +# terraform init -backend-config=backend.conf +# +# - name: Terraform Plan +# id: plan +# run: | +# cd infra/artifact-registry +# terraform plan -var-file=vars.tfvars -out=tf.plan +# terraform show -no-color tf.plan > terraform-plan.txt +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# +# - name: Terraform Apply +# if: ${{ inputs.TF_APPLY }} +# run: | +# cd infra/artifact-registry +# terraform apply -auto-approve tf.plan +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# +# - name: Persist TF plan +# uses: actions/upload-artifact@v4 +# with: +# name: terraform-artifact-plan.txt +# path: infra/artifact-registry/terraform-plan.txt +# overwrite: true +# +# docker-build-publish: +# # Add docker healthy test +# runs-on: ubuntu-latest +# permissions: write-all +# needs: [create-artifact-repo, api-build-test] +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 +# +# - name: Extract commit hash and version from git +# run: ./scripts/extract-hash-and-version.sh +# +# - name: Upload version_info to workflow artefacts +# uses: actions/upload-artifact@v4 +# with: +# name: version_info +# path: api/src/version_info +# compression-level: 0 +# +# - name: Authenticate to Google Cloud +# id: gcloud_auth +# uses: google-github-actions/auth@v2 +# with: +# credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} +# +# - name: Login to Google Artifact Registry +# uses: docker/login-action@v2 +# with: +# registry: ${{inputs.REGION}}-docker.pkg.dev +# username: _json_key_base64 +# password: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} +# +# - name: Set up JDK ${{ env.java_version }} +# uses: actions/setup-java@v4 +# with: +# java-version: ${{ env.java_version }} +# distribution: 'temurin' +# +# - uses: actions/setup-python@v4 +# with: +# python-version: ${{ env.python_version }} +# +# - name: Update .env file +# run: | +# echo "POSTGRES_USER=${{ env.local_postgres_user }}" > config/.env.local +# echo "PGUSER=${{ env.local_postgres_user }}" >> config/.env.local +# echo "POSTGRES_PASSWORD=${{ env.local_postgres_pwd }}" >> config/.env.local +# echo "POSTGRES_DB=${{ env.local_postgres_db }}" >> config/.env.local +# echo "POSTGRES_PORT=${{ env.local_postgres_port }}" >> config/.env.local +# echo "POSTGRES_HOST=localhost" >> config/.env.local +# echo "ENV=dev" >> config/.env.local +# +# # db models were generated and uploaded in api-build-test job above. +# - uses: actions/download-artifact@v4 +# with: +# name: database_gen +# path: api/src/database_gen/ +# +# - name: Copy to db models to functions directory +# run: | +# cp -R api/src/database_gen/ functions-python/database_gen +# +# # api schema was generated and uploaded in api-build-test job above. +# - uses: actions/download-artifact@v4 +# with: +# name: feeds_gen +# path: api/src/feeds_gen/ +# +# - name: Set Variables +# id: set_variables +# run: | +# echo "Setting variables" +# echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV +# echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV +# echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV +# echo "FEED_API_IMAGE_VERSION=${{ inputs.FEED_API_IMAGE_VERSION }}" >> $GITHUB_ENV +# +# - name: Build & Publish Docker Image +# run: | +# # We want to generate the image even if it's the same commit that has been tagged. So use the version +# # (coming from the tag) in the docker image tag (If the docket tag does not change it's won't be uploaded) +# DOCKER_IMAGE_VERSION=$EXTRACTED_VERSION.$FEED_API_IMAGE_VERSION +# scripts/docker-build-push.sh -project_id $PROJECT_ID -repo_name feeds-$ENVIRONMENT -service feed-api -region $REGION -version $DOCKER_IMAGE_VERSION terraform-deploy: runs-on: ubuntu-latest permissions: write-all - needs: docker-build-publish + needs: api-build-test # TODO: restore docker-build-publish before merge steps: - name: Checkout code diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 4fb31756f..220685a29 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -40,10 +40,10 @@ jobs: docker compose --env-file ./config/.env.local up -d postgres postgres-test working-directory: ${{ github.workspace }} - - name: Run lint checks - shell: bash - run: | - scripts/lint-tests.sh +# - name: Run lint checks +# shell: bash +# run: | +# scripts/lint-tests.sh - name: Install Liquibase run: | @@ -80,22 +80,22 @@ jobs: scripts/setup-openapi-generator.sh scripts/api-gen.sh - - name: Unit tests - API - shell: bash - run: | - scripts/api-tests.sh --folder api --html_report - - - name: Unit tests - Python Functions - shell: bash - run: | - scripts/api-tests.sh --folder functions-python --html_report - - - name: Upload coverage report - uses: actions/upload-artifact@v4 - with: - name: coverage_report - path: scripts/coverage_reports/ - overwrite: true +# - name: Unit tests - API +# shell: bash +# run: | +# scripts/api-tests.sh --folder api --html_report +# +# - name: Unit tests - Python Functions +# shell: bash +# run: | +# scripts/api-tests.sh --folder functions-python --html_report +# +# - name: Upload coverage report +# uses: actions/upload-artifact@v4 +# with: +# name: coverage_report +# path: scripts/coverage_reports/ +# overwrite: true - name: Upload DB models uses: actions/upload-artifact@v4 diff --git a/functions-python/dataset_service/main.py b/functions-python/dataset_service/main.py index 1c7f7105a..0d137350b 100644 --- a/functions-python/dataset_service/main.py +++ b/functions-python/dataset_service/main.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import logging import uuid from datetime import datetime from enum import Enum @@ -43,6 +43,7 @@ class Status(Enum): class PipelineStage(Enum): DATASET_PROCESSING = "DATASET_PROCESSING" LOCATION_EXTRACTION = "LOCATION_EXTRACTION" + GBFS_VALIDATION = "GBFS_VALIDATION" # Dataset trace class to store the trace of a dataset @@ -72,11 +73,25 @@ class BatchExecution: batch_execution_collection: Final[str] = "batch_execution" +class MaxExecutionsReachedError(Exception): + pass + + # Dataset trace service with CRUD operations for the dataset trace class DatasetTraceService: def __init__(self, client: Client = None): self.client = datastore.Client() if client is None else client + def validate_and_save(self, dataset_trace: DatasetTrace, max_executions: int = 1): + if dataset_trace.execution_id is None or dataset_trace.stable_id is None: + raise ValueError("Execution ID and Stable ID are required.") + trace = self.get_by_execution_and_stable_ids(dataset_trace.execution_id, dataset_trace.stable_id) + executions = len(trace) if trace else 0 + logging.info(f"[{dataset_trace.stable_id}] Executions: {executions}") + if executions > 0 and executions >= max_executions: + raise MaxExecutionsReachedError(f"Maximum executions reached for {dataset_trace.stable_id}.") + self.save(dataset_trace) + # Save the dataset trace def save(self, dataset_trace: DatasetTrace): entity = self._dataset_trace_to_entity(dataset_trace) @@ -94,7 +109,7 @@ def get_by_id(self, trace_id: str) -> [DatasetTrace]: # Get the dataset trace by execution id and stable id def get_by_execution_and_stable_ids( - self, execution_id: str, stable_id: str + self, execution_id: str, stable_id: str ) -> [DatasetTrace]: query = self.client.query(kind=dataset_trace_collection) query.add_filter("execution_id", "=", execution_id) diff --git a/functions-python/extract_location/src/main.py b/functions-python/extract_location/src/main.py index d3970828f..9358b53f6 100644 --- a/functions-python/extract_location/src/main.py +++ b/functions-python/extract_location/src/main.py @@ -16,7 +16,7 @@ DatasetTraceService, DatasetTrace, Status, - PipelineStage, + PipelineStage, MaxExecutionsReachedError, ) from helpers.database import start_db_session from helpers.logger import Logger @@ -86,17 +86,6 @@ def extract_location_pubsub(cloud_event: CloudEvent): execution_id = str(uuid.uuid4()) logging.info(f"[{dataset_id}] Generated execution ID: {execution_id}") trace_service = DatasetTraceService() - trace = trace_service.get_by_execution_and_stable_ids(execution_id, stable_id) - logging.info(f"[{dataset_id}] Trace: {trace}") - executions = len(trace) if trace else 0 - print(f"[{dataset_id}] Executions: {executions}") - print(trace_service.get_by_execution_and_stable_ids(execution_id, stable_id)) - logging.info(f"[{dataset_id}] Executions: {executions}") - if executions > 0 and executions >= maximum_executions: - logging.warning( - f"[{dataset_id}] Maximum executions reached. Skipping processing." - ) - return f"Maximum executions reached for {dataset_id}." trace_id = str(uuid.uuid4()) error = None # Saving trace before starting in case we run into memory problems or uncatchable errors @@ -110,7 +99,14 @@ def extract_location_pubsub(cloud_event: CloudEvent): dataset_id=dataset_id, pipeline_stage=PipelineStage.LOCATION_EXTRACTION, ) - trace_service.save(trace) + try: + trace_service.validate_and_save(trace, maximum_executions) + except ValueError as e: + logging.error(f"[{dataset_id}] Error while saving trace: {e}") + return f"Error while saving trace: {e}" + except MaxExecutionsReachedError as e: + logging.warning(f"[{dataset_id}] {e}") + return f"{e}" try: logging.info(f"[{dataset_id}] accessing url: {url}") try: diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index f3a5d34e5..aba1c6817 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -3,6 +3,7 @@ import os import uuid from datetime import datetime +from typing import List, Dict, Any import functions_framework import requests @@ -15,42 +16,44 @@ from helpers.database import start_db_session from helpers.logger import Logger from helpers.parser import jsonify_pubsub +from dataset_service.main import DatasetTraceService, DatasetTrace, Status, PipelineStage, MaxExecutionsReachedError logging.basicConfig(level=logging.INFO) BUCKET_NAME = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots-dev") -def get_all_gbfs_feeds(): +def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: """ - Get all GBFS feeds from the database. + Fetch all GBFS feeds from the database. @return: A list of all GBFS feeds. """ session = None try: session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) gbfs_feeds = ( - session.query(Gbfsfeed).options(joinedload(Gbfsfeed.gbfsversions)).all() + session.query(Gbfsfeed) + .options(joinedload(Gbfsfeed.gbfsversions)) + .all() ) return gbfs_feeds except Exception as e: - logging.error(f"Error getting all GBFS feeds: {e}") + logging.error(f"Error fetching all GBFS feeds: {e}") raise e finally: if session: session.close() -@functions_framework.cloud_event -def fetch_gbfs_files(url): +def fetch_gbfs_files(url: str) -> Dict[str, Any]: """Fetch the GBFS files from the autodiscovery URL.""" response = requests.get(url) response.raise_for_status() return response.json() -def store_gbfs_file_in_bucket(bucket, file_url, destination_blob_name): - """Store a GBFS file in a Cloud Storage bucket.""" +def upload_gbfs_file_to_bucket(bucket: storage.Bucket, file_url: str, destination_blob_name: str) -> str: + """Upload a GBFS file to a Cloud Storage bucket.""" response = requests.get(file_url) response.raise_for_status() blob = bucket.blob(destination_blob_name) @@ -59,8 +62,8 @@ def store_gbfs_file_in_bucket(bucket, file_url, destination_blob_name): return blob.public_url -def generate_new_gbfs_json(bucket, gbfs_data, stable_id): - """Generate a new gbfs.json with paths pointing to Cloud Storage.""" +def create_gbfs_json_with_bucket_paths(bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str) -> None: + """Create a new gbfs.json with paths pointing to Cloud Storage and upload it.""" new_gbfs_data = gbfs_data.copy() today = datetime.now().strftime("%Y-%m-%d") @@ -70,19 +73,23 @@ def generate_new_gbfs_json(bucket, gbfs_data, stable_id): for feed_language, feed_info in feed["feeds"].items(): old_url = feed_info["url"] blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}_{feed_language}.json" - new_url = store_gbfs_file_in_bucket(bucket, old_url, blob_name) + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) feed_info["url"] = new_url elif isinstance(feed["feeds"], list): # Case when 'feeds' is a list without language codes for feed_info in feed["feeds"]: old_url = feed_info["url"] blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}.json" - new_url = store_gbfs_file_in_bucket(bucket, old_url, blob_name) + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) feed_info["url"] = new_url else: logging.warning(f"Unexpected format in feed: {feed_key}") - return new_gbfs_data + # Save the new gbfs.json in the bucket + new_gbfs_data["last_updated"] = today + new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") + new_gbfs_blob.upload_from_string(json.dumps(new_gbfs_data), content_type="application/json") + new_gbfs_blob.make_public() @functions_framework.cloud_event @@ -121,31 +128,43 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): logging.info(f"URL: {url}") logging.info(f"Latest version: {latest_version}") + trace_service = DatasetTraceService() + trace_id = str(uuid.uuid4()) + trace = DatasetTrace( + trace_id=trace_id, + stable_id=stable_id, + execution_id=execution_id, + status=Status.PROCESSING, + timestamp=datetime.now(), + pipeline_stage=PipelineStage.GBFS_VALIDATION, + ) + try: + trace_service.validate_and_save(trace, maximum_executions) + except ValueError as e: + logging.error(f"Error saving trace: {e}") + return "Error saving trace." + except MaxExecutionsReachedError: + logging.error(f"Maximum executions reached for {stable_id}.") + return "Maximum executions reached." + # Step 2: Store all gbfs files and generate new gbfs.json - storage_client = storage.Client() - bucket = storage_client.bucket(BUCKET_NAME) try: + storage_client = storage.Client() + bucket = storage_client.bucket(BUCKET_NAME) gbfs_data = fetch_gbfs_files(url) except Exception as e: logging.error(f"Error fetching data from autodiscovery URL: {e}") return "Error fetching data from autodiscovery URL." try: - new_gbfs_json = generate_new_gbfs_json(bucket, gbfs_data, stable_id) + create_gbfs_json_with_bucket_paths(bucket, gbfs_data, stable_id) except Exception as e: logging.error(f"Error generating new gbfs.json: {e}") return "Error generating new gbfs.json." - # Store the new gbfs.json in the bucket - today = datetime.now().strftime("%Y-%m-%d") - new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") - new_gbfs_blob.upload_from_string( - json.dumps(new_gbfs_json), content_type="application/json" - ) - logging.info(f"Stored new gbfs.json at {new_gbfs_blob.public_url}") + # Step 3: Store gbfs snapshot information in the database - # TODO: 2.5. Store gbfs snapshot information in the database - # TODO: 3. Validate the feed's version otherwise add a version to the feed - # TODO: 4. Validate the feed (summary) and store the results in the database + # TODO: 4. Validate the feed's version otherwise add a version to the feed + # TODO: 5. Validate the feed (summary) and store the results in the database return "GBFS files processed and stored successfully.", 200 @@ -166,7 +185,7 @@ def gbfs_validator_batch(_): # Get all GBFS feeds from the database try: - gbfs_feeds = get_all_gbfs_feeds() + gbfs_feeds = fetch_all_gbfs_feeds() except Exception: return "Error getting all GBFS feeds.", 500 From fcbbf2f9451be78782ef240538468826c6383481 Mon Sep 17 00:00:00 2001 From: cka-y Date: Fri, 16 Aug 2024 14:49:03 -0400 Subject: [PATCH 07/22] feat: added db schema --- functions-python/extract_location/src/main.py | 3 +- functions-python/gbfs_validator/src/main.py | 64 +++++++++++++++---- liquibase/changelog.xml | 1 + liquibase/changes/feat_566.sql | 27 ++++++++ 4 files changed, 82 insertions(+), 13 deletions(-) create mode 100644 liquibase/changes/feat_566.sql diff --git a/functions-python/extract_location/src/main.py b/functions-python/extract_location/src/main.py index 9358b53f6..9d90f1b4e 100644 --- a/functions-python/extract_location/src/main.py +++ b/functions-python/extract_location/src/main.py @@ -16,7 +16,8 @@ DatasetTraceService, DatasetTrace, Status, - PipelineStage, MaxExecutionsReachedError, + PipelineStage, + MaxExecutionsReachedError, ) from helpers.database import start_db_session from helpers.logger import Logger diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index aba1c6817..89c47c964 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -12,11 +12,17 @@ from google.cloud import storage from sqlalchemy.orm import joinedload -from database_gen.sqlacodegen_models import Gbfsfeed +from database_gen.sqlacodegen_models import Gbfsfeed, Gbfssnapshot from helpers.database import start_db_session from helpers.logger import Logger from helpers.parser import jsonify_pubsub -from dataset_service.main import DatasetTraceService, DatasetTrace, Status, PipelineStage, MaxExecutionsReachedError +from dataset_service.main import ( + DatasetTraceService, + DatasetTrace, + Status, + PipelineStage, + MaxExecutionsReachedError, +) logging.basicConfig(level=logging.INFO) @@ -32,9 +38,7 @@ def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: try: session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) gbfs_feeds = ( - session.query(Gbfsfeed) - .options(joinedload(Gbfsfeed.gbfsversions)) - .all() + session.query(Gbfsfeed).options(joinedload(Gbfsfeed.gbfsversions)).all() ) return gbfs_feeds except Exception as e: @@ -52,18 +56,29 @@ def fetch_gbfs_files(url: str) -> Dict[str, Any]: return response.json() -def upload_gbfs_file_to_bucket(bucket: storage.Bucket, file_url: str, destination_blob_name: str) -> str: +def upload_gbfs_file_to_bucket( + bucket: storage.Bucket, file_url: str, destination_blob_name: str +) -> str: """Upload a GBFS file to a Cloud Storage bucket.""" response = requests.get(file_url) response.raise_for_status() blob = bucket.blob(destination_blob_name) blob.upload_from_string(response.content) blob.make_public() + logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") return blob.public_url -def create_gbfs_json_with_bucket_paths(bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str) -> None: - """Create a new gbfs.json with paths pointing to Cloud Storage and upload it.""" +def create_gbfs_json_with_bucket_paths( + bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str +) -> None: + """ + Create a new gbfs.json with paths pointing to Cloud Storage and upload it. + @param bucket: The Cloud Storage bucket. + @param gbfs_data: The GBFS data. + @param stable_id: The stable ID of the feed. + @return: The public URL of the new gbfs.json. + """ new_gbfs_data = gbfs_data.copy() today = datetime.now().strftime("%Y-%m-%d") @@ -88,8 +103,11 @@ def create_gbfs_json_with_bucket_paths(bucket: storage.Bucket, gbfs_data: Dict[s # Save the new gbfs.json in the bucket new_gbfs_data["last_updated"] = today new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") - new_gbfs_blob.upload_from_string(json.dumps(new_gbfs_data), content_type="application/json") + new_gbfs_blob.upload_from_string( + json.dumps(new_gbfs_data), content_type="application/json" + ) new_gbfs_blob.make_public() + return new_gbfs_blob.public_url @functions_framework.cloud_event @@ -112,11 +130,12 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): return "Invalid Pub/Sub message data." logging.info(f"Parsed message data: {message_json}") try: - execution_id, stable_id, url, latest_version = ( + execution_id, stable_id, url, latest_version, feed_id = ( message_json["execution_id"], message_json["stable_id"], message_json["url"], message_json["latest_version"], + message_json["feed_id"], ) except KeyError: return ( @@ -127,6 +146,7 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): logging.info(f"Stable ID: {stable_id}") logging.info(f"URL: {url}") logging.info(f"Latest version: {latest_version}") + logging.info(f"Feed ID: {feed_id}") trace_service = DatasetTraceService() trace_id = str(uuid.uuid4()) @@ -156,13 +176,32 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): logging.error(f"Error fetching data from autodiscovery URL: {e}") return "Error fetching data from autodiscovery URL." try: - create_gbfs_json_with_bucket_paths(bucket, gbfs_data, stable_id) + hosted_url = create_gbfs_json_with_bucket_paths(bucket, gbfs_data, stable_id) except Exception as e: logging.error(f"Error generating new gbfs.json: {e}") return "Error generating new gbfs.json." # Step 3: Store gbfs snapshot information in the database - + today = datetime.now().strftime("%Y-%m-%d") + snapshot_id = str(uuid.uuid4()) + snapshot = Gbfssnapshot( + id=snapshot_id, + stable_id=f"{stable_id}-{today}", + feed_id=feed_id, + downloaded_at=datetime.now(), + hosted_url=hosted_url, + ) + session = None + try: + session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) + session.add(snapshot) + session.commit() + except Exception as e: + logging.error(f"Error storing GBFS snapshot in the database: {e}") + return "Error storing GBFS snapshot in the database." + finally: + if session: + session.close() # TODO: 4. Validate the feed's version otherwise add a version to the feed # TODO: 5. Validate the feed (summary) and store the results in the database @@ -206,6 +245,7 @@ def gbfs_validator_batch(_): feed_data = { "execution_id": execution_id, "stable_id": gbfs_feed.stable_id, + "feed_id": gbfs_feed.id, "url": gbfs_feed.auto_discovery_url, "latest_version": latest_version, } diff --git a/liquibase/changelog.xml b/liquibase/changelog.xml index d7e4259bc..a70204ad4 100644 --- a/liquibase/changelog.xml +++ b/liquibase/changelog.xml @@ -28,4 +28,5 @@ + \ No newline at end of file diff --git a/liquibase/changes/feat_566.sql b/liquibase/changes/feat_566.sql new file mode 100644 index 000000000..ea4557ad7 --- /dev/null +++ b/liquibase/changes/feat_566.sql @@ -0,0 +1,27 @@ +CREATE TABLE GBFSSnapshot( + id VARCHAR(255) NOT NULL PRIMARY KEY, + feed_id VARCHAR(255) NOT NULL, + hosted_url VARCHAR(255) NOT NULL, + downloaded_at TIMESTAMPTZ NOT NULL, + stable_id VARCHAR(255) NOT NULL UNIQUE, + FOREIGN KEY (feed_id) REFERENCES GBFSFeed(id) +); + +CREATE TABLE GBFSValidationReport( + id VARCHAR(255) NOT NULL PRIMARY KEY, + gbfs_snapshot_id VARCHAR(255) NOT NULL, + validated_at TIMESTAMPTZ NOT NULL, + report_summary_url VARCHAR(255) NOT NULL, + FOREIGN KEY (gbfs_snapshot_id) REFERENCES GBFSSnapshot(id) +); + +CREATE TABLE GBFSNotice( + keyword VARCHAR(255) NOT NULL, + message TEXT NOT NULL, + schema_path VARCHAR(255) NOT NULL, + gbfs_file VARCHAR(255) NOT NULL, + validation_report_id VARCHAR(255) NOT NULL, + count INTEGER NOT NULL, + FOREIGN KEY (validation_report_id) REFERENCES GBFSValidationReport(id), + PRIMARY KEY (validation_report_id, keyword, gbfs_file, schema_path) +); From 5e920c72a667d222e8c9c58eb78f2c5d5709338b Mon Sep 17 00:00:00 2001 From: cka-y Date: Fri, 16 Aug 2024 14:49:48 -0400 Subject: [PATCH 08/22] feat: added db schema --- functions-python/dataset_service/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/functions-python/dataset_service/main.py b/functions-python/dataset_service/main.py index 0d137350b..e4429a612 100644 --- a/functions-python/dataset_service/main.py +++ b/functions-python/dataset_service/main.py @@ -85,11 +85,15 @@ def __init__(self, client: Client = None): def validate_and_save(self, dataset_trace: DatasetTrace, max_executions: int = 1): if dataset_trace.execution_id is None or dataset_trace.stable_id is None: raise ValueError("Execution ID and Stable ID are required.") - trace = self.get_by_execution_and_stable_ids(dataset_trace.execution_id, dataset_trace.stable_id) + trace = self.get_by_execution_and_stable_ids( + dataset_trace.execution_id, dataset_trace.stable_id + ) executions = len(trace) if trace else 0 logging.info(f"[{dataset_trace.stable_id}] Executions: {executions}") if executions > 0 and executions >= max_executions: - raise MaxExecutionsReachedError(f"Maximum executions reached for {dataset_trace.stable_id}.") + raise MaxExecutionsReachedError( + f"Maximum executions reached for {dataset_trace.stable_id}." + ) self.save(dataset_trace) # Save the dataset trace @@ -109,7 +113,7 @@ def get_by_id(self, trace_id: str) -> [DatasetTrace]: # Get the dataset trace by execution id and stable id def get_by_execution_and_stable_ids( - self, execution_id: str, stable_id: str + self, execution_id: str, stable_id: str ) -> [DatasetTrace]: query = self.client.query(kind=dataset_trace_collection) query.add_filter("execution_id", "=", execution_id) From 103246f170e8c0939ce4a2510d0c24f4119bb28c Mon Sep 17 00:00:00 2001 From: cka-y Date: Fri, 16 Aug 2024 15:39:08 -0400 Subject: [PATCH 09/22] feat: storing validation report --- functions-python/gbfs_validator/src/main.py | 60 +++++++-- liquibase/tmp.json | 138 ++++++++++++++++++++ 2 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 liquibase/tmp.json diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 89c47c964..48af442ae 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -12,7 +12,7 @@ from google.cloud import storage from sqlalchemy.orm import joinedload -from database_gen.sqlacodegen_models import Gbfsfeed, Gbfssnapshot +from database_gen.sqlacodegen_models import Gbfsfeed, Gbfssnapshot, Gbfsvalidationreport, Gbfsnotice from helpers.database import start_db_session from helpers.logger import Logger from helpers.parser import jsonify_pubsub @@ -27,6 +27,8 @@ logging.basicConfig(level=logging.INFO) BUCKET_NAME = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots-dev") +VALIDATOR_URL = os.getenv("VALIDATOR_URL", + "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary") def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: @@ -57,7 +59,7 @@ def fetch_gbfs_files(url: str) -> Dict[str, Any]: def upload_gbfs_file_to_bucket( - bucket: storage.Bucket, file_url: str, destination_blob_name: str + bucket: storage.Bucket, file_url: str, destination_blob_name: str ) -> str: """Upload a GBFS file to a Cloud Storage bucket.""" response = requests.get(file_url) @@ -70,8 +72,8 @@ def upload_gbfs_file_to_bucket( def create_gbfs_json_with_bucket_paths( - bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str -) -> None: + bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str +) -> str: """ Create a new gbfs.json with paths pointing to Cloud Storage and upload it. @param bucket: The Cloud Storage bucket. @@ -191,20 +193,58 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): downloaded_at=datetime.now(), hosted_url=hosted_url, ) - session = None + try: session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) - session.add(snapshot) - session.commit() except Exception as e: logging.error(f"Error storing GBFS snapshot in the database: {e}") return "Error storing GBFS snapshot in the database." - finally: - if session: - session.close() # TODO: 4. Validate the feed's version otherwise add a version to the feed # TODO: 5. Validate the feed (summary) and store the results in the database + try: + json_payload = {"url": hosted_url} + response = requests.post(VALIDATOR_URL, json=json_payload) + response.raise_for_status() + logging.info(f"GBFS feed {hosted_url} validated successfully.") + json_report_summary = response.json() + logging.info(f"Validation summary: {json_report_summary}") + # Store in GCP + report_summary_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/report_summary.json") + report_summary_blob.upload_from_string( + json.dumps(json_report_summary), content_type="application/json" + ) + report_summary_blob.make_public() + # Store in database + validation_report = Gbfsvalidationreport( + id=f"{uuid.uuid4()}", + gbfs_snapshot_id=snapshot_id, + validated_at=datetime.now(), + report_summary_url=report_summary_blob.public_url, + ) + validation_report.gbfsnotices = [] + if 'filesSummary' in json_report_summary: + for file_summary in json_report_summary['filesSummary']: + logging.info(f"File summary: {file_summary}") + if file_summary['hasErrors']: + for error in file_summary['groupedErrors']: + logging.error(f"Error: {error}") + notice = Gbfsnotice( + keyword=error['keyword'], + message=error['message'], + schema_path=error['schemaPath'], + gbfs_file=file_summary['file'], + validation_report_id=validation_report.id, + count=error['count'], + ) + validation_report.gbfsnotices.append(notice) + snapshot.gbfsvalidationreports = [validation_report] + session.add(snapshot) + + except Exception as e: + logging.error(f"Error validating GBFS feed: {e}") + return "Error validating GBFS feed." + return "GBFS files processed and stored successfully.", 200 diff --git a/liquibase/tmp.json b/liquibase/tmp.json new file mode 100644 index 000000000..e8585e241 --- /dev/null +++ b/liquibase/tmp.json @@ -0,0 +1,138 @@ +{ + "summary": { + "validatorVersion": "1.0.12", + "version": { + "detected": "2.3", + "validated": "2.3" + }, + "hasErrors": true, + "errorsCount": 990 + }, + "filesSummary": [ + { + "required": true, + "exists": true, + "file": "gbfs.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": true, + "file": "gbfs_versions.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": true, + "exists": true, + "file": "system_information.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": true, + "file": "vehicle_types.json", + "hasErrors": true, + "errorsCount": 4, + "groupedErrors": [ + { + "keyword": "enum", + "message": "must be equal to one of the allowed values", + "schemaPath": "#/properties/data/properties/vehicle_types/items/properties/default_pricing_plan_id/enum", + "count": 4 + } + ] + }, + { + "required": false, + "exists": true, + "file": "station_information.json", + "hasErrors": true, + "errorsCount": 984, + "groupedErrors": [ + { + "keyword": "enum", + "message": "must be equal to one of the allowed values", + "schemaPath": "#/properties/data/properties/stations/items/properties/rental_methods/items/enum", + "count": 984 + } + ] + }, + { + "required": false, + "exists": true, + "file": "station_status.json", + "hasErrors": true, + "errorsCount": 2, + "groupedErrors": [ + { + "keyword": "required", + "message": "must have required property 'last_reported'", + "schemaPath": "#/properties/data/properties/stations/items/required", + "count": 2 + } + ] + }, + { + "required": false, + "exists": false, + "file": "free_bike_status.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": false, + "file": "system_hours.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": false, + "file": "system_calendar.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": true, + "file": "system_regions.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": true, + "file": "system_pricing_plans.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": false, + "file": "system_alerts.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + }, + { + "required": false, + "exists": true, + "file": "geofencing_zones.json", + "hasErrors": false, + "errorsCount": 0, + "groupedErrors": [] + } + ] +} \ No newline at end of file From 2cc0d5903b7c5b9b6a5273c8490f57ba77d1a282 Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 12:38:11 -0400 Subject: [PATCH 10/22] fix: added unit tests --- .../gbfs_validator/src/gbfs_utils.py | 155 ++++++++++++ functions-python/gbfs_validator/src/main.py | 222 ++++------------- .../gbfs_validator/tests/test_gbfs_utils.py | 155 ++++++++++++ .../tests/test_gbfs_validator.py | 227 ++++++++++++++++++ 4 files changed, 586 insertions(+), 173 deletions(-) create mode 100644 functions-python/gbfs_validator/src/gbfs_utils.py create mode 100644 functions-python/gbfs_validator/tests/test_gbfs_utils.py create mode 100644 functions-python/gbfs_validator/tests/test_gbfs_validator.py diff --git a/functions-python/gbfs_validator/src/gbfs_utils.py b/functions-python/gbfs_validator/src/gbfs_utils.py new file mode 100644 index 000000000..47e72a3c5 --- /dev/null +++ b/functions-python/gbfs_validator/src/gbfs_utils.py @@ -0,0 +1,155 @@ +import json +import logging +import os +import uuid +from datetime import datetime +from typing import Dict, Any + +import requests +from google.cloud import storage + +from database_gen.sqlacodegen_models import ( + Gbfssnapshot, + Gbfsvalidationreport, + Gbfsnotice, +) +from dataset_service.main import ( + Status, +) + +VALIDATOR_URL = os.getenv( + "VALIDATOR_URL", + "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary", +) + + +def fetch_gbfs_files(url: str) -> Dict[str, Any]: + """Fetch the GBFS files from the autodiscovery URL.""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def upload_gbfs_file_to_bucket( + bucket: storage.Bucket, file_url: str, destination_blob_name: str +) -> str: + """Upload a GBFS file to a Cloud Storage bucket.""" + response = requests.get(file_url) + response.raise_for_status() + blob = bucket.blob(destination_blob_name) + blob.upload_from_string(response.content) + blob.make_public() + logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") + return blob.public_url + + +def create_gbfs_json_with_bucket_paths( + bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str +) -> str: + """ + Create a new gbfs.json with paths pointing to Cloud Storage and upload it. + @param bucket: The Cloud Storage bucket. + @param gbfs_data: The GBFS data. + @param stable_id: The stable ID of the feed. + @return: The public URL of the new gbfs.json. + """ + new_gbfs_data = gbfs_data.copy() + today = datetime.now().strftime("%Y-%m-%d") + + for feed_key, feed in new_gbfs_data["data"].items(): + if isinstance(feed["feeds"], dict): + for feed_language, feed_info in feed["feeds"].items(): + old_url = feed_info["url"] + blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}_{feed_language}.json" + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) + feed_info["url"] = new_url + elif isinstance(feed["feeds"], list): + for feed_info in feed["feeds"]: + old_url = feed_info["url"] + blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}.json" + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) + feed_info["url"] = new_url + else: + logging.warning(f"Unexpected format in feed: {feed_key}") + + # Save the new gbfs.json in the bucket + new_gbfs_data["last_updated"] = today + new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") + new_gbfs_blob.upload_from_string( + json.dumps(new_gbfs_data), content_type="application/json" + ) + new_gbfs_blob.make_public() + return new_gbfs_blob.public_url + + +def save_trace_with_error(trace, error, trace_service): + """Helper function to save trace with an error.""" + trace.error_message = error + trace.status = Status.FAILED + trace_service.save(trace) + + +def create_snapshot(stable_id: str, feed_id: str, hosted_url: str) -> Gbfssnapshot: + """Create a new Gbfssnapshot object.""" + today = datetime.now().strftime("%Y-%m-%d") + snapshot_id = str(uuid.uuid4()) + snapshot = Gbfssnapshot( + id=snapshot_id, + stable_id=f"{stable_id}-{today}", + feed_id=feed_id, + downloaded_at=datetime.now(), + hosted_url=hosted_url, + ) + return snapshot + + +def validate_gbfs_feed( + hosted_url: str, stable_id: str, today: str, bucket: storage.Bucket +) -> Dict[str, Any]: + """Validate the GBFS feed and store the report in Cloud Storage.""" + json_payload = {"url": hosted_url} + response = requests.post(VALIDATOR_URL, json=json_payload) + response.raise_for_status() + + json_report_summary = response.json() + report_summary_blob = bucket.blob( + f"{stable_id}/{stable_id}-{today}/report_summary.json" + ) + report_summary_blob.upload_from_string( + json.dumps(json_report_summary), content_type="application/json" + ) + report_summary_blob.make_public() + + return { + "report_summary_url": report_summary_blob.public_url, + "json_report_summary": json_report_summary, + } + + +def save_snapshot_and_report( + session, snapshot: Gbfssnapshot, validation_result: Dict[str, Any] +): + """Save the snapshot and validation report to the database.""" + validation_report = Gbfsvalidationreport( + id=str(uuid.uuid4()), + gbfs_snapshot_id=snapshot.id, + validated_at=datetime.now(), + report_summary_url=validation_result["report_summary_url"], + ) + json_report_summary = validation_result["json_report_summary"] + validation_report.gbfsnotices = [ + Gbfsnotice( + keyword=error["keyword"], + message=error["message"], + schema_path=error["schemaPath"], + gbfs_file=file_summary["file"], + validation_report_id=validation_report.id, + count=error["count"], + ) + for file_summary in json_report_summary.get("filesSummary", []) + if file_summary["hasErrors"] + for error in file_summary["groupedErrors"] + ] + snapshot.gbfsvalidationreports = [validation_report] + session.add(snapshot) + session.commit() diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 48af442ae..94c9fb2e4 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -3,19 +3,14 @@ import os import uuid from datetime import datetime -from typing import List, Dict, Any +from typing import List import functions_framework -import requests from cloudevents.http import CloudEvent -from google.cloud import pubsub_v1 -from google.cloud import storage +from google.cloud import pubsub_v1, storage from sqlalchemy.orm import joinedload -from database_gen.sqlacodegen_models import Gbfsfeed, Gbfssnapshot, Gbfsvalidationreport, Gbfsnotice -from helpers.database import start_db_session -from helpers.logger import Logger -from helpers.parser import jsonify_pubsub +from database_gen.sqlacodegen_models import Gbfsfeed from dataset_service.main import ( DatasetTraceService, DatasetTrace, @@ -23,19 +18,24 @@ PipelineStage, MaxExecutionsReachedError, ) +from .gbfs_utils import ( + fetch_gbfs_files, + create_gbfs_json_with_bucket_paths, + save_trace_with_error, + create_snapshot, + validate_gbfs_feed, + save_snapshot_and_report, +) +from helpers.database import start_db_session +from helpers.logger import Logger +from helpers.parser import jsonify_pubsub logging.basicConfig(level=logging.INFO) BUCKET_NAME = os.getenv("BUCKET_NAME", "mobilitydata-gbfs-snapshots-dev") -VALIDATOR_URL = os.getenv("VALIDATOR_URL", - "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary") def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: - """ - Fetch all GBFS feeds from the database. - @return: A list of all GBFS feeds. - """ session = None try: session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) @@ -51,104 +51,25 @@ def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: session.close() -def fetch_gbfs_files(url: str) -> Dict[str, Any]: - """Fetch the GBFS files from the autodiscovery URL.""" - response = requests.get(url) - response.raise_for_status() - return response.json() - - -def upload_gbfs_file_to_bucket( - bucket: storage.Bucket, file_url: str, destination_blob_name: str -) -> str: - """Upload a GBFS file to a Cloud Storage bucket.""" - response = requests.get(file_url) - response.raise_for_status() - blob = bucket.blob(destination_blob_name) - blob.upload_from_string(response.content) - blob.make_public() - logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") - return blob.public_url - - -def create_gbfs_json_with_bucket_paths( - bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str -) -> str: - """ - Create a new gbfs.json with paths pointing to Cloud Storage and upload it. - @param bucket: The Cloud Storage bucket. - @param gbfs_data: The GBFS data. - @param stable_id: The stable ID of the feed. - @return: The public URL of the new gbfs.json. - """ - new_gbfs_data = gbfs_data.copy() - today = datetime.now().strftime("%Y-%m-%d") - - for feed_key, feed in new_gbfs_data["data"].items(): - if isinstance(feed["feeds"], dict): - # Case when 'feeds' is a dictionary keyed by language - for feed_language, feed_info in feed["feeds"].items(): - old_url = feed_info["url"] - blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}_{feed_language}.json" - new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - feed_info["url"] = new_url - elif isinstance(feed["feeds"], list): - # Case when 'feeds' is a list without language codes - for feed_info in feed["feeds"]: - old_url = feed_info["url"] - blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}.json" - new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - feed_info["url"] = new_url - else: - logging.warning(f"Unexpected format in feed: {feed_key}") - - # Save the new gbfs.json in the bucket - new_gbfs_data["last_updated"] = today - new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") - new_gbfs_blob.upload_from_string( - json.dumps(new_gbfs_data), content_type="application/json" - ) - new_gbfs_blob.make_public() - return new_gbfs_blob.public_url - - +@functions_framework.cloud_event @functions_framework.cloud_event def gbfs_validator_pubsub(cloud_event: CloudEvent): - """ - Main function triggered by a Pub/Sub message to validate a GBFS feed. - @param cloud_event: The CloudEvent containing the Pub/Sub message. - """ Logger.init_logger() data = cloud_event.data logging.info(f"Function triggered with Pub/Sub event data: {data}") - try: - maximum_executions = int(os.getenv("MAXIMUM_EXECUTIONS", 1)) - except ValueError: - maximum_executions = 1 - logging.info(f"Maximum allowed executions: {maximum_executions}") message_json = jsonify_pubsub(data) if message_json is None: return "Invalid Pub/Sub message data." - logging.info(f"Parsed message data: {message_json}") + try: - execution_id, stable_id, url, latest_version, feed_id = ( - message_json["execution_id"], - message_json["stable_id"], - message_json["url"], - message_json["latest_version"], - message_json["feed_id"], - ) - except KeyError: - return ( - "Invalid Pub/Sub message data. " - "Missing required field(s) execution_id, stable_id, url, or latest_version." - ) - logging.info(f"Execution ID: {execution_id}") - logging.info(f"Stable ID: {stable_id}") - logging.info(f"URL: {url}") - logging.info(f"Latest version: {latest_version}") - logging.info(f"Feed ID: {feed_id}") + execution_id = message_json["execution_id"] + stable_id = message_json["stable_id"] + url = message_json["url"] + feed_id = message_json["feed_id"] + except KeyError as e: + logging.error(f"Missing required field: {e}") + return f"Invalid Pub/Sub message data. Missing {e}." trace_service = DatasetTraceService() trace_id = str(uuid.uuid4()) @@ -160,92 +81,47 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): timestamp=datetime.now(), pipeline_stage=PipelineStage.GBFS_VALIDATION, ) + try: - trace_service.validate_and_save(trace, maximum_executions) - except ValueError as e: - logging.error(f"Error saving trace: {e}") - return "Error saving trace." - except MaxExecutionsReachedError: - logging.error(f"Maximum executions reached for {stable_id}.") - return "Maximum executions reached." + trace_service.validate_and_save(trace, int(os.getenv("MAXIMUM_EXECUTIONS", 1))) + except (ValueError, MaxExecutionsReachedError) as e: + error_message = str(e) + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message - # Step 2: Store all gbfs files and generate new gbfs.json + session = None try: storage_client = storage.Client() bucket = storage_client.bucket(BUCKET_NAME) gbfs_data = fetch_gbfs_files(url) - except Exception as e: - logging.error(f"Error fetching data from autodiscovery URL: {e}") - return "Error fetching data from autodiscovery URL." - try: hosted_url = create_gbfs_json_with_bucket_paths(bucket, gbfs_data, stable_id) except Exception as e: - logging.error(f"Error generating new gbfs.json: {e}") - return "Error generating new gbfs.json." - - # Step 3: Store gbfs snapshot information in the database - today = datetime.now().strftime("%Y-%m-%d") - snapshot_id = str(uuid.uuid4()) - snapshot = Gbfssnapshot( - id=snapshot_id, - stable_id=f"{stable_id}-{today}", - feed_id=feed_id, - downloaded_at=datetime.now(), - hosted_url=hosted_url, - ) + error_message = f"Error processing GBFS files: {e}" + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message try: + today = datetime.now().strftime("%Y-%m-%d") + snapshot = create_snapshot(stable_id, feed_id, hosted_url) session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) - except Exception as e: - logging.error(f"Error storing GBFS snapshot in the database: {e}") - return "Error storing GBFS snapshot in the database." - # TODO: 4. Validate the feed's version otherwise add a version to the feed - # TODO: 5. Validate the feed (summary) and store the results in the database - try: - json_payload = {"url": hosted_url} - response = requests.post(VALIDATOR_URL, json=json_payload) - response.raise_for_status() - logging.info(f"GBFS feed {hosted_url} validated successfully.") - json_report_summary = response.json() - logging.info(f"Validation summary: {json_report_summary}") - # Store in GCP - report_summary_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/report_summary.json") - report_summary_blob.upload_from_string( - json.dumps(json_report_summary), content_type="application/json" - ) - report_summary_blob.make_public() - # Store in database - validation_report = Gbfsvalidationreport( - id=f"{uuid.uuid4()}", - gbfs_snapshot_id=snapshot_id, - validated_at=datetime.now(), - report_summary_url=report_summary_blob.public_url, - ) - validation_report.gbfsnotices = [] - if 'filesSummary' in json_report_summary: - for file_summary in json_report_summary['filesSummary']: - logging.info(f"File summary: {file_summary}") - if file_summary['hasErrors']: - for error in file_summary['groupedErrors']: - logging.error(f"Error: {error}") - notice = Gbfsnotice( - keyword=error['keyword'], - message=error['message'], - schema_path=error['schemaPath'], - gbfs_file=file_summary['file'], - validation_report_id=validation_report.id, - count=error['count'], - ) - validation_report.gbfsnotices.append(notice) - snapshot.gbfsvalidationreports = [validation_report] - session.add(snapshot) + validation_results = validate_gbfs_feed(hosted_url, stable_id, today, bucket) + save_snapshot_and_report(session, snapshot, validation_results) except Exception as e: - logging.error(f"Error validating GBFS feed: {e}") - return "Error validating GBFS feed." + error_message = f"Error validating GBFS feed: {e}" + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message + finally: + if session: + session.close() - return "GBFS files processed and stored successfully.", 200 + trace.status = Status.SUCCESS + trace_service.save(trace) + return "GBFS files processed and stored successfully." @functions_framework.http diff --git a/functions-python/gbfs_validator/tests/test_gbfs_utils.py b/functions-python/gbfs_validator/tests/test_gbfs_utils.py new file mode 100644 index 000000000..1284912e4 --- /dev/null +++ b/functions-python/gbfs_validator/tests/test_gbfs_utils.py @@ -0,0 +1,155 @@ +import unittest +import uuid +from datetime import datetime +from unittest.mock import patch, MagicMock + +from gbfs_validator.src.gbfs_utils import ( + fetch_gbfs_files, + upload_gbfs_file_to_bucket, + create_gbfs_json_with_bucket_paths, + save_trace_with_error, + create_snapshot, + validate_gbfs_feed, + save_snapshot_and_report, + VALIDATOR_URL, +) +from dataset_service.main import Status + + +class TestGbfsUtils(unittest.TestCase): + @patch("requests.get") + def test_fetch_gbfs_files(self, mock_get): + mock_response = MagicMock() + mock_response.json.return_value = {"key": "value"} + mock_response.status_code = 200 + mock_get.return_value = mock_response + + result = fetch_gbfs_files("http://example.com") + self.assertEqual(result, {"key": "value"}) + mock_get.assert_called_once_with("http://example.com") + + @patch("requests.get") + def test_upload_gbfs_file_to_bucket(self, mock_get): + mock_response = MagicMock() + mock_response.content = b"file_content" + mock_response.status_code = 200 + mock_get.return_value = mock_response + + mock_blob = MagicMock() + mock_blob.public_url = "http://public-url.com" + mock_bucket = MagicMock() + mock_bucket.blob.return_value = mock_blob + + result = upload_gbfs_file_to_bucket( + mock_bucket, "http://file-url.com", "destination_blob" + ) + self.assertEqual(result, "http://public-url.com") + mock_get.assert_called_once_with("http://file-url.com") + mock_blob.upload_from_string.assert_called_once_with(b"file_content") + mock_blob.make_public.assert_called_once() + + @patch("gbfs_validator.src.gbfs_utils.upload_gbfs_file_to_bucket") + def test_create_gbfs_json_with_bucket_paths(self, mock_upload): + mock_upload.return_value = "http://new-url.com" + + mock_bucket = MagicMock() + gbfs_data = { + "data": {"en": {"feeds": [{"url": "http://old-url.com", "name": "feed"}]}} + } + stable_id = "test_stable_id" + mock_bucket.blob.return_value.public_url = "http://new-url.com" + + result = create_gbfs_json_with_bucket_paths(mock_bucket, gbfs_data, stable_id) + self.assertEqual(result, "http://new-url.com") + + def test_save_trace_with_error(self): + mock_trace = MagicMock() + mock_trace_service = MagicMock() + + save_trace_with_error(mock_trace, "An error occurred", mock_trace_service) + + mock_trace_service.save.assert_called_once_with(mock_trace) + self.assertEqual(mock_trace.error_message, "An error occurred") + self.assertEqual(mock_trace.status, Status.FAILED) + + def test_create_snapshot(self): + stable_id = "test_stable_id" + feed_id = "test_feed_id" + hosted_url = "http://hosted-url.com" + + snapshot = create_snapshot(stable_id, feed_id, hosted_url) + + self.assertEqual( + snapshot.stable_id, f"{stable_id}-{datetime.now().strftime('%Y-%m-%d')}" + ) + self.assertEqual(snapshot.feed_id, feed_id) + self.assertEqual(snapshot.hosted_url, hosted_url) + self.assertTrue( + uuid.UUID(snapshot.id) + ) # Validates that `snapshot.id` is a valid UUID + + @patch("requests.post") + @patch("google.cloud.storage.Bucket.blob") + def test_validate_gbfs_feed(self, mock_blob, mock_post): + mock_response = MagicMock() + mock_response.json.return_value = {"summary": "validation report"} + mock_response.status_code = 200 + mock_post.return_value = mock_response + + mock_blob_obj = MagicMock() + mock_blob_obj.public_url = "http://public-url.com" + mock_blob.return_value = mock_blob_obj + + hosted_url = "http://hosted-url.com" + stable_id = "test_stable_id" + today = datetime.now().strftime("%Y-%m-%d") + mock_bucket = MagicMock() + mock_bucket.blob.return_value = mock_blob_obj + + result = validate_gbfs_feed(hosted_url, stable_id, today, mock_bucket) + + self.assertEqual( + result["json_report_summary"], {"summary": "validation report"} + ) + self.assertEqual(result["report_summary_url"], mock_blob_obj.public_url) + mock_post.assert_called_once_with(VALIDATOR_URL, json={"url": hosted_url}) + mock_blob_obj.upload_from_string.assert_called_once() + + @patch("gbfs_validator.src.gbfs_utils.Gbfsvalidationreport") + @patch("gbfs_validator.src.gbfs_utils.Gbfsnotice") + def test_save_snapshot_and_report(self, mock_gbfsnotice, mock_gbfsvalidationreport): + mock_session = MagicMock() + mock_snapshot = MagicMock() + validation_result = { + "report_summary_url": "http://report-summary-url.com", + "json_report_summary": { + "filesSummary": [ + { + "file": "file_name", + "hasErrors": True, + "groupedErrors": [ + { + "keyword": "error_keyword", + "message": "error_message", + "schemaPath": "schema_path", + "count": 1, + } + ], + } + ] + }, + } + + save_snapshot_and_report(mock_session, mock_snapshot, validation_result) + + mock_session.add.assert_called_once_with(mock_snapshot) + mock_session.commit.assert_called_once() + + mock_gbfsnotice.assert_called_once_with( + keyword="error_keyword", + message="error_message", + schema_path="schema_path", + gbfs_file="file_name", + validation_report_id=mock_gbfsvalidationreport().id, + count=1, + ) diff --git a/functions-python/gbfs_validator/tests/test_gbfs_validator.py b/functions-python/gbfs_validator/tests/test_gbfs_validator.py new file mode 100644 index 000000000..bfbbe68f5 --- /dev/null +++ b/functions-python/gbfs_validator/tests/test_gbfs_validator.py @@ -0,0 +1,227 @@ +import base64 +import copy +import json +import os +import unittest +import uuid +from unittest.mock import patch, MagicMock + +from cloudevents.http import CloudEvent + +from gbfs_validator.src.main import ( + gbfs_validator_pubsub, + gbfs_validator_batch, + fetch_all_gbfs_feeds, +) +from test_utils.database_utils import default_db_url + + +class TestMainFunctions(unittest.TestCase): + @patch.dict( + os.environ, + { + "FEEDS_DATABASE_URL": default_db_url, + "BUCKET_NAME": "mock-bucket", + "MAXIMUM_EXECUTIONS": "1", + "PUBSUB_TOPIC_NAME": "mock-topic", + "PROJECT_ID": "mock-project", + "VALIDATOR_URL": "https://mock-validator-url.com", + }, + ) + @patch("gbfs_validator.src.main.start_db_session") + @patch("gbfs_validator.src.main.DatasetTraceService") + @patch("gbfs_validator.src.main.fetch_gbfs_files") + @patch("gbfs_validator.src.main.create_gbfs_json_with_bucket_paths") + @patch("gbfs_validator.src.main.create_snapshot") + @patch("gbfs_validator.src.main.validate_gbfs_feed") + @patch("gbfs_validator.src.main.save_snapshot_and_report") + @patch("gbfs_validator.src.main.Logger") + def test_gbfs_validator_pubsub( + self, + _, # mock_logger + mock_save_snapshot_and_report, + mock_validate_gbfs_feed, + mock_create_snapshot, + mock_create_gbfs_json, + mock_fetch_gbfs_files, + mock_dataset_trace_service, + mock_start_db_session, + ): + # Prepare mocks + mock_session = MagicMock() + mock_start_db_session.return_value = mock_session + + mock_trace_service = MagicMock() + mock_dataset_trace_service.return_value = mock_trace_service + + mock_create_snapshot.return_value = MagicMock() + + mock_validate_gbfs_feed.return_value = { + "report_summary_url": "http://report-summary-url.com", + "json_report_summary": {"summary": "validation report"}, + } + + # Prepare a mock CloudEvent + data = { + "execution_id": str(uuid.uuid4()), + "stable_id": "mock-stable-id", + "url": "http://mock-url.com", + "feed_id": str(uuid.uuid4()), + } + base64_data = base64.b64encode(json.dumps(data).encode("utf-8")) + cloud_event = CloudEvent( + attributes={ + "type": "com.example.someevent", + "source": "https://example.com/event-source", + }, + data={"message": {"data": base64_data}}, + ) + + # Call the function + result = gbfs_validator_pubsub(cloud_event) + self.assertEqual(result, "GBFS files processed and stored successfully.") + + mock_fetch_gbfs_files.assert_called_once_with("http://mock-url.com") + mock_create_gbfs_json.assert_called_once() + mock_create_snapshot.assert_called_once() + mock_validate_gbfs_feed.assert_called_once() + mock_save_snapshot_and_report.assert_called_once() + + @patch.dict( + os.environ, + { + "PUBSUB_TOPIC_NAME": "mock-topic", + }, + ) + @patch("gbfs_validator.src.main.start_db_session") + @patch("gbfs_validator.src.main.pubsub_v1.PublisherClient") + @patch("gbfs_validator.src.main.fetch_all_gbfs_feeds") + @patch("gbfs_validator.src.main.Logger") + def test_gbfs_validator_batch( + self, _, mock_fetch_all_gbfs_feeds, mock_publisher_client, mock_start_db_session + ): + # Prepare mocks + mock_session = MagicMock() + mock_start_db_session.return_value = mock_session + + mock_publisher = MagicMock() + mock_publisher_client.return_value = mock_publisher + + mock_feed = MagicMock() + mock_feed.stable_id = "mock-stable-id" + mock_feed.id = str(uuid.uuid4()) + mock_feed.auto_discovery_url = "http://mock-url.com" + mock_feed.gbfsversions = [MagicMock(version="1.0")] + mock_feed_2 = copy.deepcopy(mock_feed) + mock_feed_2.gbfsversions = [] + mock_fetch_all_gbfs_feeds.return_value = [mock_feed, mock_feed_2] + + # Call the function + result = gbfs_validator_batch(None) + self.assertEqual(result[1], 200) + + mock_fetch_all_gbfs_feeds.assert_called_once() + self.assertEqual(mock_publisher.publish.call_count, 2) + + @patch("gbfs_validator.src.main.Logger") + def test_gbfs_validator_batch_missing_topic(self, _): # mock_logger + # Call the function + result = gbfs_validator_batch(None) + self.assertEqual(result[1], 500) + + @patch("gbfs_validator.src.main.start_db_session") + @patch("gbfs_validator.src.main.Logger") + def test_fetch_all_gbfs_feeds(self, _, mock_start_db_session): + mock_session = MagicMock() + mock_start_db_session.return_value = mock_session + mock_feed = MagicMock() + mock_session.query.return_value.options.return_value.all.return_value = [ + mock_feed + ] + + result = fetch_all_gbfs_feeds() + self.assertEqual(result, [mock_feed]) + + mock_start_db_session.assert_called_once() + mock_session.close.assert_called_once() + + @patch("gbfs_validator.src.main.start_db_session") + @patch("gbfs_validator.src.main.Logger") + def test_fetch_all_gbfs_feeds_exception(self, _, mock_start_db_session): + mock_session = MagicMock() + mock_start_db_session.return_value = mock_session + + # Simulate an exception when querying the database + mock_session.query.side_effect = Exception("Database error") + + with self.assertRaises(Exception) as context: + fetch_all_gbfs_feeds() + + self.assertTrue("Database error" in str(context.exception)) + + mock_start_db_session.assert_called_once() + mock_session.close.assert_called_once() + + @patch("gbfs_validator.src.main.start_db_session") + def test_fetch_all_gbfs_feeds_none_session(self, mock_start_db_session): + mock_start_db_session.return_value = None + + with self.assertRaises(Exception) as context: + fetch_all_gbfs_feeds() + + self.assertTrue("NoneType" in str(context.exception)) + + mock_start_db_session.assert_called_once() + + @patch.dict( + os.environ, + { + "PUBSUB_TOPIC_NAME": "mock-topic", + }, + ) + @patch("gbfs_validator.src.main.fetch_all_gbfs_feeds") + @patch("gbfs_validator.src.main.Logger") + def test_gbfs_validator_batch_fetch_exception(self, _, mock_fetch_all_gbfs_feeds): + # Prepare mocks + mock_fetch_all_gbfs_feeds.side_effect = Exception("Database error") + + # Call the function + result = gbfs_validator_batch(None) + self.assertEqual(result[1], 500) + + mock_fetch_all_gbfs_feeds.assert_called_once() + + @patch.dict( + os.environ, + { + "PUBSUB_TOPIC_NAME": "mock-topic", + }, + ) + @patch("gbfs_validator.src.main.start_db_session") + @patch("gbfs_validator.src.main.pubsub_v1.PublisherClient") + @patch("gbfs_validator.src.main.fetch_all_gbfs_feeds") + @patch("gbfs_validator.src.main.Logger") + def test_gbfs_validator_batch_publish_exception( + self, _, mock_fetch_all_gbfs_feeds, mock_publisher_client, mock_start_db_session + ): + # Prepare mocks + mock_session = MagicMock() + mock_start_db_session.return_value = mock_session + + mock_publisher_client.side_effect = Exception("Pub/Sub error") + + mock_feed = MagicMock() + mock_feed.stable_id = "mock-stable-id" + mock_feed.id = str(uuid.uuid4()) + mock_feed.auto_discovery_url = "http://mock-url.com" + mock_feed.gbfsversions = [MagicMock(version="1.0")] + mock_feed_2 = copy.deepcopy(mock_feed) + mock_feed_2.gbfsversions = [] + mock_fetch_all_gbfs_feeds.return_value = [mock_feed, mock_feed_2] + + # Call the function + result = gbfs_validator_batch(None) + self.assertEqual(result[1], 500) + + mock_fetch_all_gbfs_feeds.assert_called_once() + mock_publisher_client.assert_called_once() From 609aa5ff29f94f23ac535ac3dbd1ba3e7493a686 Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 12:40:45 -0400 Subject: [PATCH 11/22] fix: added country name to gbfs populate script --- api/src/scripts/populate_db_gbfs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api/src/scripts/populate_db_gbfs.py b/api/src/scripts/populate_db_gbfs.py index 4bb7796eb..d0f22efe3 100644 --- a/api/src/scripts/populate_db_gbfs.py +++ b/api/src/scripts/populate_db_gbfs.py @@ -2,6 +2,7 @@ import pandas as pd import pytz +import pycountry from database.database import generate_unique_id, configure_polymorphic_mappers from database_gen.sqlacodegen_models import Gbfsfeed, Location, Gbfsversion, Externalid @@ -95,6 +96,7 @@ def populate_db(self): location = self.db.session.get(Location, location_id) or Location( id=location_id, country_code=country_code, + country=pycountry.countries.get(alpha_2=country_code).name if country_code else None, municipality=municipality, ) gbfs_feed.locations.clear() From f4c97e4119bb06a235f30b9d1f6b56c6066b06ed Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 13:01:47 -0400 Subject: [PATCH 12/22] fix: added log filter and tests --- .../tests/test_dataset_service.py | 21 +++++++++++++++++++ functions-python/gbfs_validator/src/main.py | 5 ++++- functions-python/helpers/logger.py | 14 +++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/functions-python/dataset_service/tests/test_dataset_service.py b/functions-python/dataset_service/tests/test_dataset_service.py index bce63a02a..bf0774e48 100644 --- a/functions-python/dataset_service/tests/test_dataset_service.py +++ b/functions-python/dataset_service/tests/test_dataset_service.py @@ -21,6 +21,27 @@ def test_save_dataset_trace(self, mock_datastore_client): service.save(dataset_trace) mock_datastore_client.put.assert_called_once() + @patch("google.cloud.datastore.Client") + def test_validate_and_save_exception(self, mock_datastore_client): + service = DatasetTraceService(mock_datastore_client) + dataset_trace = DatasetTrace( + stable_id="123", status=Status.PUBLISHED, timestamp=datetime.now() + ) + with self.assertRaises(ValueError): + service.validate_and_save(dataset_trace, 1) + + @patch("google.cloud.datastore.Client") + def test_validate_and_save(self, mock_datastore_client): + service = DatasetTraceService(mock_datastore_client) + dataset_trace = DatasetTrace( + stable_id="123", + execution_id="123", + status=Status.PUBLISHED, + timestamp=datetime.now(), + ) + service.validate_and_save(dataset_trace, 1) + mock_datastore_client.put.assert_called_once() + @patch("google.cloud.datastore.Client") def test_get_dataset_trace_by_id(self, mock_datastore_client): mock_datastore_client.get.return_value = { diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 94c9fb2e4..4c99904b4 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -27,7 +27,7 @@ save_snapshot_and_report, ) from helpers.database import start_db_session -from helpers.logger import Logger +from helpers.logger import Logger, StableIdFilter from helpers.parser import jsonify_pubsub logging.basicConfig(level=logging.INFO) @@ -71,6 +71,9 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): logging.error(f"Missing required field: {e}") return f"Invalid Pub/Sub message data. Missing {e}." + stable_id_filter = StableIdFilter(stable_id) + logging.getLogger().addFilter(stable_id_filter) + trace_service = DatasetTraceService() trace_id = str(uuid.uuid4()) trace = DatasetTrace( diff --git a/functions-python/helpers/logger.py b/functions-python/helpers/logger.py index cc9a66433..30725d222 100644 --- a/functions-python/helpers/logger.py +++ b/functions-python/helpers/logger.py @@ -16,6 +16,20 @@ import google.cloud.logging from google.cloud.logging_v2 import Client +import logging + + +class StableIdFilter(logging.Filter): + """Add a stable_id to the log record""" + + def __init__(self, stable_id=None): + super().__init__() + self.stable_id = stable_id + + def filter(self, record): + if self.stable_id: + record.msg = f"[{self.stable_id}] {record.msg}" + return True class Logger: From 0e0c4ed42335837da11032853e6dd379ec8bf5f8 Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 13:49:08 -0400 Subject: [PATCH 13/22] fix: restore gh workflows --- .github/workflows/api-deployer.yml | 304 ++++++++++---------- .github/workflows/build-test.yml | 40 +-- functions-python/gbfs_validator/README.md | 59 ++++ functions-python/gbfs_validator/src/main.py | 106 +++---- infra/functions-python/main.tf | 30 ++ infra/functions-python/vars.tf | 6 + 6 files changed, 324 insertions(+), 221 deletions(-) create mode 100644 functions-python/gbfs_validator/README.md diff --git a/.github/workflows/api-deployer.yml b/.github/workflows/api-deployer.yml index 89f2b4c8a..5d1a99900 100644 --- a/.github/workflows/api-deployer.yml +++ b/.github/workflows/api-deployer.yml @@ -67,158 +67,158 @@ jobs: api-build-test: uses: ./.github/workflows/build-test.yml name: Build & Test -# -# create-artifact-repo: -# runs-on: ubuntu-latest -# permissions: write-all -# steps: -# - name: Checkout code -# uses: actions/checkout@v4 -# -# - uses: actions/setup-python@v4 -# with: -# python-version: ${{ env.python_version }} -# -# - name: Authenticate to Google Cloud -# id: gcloud_auth -# uses: google-github-actions/auth@v2 -# with: -# credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} -# -# - name: GCloud Setup -# uses: google-github-actions/setup-gcloud@v2 -# -# - name: Set Variables -# run: | -# echo "Setting variables" -# echo "BUCKET_NAME=${{ inputs.BUCKET_NAME }}" >> $GITHUB_ENV -# echo "OBJECT_PREFIX=${{ inputs.OBJECT_PREFIX }}-artifact" >> $GITHUB_ENV -# echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV -# echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV -# echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV -# -# - name: Populate Variables -# run: | -# scripts/replace-variables.sh -in_file infra/backend.conf.rename_me -out_file infra/artifact-registry/backend.conf -variables BUCKET_NAME,OBJECT_PREFIX -# scripts/replace-variables.sh -in_file infra/artifact-registry/vars.tfvars.rename_me -out_file infra/artifact-registry/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,ARTIFACT_REPO_NAME -# -# - uses: hashicorp/setup-terraform@v2 -# with: -# terraform_version: 1.5.3 -# terraform_wrapper: false -# -# - name: Terraform Init -# run: | -# cd infra/artifact-registry -# terraform init -backend-config=backend.conf -# -# - name: Terraform Plan -# id: plan -# run: | -# cd infra/artifact-registry -# terraform plan -var-file=vars.tfvars -out=tf.plan -# terraform show -no-color tf.plan > terraform-plan.txt -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -# -# - name: Terraform Apply -# if: ${{ inputs.TF_APPLY }} -# run: | -# cd infra/artifact-registry -# terraform apply -auto-approve tf.plan -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -# -# - name: Persist TF plan -# uses: actions/upload-artifact@v4 -# with: -# name: terraform-artifact-plan.txt -# path: infra/artifact-registry/terraform-plan.txt -# overwrite: true -# -# docker-build-publish: -# # Add docker healthy test -# runs-on: ubuntu-latest -# permissions: write-all -# needs: [create-artifact-repo, api-build-test] -# steps: -# - name: Checkout code -# uses: actions/checkout@v4 -# -# - name: Extract commit hash and version from git -# run: ./scripts/extract-hash-and-version.sh -# -# - name: Upload version_info to workflow artefacts -# uses: actions/upload-artifact@v4 -# with: -# name: version_info -# path: api/src/version_info -# compression-level: 0 -# -# - name: Authenticate to Google Cloud -# id: gcloud_auth -# uses: google-github-actions/auth@v2 -# with: -# credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} -# -# - name: Login to Google Artifact Registry -# uses: docker/login-action@v2 -# with: -# registry: ${{inputs.REGION}}-docker.pkg.dev -# username: _json_key_base64 -# password: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} -# -# - name: Set up JDK ${{ env.java_version }} -# uses: actions/setup-java@v4 -# with: -# java-version: ${{ env.java_version }} -# distribution: 'temurin' -# -# - uses: actions/setup-python@v4 -# with: -# python-version: ${{ env.python_version }} -# -# - name: Update .env file -# run: | -# echo "POSTGRES_USER=${{ env.local_postgres_user }}" > config/.env.local -# echo "PGUSER=${{ env.local_postgres_user }}" >> config/.env.local -# echo "POSTGRES_PASSWORD=${{ env.local_postgres_pwd }}" >> config/.env.local -# echo "POSTGRES_DB=${{ env.local_postgres_db }}" >> config/.env.local -# echo "POSTGRES_PORT=${{ env.local_postgres_port }}" >> config/.env.local -# echo "POSTGRES_HOST=localhost" >> config/.env.local -# echo "ENV=dev" >> config/.env.local -# -# # db models were generated and uploaded in api-build-test job above. -# - uses: actions/download-artifact@v4 -# with: -# name: database_gen -# path: api/src/database_gen/ -# -# - name: Copy to db models to functions directory -# run: | -# cp -R api/src/database_gen/ functions-python/database_gen -# -# # api schema was generated and uploaded in api-build-test job above. -# - uses: actions/download-artifact@v4 -# with: -# name: feeds_gen -# path: api/src/feeds_gen/ -# -# - name: Set Variables -# id: set_variables -# run: | -# echo "Setting variables" -# echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV -# echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV -# echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV -# echo "FEED_API_IMAGE_VERSION=${{ inputs.FEED_API_IMAGE_VERSION }}" >> $GITHUB_ENV -# -# - name: Build & Publish Docker Image -# run: | -# # We want to generate the image even if it's the same commit that has been tagged. So use the version -# # (coming from the tag) in the docker image tag (If the docket tag does not change it's won't be uploaded) -# DOCKER_IMAGE_VERSION=$EXTRACTED_VERSION.$FEED_API_IMAGE_VERSION -# scripts/docker-build-push.sh -project_id $PROJECT_ID -repo_name feeds-$ENVIRONMENT -service feed-api -region $REGION -version $DOCKER_IMAGE_VERSION + + create-artifact-repo: + runs-on: ubuntu-latest + permissions: write-all + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.python_version }} + + - name: Authenticate to Google Cloud + id: gcloud_auth + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} + + - name: GCloud Setup + uses: google-github-actions/setup-gcloud@v2 + + - name: Set Variables + run: | + echo "Setting variables" + echo "BUCKET_NAME=${{ inputs.BUCKET_NAME }}" >> $GITHUB_ENV + echo "OBJECT_PREFIX=${{ inputs.OBJECT_PREFIX }}-artifact" >> $GITHUB_ENV + echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV + echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV + echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV + + - name: Populate Variables + run: | + scripts/replace-variables.sh -in_file infra/backend.conf.rename_me -out_file infra/artifact-registry/backend.conf -variables BUCKET_NAME,OBJECT_PREFIX + scripts/replace-variables.sh -in_file infra/artifact-registry/vars.tfvars.rename_me -out_file infra/artifact-registry/vars.tfvars -variables PROJECT_ID,REGION,ENVIRONMENT,ARTIFACT_REPO_NAME + + - uses: hashicorp/setup-terraform@v2 + with: + terraform_version: 1.5.3 + terraform_wrapper: false + + - name: Terraform Init + run: | + cd infra/artifact-registry + terraform init -backend-config=backend.conf + + - name: Terraform Plan + id: plan + run: | + cd infra/artifact-registry + terraform plan -var-file=vars.tfvars -out=tf.plan + terraform show -no-color tf.plan > terraform-plan.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Terraform Apply + if: ${{ inputs.TF_APPLY }} + run: | + cd infra/artifact-registry + terraform apply -auto-approve tf.plan + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Persist TF plan + uses: actions/upload-artifact@v4 + with: + name: terraform-artifact-plan.txt + path: infra/artifact-registry/terraform-plan.txt + overwrite: true + + docker-build-publish: + # Add docker healthy test + runs-on: ubuntu-latest + permissions: write-all + needs: [create-artifact-repo, api-build-test] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Extract commit hash and version from git + run: ./scripts/extract-hash-and-version.sh + + - name: Upload version_info to workflow artefacts + uses: actions/upload-artifact@v4 + with: + name: version_info + path: api/src/version_info + compression-level: 0 + + - name: Authenticate to Google Cloud + id: gcloud_auth + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} + + - name: Login to Google Artifact Registry + uses: docker/login-action@v2 + with: + registry: ${{inputs.REGION}}-docker.pkg.dev + username: _json_key_base64 + password: ${{ secrets.GCP_MOBILITY_FEEDS_SA_KEY }} + + - name: Set up JDK ${{ env.java_version }} + uses: actions/setup-java@v4 + with: + java-version: ${{ env.java_version }} + distribution: 'temurin' + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.python_version }} + + - name: Update .env file + run: | + echo "POSTGRES_USER=${{ env.local_postgres_user }}" > config/.env.local + echo "PGUSER=${{ env.local_postgres_user }}" >> config/.env.local + echo "POSTGRES_PASSWORD=${{ env.local_postgres_pwd }}" >> config/.env.local + echo "POSTGRES_DB=${{ env.local_postgres_db }}" >> config/.env.local + echo "POSTGRES_PORT=${{ env.local_postgres_port }}" >> config/.env.local + echo "POSTGRES_HOST=localhost" >> config/.env.local + echo "ENV=dev" >> config/.env.local + + # db models were generated and uploaded in api-build-test job above. + - uses: actions/download-artifact@v4 + with: + name: database_gen + path: api/src/database_gen/ + + - name: Copy to db models to functions directory + run: | + cp -R api/src/database_gen/ functions-python/database_gen + + # api schema was generated and uploaded in api-build-test job above. + - uses: actions/download-artifact@v4 + with: + name: feeds_gen + path: api/src/feeds_gen/ + + - name: Set Variables + id: set_variables + run: | + echo "Setting variables" + echo "PROJECT_ID=${{ inputs.PROJECT_ID }}" >> $GITHUB_ENV + echo "REGION=${{ inputs.REGION }}" >> $GITHUB_ENV + echo "ENVIRONMENT=${{ inputs.ENVIRONMENT }}" >> $GITHUB_ENV + echo "FEED_API_IMAGE_VERSION=${{ inputs.FEED_API_IMAGE_VERSION }}" >> $GITHUB_ENV + + - name: Build & Publish Docker Image + run: | + # We want to generate the image even if it's the same commit that has been tagged. So use the version + # (coming from the tag) in the docker image tag (If the docket tag does not change it's won't be uploaded) + DOCKER_IMAGE_VERSION=$EXTRACTED_VERSION.$FEED_API_IMAGE_VERSION + scripts/docker-build-push.sh -project_id $PROJECT_ID -repo_name feeds-$ENVIRONMENT -service feed-api -region $REGION -version $DOCKER_IMAGE_VERSION terraform-deploy: runs-on: ubuntu-latest diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 220685a29..4fb31756f 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -40,10 +40,10 @@ jobs: docker compose --env-file ./config/.env.local up -d postgres postgres-test working-directory: ${{ github.workspace }} -# - name: Run lint checks -# shell: bash -# run: | -# scripts/lint-tests.sh + - name: Run lint checks + shell: bash + run: | + scripts/lint-tests.sh - name: Install Liquibase run: | @@ -80,22 +80,22 @@ jobs: scripts/setup-openapi-generator.sh scripts/api-gen.sh -# - name: Unit tests - API -# shell: bash -# run: | -# scripts/api-tests.sh --folder api --html_report -# -# - name: Unit tests - Python Functions -# shell: bash -# run: | -# scripts/api-tests.sh --folder functions-python --html_report -# -# - name: Upload coverage report -# uses: actions/upload-artifact@v4 -# with: -# name: coverage_report -# path: scripts/coverage_reports/ -# overwrite: true + - name: Unit tests - API + shell: bash + run: | + scripts/api-tests.sh --folder api --html_report + + - name: Unit tests - Python Functions + shell: bash + run: | + scripts/api-tests.sh --folder functions-python --html_report + + - name: Upload coverage report + uses: actions/upload-artifact@v4 + with: + name: coverage_report + path: scripts/coverage_reports/ + overwrite: true - name: Upload DB models uses: actions/upload-artifact@v4 diff --git a/functions-python/gbfs_validator/README.md b/functions-python/gbfs_validator/README.md new file mode 100644 index 000000000..bc75029bc --- /dev/null +++ b/functions-python/gbfs_validator/README.md @@ -0,0 +1,59 @@ +# GBFS Validator Pipeline + +This pipeline consists of two functions that work together to validate GBFS feeds: + +1. **`gbfs-validator-batch`**: This function is HTTP-triggered by a Cloud Scheduler. +2. **`gbfs-validator-pubsub`**: This function is triggered by a Pub/Sub message. + +### Pipeline Overview + +- **`gbfs-validator-batch`**: This function checks all GBFS feeds in the database and publishes a message to the Pub/Sub topic for each feed to initiate its validation. +- **`gbfs-validator-pubsub`**: This function is triggered by the Pub/Sub message generated by the batch function. It handles the validation of the individual feed. + +### Message Format + +The message published by the batch function to the Pub/Sub topic follows this format: + +```json +{ + "message": { + "data": { + "execution_id": "execution_id", + "stable_id": "stable_id", + "feed_id": "id", + "url": "auto_discovery_url", + "latest_version": "version" + } + } +} +``` + +### Functionality Details + +- **`gbfs-validator-batch`**: Triggered per execution ID, this function iterates over all GBFS feeds, preparing and publishing individual messages to the Pub/Sub topic. +- **`gbfs-validator-pubsub`**: Triggered per feed, this function performs the following steps: + 1. **Download the feed snapshot to GCP**: It uploads all related files to the specified Cloud Storage bucket and updates the `gbfs.json` file to point to the newly uploaded files. + 2. **Validate the feed**: Run the GBFS validator on the feed snapshot. + 3. **Update the database**: The function updates the database with the snapshot information and validation report details. + +## Function Configuration + +### Batch Function Environment Variables + +The `gbfs-validator-batch` function requires the following environment variables: + +- **`PUBSUB_TOPIC_NAME`**: The name of the Pub/Sub topic where messages will be published. +- **`PROJECT_ID`**: The Google Cloud Project ID used to construct the full topic path. +- **`FEEDS_DATABASE_URL`**: The database connection string for accessing the GBFS feeds. + +### Pub/Sub Function Environment Variables + +The `gbfs-validator-pubsub` function requires the following environment variables: + +- **`BUCKET_NAME`**: The name of the Cloud Storage bucket where the GBFS snapshots will be stored. Defaults to `"mobilitydata-gbfs-snapshots-dev"` if not set. +- **`FEEDS_DATABASE_URL`**: The database connection string for accessing the GBFS feeds. +- **`MAXIMUM_EXECUTIONS`**: The maximum number of times a trace can be executed before it is considered as having reached its limit. Defaults to `1` if not set. + +## Local Development + +For local development, these functions should be developed and tested according to standard practices for GCP serverless functions. Refer to the main [README.md](../README.md) file for general instructions on setting up the development environment. diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 4c99904b4..f152050b9 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -73,58 +73,66 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): stable_id_filter = StableIdFilter(stable_id) logging.getLogger().addFilter(stable_id_filter) - - trace_service = DatasetTraceService() - trace_id = str(uuid.uuid4()) - trace = DatasetTrace( - trace_id=trace_id, - stable_id=stable_id, - execution_id=execution_id, - status=Status.PROCESSING, - timestamp=datetime.now(), - pipeline_stage=PipelineStage.GBFS_VALIDATION, - ) - - try: - trace_service.validate_and_save(trace, int(os.getenv("MAXIMUM_EXECUTIONS", 1))) - except (ValueError, MaxExecutionsReachedError) as e: - error_message = str(e) - logging.error(error_message) - save_trace_with_error(trace, error_message, trace_service) - return error_message - - session = None try: - storage_client = storage.Client() - bucket = storage_client.bucket(BUCKET_NAME) - gbfs_data = fetch_gbfs_files(url) - hosted_url = create_gbfs_json_with_bucket_paths(bucket, gbfs_data, stable_id) - except Exception as e: - error_message = f"Error processing GBFS files: {e}" - logging.error(error_message) - save_trace_with_error(trace, error_message, trace_service) - return error_message - - try: - today = datetime.now().strftime("%Y-%m-%d") - snapshot = create_snapshot(stable_id, feed_id, hosted_url) - session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) - - validation_results = validate_gbfs_feed(hosted_url, stable_id, today, bucket) - save_snapshot_and_report(session, snapshot, validation_results) + trace_service = DatasetTraceService() + trace_id = str(uuid.uuid4()) + trace = DatasetTrace( + trace_id=trace_id, + stable_id=stable_id, + execution_id=execution_id, + status=Status.PROCESSING, + timestamp=datetime.now(), + pipeline_stage=PipelineStage.GBFS_VALIDATION, + ) - except Exception as e: - error_message = f"Error validating GBFS feed: {e}" - logging.error(error_message) - save_trace_with_error(trace, error_message, trace_service) - return error_message + try: + trace_service.validate_and_save( + trace, int(os.getenv("MAXIMUM_EXECUTIONS", 1)) + ) + except (ValueError, MaxExecutionsReachedError) as e: + error_message = str(e) + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message + + session = None + try: + storage_client = storage.Client() + bucket = storage_client.bucket(BUCKET_NAME) + gbfs_data = fetch_gbfs_files(url) + hosted_url = create_gbfs_json_with_bucket_paths( + bucket, gbfs_data, stable_id + ) + except Exception as e: + error_message = f"Error processing GBFS files: {e}" + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message + + try: + today = datetime.now().strftime("%Y-%m-%d") + snapshot = create_snapshot(stable_id, feed_id, hosted_url) + session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) + + validation_results = validate_gbfs_feed( + hosted_url, stable_id, today, bucket + ) + save_snapshot_and_report(session, snapshot, validation_results) + + except Exception as e: + error_message = f"Error validating GBFS feed: {e}" + logging.error(error_message) + save_trace_with_error(trace, error_message, trace_service) + return error_message + finally: + if session: + session.close() + + trace.status = Status.SUCCESS + trace_service.save(trace) + return "GBFS files processed and stored successfully." finally: - if session: - session.close() - - trace.status = Status.SUCCESS - trace_service.save(trace) - return "GBFS files processed and stored successfully." + logging.getLogger().removeFilter(stable_id_filter) @functions_framework.http diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 5917054c9..7a836e2e9 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -448,6 +448,27 @@ resource "google_cloudfunctions2_function" "gbfs_validator_batch" { } } +# Schedule the batch function to run +resource "google_cloud_scheduler_job" "gbfs_validator_batch_scheduler" { + name = "gbfs-validator-batch-scheduler-${var.environment}" + description = "Schedule the gbfs-validator-batch function" + time_zone = "Etc/UTC" + schedule = var.gbfs_scheduler_schedule + paused = var.environment == "prod" ? false : true + depends_on = [google_cloudfunctions2_function.gbfs_validator_batch, google_cloudfunctions2_function_iam_member.gbfs_validator_batch_invoker] + http_target { + http_method = "POST" + uri = google_cloudfunctions2_function.gbfs_validator_batch.url + oidc_token { + service_account_email = google_service_account.functions_service_account.email + } + headers = { + "Content-Type" = "application/json" + } + attempt_deadline = "320s" + } +} + # 5.3 Create function that subscribes to the Pub/Sub topic resource "google_cloudfunctions2_function" "gbfs_validator_pubsub" { name = "${local.function_gbfs_validation_report_config.name}-pubsub" @@ -595,6 +616,15 @@ resource "google_cloud_tasks_queue" "update_validation_report_task_queue" { name = "update-validation-report-task-queue" } +# Task queue to invoke gbfs_validator_batch function for the scheduler +resource "google_cloudfunctions2_function_iam_member" "gbfs_validator_batch_invoker" { + project = var.project_id + location = var.gcp_region + cloud_function = google_cloudfunctions2_function.gbfs_validator_batch.name + role = "roles/cloudfunctions.invoker" + member = "serviceAccount:${google_service_account.functions_service_account.email}" +} + # Grant permissions to the service account to publish to the pubsub topic resource "google_pubsub_topic_iam_member" "functions_publisher" { for_each = { diff --git a/infra/functions-python/vars.tf b/infra/functions-python/vars.tf index 63e2dc1f3..8658d370e 100644 --- a/infra/functions-python/vars.tf +++ b/infra/functions-python/vars.tf @@ -58,3 +58,9 @@ variable "gbfs_bucket_name" { description = "Name of the bucket where the GBFS feeds are stored" default = "mobilitydata-gbfs-snapshots" } + +variable "gbfs_scheduler_schedule" { + type = string + description = "Schedule for the GBFS scheduler job" + default = "0 0 1 * *" # every month on the first day at 00:00 +} From 16d1c4c0c559a5c3edfb6e62aa7fe56688af840a Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 13:50:19 -0400 Subject: [PATCH 14/22] fix: restore gh workflows --- .github/workflows/api-deployer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/api-deployer.yml b/.github/workflows/api-deployer.yml index 5d1a99900..3586532cd 100644 --- a/.github/workflows/api-deployer.yml +++ b/.github/workflows/api-deployer.yml @@ -223,7 +223,7 @@ jobs: terraform-deploy: runs-on: ubuntu-latest permissions: write-all - needs: api-build-test # TODO: restore docker-build-publish before merge + needs: docker-build-publish steps: - name: Checkout code From 1128abb2699df9d7df89697ed82a6985303759e7 Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 13:52:17 -0400 Subject: [PATCH 15/22] fix: restore gh workflows --- liquibase/tmp.json | 138 --------------------------------------------- 1 file changed, 138 deletions(-) delete mode 100644 liquibase/tmp.json diff --git a/liquibase/tmp.json b/liquibase/tmp.json deleted file mode 100644 index e8585e241..000000000 --- a/liquibase/tmp.json +++ /dev/null @@ -1,138 +0,0 @@ -{ - "summary": { - "validatorVersion": "1.0.12", - "version": { - "detected": "2.3", - "validated": "2.3" - }, - "hasErrors": true, - "errorsCount": 990 - }, - "filesSummary": [ - { - "required": true, - "exists": true, - "file": "gbfs.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": true, - "file": "gbfs_versions.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": true, - "exists": true, - "file": "system_information.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": true, - "file": "vehicle_types.json", - "hasErrors": true, - "errorsCount": 4, - "groupedErrors": [ - { - "keyword": "enum", - "message": "must be equal to one of the allowed values", - "schemaPath": "#/properties/data/properties/vehicle_types/items/properties/default_pricing_plan_id/enum", - "count": 4 - } - ] - }, - { - "required": false, - "exists": true, - "file": "station_information.json", - "hasErrors": true, - "errorsCount": 984, - "groupedErrors": [ - { - "keyword": "enum", - "message": "must be equal to one of the allowed values", - "schemaPath": "#/properties/data/properties/stations/items/properties/rental_methods/items/enum", - "count": 984 - } - ] - }, - { - "required": false, - "exists": true, - "file": "station_status.json", - "hasErrors": true, - "errorsCount": 2, - "groupedErrors": [ - { - "keyword": "required", - "message": "must have required property 'last_reported'", - "schemaPath": "#/properties/data/properties/stations/items/required", - "count": 2 - } - ] - }, - { - "required": false, - "exists": false, - "file": "free_bike_status.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": false, - "file": "system_hours.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": false, - "file": "system_calendar.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": true, - "file": "system_regions.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": true, - "file": "system_pricing_plans.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": false, - "file": "system_alerts.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - }, - { - "required": false, - "exists": true, - "file": "geofencing_zones.json", - "hasErrors": false, - "errorsCount": 0, - "groupedErrors": [] - } - ] -} \ No newline at end of file From cb9204b270a0c788f4cebb6223083b3e6db1750f Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 14:01:22 -0400 Subject: [PATCH 16/22] fix: gcloud auth test --- functions-python/gbfs_validator/tests/test_gbfs_validator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/functions-python/gbfs_validator/tests/test_gbfs_validator.py b/functions-python/gbfs_validator/tests/test_gbfs_validator.py index bfbbe68f5..bbd206b93 100644 --- a/functions-python/gbfs_validator/tests/test_gbfs_validator.py +++ b/functions-python/gbfs_validator/tests/test_gbfs_validator.py @@ -36,8 +36,10 @@ class TestMainFunctions(unittest.TestCase): @patch("gbfs_validator.src.main.validate_gbfs_feed") @patch("gbfs_validator.src.main.save_snapshot_and_report") @patch("gbfs_validator.src.main.Logger") + @patch("gbfs_validator.src.main.storage.Client") def test_gbfs_validator_pubsub( self, + __, _, # mock_logger mock_save_snapshot_and_report, mock_validate_gbfs_feed, From e88edaf556d342a17a2286a9e11faf055acde4f2 Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 14:32:42 -0400 Subject: [PATCH 17/22] fix: infra --- infra/functions-python/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 7a836e2e9..46504a474 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -465,8 +465,8 @@ resource "google_cloud_scheduler_job" "gbfs_validator_batch_scheduler" { headers = { "Content-Type" = "application/json" } - attempt_deadline = "320s" } + attempt_deadline = "320s" } # 5.3 Create function that subscribes to the Pub/Sub topic From 3bc62aecc2d5fa7d1b828b0cfce21f32ff5e894a Mon Sep 17 00:00:00 2001 From: cka-y Date: Mon, 19 Aug 2024 14:45:08 -0400 Subject: [PATCH 18/22] fix: missing region --- infra/functions-python/main.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 46504a474..0f20944ae 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -454,6 +454,7 @@ resource "google_cloud_scheduler_job" "gbfs_validator_batch_scheduler" { description = "Schedule the gbfs-validator-batch function" time_zone = "Etc/UTC" schedule = var.gbfs_scheduler_schedule + region = var.gcp_region paused = var.environment == "prod" ? false : true depends_on = [google_cloudfunctions2_function.gbfs_validator_batch, google_cloudfunctions2_function_iam_member.gbfs_validator_batch_invoker] http_target { From 13d39e525cc9693a724d3cca4b1c5967cbce82ff Mon Sep 17 00:00:00 2001 From: cka-y Date: Tue, 20 Aug 2024 10:02:25 -0400 Subject: [PATCH 19/22] fix: PR comments --- api/src/scripts/populate_db_gbfs.py | 3 +- .../gbfs_validator/src/gbfs_utils.py | 55 ++++++++++++------- functions-python/gbfs_validator/src/main.py | 5 +- .../gbfs_validator/tests/test_gbfs_utils.py | 23 +++++++- 4 files changed, 57 insertions(+), 29 deletions(-) diff --git a/api/src/scripts/populate_db_gbfs.py b/api/src/scripts/populate_db_gbfs.py index d0f22efe3..bb706b776 100644 --- a/api/src/scripts/populate_db_gbfs.py +++ b/api/src/scripts/populate_db_gbfs.py @@ -93,10 +93,11 @@ def populate_db(self): country_code = self.get_safe_value(row, "Country Code", "") municipality = self.get_safe_value(row, "Location", "") location_id = self.get_location_id(country_code, None, municipality) + country = pycountry.countries.get(alpha_2=country_code) if country_code else None location = self.db.session.get(Location, location_id) or Location( id=location_id, country_code=country_code, - country=pycountry.countries.get(alpha_2=country_code).name if country_code else None, + country=country.name if country else None, municipality=municipality, ) gbfs_feed.locations.clear() diff --git a/functions-python/gbfs_validator/src/gbfs_utils.py b/functions-python/gbfs_validator/src/gbfs_utils.py index 47e72a3c5..8c23aa4a1 100644 --- a/functions-python/gbfs_validator/src/gbfs_utils.py +++ b/functions-python/gbfs_validator/src/gbfs_utils.py @@ -3,7 +3,7 @@ import os import uuid from datetime import datetime -from typing import Dict, Any +from typing import Dict, Any, Optional import requests from google.cloud import storage @@ -22,6 +22,13 @@ "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary", ) +today = datetime.now().strftime("%Y-%m-%d") + + +def get_snapshot_id(stable_id: str) -> str: + """Get the file ID from the stable ID.""" + return f"{stable_id}-{today}" + def fetch_gbfs_files(url: str) -> Dict[str, Any]: """Fetch the GBFS files from the autodiscovery URL.""" @@ -32,15 +39,21 @@ def fetch_gbfs_files(url: str) -> Dict[str, Any]: def upload_gbfs_file_to_bucket( bucket: storage.Bucket, file_url: str, destination_blob_name: str -) -> str: +) -> Optional[str]: """Upload a GBFS file to a Cloud Storage bucket.""" - response = requests.get(file_url) - response.raise_for_status() - blob = bucket.blob(destination_blob_name) - blob.upload_from_string(response.content) - blob.make_public() - logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") - return blob.public_url + try: + response = requests.get(file_url) + response.raise_for_status() + blob = bucket.blob(destination_blob_name) + blob.upload_from_string(response.content) + blob.make_public() + logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") + return blob.public_url + except requests.exceptions.RequestException as error: + logging.error( + f"Error uploading {destination_blob_name} with access url {file_url}: {error}" + ) + return None def create_gbfs_json_with_bucket_paths( @@ -54,27 +67,29 @@ def create_gbfs_json_with_bucket_paths( @return: The public URL of the new gbfs.json. """ new_gbfs_data = gbfs_data.copy() - today = datetime.now().strftime("%Y-%m-%d") + snapshot_id = get_snapshot_id(stable_id) for feed_key, feed in new_gbfs_data["data"].items(): if isinstance(feed["feeds"], dict): for feed_language, feed_info in feed["feeds"].items(): old_url = feed_info["url"] - blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}_{feed_language}.json" + blob_name = f"{stable_id}/{snapshot_id}/{feed_info['name']}_{feed_language}.json" new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - feed_info["url"] = new_url + if new_url is not None: + feed_info["url"] = new_url elif isinstance(feed["feeds"], list): for feed_info in feed["feeds"]: old_url = feed_info["url"] - blob_name = f"{stable_id}/{stable_id}-{today}/{feed_info['name']}.json" + blob_name = f"{stable_id}/{snapshot_id}/{feed_info['name']}.json" new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - feed_info["url"] = new_url + if new_url is not None: + feed_info["url"] = new_url else: logging.warning(f"Unexpected format in feed: {feed_key}") # Save the new gbfs.json in the bucket new_gbfs_data["last_updated"] = today - new_gbfs_blob = bucket.blob(f"{stable_id}/{stable_id}-{today}/gbfs.json") + new_gbfs_blob = bucket.blob(f"{stable_id}/{snapshot_id}/gbfs.json") new_gbfs_blob.upload_from_string( json.dumps(new_gbfs_data), content_type="application/json" ) @@ -91,11 +106,10 @@ def save_trace_with_error(trace, error, trace_service): def create_snapshot(stable_id: str, feed_id: str, hosted_url: str) -> Gbfssnapshot: """Create a new Gbfssnapshot object.""" - today = datetime.now().strftime("%Y-%m-%d") snapshot_id = str(uuid.uuid4()) snapshot = Gbfssnapshot( id=snapshot_id, - stable_id=f"{stable_id}-{today}", + stable_id=get_snapshot_id(stable_id), feed_id=feed_id, downloaded_at=datetime.now(), hosted_url=hosted_url, @@ -104,17 +118,16 @@ def create_snapshot(stable_id: str, feed_id: str, hosted_url: str) -> Gbfssnapsh def validate_gbfs_feed( - hosted_url: str, stable_id: str, today: str, bucket: storage.Bucket + hosted_url: str, stable_id: str, bucket: storage.Bucket ) -> Dict[str, Any]: """Validate the GBFS feed and store the report in Cloud Storage.""" json_payload = {"url": hosted_url} + snapshot_id = get_snapshot_id(stable_id) response = requests.post(VALIDATOR_URL, json=json_payload) response.raise_for_status() json_report_summary = response.json() - report_summary_blob = bucket.blob( - f"{stable_id}/{stable_id}-{today}/report_summary.json" - ) + report_summary_blob = bucket.blob(f"{stable_id}/{snapshot_id}/report_summary.json") report_summary_blob.upload_from_string( json.dumps(json_report_summary), content_type="application/json" ) diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index f152050b9..99f895a0a 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -110,13 +110,10 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): return error_message try: - today = datetime.now().strftime("%Y-%m-%d") snapshot = create_snapshot(stable_id, feed_id, hosted_url) session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) - validation_results = validate_gbfs_feed( - hosted_url, stable_id, today, bucket - ) + validation_results = validate_gbfs_feed(hosted_url, stable_id, bucket) save_snapshot_and_report(session, snapshot, validation_results) except Exception as e: diff --git a/functions-python/gbfs_validator/tests/test_gbfs_utils.py b/functions-python/gbfs_validator/tests/test_gbfs_utils.py index 1284912e4..b8f54c060 100644 --- a/functions-python/gbfs_validator/tests/test_gbfs_utils.py +++ b/functions-python/gbfs_validator/tests/test_gbfs_utils.py @@ -2,6 +2,7 @@ import uuid from datetime import datetime from unittest.mock import patch, MagicMock +import requests from gbfs_validator.src.gbfs_utils import ( fetch_gbfs_files, @@ -11,12 +12,19 @@ create_snapshot, validate_gbfs_feed, save_snapshot_and_report, - VALIDATOR_URL, + VALIDATOR_URL, get_snapshot_id, ) from dataset_service.main import Status class TestGbfsUtils(unittest.TestCase): + + def test_get_snapshot_id(self): + stable_id = "test_stable_id" + today = datetime.now().strftime("%Y-%m-%d") + result = get_snapshot_id(stable_id) + self.assertEqual(result, f"{stable_id}-{today}") + @patch("requests.get") def test_fetch_gbfs_files(self, mock_get): mock_response = MagicMock() @@ -48,6 +56,16 @@ def test_upload_gbfs_file_to_bucket(self, mock_get): mock_blob.upload_from_string.assert_called_once_with(b"file_content") mock_blob.make_public.assert_called_once() + @patch("requests.get") + def test_upload_gbfs_file_to_bucket_exception(self, mock_get): + mock_get.side_effect = requests.exceptions.RequestException("Error") + mock_bucket = MagicMock() + + result = upload_gbfs_file_to_bucket( + mock_bucket, "http://file-url.com", "destination_blob" + ) + self.assertIsNone(result) + @patch("gbfs_validator.src.gbfs_utils.upload_gbfs_file_to_bucket") def test_create_gbfs_json_with_bucket_paths(self, mock_upload): mock_upload.return_value = "http://new-url.com" @@ -102,11 +120,10 @@ def test_validate_gbfs_feed(self, mock_blob, mock_post): hosted_url = "http://hosted-url.com" stable_id = "test_stable_id" - today = datetime.now().strftime("%Y-%m-%d") mock_bucket = MagicMock() mock_bucket.blob.return_value = mock_blob_obj - result = validate_gbfs_feed(hosted_url, stable_id, today, mock_bucket) + result = validate_gbfs_feed(hosted_url, stable_id, mock_bucket) self.assertEqual( result["json_report_summary"], {"summary": "validation report"} From d37f6e92719c32637844306a5306163ce92e1517 Mon Sep 17 00:00:00 2001 From: cka-y Date: Tue, 20 Aug 2024 10:02:52 -0400 Subject: [PATCH 20/22] fix: lint --- functions-python/gbfs_validator/tests/test_gbfs_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functions-python/gbfs_validator/tests/test_gbfs_utils.py b/functions-python/gbfs_validator/tests/test_gbfs_utils.py index b8f54c060..ad959cc56 100644 --- a/functions-python/gbfs_validator/tests/test_gbfs_utils.py +++ b/functions-python/gbfs_validator/tests/test_gbfs_utils.py @@ -12,13 +12,13 @@ create_snapshot, validate_gbfs_feed, save_snapshot_and_report, - VALIDATOR_URL, get_snapshot_id, + VALIDATOR_URL, + get_snapshot_id, ) from dataset_service.main import Status class TestGbfsUtils(unittest.TestCase): - def test_get_snapshot_id(self): stable_id = "test_stable_id" today = datetime.now().strftime("%Y-%m-%d") From 440b541d24051ba8bbcfeb7b1f67c5d21b210296 Mon Sep 17 00:00:00 2001 From: cka-y Date: Tue, 20 Aug 2024 11:25:32 -0400 Subject: [PATCH 21/22] fix: PR comment --- .../batch_process_dataset/src/main.py | 2 +- .../gbfs_validator/src/gbfs_utils.py | 230 +++++++++--------- functions-python/gbfs_validator/src/main.py | 21 +- .../gbfs_validator/tests/test_gbfs_utils.py | 39 ++- .../tests/test_gbfs_validator.py | 6 +- 5 files changed, 141 insertions(+), 157 deletions(-) diff --git a/functions-python/batch_process_dataset/src/main.py b/functions-python/batch_process_dataset/src/main.py index dd791d9f5..15448347e 100644 --- a/functions-python/batch_process_dataset/src/main.py +++ b/functions-python/batch_process_dataset/src/main.py @@ -75,7 +75,7 @@ def __init__( self.execution_id = execution_id self.authentication_type = authentication_type self.api_key_parameter_name = api_key_parameter_name - self.date = datetime.now().strftime("%Y%m%d%H%S") + self.date = datetime.now().strftime("%Y%m%d%H%M") feeds_credentials = ast.literal_eval(os.getenv("FEED_CREDENTIALS", "{}")) self.feed_credentials = feeds_credentials.get(self.feed_stable_id, None) self.public_hosted_datasets_url = public_hosted_datasets_url diff --git a/functions-python/gbfs_validator/src/gbfs_utils.py b/functions-python/gbfs_validator/src/gbfs_utils.py index 8c23aa4a1..e9eaf98f1 100644 --- a/functions-python/gbfs_validator/src/gbfs_utils.py +++ b/functions-python/gbfs_validator/src/gbfs_utils.py @@ -7,136 +7,95 @@ import requests from google.cloud import storage - from database_gen.sqlacodegen_models import ( Gbfssnapshot, Gbfsvalidationreport, Gbfsnotice, ) -from dataset_service.main import ( - Status, -) - -VALIDATOR_URL = os.getenv( - "VALIDATOR_URL", - "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary", -) - -today = datetime.now().strftime("%Y-%m-%d") - +from dataset_service.main import Status -def get_snapshot_id(stable_id: str) -> str: - """Get the file ID from the stable ID.""" - return f"{stable_id}-{today}" - -def fetch_gbfs_files(url: str) -> Dict[str, Any]: - """Fetch the GBFS files from the autodiscovery URL.""" - response = requests.get(url) - response.raise_for_status() - return response.json() - - -def upload_gbfs_file_to_bucket( - bucket: storage.Bucket, file_url: str, destination_blob_name: str -) -> Optional[str]: - """Upload a GBFS file to a Cloud Storage bucket.""" - try: - response = requests.get(file_url) - response.raise_for_status() - blob = bucket.blob(destination_blob_name) - blob.upload_from_string(response.content) - blob.make_public() - logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") - return blob.public_url - except requests.exceptions.RequestException as error: - logging.error( - f"Error uploading {destination_blob_name} with access url {file_url}: {error}" +class GBFSValidator: + def __init__(self, stable_id: str): + self.validation_timestamp = datetime.now().strftime("%Y%m%d%H%M") + self.stable_id = stable_id + self.snapshot_id = f"{self.stable_id}-{self.validation_timestamp}" + self.VALIDATOR_URL = os.getenv( + "VALIDATOR_URL", + "https://gbfs-validator.mobilitydata.org/.netlify/functions/validator-summary", ) - return None - - -def create_gbfs_json_with_bucket_paths( - bucket: storage.Bucket, gbfs_data: Dict[str, Any], stable_id: str -) -> str: - """ - Create a new gbfs.json with paths pointing to Cloud Storage and upload it. - @param bucket: The Cloud Storage bucket. - @param gbfs_data: The GBFS data. - @param stable_id: The stable ID of the feed. - @return: The public URL of the new gbfs.json. - """ - new_gbfs_data = gbfs_data.copy() - snapshot_id = get_snapshot_id(stable_id) - - for feed_key, feed in new_gbfs_data["data"].items(): - if isinstance(feed["feeds"], dict): - for feed_language, feed_info in feed["feeds"].items(): - old_url = feed_info["url"] - blob_name = f"{stable_id}/{snapshot_id}/{feed_info['name']}_{feed_language}.json" - new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - if new_url is not None: - feed_info["url"] = new_url - elif isinstance(feed["feeds"], list): - for feed_info in feed["feeds"]: - old_url = feed_info["url"] - blob_name = f"{stable_id}/{snapshot_id}/{feed_info['name']}.json" - new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) - if new_url is not None: - feed_info["url"] = new_url - else: - logging.warning(f"Unexpected format in feed: {feed_key}") - - # Save the new gbfs.json in the bucket - new_gbfs_data["last_updated"] = today - new_gbfs_blob = bucket.blob(f"{stable_id}/{snapshot_id}/gbfs.json") - new_gbfs_blob.upload_from_string( - json.dumps(new_gbfs_data), content_type="application/json" - ) - new_gbfs_blob.make_public() - return new_gbfs_blob.public_url - - -def save_trace_with_error(trace, error, trace_service): - """Helper function to save trace with an error.""" - trace.error_message = error - trace.status = Status.FAILED - trace_service.save(trace) - - -def create_snapshot(stable_id: str, feed_id: str, hosted_url: str) -> Gbfssnapshot: - """Create a new Gbfssnapshot object.""" - snapshot_id = str(uuid.uuid4()) - snapshot = Gbfssnapshot( - id=snapshot_id, - stable_id=get_snapshot_id(stable_id), - feed_id=feed_id, - downloaded_at=datetime.now(), - hosted_url=hosted_url, - ) - return snapshot - + self.hosted_url = None # The hosted URL of the new gbfs.json + + def create_gbfs_json_with_bucket_paths( + self, bucket: storage.Bucket, gbfs_data: Dict[str, Any] + ) -> None: + """ + Create a new gbfs.json with paths pointing to Cloud Storage and upload it. + @param bucket: The Cloud Storage bucket. + @param gbfs_data: The GBFS data. + @return: The public URL of the new gbfs.json. + """ + new_gbfs_data = gbfs_data.copy() + + for feed_key, feed in new_gbfs_data["data"].items(): + if isinstance(feed["feeds"], dict): + for feed_language, feed_info in feed["feeds"].items(): + old_url = feed_info["url"] + blob_name = f"{self.stable_id}/{self.snapshot_id}/{feed_info['name']}_{feed_language}.json" + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) + if new_url is not None: + feed_info["url"] = new_url + elif isinstance(feed["feeds"], list): + for feed_info in feed["feeds"]: + old_url = feed_info["url"] + blob_name = ( + f"{self.stable_id}/{self.snapshot_id}/{feed_info['name']}.json" + ) + new_url = upload_gbfs_file_to_bucket(bucket, old_url, blob_name) + if new_url is not None: + feed_info["url"] = new_url + else: + logging.warning(f"Unexpected format in feed: {feed_key}") + + # Save the new gbfs.json in the bucket + new_gbfs_blob = bucket.blob(f"{self.stable_id}/{self.snapshot_id}/gbfs.json") + new_gbfs_blob.upload_from_string( + json.dumps(new_gbfs_data), content_type="application/json" + ) + new_gbfs_blob.make_public() + self.hosted_url = new_gbfs_blob.public_url + + def create_snapshot(self, feed_id: str) -> Gbfssnapshot: + """Create a new Gbfssnapshot object.""" + snapshot_id = str(uuid.uuid4()) + snapshot = Gbfssnapshot( + id=snapshot_id, + stable_id=self.snapshot_id, + feed_id=feed_id, + downloaded_at=datetime.now(), + hosted_url=self.hosted_url, + ) + return snapshot -def validate_gbfs_feed( - hosted_url: str, stable_id: str, bucket: storage.Bucket -) -> Dict[str, Any]: - """Validate the GBFS feed and store the report in Cloud Storage.""" - json_payload = {"url": hosted_url} - snapshot_id = get_snapshot_id(stable_id) - response = requests.post(VALIDATOR_URL, json=json_payload) - response.raise_for_status() + def validate_gbfs_feed(self, bucket: storage.Bucket) -> Dict[str, Any]: + """Validate the GBFS feed and store the report in Cloud Storage.""" + json_payload = {"url": self.hosted_url} + response = requests.post(self.VALIDATOR_URL, json=json_payload) + response.raise_for_status() - json_report_summary = response.json() - report_summary_blob = bucket.blob(f"{stable_id}/{snapshot_id}/report_summary.json") - report_summary_blob.upload_from_string( - json.dumps(json_report_summary), content_type="application/json" - ) - report_summary_blob.make_public() + json_report_summary = response.json() + report_summary_blob = bucket.blob( + f"{self.stable_id}/{self.snapshot_id}/report_summary.json" + ) + report_summary_blob.upload_from_string( + json.dumps(json_report_summary), content_type="application/json" + ) + report_summary_blob.make_public() - return { - "report_summary_url": report_summary_blob.public_url, - "json_report_summary": json_report_summary, - } + return { + "report_summary_url": report_summary_blob.public_url, + "json_report_summary": json_report_summary, + } def save_snapshot_and_report( @@ -166,3 +125,36 @@ def save_snapshot_and_report( snapshot.gbfsvalidationreports = [validation_report] session.add(snapshot) session.commit() + + +def fetch_gbfs_files(url: str) -> Dict[str, Any]: + """Fetch the GBFS files from the autodiscovery URL.""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def save_trace_with_error(trace, error, trace_service): + """Helper function to save trace with an error.""" + trace.error_message = error + trace.status = Status.FAILED + trace_service.save(trace) + + +def upload_gbfs_file_to_bucket( + bucket: storage.Bucket, file_url: str, destination_blob_name: str +) -> Optional[str]: + """Upload a GBFS file to a Cloud Storage bucket.""" + try: + response = requests.get(file_url) + response.raise_for_status() + blob = bucket.blob(destination_blob_name) + blob.upload_from_string(response.content) + blob.make_public() + logging.info(f"Uploaded {destination_blob_name} to {bucket.name}.") + return blob.public_url + except requests.exceptions.RequestException as error: + logging.error( + f"Error uploading {destination_blob_name} with access url {file_url}: {error}" + ) + return None diff --git a/functions-python/gbfs_validator/src/main.py b/functions-python/gbfs_validator/src/main.py index 99f895a0a..802a6de43 100644 --- a/functions-python/gbfs_validator/src/main.py +++ b/functions-python/gbfs_validator/src/main.py @@ -18,17 +18,15 @@ PipelineStage, MaxExecutionsReachedError, ) +from helpers.database import start_db_session +from helpers.logger import Logger, StableIdFilter +from helpers.parser import jsonify_pubsub from .gbfs_utils import ( + GBFSValidator, fetch_gbfs_files, - create_gbfs_json_with_bucket_paths, save_trace_with_error, - create_snapshot, - validate_gbfs_feed, save_snapshot_and_report, ) -from helpers.database import start_db_session -from helpers.logger import Logger, StableIdFilter -from helpers.parser import jsonify_pubsub logging.basicConfig(level=logging.INFO) @@ -51,7 +49,6 @@ def fetch_all_gbfs_feeds() -> List[Gbfsfeed]: session.close() -@functions_framework.cloud_event @functions_framework.cloud_event def gbfs_validator_pubsub(cloud_event: CloudEvent): Logger.init_logger() @@ -100,9 +97,8 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): storage_client = storage.Client() bucket = storage_client.bucket(BUCKET_NAME) gbfs_data = fetch_gbfs_files(url) - hosted_url = create_gbfs_json_with_bucket_paths( - bucket, gbfs_data, stable_id - ) + validator = GBFSValidator(stable_id) + validator.create_gbfs_json_with_bucket_paths(bucket, gbfs_data) except Exception as e: error_message = f"Error processing GBFS files: {e}" logging.error(error_message) @@ -110,10 +106,9 @@ def gbfs_validator_pubsub(cloud_event: CloudEvent): return error_message try: - snapshot = create_snapshot(stable_id, feed_id, hosted_url) + snapshot = validator.create_snapshot(feed_id) + validation_results = validator.validate_gbfs_feed(bucket) session = start_db_session(os.getenv("FEEDS_DATABASE_URL")) - - validation_results = validate_gbfs_feed(hosted_url, stable_id, bucket) save_snapshot_and_report(session, snapshot, validation_results) except Exception as e: diff --git a/functions-python/gbfs_validator/tests/test_gbfs_utils.py b/functions-python/gbfs_validator/tests/test_gbfs_utils.py index ad959cc56..96462b12c 100644 --- a/functions-python/gbfs_validator/tests/test_gbfs_utils.py +++ b/functions-python/gbfs_validator/tests/test_gbfs_utils.py @@ -1,29 +1,23 @@ import unittest import uuid -from datetime import datetime from unittest.mock import patch, MagicMock + import requests +from dataset_service.main import Status from gbfs_validator.src.gbfs_utils import ( fetch_gbfs_files, upload_gbfs_file_to_bucket, - create_gbfs_json_with_bucket_paths, save_trace_with_error, - create_snapshot, - validate_gbfs_feed, save_snapshot_and_report, - VALIDATOR_URL, - get_snapshot_id, + GBFSValidator, ) -from dataset_service.main import Status class TestGbfsUtils(unittest.TestCase): - def test_get_snapshot_id(self): - stable_id = "test_stable_id" - today = datetime.now().strftime("%Y-%m-%d") - result = get_snapshot_id(stable_id) - self.assertEqual(result, f"{stable_id}-{today}") + def setUp(self): + self.stable_id = "test_stable_id" + self.validator = GBFSValidator(self.stable_id) @patch("requests.get") def test_fetch_gbfs_files(self, mock_get): @@ -74,11 +68,11 @@ def test_create_gbfs_json_with_bucket_paths(self, mock_upload): gbfs_data = { "data": {"en": {"feeds": [{"url": "http://old-url.com", "name": "feed"}]}} } - stable_id = "test_stable_id" + mock_bucket.blob.return_value.public_url = "http://new-url.com" - result = create_gbfs_json_with_bucket_paths(mock_bucket, gbfs_data, stable_id) - self.assertEqual(result, "http://new-url.com") + self.validator.create_gbfs_json_with_bucket_paths(mock_bucket, gbfs_data) + self.assertEqual(self.validator.hosted_url, "http://new-url.com") def test_save_trace_with_error(self): mock_trace = MagicMock() @@ -91,14 +85,15 @@ def test_save_trace_with_error(self): self.assertEqual(mock_trace.status, Status.FAILED) def test_create_snapshot(self): - stable_id = "test_stable_id" feed_id = "test_feed_id" hosted_url = "http://hosted-url.com" + self.validator.hosted_url = hosted_url - snapshot = create_snapshot(stable_id, feed_id, hosted_url) + snapshot = self.validator.create_snapshot(feed_id) self.assertEqual( - snapshot.stable_id, f"{stable_id}-{datetime.now().strftime('%Y-%m-%d')}" + snapshot.stable_id, + f"{self.stable_id}-{self.validator.validation_timestamp}", ) self.assertEqual(snapshot.feed_id, feed_id) self.assertEqual(snapshot.hosted_url, hosted_url) @@ -119,17 +114,19 @@ def test_validate_gbfs_feed(self, mock_blob, mock_post): mock_blob.return_value = mock_blob_obj hosted_url = "http://hosted-url.com" - stable_id = "test_stable_id" + self.validator.hosted_url = hosted_url mock_bucket = MagicMock() mock_bucket.blob.return_value = mock_blob_obj - result = validate_gbfs_feed(hosted_url, stable_id, mock_bucket) + result = self.validator.validate_gbfs_feed(mock_bucket) self.assertEqual( result["json_report_summary"], {"summary": "validation report"} ) self.assertEqual(result["report_summary_url"], mock_blob_obj.public_url) - mock_post.assert_called_once_with(VALIDATOR_URL, json={"url": hosted_url}) + mock_post.assert_called_once_with( + self.validator.VALIDATOR_URL, json={"url": hosted_url} + ) mock_blob_obj.upload_from_string.assert_called_once() @patch("gbfs_validator.src.gbfs_utils.Gbfsvalidationreport") diff --git a/functions-python/gbfs_validator/tests/test_gbfs_validator.py b/functions-python/gbfs_validator/tests/test_gbfs_validator.py index bbd206b93..26e242941 100644 --- a/functions-python/gbfs_validator/tests/test_gbfs_validator.py +++ b/functions-python/gbfs_validator/tests/test_gbfs_validator.py @@ -31,9 +31,9 @@ class TestMainFunctions(unittest.TestCase): @patch("gbfs_validator.src.main.start_db_session") @patch("gbfs_validator.src.main.DatasetTraceService") @patch("gbfs_validator.src.main.fetch_gbfs_files") - @patch("gbfs_validator.src.main.create_gbfs_json_with_bucket_paths") - @patch("gbfs_validator.src.main.create_snapshot") - @patch("gbfs_validator.src.main.validate_gbfs_feed") + @patch("gbfs_validator.src.main.GBFSValidator.create_gbfs_json_with_bucket_paths") + @patch("gbfs_validator.src.main.GBFSValidator.create_snapshot") + @patch("gbfs_validator.src.main.GBFSValidator.validate_gbfs_feed") @patch("gbfs_validator.src.main.save_snapshot_and_report") @patch("gbfs_validator.src.main.Logger") @patch("gbfs_validator.src.main.storage.Client") From dc5e5fd40092b2e8f1ee8e1353e664a69cebbb7a Mon Sep 17 00:00:00 2001 From: cka-y Date: Tue, 20 Aug 2024 11:33:13 -0400 Subject: [PATCH 22/22] fix: region of gcp bucket --- infra/functions-python/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infra/functions-python/main.tf b/infra/functions-python/main.tf index 0f20944ae..ddf768c82 100644 --- a/infra/functions-python/main.tf +++ b/infra/functions-python/main.tf @@ -68,7 +68,7 @@ resource "google_storage_bucket" "functions_bucket" { } resource "google_storage_bucket" "gbfs_snapshots_bucket" { - location = "us" + location = var.gcp_region name = "${var.gbfs_bucket_name}-${var.environment}" }