Merge pull request #136 from openstates/scrape-update-archive-handle-…

…exceptions Scrape archiving: handle exceptions so archiving doesn't fail scrape
openstates · Aug 14, 2024 · ec0f5e5 · ec0f5e5
2 parents 7400639 + 4af2454
commit ec0f5e5
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 6.20.2 - Aug 14, 2024
+
+* Prevent failure in Google Cloud Storage archiving from failing a scrape/update operation
+
 ## 6.20.1 - Aug 2, 2024
 
 * Fix permissions issue caused by slightly wrong usage of GCP storage client code

diff --git a/openstates/cli/update.py b/openstates/cli/update.py
@@ -217,22 +217,28 @@ def archive_to_cloud_storage(
         return
     logger.info("Beginning archive of scraped files to google cloud storage.")
     logger.info(f"GCP Project is {GCP_PROJECT} and bucket is {BUCKET_NAME}")
-    cloud_storage_client = storage.Client(project=GCP_PROJECT)
-    bucket = cloud_storage_client.bucket(BUCKET_NAME)
-    jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "")
-    destination_prefx = (
-        f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}"
-    )
 
-    # read files in directory and upload
-    files_count = 0
-    for file_path in glob.glob(datadir + "/*.json"):
-        files_count += 1
-        blob_name = os.path.join(destination_prefx, os.path.basename(file_path))
-        blob = bucket.blob(blob_name)
-        blob.upload_from_filename(file_path)
+    # Catch exceptions so that we do not fail the scrape if transient GCS error occurs
+    try:
+        cloud_storage_client = storage.Client(project=GCP_PROJECT)
+        bucket = cloud_storage_client.bucket(BUCKET_NAME)
+        jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "")
+        destination_prefx = (
+            f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}"
+        )
+
+        # read files in directory and upload
+        files_count = 0
+        for file_path in glob.glob(datadir + "/*.json"):
+            files_count += 1
+            blob_name = os.path.join(destination_prefx, os.path.basename(file_path))
+            blob = bucket.blob(blob_name)
+            blob.upload_from_filename(file_path)
+
+        logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.")
 
-    logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.")
+    except Exception as e:
+        logger.warning(f"An error occurred during the attempt to archive files to Google Cloud Storage: {e}")
 
 
 def do_import(juris: State, args: argparse.Namespace) -> dict[str, typing.Any]:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openstates"
-version = "6.20.1"
+version = "6.20.2"
 description = "core infrastructure for the openstates project"
 authors = ["James Turk <[email protected]>"]
 license = "MIT"