Skip to content

Commit

Permalink
Merge pull request #136 from openstates/scrape-update-archive-handle-…
Browse files Browse the repository at this point in the history
…exceptions

Scrape archiving: handle exceptions so archiving doesn't fail scrape
  • Loading branch information
jessemortenson authored Aug 14, 2024
2 parents 7400639 + 4af2454 commit ec0f5e5
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 15 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 6.20.2 - Aug 14, 2024

* Prevent failure in Google Cloud Storage archiving from failing a scrape/update operation

## 6.20.1 - Aug 2, 2024

* Fix permissions issue caused by slightly wrong usage of GCP storage client code
Expand Down
34 changes: 20 additions & 14 deletions openstates/cli/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,22 +217,28 @@ def archive_to_cloud_storage(
return
logger.info("Beginning archive of scraped files to google cloud storage.")
logger.info(f"GCP Project is {GCP_PROJECT} and bucket is {BUCKET_NAME}")
cloud_storage_client = storage.Client(project=GCP_PROJECT)
bucket = cloud_storage_client.bucket(BUCKET_NAME)
jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "")
destination_prefx = (
f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}"
)

# read files in directory and upload
files_count = 0
for file_path in glob.glob(datadir + "/*.json"):
files_count += 1
blob_name = os.path.join(destination_prefx, os.path.basename(file_path))
blob = bucket.blob(blob_name)
blob.upload_from_filename(file_path)
# Catch exceptions so that we do not fail the scrape if transient GCS error occurs
try:
cloud_storage_client = storage.Client(project=GCP_PROJECT)
bucket = cloud_storage_client.bucket(BUCKET_NAME)
jurisdiction_id = juris.jurisdiction_id.replace("ocd-jurisdiction/", "")
destination_prefx = (
f"{SCRAPE_LAKE_PREFIX}/{jurisdiction_id}/{last_scrape_end_datetime.isoformat()}"
)

# read files in directory and upload
files_count = 0
for file_path in glob.glob(datadir + "/*.json"):
files_count += 1
blob_name = os.path.join(destination_prefx, os.path.basename(file_path))
blob = bucket.blob(blob_name)
blob.upload_from_filename(file_path)

logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.")

logger.info(f"Completed archive to Google Cloud Storage, {files_count} files were uploaded.")
except Exception as e:
logger.warning(f"An error occurred during the attempt to archive files to Google Cloud Storage: {e}")


def do_import(juris: State, args: argparse.Namespace) -> dict[str, typing.Any]:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "openstates"
version = "6.20.1"
version = "6.20.2"
description = "core infrastructure for the openstates project"
authors = ["James Turk <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit ec0f5e5

Please sign in to comment.