Update Science museum urls (#4276)

* Update Science Museum DAG to check if urls are reachable * Add one-time DAG to update existing urls * Fix docstrings, more performant chain * Update mocks in tests * Update dag docs * Decrease batch size, limit number concurrent * Commit ids to update to temp table and drop later
WordPress · May 9, 2024 · cf9e6a8 · cf9e6a8
1 parent 43d4d09
commit cf9e6a8
Show file tree

Hide file tree

Showing 4 changed files with 250 additions and 11 deletions.
diff --git a/catalog/dags/maintenance/update_science_museum_urls.py b/catalog/dags/maintenance/update_science_museum_urls.py
@@ -0,0 +1,203 @@
+"""
+# Update Science Museum URLs
+
+One-time maintenance DAG to update Science Museum records to have valid URLs. See https://github.com/WordPress/openverse/issues/4261.
+
+For each Science Museum record, this DAG:
+
+* updates the url to the new format, excluding `/images/` in the path if it exists
+* validates whether the url is reachable. If not, the record ID is added to an `invalid_science_musem_ids` table.
+
+Once complete, we can use the `science_museum_invalid_ids` to identify records to delete. They are not automatically deleted by this DAG, in order to give us an opportunity to first see how many there are.
+"""
+
+import itertools
+import logging
+from datetime import timedelta
+from textwrap import dedent
+
+from airflow.decorators import dag, task
+from airflow.exceptions import AirflowSkipException
+from airflow.models.abstractoperator import AbstractOperator
+from airflow.models.param import Param
+
+from common import slack
+from common.constants import DAG_DEFAULT_ARGS, POSTGRES_CONN_ID
+from common.sql import RETURN_ROW_COUNT, PostgresHook
+from common.urls import rewrite_redirected_url
+
+
+logger = logging.getLogger(__name__)
+
+
+DAG_ID = "update_science_museum_urls"
+INVALID_IDS_TABLE = "science_museum_invalid_ids"
+IDS_TO_UPDATE_TABLE = "temp_science_museum_rows_to_validate"
+
+UPDATE_URLS_QUERY = dedent(
+    """
+    UPDATE image
+    SET url=REPLACE(url, '/images', '')
+    WHERE provider='sciencemuseum' AND url ILIKE '%/images/%';
+    """
+)
+CREATE_TABLE_QUERY = dedent(
+    f"""
+    CREATE TABLE IF NOT EXISTS {INVALID_IDS_TABLE}
+    (identifier uuid);
+    """
+)
+CREATE_TEMP_TABLE_QUERY = dedent(
+    f"""
+    CREATE TABLE {IDS_TO_UPDATE_TABLE} AS
+    SELECT ROW_NUMBER() OVER() row_id, identifier, url
+    FROM image
+    WHERE provider='sciencemuseum';
+    """
+)
+DROP_TEMP_TABLE_QUERY = f"DROP TABLE {IDS_TO_UPDATE_TABLE};"
+GET_BATCH_QUERY = dedent(
+    """
+    SELECT identifier, url
+    FROM {table_name}
+    WHERE row_id > {start} AND row_id <= {end};
+    """
+)
+
+
+@task
+def run_sql(
+    sql: str,
+    handler: callable = None,
+    postgres_conn_id: str = POSTGRES_CONN_ID,
+    task: AbstractOperator = None,
+):
+    postgres = PostgresHook(
+        postgres_conn_id=postgres_conn_id,
+        default_statement_timeout=PostgresHook.get_execution_timeout(task),
+    )
+
+    return postgres.run(sql, handler=handler)
+
+
+@task
+def get_batches(
+    total_count: int,
+    batch_size: int,
+):
+    return [
+        {"start": i, "end": i + batch_size} for i in range(0, total_count, batch_size)
+    ]
+
+
+@task(max_active_tis_per_dagrun=3)
+def process_batch(
+    start: int,
+    end: int,
+    postgres_conn_id: str = POSTGRES_CONN_ID,
+    task: AbstractOperator = None,
+):
+    invalid_ids = []
+
+    postgres = PostgresHook(
+        postgres_conn_id=postgres_conn_id,
+        default_statement_timeout=PostgresHook.get_execution_timeout(task),
+    )
+    records = postgres.get_records(
+        GET_BATCH_QUERY.format(table_name=IDS_TO_UPDATE_TABLE, start=start, end=end)
+    )
+
+    for identifier, url in records:
+        # Failed to reach URL
+        if rewrite_redirected_url(url) is None:
+            invalid_ids.append(identifier)
+
+    return invalid_ids
+
+
+@task
+def record_invalid_ids(invalid_ids):
+    # Chain together
+    ids_to_record = itertools.chain.from_iterable(invalid_ids)
+
+    if not ids_to_record:
+        raise AirflowSkipException("No invalid urls found!")
+
+    values = ", ".join([f"('{id}')" for id in ids_to_record])
+    query = dedent(
+        f"""
+        INSERT INTO {INVALID_IDS_TABLE} (identifier)
+        VALUES {values}
+        """
+    )
+
+    return run_sql.function(sql=query, handler=RETURN_ROW_COUNT)
+
+
+@task
+def notify_slack(invalid_count: int):
+    slack.send_message(
+        f"Detected {invalid_count} invalid Science Museum urls.", dag_id=DAG_ID
+    )
+
+
+@dag(
+    dag_id=DAG_ID,
+    schedule=None,
+    catchup=False,
+    tags=["data_normalization"],
+    doc_md=__doc__,
+    default_args={
+        **DAG_DEFAULT_ARGS,
+        "retries": 0,
+        "execution_timeout": timedelta(days=2),
+    },
+    render_template_as_native_obj=True,
+    params={
+        "batch_size": Param(
+            default=250,
+            type="integer",
+            description="The number of records to update per batch.",
+        ),
+    },
+)
+def update_science_museum_urls():
+    # Update all URLs to have the correct format
+    update = run_sql.override(task_id="update_url_format")(sql=UPDATE_URLS_QUERY)
+
+    # Create table to track ids of records that still have an
+    # invalid/unreachable url
+    create_table = run_sql.override(task_id="create_invalid_id_table")(
+        sql=CREATE_TABLE_QUERY
+    )
+
+    # Temp table used to store the information that needs to be
+    # validated.
+    create_temp_table = run_sql.override(task_id="create_temp_table")(
+        sql=CREATE_TEMP_TABLE_QUERY, handler=RETURN_ROW_COUNT
+    )
+
+    # Split records into batches
+    batches = get_batches(
+        total_count=create_temp_table, batch_size="{{ params.batch_size }}"
+    )
+
+    # Verify each record's url
+    process = process_batch.expand_kwargs(batches)
+
+    # Record the identifiers of records with invalid urls
+    record = record_invalid_ids(process)
+
+    # Drop the temp table.
+    drop_temp_table = run_sql.override(task_id="drop_temp_table")(
+        sql=DROP_TEMP_TABLE_QUERY
+    )
+
+    # Report the number of invalid records to Slack
+    notify_slack(record)
+
+    update >> create_table >> process
+    record >> drop_temp_table
+
+
+update_science_museum_urls()
diff --git a/catalog/dags/providers/provider_api_scripts/science_museum.py b/catalog/dags/providers/provider_api_scripts/science_museum.py
@@ -17,6 +17,7 @@
 from common import slack
 from common.licenses import LicenseInfo, get_license_info
 from common.loader import provider_details as prov
+from common.urls import rewrite_redirected_url
 from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
 
 
@@ -166,9 +167,13 @@ def _get_creator_info(attributes):
     def check_url(url: str | None) -> str | None:
         if not url:
             return None
-        if url.startswith("http"):
-            return url
-        return f"https://coimages.sciencemuseumgroup.org.uk/{url}"
+
+        # Will return None if url 403s
+        return rewrite_redirected_url(
+            url
+            if url.startswith("http")
+            else f"https://coimages.sciencemuseumgroup.org.uk/{url}"
+        )
 
     @staticmethod
     def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]:

diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_science_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_science_museum.py
@@ -9,6 +9,7 @@
 from common.licenses import get_license_info
 from common.loader import provider_details as prov
 from common.storage.image import ImageStore
+from providers.provider_api_scripts import science_museum
 from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester
 
 
@@ -25,6 +26,14 @@
 )
 
 
+@pytest.fixture
+def mock_rewrite_redirected_url(monkeypatch):
+    def mock_rewrite(url_string):
+        return url_string
+
+    monkeypatch.setattr(science_museum, "rewrite_redirected_url", mock_rewrite)
+
+
 @pytest.fixture(autouse=True)
 def after_test():
     yield
@@ -158,7 +167,7 @@ def test_get_record_data_returns_empty_list_for_falsy_image_required_values(reco
     assert actual_record_data == []
 
 
-def test_get_record_data_success(object_data):
+def test_get_record_data_success(object_data, mock_rewrite_redirected_url):
     actual_record_data = sm.get_record_data(object_data)
     actual_image_data = actual_record_data[0]
     assert len(actual_record_data) == 3
@@ -209,7 +218,7 @@ def test_creator_info_fail(object_data):
     assert actual_creator is None
 
 
-def test_image_info_large():
+def test_image_info_large(mock_rewrite_redirected_url):
     large_image = _get_resource_json("large_image.json")
     actual_image, actual_height, actual_width, actual_filetype, actual_filesize = (
         sm._get_image_info(large_image)
@@ -230,7 +239,7 @@ def test_image_info_large():
     assert actual_filesize == expected_filesize
 
 
-def test_image_info_medium():
+def test_image_info_medium(mock_rewrite_redirected_url):
     medium_image = _get_resource_json("medium_image.json")
     actual_url, actual_height, actual_width, actual_filetype, actual_filesize = (
         sm._get_image_info(medium_image)
@@ -262,7 +271,7 @@ def test_image_info_failure():
     assert actual_filesize is None
 
 
-def test_check_relative_url():
+def test_check_relative_url(mock_rewrite_redirected_url):
     rel_url = "3/563/large_thumbnail_1999_0299_0001__0002_.jpg"
     actual_url = sm.check_url(rel_url)
     expected_url = (
@@ -273,7 +282,7 @@ def test_check_relative_url():
     assert actual_url == expected_url
 
 
-def test_check_complete_url():
+def test_check_complete_url(mock_rewrite_redirected_url):
     url = (
         "https://coimages.sciencemuseumgroup.org.uk/3/563/"
         "large_thumbnail_1999_0299_0001__0002_.jpg"

diff --git a/documentation/catalog/reference/DAGs.md b/documentation/catalog/reference/DAGs.md
@@ -26,9 +26,10 @@ The following are DAGs grouped by their primary tag:
 
 ### Data Normalization
 
-| DAG ID                                | Schedule Interval |
-| ------------------------------------- | ----------------- |
-| [`add_license_url`](#add_license_url) | `None`            |
+| DAG ID                                                      | Schedule Interval |
+| ----------------------------------------------------------- | ----------------- |
+| [`add_license_url`](#add_license_url)                       | `None`            |
+| [`update_science_museum_urls`](#update_science_museum_urls) | `None`            |
 
 ### Data Refresh
 
@@ -170,6 +171,7 @@ The following is documentation associated with each DAG (where available):
 1.  [`smk_workflow`](#smk_workflow)
 1.  [`staging_database_restore`](#staging_database_restore)
 1.  [`stocksnap_workflow`](#stocksnap_workflow)
+1.  [`update_science_museum_urls`](#update_science_museum_urls)
 1.  [`wikimedia_commons_workflow`](#wikimedia_commons_workflow)
 1.  [`wikimedia_reingestion_workflow`](#wikimedia_commons_workflow)
 1.  [`wordpress_workflow`](#wordpress_workflow)
@@ -1055,6 +1057,26 @@ authorization required. API is undocumented.
 
 ----
 
+### `update_science_museum_urls`
+
+#### Update Science Museum URLs
+
+One-time maintenance DAG to update Science Museum records to have valid URLs.
+See https://github.com/WordPress/openverse/issues/4261.
+
+For each Science Museum record, this DAG:
+
+- updates the url to the new format, excluding `/images/` in the path if it
+  exists
+- validates whether the url is reachable. If not, the record ID is added to an
+  `invalid_science_musem_ids` table.
+
+Once complete, we can use the `science_museum_invalid_ids` to identify records
+to delete. They are not automatically deleted by this DAG, in order to give us
+an opportunity to first see how many there are.
+
+----
+
 ### `wikimedia_commons_workflow`
 
 **Content Provider:** Wikimedia Commons