Skip to content

Commit

Permalink
Update Science museum urls (#4276)
Browse files Browse the repository at this point in the history
* Update Science Museum DAG to check if urls are reachable

* Add one-time DAG to update existing urls

* Fix docstrings, more performant chain

* Update mocks in tests

* Update dag docs

* Decrease batch size, limit number concurrent

* Commit ids to update to temp table and drop later
  • Loading branch information
stacimc authored May 9, 2024
1 parent 43d4d09 commit cf9e6a8
Show file tree
Hide file tree
Showing 4 changed files with 250 additions and 11 deletions.
203 changes: 203 additions & 0 deletions catalog/dags/maintenance/update_science_museum_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
"""
# Update Science Museum URLs
One-time maintenance DAG to update Science Museum records to have valid URLs. See https://github.com/WordPress/openverse/issues/4261.
For each Science Museum record, this DAG:
* updates the url to the new format, excluding `/images/` in the path if it exists
* validates whether the url is reachable. If not, the record ID is added to an `invalid_science_musem_ids` table.
Once complete, we can use the `science_museum_invalid_ids` to identify records to delete. They are not automatically deleted by this DAG, in order to give us an opportunity to first see how many there are.
"""

import itertools
import logging
from datetime import timedelta
from textwrap import dedent

from airflow.decorators import dag, task
from airflow.exceptions import AirflowSkipException
from airflow.models.abstractoperator import AbstractOperator
from airflow.models.param import Param

from common import slack
from common.constants import DAG_DEFAULT_ARGS, POSTGRES_CONN_ID
from common.sql import RETURN_ROW_COUNT, PostgresHook
from common.urls import rewrite_redirected_url


logger = logging.getLogger(__name__)


DAG_ID = "update_science_museum_urls"
INVALID_IDS_TABLE = "science_museum_invalid_ids"
IDS_TO_UPDATE_TABLE = "temp_science_museum_rows_to_validate"

UPDATE_URLS_QUERY = dedent(
"""
UPDATE image
SET url=REPLACE(url, '/images', '')
WHERE provider='sciencemuseum' AND url ILIKE '%/images/%';
"""
)
CREATE_TABLE_QUERY = dedent(
f"""
CREATE TABLE IF NOT EXISTS {INVALID_IDS_TABLE}
(identifier uuid);
"""
)
CREATE_TEMP_TABLE_QUERY = dedent(
f"""
CREATE TABLE {IDS_TO_UPDATE_TABLE} AS
SELECT ROW_NUMBER() OVER() row_id, identifier, url
FROM image
WHERE provider='sciencemuseum';
"""
)
DROP_TEMP_TABLE_QUERY = f"DROP TABLE {IDS_TO_UPDATE_TABLE};"
GET_BATCH_QUERY = dedent(
"""
SELECT identifier, url
FROM {table_name}
WHERE row_id > {start} AND row_id <= {end};
"""
)


@task
def run_sql(
sql: str,
handler: callable = None,
postgres_conn_id: str = POSTGRES_CONN_ID,
task: AbstractOperator = None,
):
postgres = PostgresHook(
postgres_conn_id=postgres_conn_id,
default_statement_timeout=PostgresHook.get_execution_timeout(task),
)

return postgres.run(sql, handler=handler)


@task
def get_batches(
total_count: int,
batch_size: int,
):
return [
{"start": i, "end": i + batch_size} for i in range(0, total_count, batch_size)
]


@task(max_active_tis_per_dagrun=3)
def process_batch(
start: int,
end: int,
postgres_conn_id: str = POSTGRES_CONN_ID,
task: AbstractOperator = None,
):
invalid_ids = []

postgres = PostgresHook(
postgres_conn_id=postgres_conn_id,
default_statement_timeout=PostgresHook.get_execution_timeout(task),
)
records = postgres.get_records(
GET_BATCH_QUERY.format(table_name=IDS_TO_UPDATE_TABLE, start=start, end=end)
)

for identifier, url in records:
# Failed to reach URL
if rewrite_redirected_url(url) is None:
invalid_ids.append(identifier)

return invalid_ids


@task
def record_invalid_ids(invalid_ids):
# Chain together
ids_to_record = itertools.chain.from_iterable(invalid_ids)

if not ids_to_record:
raise AirflowSkipException("No invalid urls found!")

values = ", ".join([f"('{id}')" for id in ids_to_record])
query = dedent(
f"""
INSERT INTO {INVALID_IDS_TABLE} (identifier)
VALUES {values}
"""
)

return run_sql.function(sql=query, handler=RETURN_ROW_COUNT)


@task
def notify_slack(invalid_count: int):
slack.send_message(
f"Detected {invalid_count} invalid Science Museum urls.", dag_id=DAG_ID
)


@dag(
dag_id=DAG_ID,
schedule=None,
catchup=False,
tags=["data_normalization"],
doc_md=__doc__,
default_args={
**DAG_DEFAULT_ARGS,
"retries": 0,
"execution_timeout": timedelta(days=2),
},
render_template_as_native_obj=True,
params={
"batch_size": Param(
default=250,
type="integer",
description="The number of records to update per batch.",
),
},
)
def update_science_museum_urls():
# Update all URLs to have the correct format
update = run_sql.override(task_id="update_url_format")(sql=UPDATE_URLS_QUERY)

# Create table to track ids of records that still have an
# invalid/unreachable url
create_table = run_sql.override(task_id="create_invalid_id_table")(
sql=CREATE_TABLE_QUERY
)

# Temp table used to store the information that needs to be
# validated.
create_temp_table = run_sql.override(task_id="create_temp_table")(
sql=CREATE_TEMP_TABLE_QUERY, handler=RETURN_ROW_COUNT
)

# Split records into batches
batches = get_batches(
total_count=create_temp_table, batch_size="{{ params.batch_size }}"
)

# Verify each record's url
process = process_batch.expand_kwargs(batches)

# Record the identifiers of records with invalid urls
record = record_invalid_ids(process)

# Drop the temp table.
drop_temp_table = run_sql.override(task_id="drop_temp_table")(
sql=DROP_TEMP_TABLE_QUERY
)

# Report the number of invalid records to Slack
notify_slack(record)

update >> create_table >> process
record >> drop_temp_table


update_science_museum_urls()
11 changes: 8 additions & 3 deletions catalog/dags/providers/provider_api_scripts/science_museum.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from common import slack
from common.licenses import LicenseInfo, get_license_info
from common.loader import provider_details as prov
from common.urls import rewrite_redirected_url
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester


Expand Down Expand Up @@ -166,9 +167,13 @@ def _get_creator_info(attributes):
def check_url(url: str | None) -> str | None:
if not url:
return None
if url.startswith("http"):
return url
return f"https://coimages.sciencemuseumgroup.org.uk/{url}"

# Will return None if url 403s
return rewrite_redirected_url(
url
if url.startswith("http")
else f"https://coimages.sciencemuseumgroup.org.uk/{url}"
)

@staticmethod
def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from common.licenses import get_license_info
from common.loader import provider_details as prov
from common.storage.image import ImageStore
from providers.provider_api_scripts import science_museum
from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester


Expand All @@ -25,6 +26,14 @@
)


@pytest.fixture
def mock_rewrite_redirected_url(monkeypatch):
def mock_rewrite(url_string):
return url_string

monkeypatch.setattr(science_museum, "rewrite_redirected_url", mock_rewrite)


@pytest.fixture(autouse=True)
def after_test():
yield
Expand Down Expand Up @@ -158,7 +167,7 @@ def test_get_record_data_returns_empty_list_for_falsy_image_required_values(reco
assert actual_record_data == []


def test_get_record_data_success(object_data):
def test_get_record_data_success(object_data, mock_rewrite_redirected_url):
actual_record_data = sm.get_record_data(object_data)
actual_image_data = actual_record_data[0]
assert len(actual_record_data) == 3
Expand Down Expand Up @@ -209,7 +218,7 @@ def test_creator_info_fail(object_data):
assert actual_creator is None


def test_image_info_large():
def test_image_info_large(mock_rewrite_redirected_url):
large_image = _get_resource_json("large_image.json")
actual_image, actual_height, actual_width, actual_filetype, actual_filesize = (
sm._get_image_info(large_image)
Expand All @@ -230,7 +239,7 @@ def test_image_info_large():
assert actual_filesize == expected_filesize


def test_image_info_medium():
def test_image_info_medium(mock_rewrite_redirected_url):
medium_image = _get_resource_json("medium_image.json")
actual_url, actual_height, actual_width, actual_filetype, actual_filesize = (
sm._get_image_info(medium_image)
Expand Down Expand Up @@ -262,7 +271,7 @@ def test_image_info_failure():
assert actual_filesize is None


def test_check_relative_url():
def test_check_relative_url(mock_rewrite_redirected_url):
rel_url = "3/563/large_thumbnail_1999_0299_0001__0002_.jpg"
actual_url = sm.check_url(rel_url)
expected_url = (
Expand All @@ -273,7 +282,7 @@ def test_check_relative_url():
assert actual_url == expected_url


def test_check_complete_url():
def test_check_complete_url(mock_rewrite_redirected_url):
url = (
"https://coimages.sciencemuseumgroup.org.uk/3/563/"
"large_thumbnail_1999_0299_0001__0002_.jpg"
Expand Down
28 changes: 25 additions & 3 deletions documentation/catalog/reference/DAGs.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ The following are DAGs grouped by their primary tag:

### Data Normalization

| DAG ID | Schedule Interval |
| ------------------------------------- | ----------------- |
| [`add_license_url`](#add_license_url) | `None` |
| DAG ID | Schedule Interval |
| ----------------------------------------------------------- | ----------------- |
| [`add_license_url`](#add_license_url) | `None` |
| [`update_science_museum_urls`](#update_science_museum_urls) | `None` |

### Data Refresh

Expand Down Expand Up @@ -170,6 +171,7 @@ The following is documentation associated with each DAG (where available):
1. [`smk_workflow`](#smk_workflow)
1. [`staging_database_restore`](#staging_database_restore)
1. [`stocksnap_workflow`](#stocksnap_workflow)
1. [`update_science_museum_urls`](#update_science_museum_urls)
1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow)
1. [`wikimedia_reingestion_workflow`](#wikimedia_commons_workflow)
1. [`wordpress_workflow`](#wordpress_workflow)
Expand Down Expand Up @@ -1055,6 +1057,26 @@ authorization required. API is undocumented.

----

### `update_science_museum_urls`

#### Update Science Museum URLs

One-time maintenance DAG to update Science Museum records to have valid URLs.
See https://github.com/WordPress/openverse/issues/4261.

For each Science Museum record, this DAG:

- updates the url to the new format, excluding `/images/` in the path if it
exists
- validates whether the url is reachable. If not, the record ID is added to an
`invalid_science_musem_ids` table.

Once complete, we can use the `science_museum_invalid_ids` to identify records
to delete. They are not automatically deleted by this DAG, in order to give us
an opportunity to first see how many there are.

----

### `wikimedia_commons_workflow`

**Content Provider:** Wikimedia Commons
Expand Down

0 comments on commit cf9e6a8

Please sign in to comment.