Skip to content

Commit

Permalink
Fix placing test S3 data into MinIO (#4495)
Browse files Browse the repository at this point in the history
* Fix placing test S3 data into MinIO

* Add inaturalist-open-data to BUCKETS_TO_CREATE

* Test the iNaturalist bucket exists locally
  • Loading branch information
krysal authored Jun 25, 2024
1 parent 722006e commit 04aa4dd
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 8 deletions.
7 changes: 4 additions & 3 deletions catalog/dags/providers/provider_api_scripts/inaturalist.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
}
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "/tmp/"))
COL_URL = "https://download.checklistbank.org/col/latest_coldp.zip"
INATURALIST_BUCKET = "inaturalist-open-data"


class INaturalistDataIngester(ProviderDataIngester):
Expand Down Expand Up @@ -202,9 +203,9 @@ def compare_update_dates(
for key in s3_keys:
# this will error out if the files don't exist, and bubble up as an
# informative failure
last_modified = s3_client.head_object(
Bucket="inaturalist-open-data", Key=key
)["LastModified"]
last_modified = s3_client.head_object(Bucket=INATURALIST_BUCKET, Key=key)[
"LastModified"
]
logger.info(
f"{key} was last modified on s3 on "
f"{last_modified.strftime('%Y-%m-%d %H:%M:%S')}."
Expand Down
5 changes: 2 additions & 3 deletions catalog/tests/dags/common/loader/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
TEST_MEDIA_PREFIX = "media"
TEST_STAGING_PREFIX = "test_staging"
S3_LOCAL_ENDPOINT = os.getenv("S3_LOCAL_ENDPOINT")
S3_TEST_BUCKET = f"cccatalog-storage-{TEST_ID}"
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
SECRET_KEY = os.getenv("AWS_SECRET_KEY")
ACCESS_KEY = os.getenv("AWS_ACCESS_KEY", "test_key")
SECRET_KEY = os.getenv("AWS_SECRET_KEY", "test_secret")


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@
from pathlib import Path
from unittest import mock

import boto3
import pendulum
import pytest
from airflow.exceptions import AirflowSkipException
from airflow.models import TaskInstance
from tests.dags.common.loader.test_s3 import (
ACCESS_KEY,
S3_LOCAL_ENDPOINT,
SECRET_KEY,
)

from common.constants import IMAGE
from common.loader.reporting import RecordMetrics
Expand Down Expand Up @@ -232,3 +238,14 @@ def test_compare_update_dates(last_success, s3_dir, expected_msgs, caplog):
assert actual is None
for msg in expected_msgs:
assert msg in caplog.text


def test_bucket_exists():
bucket = boto3.resource(
"s3",
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
endpoint_url=S3_LOCAL_ENDPOINT,
).Bucket(inaturalist.INATURALIST_BUCKET)

assert bucket.creation_date is not None
2 changes: 1 addition & 1 deletion docker/minio/env.template
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ MINIO_ROOT_USER=test_key
MINIO_ROOT_PASSWORD=test_secret

# Comma separated list of buckets to create on startup
BUCKETS_TO_CREATE=openverse-catalog,openverse-airflow-logs
BUCKETS_TO_CREATE=openverse-catalog,openverse-airflow-logs,inaturalist-open-data
2 changes: 1 addition & 1 deletion docker/minio/load_to_s3_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# More info here: https://stackoverflow.com/questions/72867045
set -euxo pipefail

/usr/bin/mc config host add s3 http://s3:5000 "${AWS_ACCESS_KEY}" "${AWS_SECRET_KEY}"
/usr/bin/mc config host add s3 http://s3:5000 "${MINIO_ROOT_USER}" "${MINIO_ROOT_PASSWORD}"
cd /data
for b in */; do
echo "Loading bucket $b"
Expand Down

0 comments on commit 04aa4dd

Please sign in to comment.