-
Notifications
You must be signed in to change notification settings - Fork 216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Staging database restore DAG: base restore #2099
Changes from all commits
c6f5a07
05191e5
5f9c762
f3dbd38
b8502fd
4bd03c8
62078ba
1cdb3c9
8ddf498
56af8ca
5efd289
5731465
939dcc1
4eba5bc
6fe70ad
ca0459e
6d6ed95
82d0fa7
4dcbf06
e373836
1a99bf7
71069cf
a08ea01
a8e4289
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
|
||
from common.constants import AWS_CONN_ID | ||
|
||
|
||
_ID_FORMAT = "{}-openverse-db" | ||
|
||
DAG_ID = "staging_database_restore" | ||
PROD_IDENTIFIER = _ID_FORMAT.format("prod") | ||
STAGING_IDENTIFIER = _ID_FORMAT.format("dev") | ||
TEMP_IDENTIFIER = _ID_FORMAT.format("dev-next") | ||
OLD_IDENTIFIER = _ID_FORMAT.format("dev-old") | ||
|
||
SAFE_TO_MUTATE = {STAGING_IDENTIFIER, TEMP_IDENTIFIER, OLD_IDENTIFIER} | ||
|
||
SKIP_VARIABLE = "SKIP_STAGING_DATABASE_RESTORE" | ||
AWS_RDS_CONN_ID = os.environ.get("AWS_RDS_CONN_ID", AWS_CONN_ID) | ||
SLACK_USERNAME = "Staging Database Restore" | ||
SLACK_ICON = ":database-pink:" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
import logging | ||
from datetime import timedelta | ||
from pprint import pformat | ||
|
||
from airflow.decorators import task | ||
from airflow.models import Variable | ||
from airflow.providers.amazon.aws.hooks.rds import RdsHook | ||
from airflow.providers.amazon.aws.sensors.rds import RdsDbSensor | ||
from airflow.utils.task_group import TaskGroup | ||
from airflow.utils.trigger_rule import TriggerRule | ||
|
||
from common import slack | ||
from database.staging_database_restore import constants | ||
from database.staging_database_restore.utils import ( | ||
ensure_mutate_allowed, | ||
setup_rds_hook, | ||
) | ||
|
||
|
||
REQUIRED_DB_INFO = { | ||
"MultiAZ", | ||
"AvailabilityZone", | ||
"VpcSecurityGroups", | ||
"DBSubnetGroup", | ||
"PubliclyAccessible", | ||
"DBInstanceClass", | ||
"AllocatedStorage", | ||
} | ||
|
||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
@task.short_circuit | ||
def skip_restore(should_skip: bool = False) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this take There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It won't be passed in normal control flow, but it's useful for testing! See: https://github.com/WordPress/openverse/pull/2099/files#diff-cbfa38a24878311bad02d5a1140c5bc6ab29e4e8a468c0c7e90433cdaf45a904R23-R24 |
||
""" | ||
Determine whether to skip the restore process. | ||
Can be overridden by setting the `SKIP_STAGING_DATABASE_RESTORE` Airflow Variable | ||
to `true`. | ||
Should return `True` to have the DAG continue, and `False` to have it skipped. | ||
https://docs.astronomer.io/learn/airflow-branch-operator#taskshort_circuit-shortcircuitoperator | ||
""" | ||
should_continue = not ( | ||
should_skip | ||
or Variable.get( | ||
constants.SKIP_VARIABLE, default_var=False, deserialize_json=True | ||
) | ||
) | ||
if not should_continue: | ||
notify_slack.function( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Neat! |
||
"The staging database restore has been skipped. " | ||
f"(Set the `{constants.SKIP_VARIABLE}` Airflow Variable to `false`" | ||
"to disable this behavior.)" | ||
) | ||
return should_continue | ||
|
||
|
||
@task | ||
@setup_rds_hook | ||
def get_latest_prod_snapshot(rds_hook: RdsHook = None) -> str: | ||
""" | ||
Get the latest automated snapshot for the production database. | ||
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_snapshots.html | ||
Status is checked using a sensor in a later step, in case a snapshot creation is | ||
currently in progress. | ||
""" | ||
# Get snapshots | ||
snapshots = rds_hook.conn.describe_db_snapshots( | ||
DBInstanceIdentifier=constants.PROD_IDENTIFIER, | ||
SnapshotType="automated", | ||
).get("DBSnapshots", []) | ||
# Sort by descending creation time | ||
snapshots = sorted( | ||
snapshots, | ||
key=lambda x: x["SnapshotCreateTime"], | ||
reverse=True, | ||
) | ||
if not snapshots: | ||
raise ValueError(f"No snapshots found for {constants.PROD_IDENTIFIER}") | ||
latest_snapshot = snapshots[0] | ||
log.info(f"Latest snapshot: {latest_snapshot}") | ||
return latest_snapshot["DBSnapshotIdentifier"] | ||
|
||
|
||
@task | ||
@setup_rds_hook | ||
def get_staging_db_details(rds_hook: RdsHook = None) -> dict: | ||
""" | ||
Retrieve the details of the staging database. Only some details are required (and | ||
others are actually sensitive) so filter down to only what we need. | ||
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/describe_db_instances.html | ||
""" | ||
# Get staging DB details | ||
instances = rds_hook.conn.describe_db_instances( | ||
DBInstanceIdentifier=constants.STAGING_IDENTIFIER, | ||
).get("DBInstances", []) | ||
if not instances: | ||
raise ValueError(f"No staging DB found for {constants.STAGING_IDENTIFIER}") | ||
staging_db = instances[0] | ||
# While it might be tempting to log this information, it contains sensitive | ||
# values. Instead, we'll select only the information we need, then log that. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is filtering out sensitive values only important for logging? Ie, could we only restrict logging to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there are a few reasons why the current approach might be best:
|
||
staging_db = { | ||
key: value for key, value in staging_db.items() if key in REQUIRED_DB_INFO | ||
} | ||
# Pull the DBSubnetGroup name out of the DBSubnetGroup object | ||
staging_db["DBSubnetGroupName"] = staging_db.pop("DBSubnetGroup")[ | ||
"DBSubnetGroupName" | ||
] | ||
# Pull the VPC IDs out of the VpcSecurityGroups objects | ||
staging_db["VpcSecurityGroupIds"] = [ | ||
vpc["VpcSecurityGroupId"] for vpc in staging_db.pop("VpcSecurityGroups") | ||
] | ||
log.info(f"Staging DB config: \n{pformat(staging_db)}") | ||
return staging_db | ||
|
||
|
||
@task | ||
@setup_rds_hook | ||
def restore_staging_from_snapshot( | ||
latest_snapshot: str, staging_config: dict, rds_hook: RdsHook = None | ||
) -> None: | ||
""" | ||
Restore the staging database from the latest snapshot. | ||
Augment the restore operation with the existing details determined from | ||
a previous step. | ||
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/restore_db_instance_from_db_snapshot.html | ||
""" | ||
log.info( | ||
f"Creating a new {constants.TEMP_IDENTIFIER} instance from {latest_snapshot} " | ||
f"with: \n{pformat(staging_config)}" | ||
) | ||
rds_hook.conn.restore_db_instance_from_db_snapshot( | ||
DBInstanceIdentifier=constants.TEMP_IDENTIFIER, | ||
DBSnapshotIdentifier=latest_snapshot, | ||
**staging_config, | ||
) | ||
|
||
|
||
@task | ||
@setup_rds_hook | ||
def rename_db_instance(source: str, target: str, rds_hook: RdsHook = None) -> None: | ||
""" | ||
Rename a database instance. | ||
This can only be run on instances where mutation is allowed. | ||
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/rds/client/modify_db_instance.html | ||
""" | ||
log.info("Checking input values") | ||
ensure_mutate_allowed(source) | ||
ensure_mutate_allowed(target) | ||
log.info(f"Renaming {source} to {target}") | ||
rds_hook.conn.modify_db_instance( | ||
DBInstanceIdentifier=source, | ||
NewDBInstanceIdentifier=target, | ||
ApplyImmediately=True, | ||
) | ||
|
||
|
||
@task | ||
def notify_slack(text: str) -> None: | ||
slack.send_message( | ||
text, | ||
username=constants.SLACK_USERNAME, | ||
icon_emoji=constants.SLACK_ICON, | ||
dag_id=constants.DAG_ID, | ||
) | ||
|
||
|
||
def make_rds_sensor(task_id: str, db_identifier: str, retries: int = 0) -> RdsDbSensor: | ||
return RdsDbSensor( | ||
task_id=task_id, | ||
db_identifier=db_identifier, | ||
target_statuses=["available"], | ||
aws_conn_id=constants.AWS_RDS_CONN_ID, | ||
mode="reschedule", | ||
timeout=60 * 60, # 1 hour | ||
retries=retries, | ||
retry_delay=timedelta(minutes=1), | ||
) | ||
|
||
|
||
def make_rename_task_group( | ||
source: str, | ||
target: str, | ||
trigger_rule: TriggerRule = TriggerRule.ALL_SUCCESS, | ||
) -> TaskGroup: | ||
""" | ||
Create a task group which includes both a rename operation, and a sensor to wait | ||
for the new database to be ready. This requires retries because the database | ||
may not be ready immediately after the rename when the first await is tried. | ||
""" | ||
source_name = source.removesuffix("-openverse-db") | ||
target_name = target.removesuffix("-openverse-db") | ||
with TaskGroup(group_id=f"rename_{source_name}_to_{target_name}") as rename_group: | ||
rename = rename_db_instance.override( | ||
task_id=f"rename_{source_name}_to_{target_name}", | ||
trigger_rule=trigger_rule, | ||
)( | ||
source=source, | ||
target=target, | ||
) | ||
await_rename = make_rds_sensor( | ||
task_id=f"await_{target_name}", | ||
db_identifier=target, | ||
retries=2, | ||
) | ||
rename >> await_rename | ||
|
||
return rename_group |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very cool, TIL!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stumbled across this one by accident while looking for something else, couldn't help but use it!