Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update ingestion script to support multiple S3 buckets #154

Merged
merged 4 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ingestion_script = /absolute/path/to/ingest-tarball.sh
metadata_file_extension = .meta.txt

[aws]
staging_bucket = eessi-staging
staging_buckets = eessi-staging, eessi-staging-2023.06

[cvmfs]
ingest_as_root = yes
Expand Down
28 changes: 14 additions & 14 deletions scripts/automated_ingestion/automated_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
REQUIRED_CONFIG = {
'secrets': ['aws_secret_access_key', 'aws_access_key_id', 'github_pat'],
'paths': ['download_dir', 'ingestion_script', 'metadata_file_extension'],
'aws': ['staging_bucket'],
'aws': ['staging_buckets'],
'github': ['staging_repo', 'failed_ingestion_issue_body', 'pr_body'],
}

Expand All @@ -39,10 +39,9 @@ def error(msg, code=1):
def find_tarballs(s3, bucket, extension='.tar.gz', metadata_extension='.meta.txt'):
"""Return a list of all tarballs in an S3 bucket that have a metadata file with the given extension (and same filename)."""
# TODO: list_objects_v2 only returns up to 1000 objects
files = [
object['Key']
for object in s3.list_objects_v2(Bucket=bucket)['Contents']
]
s3_objects = s3.list_objects_v2(Bucket=bucket).get('Contents', [])
files = [obj['Key'] for obj in s3_objects]

tarballs = [
file
for file in files
Expand Down Expand Up @@ -100,15 +99,16 @@ def main():
aws_secret_access_key=config['secrets']['aws_secret_access_key'],
)

tarballs = find_tarballs(s3, config['aws']['staging_bucket'])
if args.list_only:
for num, tarball in enumerate(tarballs):
print(f'{num}: {tarball}')
sys.exit(0)

for tarball in tarballs:
tar = EessiTarball(tarball, config, gh, s3)
tar.run_handler()
buckets = [x.strip() for x in config['aws']['staging_buckets'].split(',')]
for bucket in buckets:
tarballs = find_tarballs(s3, bucket)
if args.list_only:
for num, tarball in enumerate(tarballs):
print(f'{num}: {tarball}')
else:
for tarball in tarballs:
tar = EessiTarball(tarball, config, gh, s3, bucket)
tar.run_handler()


if __name__ == '__main__':
Expand Down
14 changes: 7 additions & 7 deletions scripts/automated_ingestion/eessitarball.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@ class EessiTarball:
for which it interfaces with the S3 bucket, GitHub, and CVMFS.
"""

def __init__(self, object_name, config, github, s3):
def __init__(self, object_name, config, github, s3, bucket):
"""Initialize the tarball object."""
self.config = config
self.github = github
self.git_repo = github.get_repo(config['github']['staging_repo'])
self.metadata_file = object_name + config['paths']['metadata_file_extension']
self.object = object_name
self.s3 = s3
self.bucket = bucket
self.local_path = os.path.join(config['paths']['download_dir'], os.path.basename(object_name))
self.local_metadata_path = self.local_path + config['paths']['metadata_file_extension']
self.url = f'https://{config["aws"]["staging_bucket"]}.s3.amazonaws.com/{object_name}'
self.url = f'https://{bucket}.s3.amazonaws.com/{object_name}'

self.states = {
'new': {'handler': self.mark_new_tarball_as_staged, 'next_state': 'staged'},
Expand All @@ -47,21 +48,20 @@ def download(self, force=False):
"""
Download this tarball and its corresponding metadata file, if this hasn't been already done.
"""
bucket = self.config['aws']['staging_bucket']
if force or not os.path.exists(self.local_path):
try:
self.s3.download_file(bucket, self.object, self.local_path)
self.s3.download_file(self.bucket, self.object, self.local_path)
except:
logging.error(
f'Failed to download tarball {self.object} from {bucket} to {self.local_path}.'
f'Failed to download tarball {self.object} from {self.bucket} to {self.local_path}.'
)
self.local_path = None
if force or not os.path.exists(self.local_metadata_path):
try:
self.s3.download_file(bucket, self.metadata_file, self.local_metadata_path)
self.s3.download_file(self.bucket, self.metadata_file, self.local_metadata_path)
except:
logging.error(
f'Failed to download metadata file {self.metadata_file} from {bucket} to {self.local_metadata_path}.'
f'Failed to download metadata file {self.metadata_file} from {self.bucket} to {self.local_metadata_path}.'
)
self.local_metadata_path = None

Expand Down