diff --git a/MANIFEST.in b/MANIFEST.in index 5f5f127..310b96b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -50,6 +50,7 @@ recursive-include examples *.sh recursive-include cds_migrator_kit *.gitkeep recursive-include cds_migrator_kit *.md recursive-include cds_migrator_kit *.yaml +recursive-include scripts *.py exclude scripts/migrator.sh diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py index 95e42cd..8c75eca 100644 --- a/cds_migrator_kit/migration_config.py +++ b/cds_migrator_kit/migration_config.py @@ -355,3 +355,4 @@ def _(x): # needed to avoid start time failure with lazy strings base_path = os.path.dirname(os.path.realpath(__file__)) logs_dir = os.path.join(base_path, "tmp/logs/") CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir +INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG = "cds_migrator_kit/rdm/migration/streams.yaml" diff --git a/cds_migrator_kit/rdm/migration/README.md b/cds_migrator_kit/rdm/migration/README.md index a1f5e2a..e4a488b 100644 --- a/cds_migrator_kit/rdm/migration/README.md +++ b/cds_migrator_kit/rdm/migration/README.md @@ -135,8 +135,32 @@ current_rdm_records_service.indexer.bulk_index((rec.id for rec in records)) ``` -### To visualise the errors: +### To visualise the errors (locally): ```shell gunicorn -b :8080 --timeout 120 --graceful-timeout 60 cds_migrator_kit.app:app ``` + + + + +### Full migration workflow of one collection + +#### Legacy + +```shell +ssh cds-wn-31 # inveniomigrator tool installed here +kinit cdsrdmeosdev +cd /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump +inveniomigrator dump records -q '980__:NOTE 037__:CERN-STUDENTS-Note-* -980:DELETED' --file-prefix summer-studends-notes --latest-only --chunk-size=1000 +python copy_collection_files.py --dump-folder /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/dump --files-destination /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/files +``` + + +#### Openshift migration pod + +```shell +invenio migration run +``` + +visit https://migration-cds-rdm-dev.app.cern.ch for report diff --git a/cds_migrator_kit/rdm/migration/cli.py b/cds_migrator_kit/rdm/migration/cli.py index 9c46bab..cae0146 100644 --- a/cds_migrator_kit/rdm/migration/cli.py +++ b/cds_migrator_kit/rdm/migration/cli.py @@ -10,6 +10,7 @@ from pathlib import Path import click +from flask import current_app from flask.cli import with_appcontext from cds_migrator_kit.rdm.migration.runner import Runner @@ -26,8 +27,9 @@ def migration(): @with_appcontext def run(): """Run.""" + stream_config = current_app.config["INVENIO_CDS_MIGRATOR_KIT_STREAM_CONFIG"] runner = Runner( stream_definitions=[RecordStreamDefinition], - config_filepath=Path("cds_migrator_kit/rdm/migration/streams.yaml").absolute(), + config_filepath=Path(stream_config).absolute(), ) runner.run() diff --git a/cds_migrator_kit/rdm/migration/load.py b/cds_migrator_kit/rdm/migration/load.py index 4a9d18c..93eb704 100644 --- a/cds_migrator_kit/rdm/migration/load.py +++ b/cds_migrator_kit/rdm/migration/load.py @@ -7,11 +7,19 @@ """CDS-RDM migration load module.""" +import os + from invenio_access.permissions import system_identity from invenio_rdm_migrator.load.base import Load from invenio_rdm_records.proxies import current_rdm_records_service +def import_legacy_files(filepath): + """Download file from legacy.""" + filestream = open(filepath, "rb") + return filestream + + class CDSRecordServiceLoad(Load): """CDSRecordServiceLoad.""" @@ -29,8 +37,34 @@ def _prepare(self, entry): def _load(self, entry): """Use the services to load the entries.""" - identity = system_identity # Should we create an idenity for the migration? + identity = system_identity # Should we create an identity for the migration? draft = current_rdm_records_service.create(identity, entry["record"]["json"]) + draft_files = entry["draft_files"] + + for file in draft_files: + current_rdm_records_service.draft_files.init_files( + identity, + draft.id, + data=[ + { + "key": file["key"], + "metadata": file["metadata"], + "access": {"hidden": False}, + } + ], + ) + current_rdm_records_service.draft_files.set_file_content( + identity, + draft.id, + file["key"], + import_legacy_files(file["eos_tmp_path"]), + ) + result = current_rdm_records_service.draft_files.commit_file( + identity, draft.id, file["key"] + ) + legacy_checksum = f"md5:{file['checksum']}" + new_checksum = result.to_dict()["checksum"] + assert legacy_checksum == new_checksum current_rdm_records_service.publish(system_identity, draft["id"]) def _cleanup(self, *args, **kwargs): diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml index 0384f94..fe961b0 100644 --- a/cds_migrator_kit/rdm/migration/streams.yaml +++ b/cds_migrator_kit/rdm/migration/streams.yaml @@ -8,3 +8,5 @@ new_secret_key: CHANGE_ME records: extract: dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports + transform: + files_dump_dir: cds_migrator_kit/rdm/migration/data/files/ diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py index d76d9b9..eb0cd02 100644 --- a/cds_migrator_kit/rdm/migration/transform/transform.py +++ b/cds_migrator_kit/rdm/migration/transform/transform.py @@ -9,6 +9,7 @@ import datetime import logging +from pathlib import Path from invenio_rdm_migrator.streams.records.transform import ( RDMRecordEntry, @@ -84,7 +85,8 @@ def _pids(self, json_entry): def _files(self, record_dump): """Transform the files of a record.""" - files = record_dump.prepare_files() + record_dump.prepare_files() + files = record_dump.files return {"enabled": True if files else False} def _communities(self, json_entry): @@ -158,6 +160,11 @@ def transform(self, entry): class CDSToRDMRecordTransform(RDMRecordTransform): """CDSToRDMRecordTransform.""" + def __init__(self, workers=None, throw=False, files_dump_dir=None): + """Constructor.""" + self.files_dump_dir = Path(files_dump_dir).absolute().as_posix() + super().__init__(workers, throw) + def _community_id(self, entry, record): communities = record.get("communities") if communities: @@ -201,26 +208,67 @@ def _transform(self, entry): } def _record(self, entry): + # could be in draft as well, depends on how we decide to publish return CDSToRDMRecordEntry().transform(entry) def _draft(self, entry): return None def _draft_files(self, entry): - return None + """Point to temporary eos storage to import files from.""" + _files = entry["files"] + draft_files = [] + legacy_path_root = Path("/opt/cdsweb/var/data/files/") + tmp_eos_root = Path(self.files_dump_dir) + + for file in _files: + full_path = Path(file["full_path"]) + draft_files.append( + { + "eos_tmp_path": tmp_eos_root + / full_path.relative_to(legacy_path_root), + "key": file["full_name"], + "metadata": {}, + "mimetype": file["mime"], + "checksum": file["checksum"], + } + ) + return draft_files def _record_files(self, entry, record): - # files = entry["json"].get("_files", []) - # return [ - # { - # "key": f["key"], - # "object_version": { - # "file": { - # "size": f["size"], - # "checksum": f["checksum"], - # }, - # }, - # } - # for f in files - # ] + """Record files entries transform.""" + # TO implement if we decide not to go via draft publish return [] + + # + # + # "files": [ + # { + # "comment": null, + # "status": "firerole: allow group \"council-full [CERN]\"\ndeny until \"1996-02-01\"\nallow all", + # "version": 1, + # "encoding": null, + # "creation_date": "2009-11-03T12:29:06+00:00", + # "bibdocid": 502379, + # "mime": "application/pdf", + # "full_name": "CM-P00080632-e.pdf", + # "superformat": ".pdf", + # "recids_doctype": [[32097, "Main", "CM-P00080632-e.pdf"]], + # "path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1", + # "size": 5033532, + # "license": {}, + # "modification_date": "2009-11-03T12:29:06+00:00", + # "copyright": {}, + # "url": "http://cds.cern.ch/record/32097/files/CM-P00080632-e.pdf", + # "checksum": "ed797ce5d024dcff0040db79c3396da9", + # "description": "English", + # "format": ".pdf", + # "name": "CM-P00080632-e", + # "subformat": "", + # "etag": "\"502379.pdf1\"", + # "recid": 32097, + # "flags": [], + # "hidden": false, + # "type": "Main", + # "full_path": "/opt/cdsweb/var/data/files/g50/502379/CM-P00080632-e.pdf;1" + # },] diff --git a/scripts/copy_collection_files.py b/scripts/copy_collection_files.py new file mode 100644 index 0000000..f681200 --- /dev/null +++ b/scripts/copy_collection_files.py @@ -0,0 +1,60 @@ +import argparse +import json +import os +import shutil + + +def copy_collection_file(dump_files, destination_prefix, working_dir): + file_log = open(os.path.join(working_dir, "files.log"), "w") + + for dump_file in dump_files: + with open(os.path.join(working_dir, dump_file), "r") as json_dump: + data = json.load(json_dump) + for record in data: + legacy_record_files = record["files"] + for legacy_record_file in legacy_record_files: + full_path = legacy_record_file["full_path"] + # important: last slash + path_to_replace = "/opt/cdsweb/var/data/files/" + + rel_path = full_path.replace(path_to_replace, "") + destination_path = os.path.join(destination_prefix, rel_path) + parent_dest_path = os.path.dirname(destination_path) + if not os.path.exists(parent_dest_path): + os.makedirs(parent_dest_path) + shutil.copy(full_path, destination_path) + file_log.writelines( + [ + f"RECID: {record['recid']}," + f" bibdocid: {legacy_record_file['bibdocid']}" + f" file: {legacy_record_file['full_name']}," + f" destination: {destination_path}" + ] + ) + file_log.close() + + +def get_dump_files_paths(working_dir): + dump_files = [] + # get all dump files in the folder + for root, dirs, files in os.walk(working_dir, topdown=True): + dump_files += [os.path.join(root, filename) for filename in files] + return dump_files + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Copy files over script") + parser.add_argument( + "--dump-folder", metavar="path", required=True, help="the path to dump folder" + ) + parser.add_argument( + "--files-destination", + metavar="path", + required=True, + help="path to destination folder on EOS", + ) + args = parser.parse_args() + + dump_folder = args.dump_folder + + collection_dump_file_list = get_dump_files_paths(dump_folder)