diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml index 436be2c6a..26d3244ea 100644 --- a/.github/workflows/api-and-integration-tests.yml +++ b/.github/workflows/api-and-integration-tests.yml @@ -34,7 +34,7 @@ jobs: # For other branches, we use the Docker setup of the development branch. - name: Determine Yoda repository branch run: | - if [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.9" ]; then + if [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.9" ] || [[ "${{ steps.extract_branch.outputs.branch }}" == rc-1.9.* ]]; then echo "branch=release-1.9" >> $GITHUB_OUTPUT else echo "branch=development" >> $GITHUB_OUTPUT @@ -74,12 +74,19 @@ jobs: docker exec provider.yoda sh -c "set -x ; cd /etc/irods/yoda-ruleset && sudo -u irods git checkout ${{ steps.extract_branch.outputs.branch }} && sudo -u irods python -m pip --no-cache-dir install --user -r /etc/irods/yoda-ruleset/requirements.txt && sudo -u irods make && sudo -u irods make install" docker exec provider.yoda sh -c "set -x ; sudo -u irods /var/lib/irods/irodsctl restart" + - name: Upgrade UU microservices for testing the RC version of Yoda 1.9.5 + if: github.event.pull_request.head.ref == 'rc-1.9.5' + shell: bash + run: | + docker exec provider.yoda sh -c 'set -x ; sudo yum remove -y irods-uu-microservices ; sudo wget https://github.com/UtrechtUniversity/irods-uu-microservices/releases/download/v1.2.0/irods-uu-microservices-4.2.12_1.2.0-0.rpm ; sudo rpm -ivh irods-uu-microservices-4.2.12_1.2.0-0.rpm' + - name: Pull and install latest version of portal shell: bash run: | cd yoda/docker/compose docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && git pull' docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git checkout ${{ steps.extract_branch.outputs.branch }} || git checkout development' + docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && . venv/bin/activate && venv/bin/pip3 install -r requirements.txt' docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git status' docker exec portal.yoda sh -c 'set -x ; touch /var/www/yoda/*.wsgi' @@ -118,6 +125,12 @@ jobs: run: | docker exec provider.yoda sh -c 'set -x ; cat /var/lib/irods/log/rodsLog*' + + - name: Output web server logs + if: failure() + run: | + docker exec portal.yoda sh -c 'set -x ; for log in error_log portal_access.log ; do echo "${log}:" ; cat "/var/log/httpd/$log" ; echo; done' + # Uncomment section below when needed for debugging. # # - name: Setup tmate session for debugging diff --git a/__init__.py b/__init__.py index f6a4e1f09..e7dae4bfa 100644 --- a/__init__.py +++ b/__init__.py @@ -24,30 +24,31 @@ # Import all modules containing rules into the package namespace, # so that they become visible to iRODS. -from browse import * -from folder import * -from groups import * -from json_datacite import * -from json_landing_page import * -from mail import * -from meta import * -from meta_form import * -from provenance import * -from research import * -from resources import * -from schema import * -from schema_transformation import * -from schema_transformations import * -from vault import * -from datacite import * -from epic import * -from publication import * -from policies import * -from replication import * -from revisions import * -from settings import * -from notifications import * -from integration_tests import * +from browse import * +from folder import * +from groups import * +from json_datacite import * +from json_landing_page import * +from mail import * +from meta import * +from meta_form import * +from provenance import * +from research import * +from resources import * +from schema import * +from schema_transformation import * +from schema_transformations import * +from publication_troubleshoot import * +from vault import * +from datacite import * +from epic import * +from publication import * +from policies import * +from replication import * +from revisions import * +from settings import * +from notifications import * +from integration_tests import * # Import certain modules only when enabled. from .util.config import config diff --git a/browse.py b/browse.py index 71fbeb30f..56b870f72 100644 --- a/browse.py +++ b/browse.py @@ -264,7 +264,7 @@ def transform(row): if sort_on == 'modified': cols = ['COLL_NAME', 'COLL_PARENT_NAME', 'MIN(COLL_CREATE_TIME)', 'ORDER(COLL_MODIFY_TIME)'] else: - cols = ['ORDER(COLL_NAME)', 'COLL_PARENT_NAME' 'MIN(COLL_CREATE_TIME)', 'MAX(COLL_MODIFY_TIME)'] + cols = ['ORDER(COLL_NAME)', 'COLL_PARENT_NAME', 'MIN(COLL_CREATE_TIME)', 'MAX(COLL_MODIFY_TIME)'] where = "COLL_PARENT_NAME like '{}%%' AND COLL_NAME like '%%{}%%'".format("/" + zone + "/home", search_string) elif search_type == 'metadata': if sort_on == 'modified': @@ -310,7 +310,7 @@ def _filter_vault_deposit_index(row): :param row: row of results data from GenQuery, containing collection name (COLL_NAME) - :returns: boolean value that indicated whether row should be displayed + :returns: boolean value that indicates whether row should be displayed """ # Remove ORDER_BY etc. wrappers from column names. x = {re.sub('.*\((.*)\)', '\\1', k): v for k, v in row.items()} diff --git a/datarequest.py b/datarequest.py index 63a278ef5..76ba69a3a 100644 --- a/datarequest.py +++ b/datarequest.py @@ -496,7 +496,10 @@ def datarequest_owner_get(ctx, request_id): + JSON_EXT) # Get and return data request owner - return jsonutil.read(ctx, file_path)['owner'] + try: + return jsonutil.read(ctx, file_path)['owner'] + except Exception: + return None def datarequest_is_reviewer(ctx, request_id, pending=False): @@ -1046,7 +1049,10 @@ def api_datarequest_get(ctx, request_id): datarequest_action_permitted(ctx, request_id, ["PM", "DM", "DAC", "OWN"], None) # Get request type - datarequest_type = type_get(ctx, request_id).value + try: + datarequest_type = type_get(ctx, request_id).value + except Exception as e: + return api.Error("datarequest_type_fail", "Error: {}".format(e)) # Get request status datarequest_status = status_get(ctx, request_id).value diff --git a/deposit.py b/deposit.py index bd7dacc4a..483cff9e6 100644 --- a/deposit.py +++ b/deposit.py @@ -25,15 +25,16 @@ @api.make() -def api_deposit_copy_data_package(ctx, reference): +def api_deposit_copy_data_package(ctx, reference, deposit_group): """Create deposit collection and copies selected datapackage into the newly created deposit - :param ctx: Combined type of a callback and rei struct - :param reference: Data Package Reference (UUID4) + :param ctx: Combined type of a callback and rei struct + :param reference: Data Package Reference (UUID4) + :param deposit_group: Deposit group to copy to :returns: Path to created deposit collection or API error """ - result = deposit_create(ctx) + result = deposit_create(ctx, deposit_group) if result["deposit_path"] == "not_allowed": return api.Error('not_allowed', 'Could not create deposit collection.') diff --git a/folder.py b/folder.py index d1719dd0e..c35afe982 100644 --- a/folder.py +++ b/folder.py @@ -4,6 +4,7 @@ __copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' +import time import uuid import genquery @@ -11,10 +12,12 @@ import epic import meta +import notifications import policies_folder_status import provenance import vault from util import * +from vault_utils import get_sanity_checks_results_copy_to_vault_paths __all__ = ['rule_collection_group_name', 'api_folder_get_locks', @@ -169,219 +172,433 @@ def api_folder_reject(ctx, coll): return set_status_as_datamanager(ctx, coll, constants.research_package_state.REJECTED) -@rule.make(inputs=[0, 1], outputs=[2]) -def rule_folder_secure(ctx, coll, target): - +@rule.make(inputs=[0], outputs=[1]) +def rule_folder_secure(ctx, coll): """Rule interface for processing vault status transition request. :param ctx: Combined type of a callback and rei struct :param coll: Collection to be copied to vault - :param target: Vault target to copy research package to including license file etc - :return: returns result of securing action + :return: result of securing action (1 for successfully secured or skipped folder) """ - return folder_secure(ctx, coll, target) + if not precheck_folder_secure(ctx, coll): + return '1' + if not folder_secure(ctx, coll): + folder_secure_set_retry(ctx, coll) + return '0' -def folder_secure(ctx, coll, target): - """Secure a folder to the vault. + return '1' - This function should only be called by a rodsadmin - and should not be called from the portal. + +def precheck_folder_secure(ctx, coll): + """Whether to continue with securing. Should not touch the retry attempts, + these are prechecks and don't count toward the retry attempts limit :param ctx: Combined type of a callback and rei struct :param coll: Folder to secure - :param target: Target folder in vault - :returns: '0' when nu error occurred - """ + :returns: True when successful """ - # Following code is overturned by code in the rule language. - # This, as large files were not properly copied to the vault. - # Using the rule language this turned out to work fine. + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "folder_secure: User is not rodsadmin") + return False - log.write(ctx, 'folder_secure: Start securing folder <{}>'.format(coll)) + found, last_run = get_last_run_time(ctx, coll) + if (not correct_copytovault_start_status(ctx, coll) + or not correct_copytovault_start_location(coll) + or not misc.last_run_time_acceptable(coll, found, last_run, config.vault_copy_backoff_time)): + return False - if user.user_type(ctx) != 'rodsadmin': - log.write(ctx, "folder_secure: User is no rodsadmin") - return '1' + return True - # Check modify access on research folder. - msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf()) - modify_access = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())['arguments'][2] +def folder_secure(ctx, coll): + """Secure a folder to the vault. If the previous copy did not finish, retry - # Set cronjob status - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll) - except msi.Error as e: - log.write(ctx, "Could not set acl (admin:write) for collection: " + coll) - return '1' + This function should only be called by a rodsadmin + and should not be called from the portal. - avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['PROCESSING']) + :param ctx: Combined type of a callback and rei struct + :param coll: Folder to secure - found = False - iter = genquery.row_iterator( - "META_COLL_ATTR_VALUE", - "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYPARAMSNAME + "'", - genquery.AS_LIST, ctx - ) - for row in iter: - target = row[0] - found = True + :returns: True when successful + """ - if found: - avu.rm_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, target) + log.write(ctx, 'folder_secure: Start securing folder <{}>'.format(coll)) - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll) - except msi.Error as e: - log.write(ctx, "Could not set acl (admin:null) for collection: " + coll) - return '1' + # Checks before start securing + if not check_folder_secure(ctx, coll): + return False - # Determine vault target if it does not exist. - if not found: - target = determine_vault_target(ctx, coll) - if target == "": - log.write(ctx, "folder_secure: No vault target found") - return '1' + # Set cronjob status + if not set_cronjob_status(ctx, constants.CRONJOB_STATE['PROCESSING'], coll): + return False - # Create vault target and set status to INCOMPLETE. - msi.coll_create(ctx, target, '', irods_types.BytesBuf()) - avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.INCOMPLETE) + # Get the target folder + target = determine_and_set_vault_target(ctx, coll) + if not target: + return False # Copy all original info to vault - # try: - # vault.copy_folder_to_vault(ctx, coll, target) - # except Exception as e: - # log.write(ctx, e) - # return '1' + if not vault.copy_folder_to_vault(ctx, coll, target): + return False - ctx.iiCopyFolderToVault(coll, target) - """ # Starting point of last part of securing a folder into the vault - msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf()) - modify_access = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())['arguments'][2] - # Generate UUID4 and set as Data Package Reference. if config.enable_data_package_reference: - avu.set_on_coll(ctx, target, constants.DATA_PACKAGE_REFERENCE, str(uuid.uuid4())) + if not avu.set_on_coll(ctx, target, constants.DATA_PACKAGE_REFERENCE, str(uuid.uuid4()), True): + return False meta.copy_user_metadata(ctx, coll, target) vault.vault_copy_original_metadata_to_vault(ctx, target) vault.vault_write_license(ctx, target) + group_name = collection_group_name(ctx, coll) # Enable indexing on vault target. - if collection_group_name(ctx, coll).startswith("deposit-"): + if group_name.startswith("deposit-"): vault.vault_enable_indexing(ctx, target) # Copy provenance log from research folder to vault package. provenance.provenance_copy_log(ctx, coll, target) # Try to register EPIC PID if enabled. + if not set_epic_pid(ctx, target): + return False + + # Set vault permissions for new vault package. + if not vault.set_vault_permissions(ctx, coll, target): + return False + + # Vault package is ready, set vault package state to UNPUBLISHED. + if not avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.UNPUBLISHED, True): + return False + + if not set_acl_check(ctx, "recursive", "admin:write", coll, 'Could not set ACL (admin:write) for collection: ' + coll): + return False + set_acl_parents(ctx, "recursive", "admin:write", coll) + + # Save vault package for notification. + set_vault_data_package(ctx, coll, target) + + # Everything is done, set research folder state to SECURED. + if not folder_secure_succeed_avus(ctx, coll, group_name): + return False + + # Deposit group has been deleted once secured status is set, + # so cannot change AVUs on collection + if not group_name.startswith("deposit-"): + set_acl_check(ctx, "recursive", "admin:null", coll, "Could not set ACL (admin:null) for collection: {}".format(coll)) + set_acl_parents(ctx, "default", "admin:null", coll) + + # All (mostly) went well + return True + + +def check_folder_secure(ctx, coll): + """Some initial set up that determines whether folder secure can continue. + These WILL affect the retry attempts. + + :param ctx: Combined type of a callback and rei struct + :param coll: Folder to secure + + :returns: True when successful + """ + if (not set_can_modify(ctx, coll) + or not retry_attempts(ctx, coll) + or not set_last_run_time(ctx, coll)): + return False + + return True + + +def correct_copytovault_start_status(ctx, coll): + """Confirm that the copytovault cronjob avu status is correct state to start securing""" + cronjob_status = get_cronjob_status(ctx, coll) + if cronjob_status in (constants.CRONJOB_STATE['PENDING'], constants.CRONJOB_STATE['RETRY']): + return True + + return False + + +def correct_copytovault_start_location(coll): + """Confirm that the folder to be copied is in the correct location. + For example: in a research or deposit folder and not in the trash. + + :param coll: Source collection (folder being secured) + + :returns: True when a valid start location + """ + space, _, _, _ = pathutil.info(coll) + return space in (pathutil.Space.RESEARCH, pathutil.Space.DEPOSIT) + + +def get_last_run_time(ctx, coll): + """Get the last run time, if found""" + found = False + last_run = 1 + iter = genquery.row_iterator( + "META_COLL_ATTR_VALUE", + "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYLASTRUN + "'", + genquery.AS_LIST, ctx + ) + for row in iter: + last_run = int(row[0]) + found = True + + return found, last_run + + +def set_last_run_time(ctx, coll): + """Set last run time, return True for successful set""" + now = int(time.time()) + return avu.set_on_coll(ctx, coll, constants.IICOPYLASTRUN, str(now), True) + + +def set_can_modify(ctx, coll): + """Check if have permission to modify, set if necessary""" + check_access_result = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf()) + modify_access = check_access_result['arguments'][2] + if modify_access != b'\x01': + # This allows us permission to copy the files + if not set_acl_check(ctx, "recursive", "admin:read", coll, "Could not set ACL (admin:read) for collection: {}".format(coll)): + return False + # This allows us permission to set AVUs + if not set_acl_check(ctx, "default", "admin:write", coll, "Could not set ACL (admin:write) for collection: {}".format(coll)): + return False + + return True + + +def get_retry_count(ctx, coll): + """ Get the retry count, if not such AVU, return 0 """ + retry_count = 0 + iter = genquery.row_iterator( + "META_COLL_ATTR_VALUE, COLL_NAME", + "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYRETRYCOUNT + "'", + genquery.AS_LIST, ctx + ) + for row in iter: + retry_count = int(row[0]) + + return retry_count + + +def retry_attempts(ctx, coll): + """ Check if there have been too many retries. """ + retry_count = get_retry_count(ctx, coll) + + if retry_count >= config.vault_copy_max_retries: + return False + + return True + + +def folder_secure_succeed_avus(ctx, coll, group_name): + """Set/rm AVUs on source folder when successfully secured folder""" + attributes = [x[0] for x in get_org_metadata(ctx, coll)] + + # In cases where copytovault only ran once, okay that these attributes were not created + if constants.IICOPYRETRYCOUNT in attributes: + if not avu.rmw_from_coll(ctx, coll, constants.IICOPYRETRYCOUNT, "%", True): + return False + if constants.IICOPYLASTRUN in attributes: + if not avu.rmw_from_coll(ctx, coll, constants.IICOPYLASTRUN, "%", True): + return False + + # Set cronjob status to final state before deletion + if not set_cronjob_status(ctx, constants.CRONJOB_STATE['OK'], coll): + return False + + if not rm_cronjob_status(ctx, coll): + return False + + # Note: this is the status that must always be one of the last to be set + # in folder secure, otherwise could be a problem for deposit groups + if not avu.set_on_coll(ctx, coll, constants.IISTATUSATTRNAME, constants.research_package_state.SECURED, True): + return False + + # Remove target AVU on source folder. This should be done after all possibly failing steps + # have occurred in folder_secure (any "return False" steps), so that if those trip a retry state, + # on retry folder_secure can reuse the target from before. + if (not group_name.startswith("deposit-") + and not avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True)): + return False + + return True + + +def folder_secure_set_retry(ctx, coll): + # When a folder secure fails, try to set the retry AVU and other applicable AVUs on source folder. + # If too many attempts, fail. + new_retry_count = get_retry_count(ctx, coll) + 1 + if new_retry_count > config.vault_copy_max_retries: + folder_secure_fail(ctx, coll) + send_folder_secure_notification(ctx, coll, "Data package failed to copy to vault after maximum retries") + elif not folder_secure_set_retry_avus(ctx, coll, new_retry_count): + send_folder_secure_notification(ctx, coll, "Failed to set retry state on data package") + + +def folder_secure_set_retry_avus(ctx, coll, retry_count): + avu.set_on_coll(ctx, coll, constants.IICOPYRETRYCOUNT, str(retry_count), True) + return set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], coll) + + +def folder_secure_fail(ctx, coll): + """When there are too many retries, give up, set the AVUs and send notifications""" + # Errors are caught here in hopes that will still be able to set UNRECOVERABLE status at least + avu.rmw_from_coll(ctx, coll, constants.IICOPYRETRYCOUNT, "%", True) + # Remove target AVU + avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True) + set_cronjob_status(ctx, constants.CRONJOB_STATE['UNRECOVERABLE'], coll) + + +def send_folder_secure_notification(ctx, coll, message): + """Send notification about folder secure to relevant datamanagers""" + if datamanager_exists(ctx, coll): + datamanagers = get_datamanagers(ctx, coll) + for datamanager in datamanagers: + datamanager = '{}#{}'.format(*datamanager) + notifications.set(ctx, "system", datamanager, coll, message) + + +def set_epic_pid(ctx, target): + """Try to set epic pid, if fails return False""" if config.epic_pid_enabled: ret = epic.register_epic_pid(ctx, target) url = ret['url'] pid = ret['pid'] http_code = ret['httpCode'] - if (http_code != "0" and http_code != "200" and http_code != "201"): - # Something went wrong while registering EPIC PID, set cronjob state to retry. - log.write(ctx, "folder_secure: epid pid returned http <{}>".format(http_code)) - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll) - except msi.Error: - return '1' - - avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['RETRY']) - avu.set_on_coll(ctx, coll, constants.IICOPYPARAMSNAME, target) - - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll) - except msi.Error: - log.write(ctx, "Could not set acl (admin:null) for collection: " + coll) - return '1' + if http_code not in ('0', '200', '201'): + # Something went wrong while registering EPIC PID, return false so retry status will be set + log.write(ctx, "folder_secure: epic pid returned http <{}>".format(http_code)) + return False if http_code != "0": # save EPIC Persistent ID in metadata epic.save_epic_pid(ctx, target, url, pid) - # Set vault permissions for new vault package. - group = collection_group_name(ctx, coll) - if group == '': - log.write(ctx, "folder_secure: Cannot determine which deposit or research group <{}> belongs to".format(coll)) - return '1' + return True - vault.set_vault_permissions(ctx, group, coll, target) - # Set cronjob status to OK. - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll) - except msi.Error: - log.write(ctx, "Could not set acl (admin:write) for collection: " + coll) - return '1' +def get_cronjob_status(ctx, coll): + """Get the cronjob status of given collection""" + iter = genquery.row_iterator( + "META_COLL_ATTR_VALUE", + "COLL_NAME = '{}' AND META_COLL_ATTR_NAME = '{}'".format(coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault"), + genquery.AS_LIST, ctx + ) + for row in iter: + return row[0] - avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['OK']) - if modify_access != b'\x01': - try: - msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll) - except msi.Error: - log.write(ctx, "Could not set acl (admin:null) for collection: " + coll) - return '1' +def rm_cronjob_status(ctx, coll): + """Remove cronjob_copy_to_vault attribute on source collection - # Vault package is ready, set vault package state to UNPUBLISHED. - avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.UNPUBLISHED) + :param ctx: Combined type of a callback and rei struct + :param coll: Source collection (folder that was being secured) - # Everything is done, set research folder state to SECURED. - try: - msi.set_acl(ctx, "recursive", "admin:write", user.full_name(ctx), coll) - except msi.Error: - log.write(ctx, "Could not set acl (admin:write) for collection: " + coll) - return '1' + :returns: True when successfully removed + """ + return avu.rmw_from_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", "%", True) - parent, chopped_coll = pathutil.chop(coll) - while parent != "/" + user.zone(ctx) + "/home": - try: - msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), parent) - except msi.Error: - log.write(ctx, "Could not set ACL on " + parent) - parent, chopped_coll = pathutil.chop(parent) - # Save vault package for notification. - set_vault_data_package(ctx, coll, target) +def set_cronjob_status(ctx, status, coll): + """Set cronjob_copy_to_vault attribute on source collection - # Set folder status to SECURED. - avu.set_on_coll(ctx, coll, constants.IISTATUSATTRNAME, constants.research_package_state.SECURED) + :param ctx: Combined type of a callback and rei struct + :param status: Status to set on collection + :param coll: Source collection (folder being secured) + + :returns: True when successfully set + """ + return avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", status, True) + +def set_acl_parents(ctx, acl_recurse, acl_type, coll): + """Set ACL for parent collections""" + parent, _ = pathutil.chop(coll) + while parent != "/" + user.zone(ctx) + "/home": + set_acl_check(ctx, acl_recurse, acl_type, parent, "Could not set the ACL ({}) on {}".format(acl_type, parent)) + parent, _ = pathutil.chop(parent) + + +def set_acl_check(ctx, acl_recurse, acl_type, coll, error_msg=''): + """Set the ACL if possible, log error_msg if it goes wrong""" + # TODO turn acl_recurse into a boolean try: - msi.set_acl(ctx, "recursive", "admin:null", user.full_name(ctx), coll) + msi.set_acl(ctx, acl_recurse, acl_type, user.full_name(ctx), coll) except msi.Error: - log.write(ctx, "Could not set acl (admin:null) for collection: " + coll) + if error_msg: + log.write(ctx, error_msg) + return False - parent, chopped_coll = pathutil.chop(coll) - while parent != "/" + user.zone(ctx) + "/home": - try: - msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), parent) - except msi.Error: - log.write(ctx, "Could not set ACL (admin:null) on " + parent) + return True + + +def get_existing_vault_target(ctx, coll): + """Determine vault target on coll, if it was already determined before """ + found = False + target = "" + iter = genquery.row_iterator( + "META_COLL_ATTR_VALUE", + "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYPARAMSNAME + "'", + genquery.AS_LIST, ctx + ) + for row in iter: + target = row[0] + found = True + + return found, target + + +def set_vault_target(ctx, coll, target): + """Create vault target and AVUs""" + msi.coll_create(ctx, target, '', irods_types.BytesBuf()) + if not avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.INCOMPLETE, True): + return False + + # Note on the source the target folder in case a copy stops midway + if not avu.set_on_coll(ctx, coll, constants.IICOPYPARAMSNAME, target, True): + return False - parent, chopped_coll = pathutil.chop(parent) + return True - # All went well - return '0' + +def determine_and_set_vault_target(ctx, coll): + """Determine and set target on coll""" + found, target = get_existing_vault_target(ctx, coll) + + # Overwrite vault target if it does not pass sanity checks. This should usually + # fix any wrong vault target. There's a second check in the copy_folder_to_vault + # function to prevent TOCTOU issues. + sanity_check_results = get_sanity_checks_results_copy_to_vault_paths(coll, target) + if len(sanity_check_results) > 0: + log.write(ctx, "folder_secure: overwriting previous vault target for " + coll + + "(" + target + "), because it did not meet sanity checks: " + + str(sanity_check_results)) + found = False + + # Determine vault target if it does not exist. + if not found: + target = determine_new_vault_target(ctx, coll) + if target == "": + log.write(ctx, "folder_secure: No possible vault target found") + return "" + + # Create vault target and set status to INCOMPLETE. + if not set_vault_target(ctx, coll, target): + return "" + + return target -def determine_vault_target(ctx, folder): +def determine_new_vault_target(ctx, folder): """Determine vault target path for a folder.""" group = collection_group_name(ctx, folder) if group == '': - log.write(ctx, "Cannot determine which deposit or research group " + + " belongs to") + log.write(ctx, "Cannot determine which deposit or research group <{}> belongs to".format(folder)) return "" parts = group.split('-') diff --git a/groups.py b/groups.py index fbc20c22e..30d62e188 100644 --- a/groups.py +++ b/groups.py @@ -64,13 +64,14 @@ def getGroupsData(ctx): "name": name, "managers": [], "members": [], - "read": [] + "read": [], + "invited": [] } groups[name] = group if attr in ["schema_id", "data_classification", "category", "subcategory"]: group[attr] = value - elif attr == "description" or attr == "expiration_date": + elif attr in ('description', 'expiration_date'): # Deal with legacy use of '.' for empty description metadata and expiration date. # See uuGroupGetDescription() in uuGroup.r for correct behavior of the old query interface. group[attr] = '' if value == '.' else value @@ -89,7 +90,7 @@ def getGroupsData(ctx): user = row[1] zone = row[2] - if name != user and name != "rodsadmin" and name != "public": + if name not in (user, 'rodsadmin', 'public'): user = user + "#" + zone if name.startswith("read-"): # Match read-* group with research-* or initial-* group. @@ -113,6 +114,22 @@ def getGroupsData(ctx): except KeyError: pass + # Third query: obtain list of invited SRAM users + if config.enable_sram: + iter = genquery.row_iterator( + "META_USER_ATTR_VALUE, USER_NAME, USER_ZONE", + "USER_TYPE != 'rodsgroup' AND META_USER_ATTR_NAME = '{}'".format(constants.UUORGMETADATAPREFIX + "sram_invited"), + genquery.AS_LIST, ctx + ) + for row in iter: + name = row[0] + user = row[1] + "#" + row[2] + try: + group = groups[name] + group["invited"].append(user) + except KeyError: + pass + return groups.values() @@ -163,7 +180,7 @@ def getGroupData(ctx, name): user = row[0] zone = row[1] - if name != user and name != "rodsadmin" and name != "public": + if name not in (user, 'rodsadmin', 'public'): group["members"].append(user + "#" + zone) if name.startswith("research-"): @@ -361,6 +378,10 @@ def api_group_data(ctx): # Filter groups (only return groups user is part of), convert to json and write to stdout. groups = list(filter(lambda group: full_name in group['read'] + group['members'] or group['category'] in categories, groups)) + # Only process group types managed via group manager + managed_prefixes = ("priv-", "deposit-", "research-", "grp-", "datamanager-", "datarequests-", "intake-") + groups = list(filter(lambda group: group['name'].startswith(managed_prefixes), groups)) + # Sort groups on name. groups = sorted(groups, key=lambda d: d['name']) @@ -392,6 +413,10 @@ def api_group_data(ctx): for member in group['read']: members[member] = {'access': 'reader'} + # Invited SRAM users + for member in group['invited']: + members[member]['sram'] = 'invited' + if not group_hierarchy.get(group['category']): group_hierarchy[group['category']] = OrderedDict() @@ -498,11 +523,11 @@ def api_group_process_csv(ctx, csv_header_and_data, allow_update, delete_users): return api.Error('errors', validation_errors) # Step 3: Create / update groups. - error = apply_data(ctx, data, allow_update, delete_users) - if len(error): - return api.Error('errors', [error]) + status_msg = apply_data(ctx, data, allow_update, delete_users) + if status_msg['status'] == 'error': + return api.Error('errors', [status_msg['message']]) - return api.Result.ok() + return api.Result.ok(info=[status_msg['message']]) def validate_data(ctx, data, allow_update): @@ -522,7 +547,7 @@ def validate_data(ctx, data, allow_update): for (category, subcategory, groupname, managers, members, viewers, _, _) in data: if group.exists(ctx, groupname) and not allow_update: - errors.append('Group "{}" already exists'.format(groupname)) + errors.append('Group "{}" already exists. It has not been updated.'.format(groupname)) # Is user admin or has category add privileges? if not (is_admin or can_add_category): @@ -544,11 +569,13 @@ def apply_data(ctx, data, allow_update, delete_users): :param allow_update: Allow updates in groups :param delete_users: Allow for deleting of users from groups - :returns: Errors if found any + :returns: Errors if found any, or message with actions if everything is successful """ for (category, subcategory, group_name, managers, members, viewers, schema_id, expiration_date) in data: new_group = False + users_added, users_removed = 0, 0 + message = '' log.write(ctx, 'CSV import - Adding and updating group: {}'.format(group_name)) @@ -559,10 +586,12 @@ def apply_data(ctx, data, allow_update, delete_users): if response: new_group = True + message += "Group '{}' created.".format(group_name) elif response.status == "error_group_exists" and allow_update: log.write(ctx, 'CSV import - WARNING: group "{}" not created, it already exists'.format(group_name)) + message += "Group '{}' already exists.".format(group_name) else: - return "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info) + return {status: 'error', message: "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info)} # Now add the users and set their role if other than member allusers = managers + members + viewers @@ -573,6 +602,7 @@ def apply_data(ctx, data, allow_update, delete_users): if response: currentrole = "normal" log.write(ctx, "CSV import - Notice: added user {} to group {}".format(username, group_name)) + users_added += 1 else: log.write(ctx, "CSV import - Warning: error occurred while attempting to add user {} to group {}".format(username, group_name)) log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info)) @@ -638,11 +668,21 @@ def apply_data(ctx, data, allow_update, delete_users): response = group_remove_user_from_group(ctx, username, usergroupname) if response: log.write(ctx, "CSV import - Removing user {} from group {}".format(username, usergroupname)) + users_removed += 1 else: log.write(ctx, "CSV import - Warning: error while attempting to remove user {} from group {}".format(username, usergroupname)) log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info)) - return '' + if users_added > 0: + message += ' Users added ({}).'.format(users_added) + if users_removed > 0: + message += ' Users removed ({}).'.format(users_removed) + + # If no users added, no users removed and not new group created. + if not users_added and not users_removed and not new_group: + message += ' No changes made.' + + return {"status": "ok", "message": message} def _are_roles_equivalent(a, b): @@ -942,12 +982,15 @@ def group_create(ctx, group_name, category, subcategory, schema_id, expiration_d if not sram.sram_connect_service_collaboration(ctx, short_name): return api.Error('sram_error', 'Something went wrong connecting service to group "{}" in SRAM'.format(group_name)) + if group.exists(ctx, group_name): + return api.Error('group_exists', "Group {} not created, it already exists".format(group_name)) + response = ctx.uuGroupAdd(group_name, category, subcategory, schema_id, expiration_date, description, data_classification, co_identifier, '', '')['arguments'] status = response[8] message = response[9] if status == '0': return api.Result.ok() - elif status == '-1089000' or status == '-809000': + elif status == '-1089000' or status == '-809000' or status == '-806000': return api.Error('group_exists', "Group {} not created, it already exists".format(group_name)) else: return api.Error('policy_error', message) @@ -1069,7 +1112,8 @@ def group_user_add(ctx, username, group_name): sram.invitation_mail_group_add_user(ctx, group_name, username.split('#')[0], co_identifier) elif config.sram_flow == 'invitation': sram.sram_put_collaboration_invitation(ctx, group_name, username.split('#')[0], co_identifier) - + # Mark user as invited. + msi.sudo_obj_meta_set(ctx, username, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "") return api.Result.ok() else: return api.Error('policy_error', message) @@ -1200,10 +1244,11 @@ def rule_group_sram_sync(ctx): members = group['members'] + group['read'] managers = group['managers'] description = group['description'] if 'description' in group else '' + invited = group['invited'] log.write(ctx, "Sync group {} with SRAM".format(group_name)) - sram_group, co_identifier = sram_enabled(ctx, group_name) + # Post collaboration group is not yet already SRAM enabled. if not sram_group: response_sram = sram.sram_post_collaboration(ctx, group_name, description) @@ -1226,27 +1271,43 @@ def rule_group_sram_sync(ctx): log.write(ctx, "Sync members of group {} with SRAM".format(group_name)) for member in members: - # Validate email + # Validate email. if not yoda_names.is_email_username(member): log.write(ctx, "User {} cannot be added to group {} because user email is invalid".format(member, group_name)) continue - if member.split('#')[0] not in co_members: + # Check if member is invited. + if member in invited: + if member.split('#')[0] in co_members: + log.write(ctx, "User {} added to group {}".format(member, group_name)) + # Remove invitation metadata. + msi.sudo_obj_meta_remove(ctx, member, "-u", "", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "") + else: + log.write(ctx, "User {} already invited to group {}".format(member, group_name)) + continue + + # Not invited and not yet in the CO. + if member not in invited and member.split('#')[0] not in co_members: if config.sram_flow == 'join_request': sram.invitation_mail_group_add_user(ctx, group_name, member.split('#')[0], co_identifier) - log.write(ctx, "User {} added to group {}".format(member, group_name)) + msi.sudo_obj_meta_set(ctx, member, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "") + log.write(ctx, "User {} invited to group {}".format(member, group_name)) + continue elif config.sram_flow == 'invitation': sram.sram_put_collaboration_invitation(ctx, group_name, member.split('#')[0], co_identifier) - log.write(ctx, "User {} added to group {}".format(member, group_name)) - else: - if member in managers: - uid = sram.sram_get_uid(ctx, co_identifier, member) - if uid == '': - log.write(ctx, "Something went wrong getting the SRAM user id for user {} of group {}".format(member, group_name)) + msi.sudo_obj_meta_set(ctx, member, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "") + log.write(ctx, "User {} invited to group {}".format(member, group_name)) + continue + + # Member is group manager and in the CO. + if member in managers and member.split('#')[0] in co_members: + uid = sram.sram_get_uid(ctx, co_identifier, member) + if uid == '': + log.write(ctx, "Something went wrong getting the SRAM user id for user {} of group {}".format(member, group_name)) + else: + if sram.sram_update_collaboration_membership(ctx, co_identifier, uid, "manager"): + log.write(ctx, "Updated {} user to manager of group {}".format(member, group_name)) else: - if sram.sram_update_collaboration_membership(ctx, co_identifier, uid, "manager"): - log.write(ctx, "Updated {} user to manager of group {}".format(member, group_name)) - else: - log.write(ctx, "Something went wrong updating {} user to manager of group {} in SRAM".format(member, group_name)) + log.write(ctx, "Something went wrong updating {} user to manager of group {} in SRAM".format(member, group_name)) log.write(ctx, "Finished syncing groups with SRAM") diff --git a/iiFolderStatusTransitions.r b/iiFolderStatusTransitions.r index a59933b79..56fa7adab 100644 --- a/iiFolderStatusTransitions.r +++ b/iiFolderStatusTransitions.r @@ -26,6 +26,16 @@ iiScheduleCopyToVault() { } } +# \brief Schedule copy-to-vault for just one coll (asynchronously). +# +# \param[in] folder Path of folder +# +iiScheduleCollCopyToVault(*coll) { + delay ("irods_rule_engine_plugin-irods_rule_language-instance1s") { + msiExecCmd("scheduled-copytovault.sh", "'*coll'", "", "", 0, *out); + } +} + # \brief iiFolderDatamanagerAction # @@ -205,144 +215,6 @@ iiRemoveMetadataFromItem(*itemParent, *itemName, *itemIsCollection, *buffer, *er } } -# \brief iiFolderSecure Secure a folder to the vault. This function should only be called by a rodsadmin -# and should not be called from the portal. Thus no statusInfo is returned, but -# log messages are sent to stdout instead -# -# \param[in] folder -# -iiFolderSecure(*folder) { - uuGetUserType(uuClientFullName, *userType); - if (*userType != "rodsadmin") { - writeLine("stdout", "iiFolderSecure: Should only be called by a rodsadmin"); - fail; - } - - # Check modify access on research folder. - msiCheckAccess(*folder, "modify object", *modifyAccess); - - # Set cronjob status. - msiString2KeyValPair(UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_PROCESSING, *kvp); - if (*modifyAccess != 1) { - msiSetACL("default", "admin:write", uuClientFullName, *folder); - } - msiSetKeyValuePairsToObj(*kvp, *folder, "-C"); - *found = false; - foreach (*row in SELECT META_COLL_ATTR_VALUE - WHERE COLL_NAME = '*folder' - AND META_COLL_ATTR_NAME = IICOPYPARAMSNAME) { - # retry with previous parameters - *target = *row.META_COLL_ATTR_VALUE; - *found = true; - } - if (*found) { - # Remove parameters from metadata - msiString2KeyValPair("", *kvp); - *key = IICOPYPARAMSNAME; - *kvp."*key" = *target; - msiRemoveKeyValuePairsFromObj(*kvp, *folder, "-C"); - } - if (*modifyAccess != 1) { - msiSetACL("default", "admin:null", uuClientFullName, *folder); - } - - if (!*found) { - # this file - *target = iiDetermineVaultTarget(*folder); - } - - # Copy to vault. - iiCopyFolderToVault(*folder, *target); - - # Continue securing process in PREP. - *return = ""; - rule_folder_secure(*folder, *target, *return); -} - - -# \brief iiDetermineVaultTarget -# -# \param[in] folder -# \returnvalue target path -# -iiDetermineVaultTarget(*folder) { - *err = errorcode(iiCollectionGroupName(*folder, *groupName)); - if (*err < 0) { - writeLine("stdout", "iiDetermineVaultTarget: Cannot determine which research group *folder belongs to"); - fail; - } else { - writeLine("stdout", "iiDetermineVaultTarget: *folder belongs to *groupName"); - } - uuChop(*groupName, *_, *baseName, "-", true); - uuChopPath(*folder, *parent, *datapackageName); - - # Make room for the timestamp and sequence number - if (strlen(*datapackageName) > 235) { - *datapackageName = substr(*datapackageName, 0, 235); - } - - msiGetIcatTime(*timestamp, "unix"); - *timestamp = triml(*timestamp, "0"); - *vaultGroupName = IIVAULTPREFIX ++ *baseName; - - *target = "/$rodsZoneClient/home/*vaultGroupName/*datapackageName[*timestamp]"; - - *i = 0; - while (uuCollectionExists(*target)) { - writeLine("stdout", "iiDetermineVaultTarget: *target already exists"); - *i = *i + 1; - *target = "/$rodsZoneClient/home/*vaultGroupName/*datapackageName[*timestamp][*i]"; - } - writeLine("stdout", "iiDetermineVaultTarget: Target is *target"); - *target; -} - - -# \brief Return the name of the group a collection belongs to. -# -# \param[in] folder -# \param[out] groupName -# -iiCollectionGroupName(*folder, *groupName) { - if (*folder like regex "/[^/]+/home/deposit-.[^/]*/.*") { - uuChopPath(*folder, *parent, *baseName); - *path = *parent; - } else { - *path = *folder; - } - - *isfound = false; - *groupName = ""; - foreach(*accessid in SELECT COLL_ACCESS_USER_ID WHERE COLL_NAME = *path) { - *id = *accessid.COLL_ACCESS_USER_ID; - foreach(*group in SELECT USER_GROUP_NAME WHERE USER_GROUP_ID = *id) { - *groupName = *group.USER_GROUP_NAME; - } - if (*groupName like regex "(deposit|research|intake)-.*") { - *isfound = true; - break; - } - } - - if (!*isfound) { - foreach(*accessid in SELECT COLL_ACCESS_USER_ID WHERE COLL_NAME = *path) { - *id = *accessid.COLL_ACCESS_USER_ID; - foreach(*group in SELECT USER_GROUP_NAME WHERE USER_GROUP_ID = *id) { - *groupName = *group.USER_GROUP_NAME; - } - if (*groupName like regex "(datamanager|vault)-.*") { - *isfound = true; - break; - } - } - } - if (!*isfound){ - # No results found. Not a group folder - writeLine("serverLog", "*path does not belong to a deposit, research or intake group or is not available to current user"); - } -} - - # \brief Check validity of requested folder status transition in a research area. # # \param[in] fromstatus folder status before requested transition diff --git a/iiVault.r b/iiVault.r index 805b78402..570c5ef0e 100644 --- a/iiVault.r +++ b/iiVault.r @@ -6,24 +6,6 @@ # \license GPLv3, see LICENSE. -# \brief iiCopyFolderToVault -# -# \param[in] folder folder to copy to the vault -# \param[in] target path of the vault package -# -iiCopyFolderToVault(*folder, *target) { - - writeLine("serverLog", "iiCopyFolderToVault: Copying *folder to *target") - *buffer.source = *folder; - *buffer.destination = *target ++ "/original"; - uuTreeWalk("forward", *folder, "iiIngestObject", *buffer, *error); - if (*error != 0) { - msiGetValByKey(*buffer, "msg", *msg); # using . syntax here lead to type error - writeLine("stdout", "iiIngestObject: *error: *msg"); - fail; - } -} - # \brief Called by uuTreeWalk for each collection and dataobject to copy to the vault. # # \param[in] itemParent diff --git a/integration_tests.py b/integration_tests.py index c2b8974aa..ea7fb6848 100644 --- a/integration_tests.py +++ b/integration_tests.py @@ -7,17 +7,547 @@ __all__ = ['rule_run_integration_tests'] import json +import os +import re +import time import traceback +import uuid + +import data_access_token +import folder +import meta +import schema +from util import avu, collection, config, constants, data_object, group, log, msi, resource, rule, user + + +def _call_msvc_stat_vault(ctx, resc_name, data_path): + ret = msi.stat_vault(ctx, resc_name, data_path, '', '') + return (ret['arguments'][2], ret['arguments'][3]) + + +def _call_msvc_stat_vault_check_exc(ctx, resc_name, data_path): + """Verifies whether a call to the stat vault microservices raises an exception""" + try: + msi.stat_vault(ctx, resc_name, data_path, '', '') + return False + except Exception: + return True + + +def _call_msvc_json_arrayops(ctx, jsonstr, val, ops, index, argument_index): + """Returns an output argument from the json_arrayops microservice""" + return ctx.msi_json_arrayops(jsonstr, val, ops, index)["arguments"][argument_index] + + +def _call_msvc_json_objops(ctx, jsonstr, val, ops, argument_index): + """Returns an output argument from the json_objops microservice""" + return ctx.msi_json_objops(jsonstr, val, ops)["arguments"][argument_index] + + +def _create_tmp_object(ctx): + """Creates a randomly named test data object and returns its name""" + path = "/{}/home/rods/{}.test".format(user.zone(ctx), str(uuid.uuid4())) + data_object.write(ctx, path, "test") + return path + + +def _create_tmp_collection(ctx): + """Creates a randomly named test collection and returns its name""" + path = "/{}/home/rods/{}-test".format(user.zone(ctx), str(uuid.uuid4())) + collection.create(ctx, path) + return path + + +def _test_msvc_add_avu_object(ctx): + tmp_object = _create_tmp_object(ctx) + ctx.msi_add_avu('-d', tmp_object, "foo", "bar", "baz") + result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)] + data_object.remove(ctx, tmp_object) + return result + + +def _test_msvc_add_avu_collection(ctx): + tmp_coll = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_coll, "foo", "bar", "baz") + result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_coll)] + collection.remove(ctx, tmp_coll) + return result + + +def _test_msvc_rmw_avu_object(ctx, rmw_attributes): + tmp_object = _create_tmp_object(ctx) + ctx.msi_add_avu('-d', tmp_object, "foo", "bar", "baz") + ctx.msi_add_avu('-d', tmp_object, "foot", "hand", "head") + ctx.msi_add_avu('-d', tmp_object, "aap", "noot", "mies") + ctx.msi_rmw_avu('-d', tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2]) + result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)] + data_object.remove(ctx, tmp_object) + return result + + +def _test_msvc_rmw_avu_collection(ctx, rmw_attributes): + tmp_object = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz") + ctx.msi_add_avu('-c', tmp_object, "foot", "hand", "head") + ctx.msi_add_avu('-c', tmp_object, "aap", "noot", "mies") + ctx.msi_rmw_avu('-c', tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2]) + result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)] + collection.remove(ctx, tmp_object) + return result + + +def _test_avu_set_collection(ctx, catch): + # Test setting avu with catch and without catch + tmp_object = _create_tmp_collection(ctx) + avu.set_on_coll(ctx, tmp_object, "foo", "bar", catch) + result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)] + collection.remove(ctx, tmp_object) + return result + + +def _test_avu_rmw_collection(ctx, rmw_attributes): + # Test removing with catch and without catch + tmp_object = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz") + ctx.msi_add_avu('-c', tmp_object, "aap", "noot", "mies") + avu.rmw_from_coll(ctx, tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2], rmw_attributes[3]) + result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)] + collection.remove(ctx, tmp_object) + return result + + +def _test_avu_get_attr_val_of_coll(ctx, attr, value): + # Test getting the value of an attribute on a collection + tmp_coll = _create_tmp_collection(ctx) + ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz") + result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr) + collection.remove(ctx, tmp_coll) + return result + + +def _test_avu_get_attr_val_of_coll_exception(ctx): + # Test that getting a non existing attribute on a collection raises an exception (True for exception raised) + tmp_coll = _create_tmp_collection(ctx) + result = False + try: + result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo") + except Exception: + result = True + collection.remove(ctx, tmp_coll) + return result + + +def _test_folder_set_retry_avus(ctx): + tmp_coll = _create_tmp_collection(ctx) + folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2) + # Needed to be able to delete collection + msi.set_acl(ctx, "default", "admin:own", user.full_name(ctx), tmp_coll) + collection.remove(ctx, tmp_coll) + return True + + +def _test_msvc_apply_atomic_operations_collection(ctx): + tmp_coll = _create_tmp_collection(ctx) + operations = { + "entity_name": tmp_coll, + "entity_type": "collection", + "operations": [ + { + "operation": "add", + "attribute": "aap", + "value": "noot", + "units": "mies" + }, + { + "operation": "add", + "attribute": "foo", + "value": "bar", + "units": "baz" + }, + { + "operation": "remove", + "attribute": "aap", + "value": "noot", + "units": "mies" + } + ] + } + avu.apply_atomic_operations(ctx, operations) + result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_coll)] + collection.remove(ctx, tmp_coll) + return result + + +def _test_msvc_apply_atomic_operations_object(ctx): + tmp_object = _create_tmp_object(ctx) + operations = { + "entity_name": tmp_object, + "entity_type": "data_object", + "operations": [ + { + "operation": "add", + "attribute": "aap", + "value": "noot", + "units": "mies" + }, + { + "operation": "add", + "attribute": "foo", + "value": "bar", + "units": "baz" + }, + { + "operation": "remove", + "attribute": "aap", + "value": "noot", + "units": "mies" + } + ] + } + avu.apply_atomic_operations(ctx, operations) + result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)] + data_object.remove(ctx, tmp_object) + return result + + +def _test_folder_cronjob_status(ctx): + tmp_coll = _create_tmp_collection(ctx) + result_set = folder.set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], tmp_coll) + status = folder.get_cronjob_status(ctx, tmp_coll) + correct_status = status == constants.CRONJOB_STATE['RETRY'] + result_rm = folder.rm_cronjob_status(ctx, tmp_coll) + collection.remove(ctx, tmp_coll) + return result_set, correct_status, result_rm + + +def _test_folder_set_get_last_run(ctx): + tmp_coll = _create_tmp_collection(ctx) + result = folder.set_last_run_time(ctx, tmp_coll) + found, last_run = folder.get_last_run_time(ctx, tmp_coll) + collection.remove(ctx, tmp_coll) + return result, found, last_run + + +def _test_schema_active_schema_deposit_from_default(ctx): + avu.rm_from_group(ctx, "deposit-pilot", "schema_id", "dag-0") + result = schema.get_active_schema_path(ctx, "/tempZone/home/deposit-pilot") + avu.associate_to_group(ctx, "deposit-pilot", "schema_id", "dag-0") + return result + + +def _test_schema_active_schema_research_from_default(ctx): + avu.rm_from_group(ctx, "research-core-2", "schema_id", "core-2") + result = schema.get_active_schema_path(ctx, "/tempZone/home/research-core-2") + avu.associate_to_group(ctx, "research-core-2", "schema_id", "core-2") + return result + + +def _test_schema_active_schema_vault_research_override(ctx): + avu.associate_to_group(ctx, "vault-core-2", "schema_id", "integration-test-schema-1") + result = schema.get_active_schema_path(ctx, "/tempZone/home/vault-core-2") + avu.rm_from_group(ctx, "vault-core-2", "schema_id", "integration-test-schema-1") + return result + + +def _test_schema_active_schema_vault_without_research(ctx): + ctx.uuGroupAdd("vault-without-research", "test-automation", "something", "", "", "", "", "", "", "") + result = schema.get_active_schema_path(ctx, "/tempZone/home/vault-without-research") + ctx.uuGroupRemove("vault-without-research", "", "") + return result + + +def _test_get_latest_vault_metadata_path_empty(ctx): + tmp_collection = _create_tmp_collection(ctx) + latest_file = meta.get_latest_vault_metadata_path(ctx, tmp_collection) + collection.remove(ctx, tmp_collection) + return latest_file is None + + +def _test_get_latest_vault_metadata_path_normal(ctx): + tmp_collection = _create_tmp_collection(ctx) + data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869873].json"), "test") + data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869875].json"), "test") + data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869877].json"), "test") + data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869876].json"), "test") + data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869874].json"), "test") + latest_file = meta.get_latest_vault_metadata_path(ctx, tmp_collection) + data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869873].json")) + data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869875].json")) + data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869877].json")) + data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869876].json")) + data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869874].json")) + collection.remove(ctx, tmp_collection) + return latest_file == os.path.join(tmp_collection, "yoda-metadata[1722869877].json") + + +def _test_folder_secure_func(ctx, func): + """Create tmp collection, apply func to it and get result, and clean up. + Used for testing functions that modify avu/acls related to folder secure. + Happy flow. + + :param ctx: Combined type of a callback and rei struct + :param func: Function to test + + :returns: Result of action + """ + tmp_coll = _create_tmp_collection(ctx) + # Assume returns True/False, or does not return + result = func(ctx, tmp_coll) + # Needed to be able to delete collection in situations where func changed ACLs + msi.set_acl(ctx, "default", "admin:own", user.full_name(ctx), tmp_coll) + collection.remove(ctx, tmp_coll) + if result is None: + return True + return result -from util import collection, config, data_object, log, resource, rule, user basic_integration_tests = [ + {"name": "msvc.add_avu_collection", + "test": lambda ctx: _test_msvc_add_avu_collection(ctx), + "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)}, + {"name": "msvc.add_avu_object", + "test": lambda ctx: _test_msvc_add_avu_object(ctx), + "check": lambda x: (("foo", "bar", "baz") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "msvc.json_arrayops.add", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "add", 0, 0), + "check": lambda x: x == '["a", "b", "c", "d"]'}, + {"name": "msvc.json_arrayops.find_exist", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "b", "find", 0, 3), + "check": lambda x: x == 1}, + {"name": "msvc.json_arrayops.find_notexist", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "find", 0, 3), + "check": lambda x: x == -1}, + {"name": "msvc.json_arrayops.get", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "", "get", 1, 1), + "check": lambda x: x == 'b'}, + {"name": "msvc.json_arrayops.rm_exist", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "b", "rm", 0, 0), + "check": lambda x: x == '["a", "c"]'}, + {"name": "msvc.json_arrayops.rm_notexist", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "rm", 0, 0), + "check": lambda x: x == '["a", "b", "c"]'}, + {"name": "msvc.json_arrayops.size", + "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "", "size", 0, 3), + "check": lambda x: x == 3}, + {"name": "msvc.json_objops.add_notexist_empty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'add', 0), + "check": lambda x: x == '{"e": "f"}'}, + {"name": "msvc.json_objops.add_notexist_nonempty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'add', 0), + "check": lambda x: x == '{"a": "b", "e": "f"}'}, + {"name": "msvc.json_objops.add_exist_nonempty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'add', 0), + "check": lambda x: x == '{"a": "b", "e": "g"}'}, + {"name": "msvc.json_objops.get_exist", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", ""), 'get', 1), + "check": lambda x: str(x) == "(['c'], ['d'])"}, + {"name": "msvc.json_objops.get_notexist", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "e", ""), 'get', 1), + "check": lambda x: str(x) == "(['e'], [''])"}, + {"name": "msvc.json_objops.rm_exist", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", "d"), 'rm', 0), + "check": lambda x: x == '{"a": "b"}'}, + {"name": "msvc.json_objops.rm_notexist", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", "e"), 'rm', 0), + "check": lambda x: x == '{"a": "b", "c": "d"}'}, + {"name": "msvc.json_objops.set_notexist_empty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'set', 0), + "check": lambda x: x == '{"e": "f"}'}, + {"name": "msvc.json_objops.set_notexist_nonempty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'set', 0), + "check": lambda x: x == '{"a": "b", "e": "f"}'}, + {"name": "msvc.json_objops.set_exist_nonempty", + "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'set', 0), + "check": lambda x: x == '{"a": "b", "e": "g"}'}, + {"name": "msvc.msi_vault_stat.file", + "test": lambda ctx: (_call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.uri"), + _call_msvc_stat_vault(ctx, "dev001_2", "/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.uri")), + "check": lambda x: (x[0][0] == "FILE" and x[0][1] == "45") or (x[1][0] == "FILE" and x[1][1] == "45")}, + {"name": "msvc.msi_vault_stat.dir", + "test": lambda ctx: (_call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/home"), + _call_msvc_stat_vault(ctx, "dev001_2", "/var/lib/irods/Vault1_2/home")), + "check": lambda x: (x[0][0] == "DIR" and x[0][1] == "0") or (x[1][0] == "DIR" and x[1][1] == "0")}, + {"name": "msvc.msi_vault_stat.notexist", + "test": lambda ctx: _call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/doesnotexist"), + "check": lambda x: x[0] == "NOTEXIST" and x[1] == "0"}, + {"name": "msvc.msi_vault_stat.resourcenotexist", + "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "doesnotexist", "/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.uri"), + "check": lambda x: x}, + {"name": "msvc.msi_vault_stat.outsidevault1", + "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "dev001_1", "/etc/passwd"), + "check": lambda x: x}, + {"name": "msvc.msi_vault_stat.outsidevault2", + "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "dev001_1", "/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.uri"), + "check": lambda x: x}, + {"name": "msvc.msi_file_checksum.file", + "test": lambda ctx: _call_file_checksum_either_resc(ctx, "/var/lib/irods/VaultX/yoda/licenses/GNU General Public License v3.0.txt"), + "check": lambda x: x == "sha2:OXLcl0T2SZ8Pmy2/dmlvKuetivmyPd5m1q+Gyd+zaYY="}, + {"name": "msvc.msi_file_checksum.file_not_exist", + "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/licenses/doesnotexist.txt', 'dev001_2'), + "check": lambda x: x}, + {"name": "msvc.msi_file_checksum.resc_not_exist", + "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.txt', 'non-existent-resource'), + "check": lambda x: x}, + {"name": "msvc.msi_file_checksum.outside_vault", + "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/etc/passwd', 'dev001_2'), + "check": lambda x: x}, + {"name": "msvc.msi_dir_list.dir", + "test": lambda ctx: _call_dir_list(ctx, "/var/lib/irods/Vault1_1/yoda", "dev001_1"), + "check": lambda x: len(x) == len([entry for entry in os.listdir("/var/lib/irods/Vault1_1/yoda") if os.path.isdir("/var/lib/irods/Vault1_1/yoda/" + entry)])}, + {"name": "msvc.msi_dir_list.dir_not_exist", + "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/doesnotexist', 'dev001_2'), + "check": lambda x: x}, + {"name": "msvc.msi_dir_list.file_resc_1", + "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.txt', 'dev001_1'), + "check": lambda x: x}, + {"name": "msvc.msi_dir_list.file_resc_2", + "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.txt', 'dev001_2'), + "check": lambda x: x}, + {"name": "msvc.msi_dir_list.resc_not_exist", + "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda', 'non-existent-resource'), + "check": lambda x: x}, + {"name": "msvc.msi_dir_list.outside_vault", + "test": lambda ctx: _call_dir_list_check_exc(ctx, '/etc/passwd', 'dev001_2'), + "check": lambda x: x}, + {"name": "msvc.rmw_avu_collection_literal", + "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("foo", "bar", "baz")), + "check": lambda x: (("aap", "noot", "mies") in x + and ("foot", "hand", "head") in x + and len(x) == 2)}, + {"name": "msvc.rmw_avu_object_literal", + "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("foo", "bar", "baz")), + "check": lambda x: (("aap", "noot", "mies") in x + and ("foot", "hand", "head") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 2 + )}, + {"name": "msvc.rmw_avu_collection_literal_notexist", + "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("does", "not", "exist")), + "check": lambda x: (("aap", "noot", "mies") in x + and ("foo", "bar", "baz") in x + and ("foot", "hand", "head") in x + and len(x) == 3)}, + {"name": "msvc.rmw_avu_object_literal_notexist", + "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("does", "not", "exist")), + "check": lambda x: (("aap", "noot", "mies") in x + and ("foo", "bar", "baz") in x + and ("foot", "hand", "head") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 3 + )}, + {"name": "msvc.rmw_avu_collection_wildcard", + "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("fo%", "%", "%")), + "check": lambda x: (("aap", "noot", "mies") in x + and len(x) == 1)}, + {"name": "msvc.rmw_avu_object_wildcard", + "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("fo%", "%", "%")), + "check": lambda x: (("aap", "noot", "mies") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.set_from_coll.catch.yes", + "test": lambda ctx: _test_avu_set_collection(ctx, True), + "check": lambda x: (("foo", "bar", "") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.set_from_coll.catch.no", + "test": lambda ctx: _test_avu_set_collection(ctx, False), + "check": lambda x: (("foo", "bar", "") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.rmw_from_coll_wildcard.catch.yes", + "test": lambda ctx: _test_avu_rmw_collection(ctx, ("foo", "%", True, "%")), + "check": lambda x: (("aap", "noot", "mies") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.rmw_from_coll_wildcard.catch.no", + "test": lambda ctx: _test_avu_rmw_collection(ctx, ("foo", "%", False, "%")), + "check": lambda x: (("aap", "noot", "mies") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.get_attr_val_of_coll.exists.yes", + "test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"), + "check": lambda x: x == "bar"}, + {"name": "avu.get_attr_val_of_coll.exists.no", + "test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx), + "check": lambda x: x}, + {"name": "avu.apply_atomic_operations.collection", + "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx), + "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)}, + {"name": "avu.apply_atomic_operations.object", + "test": lambda ctx: _test_msvc_apply_atomic_operations_object(ctx), + "check": lambda x: (("foo", "bar", "baz") in x + and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1 + )}, + {"name": "avu.apply_atomic_operations.invalid", + "test": lambda ctx: avu.apply_atomic_operations(ctx, {"inspector": "gadget"}), + "check": lambda x: not x}, + {"name": "data_access_token.get_all_tokens", + "test": lambda ctx: data_access_token.get_all_tokens(ctx), + "check": lambda x: isinstance(x, list)}, + {"name": "folder.set_can_modify", + "test": lambda ctx: _test_folder_secure_func(ctx, folder.set_can_modify), + "check": lambda x: x}, + {"name": "folder.cronjob_status", + "test": lambda ctx: _test_folder_cronjob_status(ctx), + "check": lambda x: x[0] and x[1] and x[2]}, + {"name": "folder.set_get_last_run_time", + "test": lambda ctx: _test_folder_set_get_last_run(ctx), + "check": lambda x: x[0] and x[1] and x[2] + 25 >= int(time.time())}, + {"name": "folder.set_last_run_time", + "test": lambda ctx: _test_folder_secure_func(ctx, folder.set_last_run_time), + "check": lambda x: x}, + {"name": "folder.check_folder_secure", + "test": lambda ctx: _test_folder_secure_func(ctx, folder.check_folder_secure), + "check": lambda x: x}, + {"name": "folder.folder_secure_fail", + "test": lambda ctx: _test_folder_secure_func(ctx, folder.folder_secure_fail), + "check": lambda x: x}, + {"name": "folder.set_retry_avus", + "test": lambda ctx: _test_folder_set_retry_avus(ctx), + "check": lambda x: x}, + {"name": "folder.determine_new_vault_target.research", + "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/research-initial/testdata"), + "check": lambda x: re.match("^\/tempZone\/home\/vault-initial\/testdata\[[0-9]*\]$", x) is not None}, + {"name": "folder.determine_new_vault_target.deposit", + "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/deposit-pilot/deposit-hi[123123]"), + "check": lambda x: re.match("^\/tempZone\/home\/vault-pilot\/deposit-hi\[[0-9]*\]\[[0-9]*\]$", x) is not None}, + {"name": "folder.determine_new_vault_target.invalid", + "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/not-research-group-not-exist/folder-not-exist"), + "check": lambda x: x == ""}, + {"name": "groups.rule_group_expiration_date_validate.1", + "test": lambda ctx: ctx.rule_group_expiration_date_validate("", ""), + "check": lambda x: x['arguments'][1] == 'true'}, + {"name": "groups.rule_group_expiration_date_validate.2", + "test": lambda ctx: ctx.rule_group_expiration_date_validate(".", ""), + "check": lambda x: x['arguments'][1] == 'true'}, + {"name": "groups.rule_group_expiration_date_validate.3", + "test": lambda ctx: ctx.rule_group_expiration_date_validate("abc", ""), + "check": lambda x: x['arguments'][1] == 'false'}, + {"name": "groups.rule_group_expiration_date_validate.4", + "test": lambda ctx: ctx.rule_group_expiration_date_validate("2020-02-02", ""), + "check": lambda x: x['arguments'][1] == 'false'}, + {"name": "groups.rule_group_expiration_date_validate.5", + "test": lambda ctx: ctx.rule_group_expiration_date_validate("2044-01-32", ""), + "check": lambda x: x['arguments'][1] == 'false'}, + {"name": "groups.rule_group_expiration_date_validate.6", + "test": lambda ctx: ctx.rule_group_expiration_date_validate("2044-02-26", ""), + "check": lambda x: x['arguments'][1] == 'true'}, + {"name": "meta.get_latest_vault_metadata_path.empty", + "test": lambda ctx: _test_get_latest_vault_metadata_path_empty(ctx), + "check": lambda x: x}, + {"name": "meta.get_latest_vault_metadata_path.normal", + "test": lambda ctx: _test_get_latest_vault_metadata_path_normal(ctx), + "check": lambda x: x}, {"name": "policies.check_anonymous_access_allowed.local", "test": lambda ctx: ctx.rule_check_anonymous_access_allowed("127.0.0.1", ""), "check": lambda x: x['arguments'][1] == 'true'}, {"name": "policies.check_anonymous_access_allowed.remote", "test": lambda ctx: ctx.rule_check_anonymous_access_allowed("1.2.3.4", ""), "check": lambda x: x['arguments'][1] == 'false'}, + {"name": "policies.check_max_connections_exceeded", + "test": lambda ctx: ctx.rule_check_max_connections_exceeded(""), + # This rule should always return 'false' for user 'rods' + "check": lambda x: x['arguments'][0] == 'false'}, # Vault metadata schema report: only check return value type, not contents {"name": "schema_transformation.batch_vault_metadata_schema_report", "test": lambda ctx: ctx.rule_batch_vault_metadata_schema_report(""), @@ -40,15 +570,64 @@ {"name": "util.data_object.exists.no", "test": lambda ctx: data_object.exists(ctx, "/tempZone/home/research-initial/testdata/doesnotexist.txt"), "check": lambda x: not x}, + {"name": "util.data_object.get_properties.by_data_name", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["DATA_NAME"] == "lorem.txt"}, + {"name": "util.data_object.get_properties.by_modify_time", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["DATA_MODIFY_TIME"].isdigit()}, + {"name": "util.data_object.get_properties.by_owner_name", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["DATA_OWNER_NAME"] == "rods"}, + {"name": "util.data_object.get_properties.by_coll_name", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["COLL_NAME"] == "/tempZone/home/research-initial/testdata"}, + {"name": "util.data_object.get_properties.by_coll_id", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["COLL_ID"].isdigit()}, + {"name": "util.data_object.get_properties.by_data_resc_hier", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["DATA_RESC_HIER"].startswith('irodsResc')}, + {"name": "util.data_object.get_properties.by_data_size", + "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"), + "check": lambda x: x["DATA_SIZE"].isdigit()}, + # Using the resource_id as data_id to ensure no existing data object uses this occupied identifier + {"name": "util.data_object.get_properties.no_data_object", + "test": lambda ctx: data_object.get_properties(ctx, resource.id_from_name(ctx, "irodsResc"), "irodsResc"), + "check": lambda x: x is None}, {"name": "util.data_object.owner", "test": lambda ctx: data_object.owner(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "check": lambda x: x == ('rods', 'tempZone')}, {"name": "util.data_object.size", "test": lambda ctx: data_object.size(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "check": lambda x: x == 1003240}, + {"name": "util.data_object.to_from_id", + "test": lambda ctx: data_object.name_from_id(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt")), + "check": lambda x: x == "/tempZone/home/research-initial/testdata/lorem.txt"}, {"name": "util.data_object.get_group_owners", "test": lambda ctx: data_object.get_group_owners(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "check": lambda x: x == [['research-initial', 'tempZone']]}, + {"name": "util.group.exists.yes", + "test": lambda ctx: group.exists(ctx, "research-initial"), + "check": lambda x: x}, + {"name": "util.group.exists.no", + "test": lambda ctx: group.exists(ctx, "research-doesnotexist"), + "check": lambda x: not x}, + {"name": "util.group.get_category", + "test": lambda ctx: group.get_category(ctx, "research-initial"), + "check": lambda x: x == "test-automation"}, + {"name": "util.group.is_member.yes", + "test": lambda ctx: group.is_member(ctx, "research-initial", "researcher"), + "check": lambda x: x}, + {"name": "util.group.is_member.no", + "test": lambda ctx: group.is_member(ctx, "research-initial", "rods"), + "check": lambda x: not x}, + {"name": "util.group.members.normal", + "test": lambda ctx: group.members(ctx, "research-initial"), + "check": lambda x: sorted([member for member in x]) == sorted([('functionaladminpriv', 'tempZone'), ('functionaladminpriv@yoda.test', 'tempZone'), ('groupmanager', 'tempZone'), ('groupmanager@yoda.test', 'tempZone'), ('researcher', 'tempZone'), ('researcher@yoda.test', 'tempZone')])}, + {"name": "util.group.members.doesnotexist", + "test": lambda ctx: user.exists(ctx, "research-doesnotexist"), + "check": lambda x: x is False}, {"name": "util.resource.exists.yes", "test": lambda ctx: resource.exists(ctx, "irodsResc"), "check": lambda x: x}, @@ -91,6 +670,9 @@ {"name": "util.user.is_member_of.no", "test": lambda ctx: user.is_member_of(ctx, "research-initial", "datamanager"), "check": lambda x: not x}, + {"name": "util.user.number_of_connection", + "test": lambda ctx: user.number_of_connections(ctx), + "check": lambda x: isinstance(x, int) and x > 0}, {"name": "util.user.usertype.rodsadmin", "test": lambda ctx: user.user_type(ctx, "rods"), "check": lambda x: x == "rodsadmin"}, @@ -100,13 +682,17 @@ ] -@rule.make(inputs=[], outputs=[0]) -def rule_run_integration_tests(ctx): +@rule.make(inputs=[0], outputs=[1]) +def rule_run_integration_tests(ctx, tests): """This function runs the integration tests. It must be run by a rodsadmin user on a development environment. It assumes the standard test data is present. :param ctx: Combined type of a callback and rei struct + :param tests: Indicates which tests to run: + - Empty string means all tests + - String ending with '*' means all tests that start with a prefix, e.g. 'util.user.*' + - Otherwise the string should be the exact name of a test :returns: string with test results. Each line has one test name and its verdict. """ @@ -126,8 +712,14 @@ def rule_run_integration_tests(ctx): name = testconfig["name"] test = testconfig["test"] check = testconfig["check"] + exception = False + if (tests != "" + and tests != name + and not (tests.endswith("*") and name.startswith(tests[0:-1]))): + continue + try: result = test(ctx) except BaseException: @@ -144,3 +736,45 @@ def rule_run_integration_tests(ctx): return_value += name + " " + verdict + "\n" return return_value + + +def _call_file_checksum_either_resc(ctx, filename): + """Returns result of file checksum microservice for either of the + two main UFS resources (dev001_1, dev001_2). If one returns an + exception, we try the other. + + :param ctx: combined type of a callback and rei struct + :param filename: name of file to checksum + + :returns: output of file checksum microservice + """ + try: + vault_filename = filename.replace("VaultX", "Vault1_1") + ret = msi.file_checksum(ctx, vault_filename, 'dev001_1', '') + except Exception: + vault_filename = filename.replace("VaultX", "Vault1_2") + ret = msi.file_checksum(ctx, vault_filename, 'dev001_2', '') + return ret['arguments'][2] + + +def _call_file_checksum_check_exc(ctx, filename, resc_name): + """Verifies whether a call to the file checksum microservice raises an exception""" + try: + msi.file_checksum(ctx, filename, resc_name, '') + return False + except Exception: + return True + + +def _call_dir_list(ctx, dirname, resc_name): + ret = msi.dir_list(ctx, dirname, resc_name, "") + print(ret['arguments'][2]) + return json.loads(ret['arguments'][2]) + + +def _call_dir_list_check_exc(ctx, dirname, resc_name): + try: + msi.dir_list(ctx, dirname, resc_name, "") + return False + except Exception: + return True diff --git a/json_datacite.py b/json_datacite.py index b1e248a1b..04bbefafa 100644 --- a/json_datacite.py +++ b/json_datacite.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Functions for transforming Yoda JSON to DataCite 4.4 JSON.""" -__copyright__ = 'Copyright (c) 2019-2023, Utrecht University' +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' from dateutil import parser @@ -218,13 +218,18 @@ def get_funders(combi): def get_creators(combi): - """Return creator information in datacite format.""" + """Return creator information in DataCite format. + + :param combi: Combined JSON file that holds both user and system metadata + + :returns: JSON element with creators in DataCite format + """ all_creators = [] for creator in combi.get('Creator', []): affiliations = [] for aff in creator.get('Affiliation', []): - if isinstance(aff, dict): + if isinstance(aff, dict) and len(aff) > 0: if "Affiliation_Identifier" in aff and len(aff["Affiliation_Identifier"]): affiliations.append({"name": aff['Affiliation_Name'], "affiliationIdentifier": '{}'.format(aff['Affiliation_Identifier']), @@ -255,14 +260,14 @@ def get_contributors(combi): :param combi: Combined JSON file that holds both user and system metadata - :returns: XML element with contributors in DataCite format + :returns: JSON element with contributors in DataCite format """ all = [] # 1) Contributor for person in combi.get('Contributor', []): affiliations = [] for aff in person.get('Affiliation', []): - if isinstance(aff, dict) and len(aff): + if isinstance(aff, dict) and len(aff) > 0: if "Affiliation_Identifier" in aff and len(aff["Affiliation_Identifier"]): affiliations.append({"name": aff['Affiliation_Name'], "affiliationIdentifier": '{}'.format(aff['Affiliation_Identifier']), diff --git a/meta.py b/meta.py index eb329a03a..a91f367e4 100644 --- a/meta.py +++ b/meta.py @@ -14,6 +14,7 @@ from deepdiff import DeepDiff import avu_json +import meta_form import provenance import publication import schema as schema_ @@ -709,4 +710,51 @@ def copy_user_metadata(ctx, source, target): log.write(ctx, "rule_copy_user_metadata: copied user metadata from <{}> to <{}>".format(source, target)) except Exception: - log.write(ctx, "rule_copy_user_metadata: failed to copy user metadata from <{}> to <{}>".format(source, target)) + log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target)) + + +def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout): + """Process a single data package to retrieve and validate that its metadata conforms to the schema. + + :param ctx: Combined type of a callback and rei struct + :param coll_name: String representing the data package collection path. + :param schema_cache: Dictionary storing schema blueprints, can be empty. + :param report_name: Name of report script (for logging) + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A dictionary result containing if schema matches and the schema short name. + """ + metadata_path = get_latest_vault_metadata_path(ctx, coll_name) + + if not metadata_path: + log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout) + return None + + try: + metadata = jsonutil.read(ctx, metadata_path) + except Exception as exc: + log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout) + return None + + # Determine schema + schema_id = schema_.get_schema_id(ctx, metadata_path) + schema_shortname = schema_id.split("/")[-2] + + # Retrieve schema and cache it for future use + schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id) + if schema_shortname in schema_cache: + schema_contents = schema_cache[schema_shortname] + else: + schema_contents = jsonutil.read(ctx, schema_path) + schema_cache[schema_shortname] = schema_contents + + # Check whether metadata matches schema and log any errors + error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) + match_schema = len(error_list) == 0 + if not match_schema: + errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list] + log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout) + log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout) + + return {"schema": schema_shortname, "match_schema": match_schema} diff --git a/meta_form.py b/meta_form.py index b322926ca..2de46d63f 100644 --- a/meta_form.py +++ b/meta_form.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """JSON metadata form handling.""" -__copyright__ = 'Copyright (c) 2019-2022, Utrecht University' +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import re @@ -150,7 +150,8 @@ def load(ctx, coll): if space in [pathutil.Space.RESEARCH, pathutil.Space.DEPOSIT]: is_locked = folder.is_locked(ctx, coll, org_metadata) - can_edit = is_member and not is_locked + # Do not allow editing of files in folders with apostrophes in name + can_edit = is_member and not is_locked and '\'' not in subpath # Analyze a possibly existing metadata JSON file. meta_path = meta.get_collection_metadata_path(ctx, coll) @@ -229,9 +230,10 @@ def load(ctx, coll): status = vault.get_coll_vault_status(ctx, coll, org_metadata) can_edit = (groups.user_is_datamanager(ctx, category, user_full_name) - and (status == constants.vault_package_state.UNPUBLISHED - or status == constants.vault_package_state.PUBLISHED - or status == constants.vault_package_state.DEPUBLISHED)) + and status in (constants.vault_package_state.UNPUBLISHED, + constants.vault_package_state.PUBLISHED, + constants.vault_package_state.DEPUBLISHED) + and '\'' not in subpath) meta_path = meta.get_latest_vault_metadata_path(ctx, coll) if meta_path is None: @@ -294,7 +296,6 @@ def save(ctx, coll, metadata): is_vault = space is pathutil.Space.VAULT if is_vault: # It's a vault path - set up a staging area in the datamanager collection. - ret = ctx.iiDatamanagerGroupFromVaultGroup(group, '') datamanager_group = ret['arguments'][1] if datamanager_group == '': @@ -310,6 +311,9 @@ def save(ctx, coll, metadata): # Use staging area instead of trying to write to the vault directly. json_path = '{}/{}'.format(tmp_coll, constants.IIJSONMETADATA) + # Remove empty objects from metadata. + metadata = misc.remove_empty_objects(metadata) + # Add metadata schema id to JSON. meta.metadata_set_schema_id(metadata, schema_.get_active_schema_id(ctx, json_path)) diff --git a/notifications.py b/notifications.py index 8d3015168..babf34997 100644 --- a/notifications.py +++ b/notifications.py @@ -481,9 +481,10 @@ def rule_process_data_access_token_expiry(ctx): exp_time = datetime.strptime(token['exp_time'], '%Y-%m-%d %H:%M:%S.%f') date_exp_time = exp_time - timedelta(hours=config.token_expiration_notification) r = relativedelta.relativedelta(date_exp_time, datetime.now().date()) + total_hours = r.years * 12 * 30 * 24 + r.months * 30 * 24 + r.days * 24 + r.hours - # Send notification if token expires in less than a day. - if r.years == 0 and r.months == 0 and r.days <= 1: + # Send notification if token expires in less than configured hours. + if total_hours <= config.token_expiration_notification: actor = 'system' target = str(user.from_str(ctx, token['user'])) message = "Data access password with label <{}> is expiring".format(token["label"]) diff --git a/policies.py b/policies.py index 06cd50cc7..52b4ce092 100644 --- a/policies.py +++ b/policies.py @@ -445,7 +445,7 @@ def py_acPreProcForModifyAVUMetadata_cp(ctx, _, t_src, t_dst, src, dst): return policy.succeed() -# This PEP is called after a AVU is added (option = 'add'), set (option = +# This PEP is called after an AVU is added (option = 'add'), set (option = # 'set') or removed (option = 'rm') in the research area or the vault. Post # conditions defined in folder.py and iiVaultTransitions.r # are called here. diff --git a/policies_folder_status.py b/policies_folder_status.py index 2e64b28f7..a2f00869b 100644 --- a/policies_folder_status.py +++ b/policies_folder_status.py @@ -131,7 +131,7 @@ def post_status_transition(ctx, path, actor, status): # Set state to secure package in vault space. attribute = constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault" avu.set_on_coll(ctx, path, attribute, constants.CRONJOB_STATE['PENDING']) - ctx.iiScheduleCopyToVault() + ctx.iiScheduleCollCopyToVault(path) elif status is constants.research_package_state.FOLDER: # If previous action was submit and new status is FOLDER action is unsubmit. diff --git a/publication.py b/publication.py index 2ac7e4d75..33ed10b0a 100644 --- a/publication.py +++ b/publication.py @@ -41,7 +41,7 @@ def get_publication_config(ctx): "davrods_anonymous_vhost": "davrodsAnonymousVHost", "publication_verbose_mode": "verboseMode"} optional_keys = ["publication_verbose_mode"] - configKeys = {} + config_keys = {} found_attrs = [] prefix_length = len(constants.UUORGMETADATAPREFIX) @@ -58,7 +58,7 @@ def get_publication_config(ctx): try: found_attrs.append(attr) - configKeys[attr2keys[attr]] = val + config_keys[attr2keys[attr]] = val except KeyError: continue @@ -67,7 +67,7 @@ def get_publication_config(ctx): if key not in found_attrs and key not in optional_keys: log.write(ctx, 'Missing config key ' + key) - return configKeys + return config_keys def generate_combi_json(ctx, publication_config, publication_state): @@ -151,8 +151,8 @@ def get_publication_state(ctx, vault_package): publ_metadata = get_collection_metadata(ctx, vault_package, constants.UUORGMETADATAPREFIX + 'publication_') # Take over all actual values as saved earlier. - for key in publ_metadata.keys(): - publication_state[key] = publ_metadata[key] + for key, value in publ_metadata.items(): + publication_state[key] = value # Handle access restriction. iter = genquery.row_iterator( @@ -300,7 +300,7 @@ def get_last_modified_datetime(ctx, vault_package): return my_date.strftime('%Y-%m-%dT%H:%M:%S.%f%z') -def generate_preliminary_DOI(ctx, publication_config, publication_state): +def generate_preliminary_doi(ctx, publication_config, publication_state): """Generate a Preliminary DOI. Preliminary, because we check for collision later. :param ctx: Combined type of a callback and rei struct @@ -316,7 +316,7 @@ def generate_preliminary_DOI(ctx, publication_config, publication_state): publication_state["versionDOI"] = dataCitePrefix + "/" + yodaPrefix + "-" + randomId -def generate_base_DOI(ctx, publication_config, publication_state): +def generate_base_doi(ctx, publication_config, publication_state): """Generate a base DOI. :param ctx: Combined type of a callback and rei struct @@ -666,17 +666,17 @@ def check_doi_availability(ctx, publication_state, type_flag): :param publication_state: Dict with state of the publication process :param type_flag: Flag indicating DOI type ('version' or 'base') """ - DOI = publication_state[type_flag + "DOI"] + doi = publication_state[type_flag + "DOI"] try: - httpCode = datacite.metadata_get(ctx, DOI) + http_code = datacite.metadata_get(ctx, doi) - if httpCode == 404: + if http_code == 404: publication_state[type_flag + "DOIAvailable"] = "yes" - elif httpCode in [401, 403, 500, 503, 504]: + elif http_code in [401, 403, 500, 503, 504]: # request failed, worth a retry publication_state["status"] = "Retry" - elif httpCode in [200, 204]: + elif http_code in [200, 204]: # DOI already in use publication_state[type_flag + "DOIAvailable"] = "no" publication_state["status"] = "Retry" @@ -745,13 +745,14 @@ def process_publication(ctx, vault_package): if "baseDOI" in previous_publication_state: # Set the link to previous publication state publication_state["baseDOI"] = previous_publication_state["baseDOI"] + publication_state["baseDOIMinted"] = previous_publication_state["baseDOIMinted"] publication_state["baseRandomId"] = previous_publication_state["baseRandomId"] # Create base DOI if it does not exist in the previous publication state. elif "baseDOI" not in previous_publication_state: log.write(ctx, "Creating base DOI for the vault package <{}>".format(vault_package)) try: - generate_base_DOI(ctx, publication_config, publication_state) + generate_base_doi(ctx, publication_config, publication_state) check_doi_availability(ctx, publication_state, 'base') publication_state["baseDOIMinted"] = 'no' # Set the link to previous publication state @@ -764,7 +765,7 @@ def process_publication(ctx, vault_package): save_publication_state(ctx, previous_vault_package, previous_publication_state) save_publication_state(ctx, vault_package, publication_state) - if status in ["Retry"]: + if status == "Retry": if verbose: log.write(ctx, "Error status for creating base DOI: " + status) return status @@ -779,7 +780,7 @@ def process_publication(ctx, vault_package): if "versionDOI" not in publication_state: if verbose: log.write(ctx, "Generating preliminary DOI.") - generate_preliminary_DOI(ctx, publication_config, publication_state) + generate_preliminary_doi(ctx, publication_config, publication_state) save_publication_state(ctx, vault_package, publication_state) @@ -788,7 +789,7 @@ def process_publication(ctx, vault_package): if verbose: log.write(ctx, "Version DOI available: no") log.write(ctx, "Generating preliminary DOI.") - generate_preliminary_DOI(ctx, publication_config, publication_state) + generate_preliminary_doi(ctx, publication_config, publication_state) publication_state["combiJsonPath"] = "" publication_state["dataCiteJsonPath"] = "" @@ -857,11 +858,8 @@ def process_publication(ctx, vault_package): # Determine whether an update ('put') or create ('post') message has to be sent to datacite datacite_action = 'post' - try: - if publication_state['versionDOIMinted'] == 'yes': - datacite_action = 'put' - except KeyError: - pass + if publication_state.get('versionDOIMinted') == 'yes': + datacite_action = 'put' # Send DataCite JSON to metadata end point if "dataCiteMetadataPosted" not in publication_state: @@ -874,7 +872,7 @@ def process_publication(ctx, vault_package): if update_base_doi: base_doi = None datacite_action = 'post' - if publication_state['baseDOIMinted'] == 'yes': + if publication_state.get('baseDOIMinted') == 'yes': datacite_action = 'put' if verbose: log.write(ctx, "Updating base DOI.") @@ -1329,7 +1327,32 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp :returns: "OK" if all went ok """ - return update_publication(ctx, vault_package, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes') + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is no rodsadmin", True) + return + + log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True) + collections = genquery.row_iterator( + "COLL_NAME", + "COLL_NAME like '%%/home/vault-%%' " + "AND META_COLL_ATTR_NAME = '" + constants.UUORGMETADATAPREFIX + "vault_status' " + "AND META_COLL_ATTR_VALUE = '{}'".format(str(constants.vault_package_state.PUBLISHED)), + genquery.AS_LIST, + ctx + ) + + packages_found = False + for collection in collections: + coll_name = collection[0] + if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)): + packages_found = True + output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes') + log.write(ctx, coll_name + ': ' + output, True) + + if not packages_found: + log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True) + else: + log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True) def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False): @@ -1505,7 +1528,7 @@ def get_all_versions(ctx, path, doi): :param ctx: Combined type of a callback and rei struct :param path: Path of the published data package - :param doi: Version DOI of the selected publication + :param doi: Base DOI of the selected publication :return: Dict of related version DOIs """ diff --git a/publication_troubleshoot.py b/publication_troubleshoot.py new file mode 100644 index 000000000..6ceafe737 --- /dev/null +++ b/publication_troubleshoot.py @@ -0,0 +1,442 @@ +# -*- coding: utf-8 -*- +"""Functions and rules for troubleshooting published data packages.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +__all__ = [ + 'api_batch_troubleshoot_published_data_packages', + 'rule_batch_troubleshoot_published_data_packages' +] + +import json +from datetime import datetime + +import genquery +import requests +import urllib3 + +import datacite +from meta import vault_metadata_matches_schema +from publication import get_publication_config +from util import * + + +def find_full_package_path(ctx, package_name, write_stdout): + """ + Find the full path of a data package based on its short name. + + :param ctx: Combined type of a callback and rei struct + :param package_name: The short name of the data package to find. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: The full path of the data package if found, otherwise None. + """ + try: + query_condition = ( + "COLL_NAME like '%{}%'".format(package_name) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Return full package path if exists + for row in iter: + return row[0] + except Exception as e: + log.write(ctx, "find_full_package_path: An error occurred while executing the query: {}".format(e), write_stdout) + return None + + +def find_data_packages(ctx, write_stdout): + """ + Find all data packages in Retry, Unrecoverable and Unknown status by matching its AVU. + + :param ctx: Combined type of a callback and rei struct + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A list of collection names that have not been processed successfully + """ + user_zone = user.zone(ctx) + + try: + # Get all the vault packages that have org_publication_status in metadata + query_condition = ( + "COLL_NAME like '/{}/home/vault-%' AND " + "META_COLL_ATTR_NAME = '{}publication_status'".format(user_zone, constants.UUORGMETADATAPREFIX) + ) + query_attributes = "COLL_NAME" + iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx) + + # Collecting only the collection names + return [row[0] for row in iter] + + except Exception as e: + log.write(ctx, "find_data_packages: An error occurred while executing the query: {}".format(e), write_stdout) + return [] + + +def check_print_data_package_system_avus(ctx, data_package, write_stdout): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + This also prints if there are any missing or unexpected results. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A 2-tuple containing boolean results of checking results + """ + extracted_avus = avu.of_coll(ctx, data_package) + results = misc.check_data_package_system_avus(extracted_avus) + + if not results["no_missing_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some missing AVUs in data package <{}> - {}".format(data_package, list(results["missing_avus"])), write_stdout) + + if not results["no_unexpected_avus"]: + log.write(ctx, "check_data_package_system_avus: There are some unexpected AVUs in data package <{}> - {}".format(data_package, list(results["unexpected_avus"])), write_stdout) + + return (results["no_missing_avus"], results["no_unexpected_avus"]) + + +def check_one_datacite_doi_reg(ctx, data_package, doi_name, write_stdout): + try: + doi = get_val_for_attr_with_pub_prefix(ctx, data_package, doi_name) + except ValueError as e: + log.write(ctx, "check_datacite_doi_registration: Error while trying to get {} - {}".format(doi_name, e), write_stdout) + return False + + status_code = datacite.metadata_get(ctx, doi) + return status_code == 200 + + +def check_datacite_doi_registration(ctx, data_package, write_stdout): + """ + Check the registration status of both versionDOI and baseDOI with the DataCite API, + ensuring that both DOIs return a 200 status code, which indicates successful registration. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple of booleans indicating check success or not (base doi check may be None if not relevant). + """ + version_doi_check = check_one_datacite_doi_reg(ctx, data_package, "versionDOI", write_stdout) + + previous_version = '' + try: + previous_version = get_val_for_attr_with_pub_prefix(ctx, data_package, "previous_version") + except Exception: + pass + + if previous_version: + base_doi_check = check_one_datacite_doi_reg(ctx, data_package, "baseDOI", write_stdout) + return version_doi_check, base_doi_check + + return (version_doi_check, None) + + +def get_val_for_attr_with_pub_prefix(ctx, data_package, attribute_suffix): + """ + Retrieves the value given the suffix of the attribute from a data package. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param attribute_suffix: Suffix of the attribute before adding prefix such as "org_publication_" + + :returns: Value of the attribute. + """ + attr = constants.UUORGMETADATAPREFIX + "publication_" + attribute_suffix + return avu.get_attr_val_of_coll(ctx, data_package, attr) + + +def get_landingpage_paths(ctx, data_package, write_stdout): + """Given a data package get what the path and remote url should be""" + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPagePath") + url = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPageUrl") + return file_path, url + + except Exception: + log.write(ctx, "get_landingpage_paths: Could not find landing page for data package: {}".format(data_package), write_stdout) + return '', '' + + +def compare_local_remote_landingpage(ctx, file_path, url, offline, api_call): + """ + Compares file contents between a file in irods and its remote version to verify their integrity. + + :param ctx: Combined type of a callback and rei struct + :param file_path: Path to file in irods + :param url: URL of file on remote + :param offline: Whether to skip requests.get call + :param api_call: Boolean representing whether was called by api and not a script + + :returns: True if the file contents match, False otherwise + """ + write_stdout = not api_call + # Local/irods file + if api_call: + # If called by technicaladmin, only check that the file exists since we don't have access to the contents + return data_object.exists(ctx, file_path) + else: + try: + local_data = data_object.read(ctx, file_path) + except Exception: + log.write(ctx, "compare_local_remote_landingpage: Local file not found at path {}.".format(file_path), write_stdout) + return False + + if offline: + return len(local_data) > 0 + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "compare_local_remote_landingpage: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "compare_local_remote_landingpage: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "compare_local_remote_landingpage: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Set encoding to utf-8 for the response text (otherwise will not match local_data) + # response.text is then returned as unicode + response.encoding = 'utf-8' + local_data_uni = local_data.decode("utf-8") + + if local_data_uni == response.text: + return True + + log.write(ctx, "compare_local_remote_landingpage: File contents at irods path <{}> and remote landing page <{}> do not match.".format(file_path, url), write_stdout) + return False + + +def check_landingpage(ctx, data_package, offline, api_call): + """ + Checks the integrity of landing page by comparing the contents + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param offline: Whether to skip any checks that require external server access + :param api_call: Boolean of whether this is for an api call version of the troubleshooting script + + :returns: A tuple containing boolean results of checking + """ + irods_file_path, landing_page_url = get_landingpage_paths(ctx, data_package, not api_call) + if len(irods_file_path) == 0 or len(landing_page_url) == 0: + return False + + return compare_local_remote_landingpage(ctx, irods_file_path, landing_page_url, offline, api_call) + + +def check_combi_json(ctx, data_package, publication_config, offline, write_stdout): + """ + Checks the integrity of combi JSON by checking URL and existence of file. + + :param ctx: Combined type of a callback and rei struct + :param data_package: String representing the data package collection path. + :param publication_config: Dictionary of publication config + :param offline: Whether to skip any checks that require external server access + :param write_stdout: A boolean representing whether to write to stdout or rodsLog + + :returns: A tuple containing boolean results of checking + """ + # Check that the combi json in irods exists + file_path = '' + try: + file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "combiJsonPath") + except Exception: + pass + exists = data_object.exists(ctx, file_path) + if not exists: + log.write(ctx, "check_combi_json: combi JSON file in irods does not exist: {}".format(file_path), write_stdout) + return False + + if offline: + return True + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + # Get the version doi + version_doi = '' + try: + version_doi = get_val_for_attr_with_pub_prefix(ctx, data_package, "versionDOI") + except Exception: + pass + url = "https://{}/oai/oai?verb=GetRecord&metadataPrefix=oai_datacite&identifier=oai:{}".format(publication_config["publicVHost"], version_doi) + try: + response = requests.get(url, verify=False) + except requests.exceptions.ConnectionError as e: + log.write(ctx, "check_combi_json: Failed to connect to {}".format(url), write_stdout) + log.write(ctx, "check_combi_json: Error: {}".format(e), write_stdout) + return False + + if response.status_code != 200: + log.write(ctx, "check_combi_json: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout) + return False + + # Look at the first few parts of the response for signs of error. + if "idDoesNotExist" in response.text[:5000]: + log.write(ctx, "check_combi_json: combiJson not found in oai for data package <{}>".format(data_package), write_stdout) + return False + + return True + + +def print_troubleshoot_result(ctx, data_package, result, datacite_check): + """Print the result of troubleshooting one package in human-friendly format""" + pass_all_tests = all(result.values()) + + log.write(ctx, "Results for: {}".format(data_package), True) + if pass_all_tests: + log.write(ctx, "Package passed all tests.", True) + else: + log.write(ctx, "Package FAILED one or more tests:", True) + log.write(ctx, "Schema matches: {}".format(result['schema_check']), True) + log.write(ctx, "All expected AVUs exist: {}".format(result['no_missing_AVUs_check']), True) + log.write(ctx, "No unexpected AVUs: {}".format(result['no_unexpected_AVUs_check']), True) + + if datacite_check: + log.write(ctx, "Version DOI matches: {}".format(result['versionDOI_check']), True) + if 'baseDOI_check' in result: + log.write(ctx, "Base DOI matches: {}".format(result['baseDOI_check']), True) + + log.write(ctx, "Landing page matches: {}".format(result['landingPage_check']), True) + log.write(ctx, "Combined JSON matches: {}".format(result['combiJson_check']), True) + + log.write(ctx, "", True) + + +def collect_troubleshoot_data_packages(ctx, requested_package, write_stdout): + data_packages = [] + + if requested_package == 'None': + # Retrieve all data packages + all_packages = find_data_packages(ctx, write_stdout) + if not all_packages: + log.write(ctx, "collect_troubleshoot_data_packages: No packages found.", write_stdout) + return None + + data_packages = all_packages + else: + # Get full path of the given package + full_package_path = find_full_package_path(ctx, requested_package, write_stdout) + + if not full_package_path: + log.write(ctx, "collect_troubleshoot_data_packages: Data package '{}' cannot be found.".format(requested_package), write_stdout) + return None + + data_packages.append(full_package_path) + + return data_packages + + +def batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, api_call, check_datacite): + """ + Troubleshoots published data packages. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + :param api_call: Boolean of whether this is run by a script or api test. + :param check_datacite: Boolean representing whether to do the datacite checks + + :returns: A dictionary of dictionaries providing the results of the job. + """ + write_stdout = not api_call + # Check permissions - rodsadmin only + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is not rodsadmin", write_stdout) + return {} + + data_packages = collect_troubleshoot_data_packages(ctx, requested_package, write_stdout) + if not data_packages: + return {} + schema_cache = {} + results = {} + + # Troubleshooting + for data_package in data_packages: + log.write(ctx, "Troubleshooting data package: {}".format(data_package), write_stdout) + result = {} + # Cannot check the metadata as technicaladmin + if not api_call: + schema_check_dict = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-publications", write_stdout) + result['schema_check'] = schema_check_dict['match_schema'] if schema_check_dict else False + + result['no_missing_AVUs_check'], result['no_unexpected_AVUs_check'] = check_print_data_package_system_avus(ctx, data_package, write_stdout) + + # Only check datacite if enabled + if check_datacite: + result['versionDOI_check'], base_doi_check = check_datacite_doi_registration(ctx, data_package, write_stdout) + if base_doi_check is not None: + result['baseDOI_check'] = base_doi_check + + result['landingPage_check'] = check_landingpage(ctx, data_package, offline, api_call) + publication_config = get_publication_config(ctx) + result['combiJson_check'] = check_combi_json(ctx, data_package, publication_config, offline, write_stdout) + + results[data_package] = result + + if not api_call: + print_troubleshoot_result(ctx, data_package, result, check_datacite) + + if log_file: + log_loc = "/var/lib/irods/log/troubleshoot_publications.log" + with open(log_loc, "a") as writer: + writer.writelines("Batch run date and time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))) + writer.writelines('\n') + writer.writelines("Troubleshooting data package: {}".format(data_package)) + writer.writelines('\n') + json.dump(result, writer) + writer.writelines('\n') + + return results + + +@api.make() +def api_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline): + """ + Wrapper for the batch script for troubleshooting published data packages. + Runs a subset of the tests since "technicaladmin" is usually more restricted than "rods". + + :param ctx: Combined type of a callback and rei struct + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A boolean representing to write results in log. + :param offline: A boolean representing whether to perform all checks without connecting to external servers. + + :returns: A dictionary of dictionaries providing the results of the job. + """ + return batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, True, False) + + +@rule.make(inputs=[0, 1, 2, 3], outputs=[]) +def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, no_datacite): + """ + Troubleshoots published data packages. + + Prints results of the following checks: + 1. Metadata schema compliance. + 2. Presence and correctness of expected AVUs. + 3. Registration with Data Cite. + 4. File integrity of landing page and combi JSON files. + + Operates on either a single specified package or all published packages, depending on the input. + + :param ctx: Context that combines a callback and rei struct. + :param requested_package: A string representing a specific data package path or all packages with failed publications. + :param log_file: A string boolean representing to write results in log. + :param offline: A string boolean representing whether to perform all checks without connecting to external servers. + :param no_datacite: A string boolean representing whether to skip the datacite checks + """ + offline = offline == "True" + log_file = log_file == "True" + check_datacite = no_datacite == "False" + + batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, False, check_datacite) diff --git a/research.py b/research.py index 4648be5b6..ff61fd41e 100644 --- a/research.py +++ b/research.py @@ -27,6 +27,26 @@ 'api_research_manifest'] +def folder_new_name_check(folder_name): + if len(folder_name) == 0: + return False, api.Error('missing_foldername', 'Missing folder name. Please add a folder name') + + # TODO remove when upgrade to GenQuery 2 + # This check should only be done on new folders, since may have old folders with apostrophes + if '\'' in folder_name: + return False, api.Error('invalid_foldername', 'It is not allowed to use apostrophes in a folder name') + + # Name should not contain '\\' or '/' + if '/' in folder_name or '\\' in folder_name: + return False, api.Error('invalid_foldername', 'It is not allowed to use slashes in the new folder name') + + # name should not be '.' or '..' + if folder_name in ('.', '..'): + return False, api.Error('invalid_foldername', 'it is not allowed to name the folder {}'.format(folder_name)) + + return True, "" + + @api.make() def api_research_folder_add(ctx, coll, new_folder_name): """Add a new folder to a research folder. @@ -39,8 +59,9 @@ def api_research_folder_add(ctx, coll, new_folder_name): """ coll_target = coll + '/' + new_folder_name - if len(new_folder_name) == 0: - return api.Error('missing_foldername', 'Missing folder name. Please add a folder name') + valid_folder_name, error_response = folder_new_name_check(new_folder_name) + if not valid_folder_name: + return error_response try: validate_filepath(coll_target.decode('utf-8')) @@ -51,14 +72,6 @@ def api_research_folder_add(ctx, coll, new_folder_name): if not len(coll.split('/')) > 2: return api.Error('invalid_destination', 'It is not possible to add folder ' + new_folder_name + ' at this location') - # Name should not contain '\\' or '/' - if '/' in new_folder_name or '\\' in new_folder_name: - return api.Error('invalid_foldername', 'It is not allowed to use slashes in a folder name') - - # Name should not be '.' or '..' - if new_folder_name == '.' or new_folder_name == '..': - return api.Error('invalid_foldername', 'It is not allowed to name the folder {}'.format(new_folder_name)) - # in vault? target_group_name = coll_target.split('/')[3] if target_group_name.startswith('vault-'): @@ -90,58 +103,83 @@ def api_research_folder_add(ctx, coll, new_folder_name): return api.Result.ok() -@api.make() -def api_research_folder_copy(ctx, folder_path, new_folder_path, overwrite=False): - """Copy a folder in a research folder. +def folder_copy_check(ctx, folder_path, new_folder_path, overwrite, copy=True): + """Check whether can copy (or move) folder to new folder location. :param ctx: Combined type of a callback and rei struct :param folder_path: Path to the folder to copy :param new_folder_path: Path to the new copy of the folder :param overwrite: Overwrite folder if it already exists + :param copy: Whether a copy operation (True) or move (False) (just for logging purposes) - :returns: Dict with API status result + :returns: 2-Tuple containing whether can copy/move, and the error if cannot """ + # Whether copy or move + verb = 'copy' if copy else 'move' + verb_past = 'copied' if copy else 'moved' if len(new_folder_path) == 0: - return api.Error('missing_folder_path', 'Missing folder path. Please add a folder path') + return False, api.Error('missing_folder_path', 'Missing folder path. Please add a folder path') + + # TODO remove when upgrade to GenQuery 2 + if '\'' in new_folder_path: + return False, api.Error('invalid_foldername', 'It is not allowed to use apostrophes in a folder name') try: validate_filepath(new_folder_path.decode('utf-8')) except ValidationError: - return api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder') + return False, api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder') # Same folder path makes no sense. if folder_path == new_folder_path: - return api.Error('invalid_folder_path', 'Origin and copy folder paths are equal. Please choose another destination') + return False, api.Error('invalid_folder_path', 'Origin and {} folder paths are equal. Please choose another destination'.format(verb)) # Inside the same path makes no sense. if "{}/".format(folder_path) in new_folder_path: - return api.Error('invalid_folder_path', 'Cannot copy folder inside itself. Please choose another destination') + return False, api.Error('invalid_folder_path', 'Cannot {} folder inside itself. Please choose another destination'.format(verb)) # not in home - a groupname must be present ie at least 2!? if not len(new_folder_path.split('/')) > 2: - return api.Error('invalid_destination', 'It is not possible to copy folder at this location') + return False, api.Error('invalid_destination', 'It is not possible to {} folder at this location'.format(verb)) # in vault? target_group_name = new_folder_path.split('/')[3] if target_group_name.startswith('vault-'): - return api.Error('invalid_destination', 'It is not possible to copy folder to the vault') + return False, api.Error('invalid_destination', 'It is not possible to {} folder to the vault'.format(verb)) # permissions ok for group? user_full_name = user.full_name(ctx) if groups.user_role(ctx, user_full_name, target_group_name) in ['none', 'reader']: - return api.Error('not_allowed', 'You do not have sufficient permissions to copy the selected folder') + return False, api.Error('not_allowed', 'You do not have sufficient permissions to {} the selected folder'.format(verb)) # Folder not locked? if folder.is_locked(ctx, new_folder_path): - return api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be copied') + return False, api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be {}'.format(verb_past)) # Does original folder exist? if not collection.exists(ctx, folder_path): - return api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found') + return False, api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found') # Collection exists in destination? if not overwrite and collection.exists(ctx, new_folder_path): - return api.Error('invalid_destination', 'Folder with this name already exists in destination') + return False, api.Error('invalid_destination', 'Folder with this name already exists in destination') + + return True, "" + + +@api.make() +def api_research_folder_copy(ctx, folder_path, new_folder_path, overwrite=False): + """Copy a folder in a research folder. + + :param ctx: Combined type of a callback and rei struct + :param folder_path: Path to the folder to copy + :param new_folder_path: Path to the new copy of the folder + :param overwrite: Overwrite folder if it already exists + + :returns: Dict with API status result + """ + valid, errorResponse = folder_copy_check(ctx, folder_path, new_folder_path, overwrite, True) + if not valid: + return errorResponse # All requirements OK try: @@ -163,47 +201,9 @@ def api_research_folder_move(ctx, folder_path, new_folder_path, overwrite=False) :returns: Dict with API status result """ - if len(new_folder_path) == 0: - return api.Error('missing_folder_path', 'Missing folder path. Please add a folder path') - - try: - validate_filepath(new_folder_path.decode('utf-8')) - except ValidationError: - return api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder') - - # Same folder path makes no sense. - if folder_path == new_folder_path: - return api.Error('invalid_folder_path', 'Origin and move folder paths are equal. Please choose another destination') - - # Inside the same path makes no sense. - if "{}/".format(folder_path) in new_folder_path: - return api.Error('invalid_folder_path', 'Cannot move folder inside itself. Please choose another destination') - - # not in home - a groupname must be present ie at least 2!? - if not len(new_folder_path.split('/')) > 2: - return api.Error('invalid_destination', 'It is not possible to move folder at this location') - - # in vault? - target_group_name = new_folder_path.split('/')[3] - if target_group_name.startswith('vault-'): - return api.Error('invalid_destination', 'It is not possible to move folder to the vault') - - # permissions ok for group? - user_full_name = user.full_name(ctx) - if groups.user_role(ctx, user_full_name, target_group_name) in ['none', 'reader']: - return api.Error('not_allowed', 'You do not have sufficient permissions to move the selected folder') - - # Folder not locked? - if folder.is_locked(ctx, new_folder_path): - return api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be moved') - - # Does original folder exist? - if not collection.exists(ctx, folder_path): - return api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found') - - # Collection exists in destination? - if not overwrite and collection.exists(ctx, new_folder_path): - return api.Error('invalid_destination', 'Folder with this name already exists in destination') + valid, errorResponse = folder_copy_check(ctx, folder_path, new_folder_path, overwrite, False) + if not valid: + return errorResponse # All requirements OK try: @@ -227,8 +227,9 @@ def api_research_folder_rename(ctx, new_folder_name, coll, org_folder_name): """ coll_target = coll + '/' + new_folder_name - if len(new_folder_name) == 0: - return api.Error('missing_foldername', 'Missing folder name. Please add a folder name') + valid_folder_name, error_response = folder_new_name_check(new_folder_name) + if not valid_folder_name: + return error_response try: validate_filepath(coll_target.decode('utf-8')) @@ -241,15 +242,7 @@ def api_research_folder_rename(ctx, new_folder_name, coll, org_folder_name): # not in home - a groupname must be present ie at least 2!? if not len(coll.split('/')) > 2: - return api.Error('invalid_destination', 'It is not possible to add folder ' + folder_name + ' at this location') - - # Name should not contain '\\' or '/' - if '/' in new_folder_name or '\\' in new_folder_name: - return api.Error('invalid_foldername', 'It is not allowed to use slashes in the new folder name') - - # Name should not be '.' or '..' - if new_folder_name == '.' or new_folder_name == '..': - return api.Error('invalid_foldername', 'It is not allowed to name the folder {}'.format(new_folder_name)) + return api.Error('invalid_destination', 'It is not possible to add folder ' + org_folder_name + ' at this location') # in vault? target_group_name = coll_target.split('/')[3] @@ -300,7 +293,7 @@ def api_research_folder_delete(ctx, coll, folder_name): # Name should not contain '\\' or '/'. if '/' in folder_name or '\\' in folder_name: - return api.Error('invalid_foldername', 'It is not allowed to use slashes in folder name to be delete') + return api.Error('invalid_foldername', 'It is not allowed to use slashes in folder name that will be deleted') # in vault? target_group_name = coll_target.split('/')[3] @@ -379,13 +372,19 @@ def api_research_file_copy(ctx, filepath, new_filepath, overwrite=False): if filepath == new_filepath: return api.Error('invalid_filepath', 'Origin and copy file paths are equal. Please choose another destination') - coll = pathutil.chop(new_filepath)[0] - data_name = pathutil.chop(new_filepath)[1] + _, org_data_name = pathutil.chop(filepath) + # These are of the NEW filepath + coll, data_name = pathutil.chop(new_filepath) try: validate_filename(data_name.decode('utf-8')) except Exception: return api.Error('invalid_filename', 'This is not a valid file name. Please choose another name') + # TODO remove when upgrade to GenQuery 2 + # This check should only be done on new folders, since may have old folders with apostrophes + if '\'' in coll: + return api.Error('invalid_filepath', 'It is not allowed to copy a file to a folder with an apostrophe in the name') + # not in home - a groupname must be present ie at least 2!? if not len(coll.split('/')) > 2: return api.Error('invalid_destination', 'It is not possible to copy files at this location') @@ -410,7 +409,7 @@ def api_research_file_copy(ctx, filepath, new_filepath, overwrite=False): # Does org file exist? if not data_object.exists(ctx, filepath): - return api.Error('invalid_source', 'The original file ' + data_name + ' can not be found') + return api.Error('invalid_source', 'The original file ' + org_data_name + ' can not be found') # new filename already exists? if not overwrite and data_object.exists(ctx, new_filepath): @@ -513,13 +512,18 @@ def api_research_file_move(ctx, filepath, new_filepath, overwrite=False): if filepath == new_filepath: return api.Error('invalid_filepath', 'Origin and move file paths are equal. Please choose another destination') - coll = pathutil.chop(new_filepath)[0] - data_name = pathutil.chop(new_filepath)[1] + # These are of the NEW filepath + coll, data_name = pathutil.chop(new_filepath) try: validate_filename(data_name.decode('utf-8')) except Exception: return api.Error('invalid_filename', 'This is not a valid file name. Please choose another name') + # TODO remove when upgrade to GenQuery 2 + # This check should only be done on new folders, since may have old folders with apostrophes + if '\'' in coll: + return api.Error('invalid_filepath', 'It is not allowed to move a file to a folder with an apostrophe in the name') + # not in home - a groupname must be present ie at least 2!? if not len(coll.split('/')) > 2: return api.Error('invalid_destination', 'It is not possible to move files to this location') diff --git a/resources.py b/resources.py index 9f715bf8c..1b3a2b6f6 100644 --- a/resources.py +++ b/resources.py @@ -218,81 +218,85 @@ def api_resource_category_stats(ctx): if len(categories) == 0: return {'categories': [], 'external_filter': ''} - # Continue for admins and datamanagers - storage = {} - - # Go through current groups of current categories. - # This function has no historic value so it is allowed to do so - for category in categories: - storage[category] = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0, 'internal': 0, 'external': 0} + # Retrieve storage statistics of groups. + iter = list(genquery.Query(ctx, + ['USER_GROUP_NAME', 'ORDER_DESC(META_USER_ATTR_NAME)', 'META_USER_ATTR_VALUE'], + "META_USER_ATTR_NAME like '{}%%'".format(constants.UUMETADATAGROUPSTORAGETOTALS), + output=genquery.AS_LIST)) - # for all groups in category - groups = get_groups_on_categories(ctx, [category]) - for groupname in groups: - if groupname.startswith(('research', 'deposit', 'intake', 'grp')): - # Only check the most recent storage measurement - iter = list(genquery.Query(ctx, - ['META_USER_ATTR_VALUE', 'ORDER_DESC(META_USER_ATTR_NAME)', 'USER_NAME', 'USER_GROUP_NAME'], - "META_USER_ATTR_VALUE like '[\"{}\",%%' AND META_USER_ATTR_NAME like '{}%%' AND USER_NAME = '{}'".format(category, constants.UUMETADATAGROUPSTORAGETOTALS, groupname), - offset=0, limit=1, output=genquery.AS_LIST)) - - for row in iter: - temp = jsonutil.parse(row[0]) - - storage[category]['total'] += temp[4] - storage[category]['research'] += temp[1] - storage[category]['vault'] += temp[2] - storage[category]['revision'] += temp[3] + # Go through storage statistics of groups. + storage = {} + group_counted = [] + for group_name, _storage_attribute, storage_json in iter: + # Check if group is valid and has not been counted yet. + if group_name.startswith(('research-', 'deposit-', 'intake-', 'grp-')) and group_name not in group_counted: + # Add group to list of groups counted for category statistics. + group_counted.append(group_name) + + # Add group to category statistics. + category, research, vault, revisions, total = jsonutil.parse(storage_json) + storage.setdefault(category, {'research': 0, 'vault': 0, 'revision': 0, 'total': 0}) + storage[category]['research'] += research + storage[category]['vault'] += vault + storage[category]['revision'] += revisions + storage[category]['total'] += total + + # Retrieve groups and their members. + iter = list(genquery.Query(ctx, + ['USER_GROUP_NAME', 'USER_NAME'], + "USER_TYPE != 'rodsgroup'", + output=genquery.AS_LIST)) + + # Calculate number of members per type per group. + members = {} + for group_name, user_name in iter: + members.setdefault(group_name, {'internal': set(), 'external': set()}) + if yoda_names.is_internal_user(user_name): + members[group_name]['internal'].add(user_name) + else: + members[group_name]['external'].add(user_name) - # Now go through all totals + # Calculate category members and storage totals. + instance_totals = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0, 'internals': set(), 'externals': set()} all_storage = [] - - # Totalization for the entire instance. - instance_totals = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0} - - # Member counts - cat_members = {} - members_total = [] for category in categories: - members = [] - # this information is only available for yoda-admins - for groupname in get_groups_on_categories(ctx, [category]): - group_members = list(group.members(ctx, groupname)) - for gm in group_members: - members.append(gm[0]) - members_total.append(gm[0]) - # deduplicate member list - cat_members[category] = list(set(members)) + if category not in storage: + continue - cat_members['YODA_INSTANCE_TOTAL'] = list(set(members_total)) + # Calculate category members and totals. + internals = set() + externals = set() + for group_name in get_groups_on_categories(ctx, [category]): + members.setdefault(group_name, {'internal': set(), 'external': set()}) + internals.update(members[group_name]['internal']) + externals.update(members[group_name]['external']) - def count_externals(members): - return len([member for member in members if not yoda_names.is_internal_user(member)]) + # Deduplicate group members. + users = {'internals': len(internals), 'externals': len(externals)} - def count_internals(members): - return len([member for member in members if yoda_names.is_internal_user(member)]) + # Count instance totals. + instance_totals['internals'].update(internals) + instance_totals['externals'].update(externals) - for category in categories: + # Humanize storage sizes for the frontend and calculate instance totals. storage_humanized = {} - # humanize storage sizes for the frontend - for type in ['total', 'research', 'vault', 'revision']: - storage_humanized[type] = misc.human_readable_size(1.0 * storage[category][type]) - instance_totals[type] += 1.0 * storage[category][type] + for storage_type in ['research', 'vault', 'revision', 'total']: + storage_humanized[storage_type] = misc.human_readable_size(1.0 * storage[category][storage_type]) + instance_totals[storage_type] += 1.0 * storage[category][storage_type] - users = {'internals': count_internals(cat_members[category]), 'externals': count_externals(cat_members[category])} all_storage.append({'category': category, 'storage': storage_humanized, 'users': users}) - # Add the yoda instance information as an extra row with category name YODA_INSTANCE_TOTAL - # So the frontend can distinguish instance totals from real category totals - users = {'internals': count_internals(cat_members['YODA_INSTANCE_TOTAL']), 'externals': count_externals(cat_members['YODA_INSTANCE_TOTAL'])} + # Add the Yoda instance information as an extra row with category name YODA_INSTANCE_TOTAL. + # So the frontend can distinguish instance totals from real category totals. all_storage.append({'category': "YODA_INSTANCE_TOTAL", 'storage': {'total': misc.human_readable_size(instance_totals['total']), 'research': misc.human_readable_size(instance_totals['research']), 'vault': misc.human_readable_size(instance_totals['vault']), 'revision': misc.human_readable_size(instance_totals['revision'])}, - 'users': users}) + 'users': {'internals': len(instance_totals['internals']), + 'externals': len(instance_totals['externals'])}}) return {'categories': sorted(all_storage, key=lambda d: d['category']), 'external_filter': ', '.join(config.external_users_domain_filter)} diff --git a/revisions.py b/revisions.py index 321d5307b..96918c8b5 100644 --- a/revisions.py +++ b/revisions.py @@ -349,6 +349,10 @@ def rule_revision_batch(ctx, verbose, balance_id_min, balance_id_max, batch_size minimum_timestamp = int(time.time() - config.async_revision_delay_time) + # Remove revision creation AVUs from deleted data objects. + # This makes it easier to monitor the number of data objects waiting for revision creation. + remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose) + # Get list of up to batch size limit of data objects (in research space) scheduled for revision, taking into account # modification time. log.write(ctx, "verbose = {}".format(verbose)) @@ -514,26 +518,6 @@ def is_revision_blocked_by_admin(ctx): return collection.exists(ctx, path) -def get_data_object(ctx, data_id, resource): - """Return data on data object necessary to create a revision.""" - iter = genquery.row_iterator( - "DATA_ID, DATA_MODIFY_TIME, DATA_OWNER_NAME, DATA_SIZE, COLL_ID, DATA_RESC_HIER, DATA_NAME, COLL_NAME", - "DATA_ID = '{}' AND DATA_RESC_HIER like '{}%'".format(data_id, resource), - genquery.AS_LIST, ctx - ) - for row in iter: - data_id = row[0] - modify_time = row[1] - data_size = row[3] - coll_id = row[4] - data_owner = row[2] - basename = row[6] - parent = row[7] - break - - return modify_time, data_size, coll_id, data_owner, basename, parent - - def get_revision_store(ctx, group_name): """Get path to revision store for group if the path exists. @@ -563,7 +547,22 @@ def revision_create(ctx, print_verbose, data_id, resource, group_name, revision_ :returns: True / False as an indication whether a revision was successfully created """ revision_created = False - modify_time, data_size, coll_id, data_owner, basename, parent = get_data_object(ctx, data_id, resource) + + # Retrieve properties of the data object + data_properties = data_object.get_properties(ctx, data_id, resource) + + # Skip current revision task if data object is not found + if data_properties is None: + log.write(ctx, "ERROR - No data object found for data_id {} on resource {}, move to the next revision creation".format(data_id, resource)) + return False + + modify_time = data_properties["DATA_MODIFY_TIME"] + data_size = data_properties["DATA_SIZE"] + coll_id = data_properties["COLL_ID"] + data_owner = data_properties["DATA_OWNER_NAME"] + basename = data_properties["DATA_NAME"] + parent = data_properties["COLL_NAME"] + path = '{}/{}'.format(parent, basename) # Allow rodsadmin to create subcollections. @@ -1035,3 +1034,28 @@ def memory_limit_exceeded(rss_limit): """ rss_limit = int(rss_limit) return rss_limit and memory_rss_usage() > rss_limit + + +def remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose): + """ + Removes revision creation AVUs from deleted data objects [marked with 'org_revision_scheduled' metadata]. + + :param ctx: Combined type of a callback and rei struct + :param print_verbose: Whether to log verbose messages for troubleshooting (Boolean) + """ + revision_avu_name = constants.UUORGMETADATAPREFIX + "revision_scheduled" + + iter = genquery.row_iterator( + "COLL_NAME, DATA_NAME", + "COLL_NAME like '%{}/trash/home/%' AND META_DATA_ATTR_NAME = '{}'".format(user.zone(ctx), revision_avu_name), + genquery.AS_LIST, ctx + ) + + for coll_name, data_name in iter: + path = coll_name + '/' + data_name + try: + avu.rmw_from_data(ctx, path, revision_avu_name, "%") # use wildcard cause rm_from_data causes problems + if print_verbose: + log.write(ctx, 'Removed revision creation AVUs from data object: {}'.format(path)) + except Exception as e: + log.write(ctx, "Error processing data object {}: {}".format(path, str(e))) diff --git a/schema_transformation.py b/schema_transformation.py index 817da02e9..e4ef569b7 100644 --- a/schema_transformation.py +++ b/schema_transformation.py @@ -19,7 +19,6 @@ import session_vars import meta -import meta_form import schema import schema_transformations from util import * @@ -137,14 +136,19 @@ def copy_acls_from_parent(ctx, path, recursive_flag): user_name = user.name_from_id(ctx, user_id) + # iRODS keeps ACLs for deleted users in the iCAT database (https://github.com/irods/irods/issues/7778), + # so we need to skip ACLs referring to users that no longer exist. + if user_name == "": + continue + if access_name == "own": log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">") msi.set_acl(ctx, recursive_flag, "own", user_name, path) elif access_name == "read object": - log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">") + log.write(ctx, "iiCopyACLsFromParent: granting read to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">") msi.set_acl(ctx, recursive_flag, "read", user_name, path) elif access_name == "modify object": - log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">") + log.write(ctx, "iiCopyACLsFromParent: granting write to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">") msi.set_acl(ctx, recursive_flag, "write", user_name, path) @@ -400,41 +404,13 @@ def rule_batch_vault_metadata_schema_report(ctx): genquery.AS_LIST, ctx) for row in iter: - coll_name = row[0] - metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name) - - if metadata_path == '' or metadata_path is None: - log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found." - % (coll_name)) - continue - try: - metadata = jsonutil.read(ctx, metadata_path) - except Exception as exc: - log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s." - % (coll_name, metadata_path, str(exc))) + coll_name = row[0] + result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True) + if result: + results[coll_name] = result + except Exception as e: + log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e))) continue - # Determine schema - schema_id = schema.get_schema_id(ctx, metadata_path) - schema_shortname = schema_id.split("/")[-2] - - # Retrieve schema and cache it for future use - schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id) - if schema_shortname in schema_cache: - schema_contents = schema_cache[schema_shortname] - else: - schema_contents = jsonutil.read(ctx, schema_path) - schema_cache[schema_shortname] = schema_contents - - # Check whether metadata matches schema and log any errors - error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents) - match_schema = len(error_list) == 0 - if not match_schema: - log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" % - (metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]))) - - # Update results - results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema} - return json.dumps(results) diff --git a/schema_transformations.py b/schema_transformations.py index e2b57098f..98e412b65 100644 --- a/schema_transformations.py +++ b/schema_transformations.py @@ -6,6 +6,8 @@ import re +from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_researcher_id, correctify_scopus + import meta from util import * @@ -128,21 +130,44 @@ def _default2_default3(ctx, m): person_identifiers = [] for person_identifier in creator.get('Person_Identifier', []): + # Check ORCID if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': # Check for incorrect ORCID format. if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)): corrected_orcid = correctify_orcid(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_orcid is None: log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually." % (person_identifier['Name_Identifier'])) elif corrected_orcid != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_orcid + # Check Scopus + elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)': + # Check for incorrect Scopus format. + if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)): + corrected_scopus = correctify_scopus(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_scopus is None: + log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_scopus != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_scopus + # Check ISNI + elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI': + # Check for incorrect ISNI format. + if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)): + corrected_isni = correctify_isni(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_isni is None: + log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_isni != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_isni elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': # Check for incorrect ResearcherID format. if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)): corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_researcher_id != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_researcher_id elif 'Name_Identifier_Scheme' not in person_identifier: @@ -164,21 +189,44 @@ def _default2_default3(ctx, m): person_identifiers = [] for person_identifier in contributor.get('Person_Identifier', []): + # Check ORCID if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID': # Check for incorrect ORCID format. if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)): corrected_orcid = correctify_orcid(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_orcid is None: log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually." % (person_identifier['Name_Identifier'])) elif corrected_orcid != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_orcid + # Check Scopus + elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)': + # Check for incorrect Scopus format. + if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)): + corrected_scopus = correctify_scopus(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_scopus is None: + log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_scopus != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_scopus + # Check ISNI + elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI': + # Check for incorrect ISNI format. + if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)): + corrected_isni = correctify_isni(person_identifier['Name_Identifier']) + # Only if an actual correction took place change the value and mark this data as 'changed'. + if corrected_isni is None: + log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually." + % (person_identifier['Name_Identifier'])) + elif corrected_isni != person_identifier['Name_Identifier']: + person_identifier['Name_Identifier'] = corrected_isni elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)': # Check for incorrect ResearcherID format. if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)): corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier']) - # Only it an actual correction took place change the value and mark this data as 'changed'. + # Only if an actual correction took place change the value and mark this data as 'changed'. if corrected_researcher_id != person_identifier['Name_Identifier']: person_identifier['Name_Identifier'] = corrected_researcher_id elif 'Name_Identifier_Scheme' not in person_identifier: @@ -702,36 +750,3 @@ def get(src_id, dst_id): x = transformations.get(src_id) return None if x is None else x.get(dst_id) - - -def correctify_orcid(org_orcid): - """Correct illformatted ORCID.""" - # Get rid of all spaces. - orcid = org_orcid.replace(' ', '') - - # Upper-case X. - orcid = org_orcid.replace('x', 'X') - - # The last part should hold a valid id like eg: 1234-1234-1234-123X. - # If not, it is impossible to correct it to the valid orcid format - orcs = orcid.split('/') - if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): - # Return original value. - return org_orcid - - return "https://orcid.org/{}".format(orcs[-1]) - - -def correctify_researcher_id(org_researcher_id): - """Correct illformatted ResearcherID.""" - # Get rid of all spaces. - researcher_id = org_researcher_id.replace(' ', '') - - # The last part should hold a valid id like eg: A-1234-1234 - # If not, it is impossible to correct it to the valid ResearcherID format - orcs = researcher_id.split('/') - if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): - # Return original value. - return org_researcher_id - - return "https://www.researcherid.com/rid/{}".format(orcs[-1]) diff --git a/schema_transformations_utils.py b/schema_transformations_utils.py new file mode 100644 index 000000000..d5cf58f68 --- /dev/null +++ b/schema_transformations_utils.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +"""JSON schema transformation utility functions.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import re + + +def correctify_orcid(org_orcid): + """Correct illformatted ORCID.""" + # Get rid of all spaces. + orcid = org_orcid.replace(' ', '') + + # Upper-case X. + orcid = orcid.replace('x', 'X') + + # The last part should hold a valid id like eg: 1234-1234-1234-123X. + # If not, it is impossible to correct it to the valid orcid format + orcs = orcid.split('/') + if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]): + return None + + return "https://orcid.org/{}".format(orcs[-1]) + + +def correctify_scopus(org_scopus): + """Correct illformatted Scopus.""" + # Get rid of all spaces. + new_scopus = org_scopus.replace(' ', '') + + if not re.search("^\d{1,11}$", new_scopus): + return None + + return new_scopus + + +def correctify_isni(org_isni): + """Correct ill-formatted ISNI.""" + # Remove all spaces. + new_isni = org_isni.replace(' ', '') + + # Upper-case X. + new_isni = new_isni.replace('x', 'X') + + # The last part should hold a valid id like eg: 123412341234123X. + # If not, it is impossible to correct it to the valid isni format + new_isni = new_isni.split('/') + if not re.search("^[0-9]{15}[0-9X]$", new_isni[-1]): + return None + + return "https://isni.org/isni/{}".format(new_isni[-1]) + + +def correctify_researcher_id(org_researcher_id): + """Correct illformatted ResearcherID.""" + # Get rid of all spaces. + researcher_id = org_researcher_id.replace(' ', '') + + # The last part should hold a valid id like eg: A-1234-1234 + # If not, it is impossible to correct it to the valid ResearcherID format + orcs = researcher_id.split('/') + if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]): + # Return original value. + return org_researcher_id + + return "https://www.researcherid.com/rid/{}".format(orcs[-1]) diff --git a/schemas/dag-0/metadata.json b/schemas/dag-0/metadata.json index 32c41ed77..eb9d8cbea 100644 --- a/schemas/dag-0/metadata.json +++ b/schemas/dag-0/metadata.json @@ -50,11 +50,11 @@ "50" ], "enumNames": [ - "2 - appriopiate period when data can be considered as a snapshot which is outdated in the short term", - "5 - appriopiate period when the value of data decreases significantly after a longer period of time", + "2 - appropriate period when data can be considered as a snapshot which is outdated in the short term", + "5 - appropriate period when the value of data decreases significantly after a longer period of time", "10 - default retention period according to UU's policy framework for research data, maximum retention period for personal data (GDPR)", - "20 - appriopiate period when the value of the data decreases slowly over a long period of time", - "50 - appriopiate period when data will always be relevant" + "20 - appropriate period when the value of the data decreases slowly over a long period of time", + "50 - appropriate period when data will always be relevant" ] }, "optionsOwnerRole": { diff --git a/schemas/dag-0/uischema.json b/schemas/dag-0/uischema.json index 947d10895..23424fc96 100644 --- a/schemas/dag-0/uischema.json +++ b/schemas/dag-0/uischema.json @@ -49,7 +49,7 @@ "ui:description": "Free text field to add characteristic words or terms that typify and describe the data, so it becomes better searchable. Please fill in one word or term per field, use the + if you want to add more keywords" }, "Related_Datapackage": { - "ui:description": "Reference to other resources which are used to create the data set, such as another data package an online publication. Please fill in the title or citing information of the resource, together with type persistant identifier (select an option) and the identifier itself", + "ui:description": "Reference to other resources which are used to create the data set, such as another data package an online publication. Please fill in the title or citing information of the resource, together with type persistent identifier (select an option) and the identifier itself", "items": { "Relation_Type": { "ui:help": "Relation to this data package", @@ -108,7 +108,7 @@ } }, "Affiliation": { - "ui:help": "Organizational or institutional affliation of the data owner" + "ui:help": "Organizational or institutional affiliation of the data owner" }, "Owner_Role": { "ui:help": "Which role does the data owner have in the context in which the data package originated?" diff --git a/setup.cfg b/setup.cfg index 1bac72b55..ca6511c23 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,5 +4,5 @@ import-order-style=smarkets strictness=short docstring_style=sphinx max-line-length=127 -exclude=__init__.py,tools,tests/env/ -application-import-names=avu_json,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils +exclude=__init__.py,tools,tests +application-import-names=avu_json,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils,vault_utils diff --git a/tests/conftest.py b/tests/conftest.py index 4ab47e948..14f59d0bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,7 @@ import json import re +import sys import pytest import requests @@ -201,7 +202,12 @@ def login(user, password): # Retrieve the login CSRF token. content = client.get(url, verify=False).content.decode() p = re.compile("tokenValue: '([a-zA-Z0-9._-]*)'") - csrf = p.findall(content)[0] + found_csrf_tokens = p.findall(content) + if len(found_csrf_tokens) == 0: + print(f"Error: could not find login CSRF token in response from server for login of user {user}. Response was:") + print(content) + sys.exit(1) + csrf = found_csrf_tokens[0] # Login as user. if verbose_test: @@ -214,7 +220,12 @@ def login(user, password): # Retrieve the authenticated CSRF token. content = response.content.decode() p = re.compile("tokenValue: '([a-zA-Z0-9._-]*)'") - csrf = p.findall(content)[0] + found_csrf_tokens = p.findall(content) + if len(found_csrf_tokens) == 0: + print(f"Error: could not find authenticated CSRF token in response from server for login of user {user}. Response was:") + print(content) + sys.exit(1) + csrf = found_csrf_tokens[0] # Return CSRF and session cookies. if verbose_test: diff --git a/tests/features/api/api_group.feature b/tests/features/api/api_group.feature index 202165e66..dcb248e5c 100644 --- a/tests/features/api/api_group.feature +++ b/tests/features/api/api_group.feature @@ -8,10 +8,12 @@ Feature: Group API And group exists Examples: - | user | group | - | researcher | research-initial | - | researcher | research-initial1 | - | datamanager | datamanager-test-automation | + | user | group | + | researcher | research-initial | + | groupmanager | research-initial | + | functionaladminpriv | research-initial | + | datamanager | datamanager-test-automation | + | technicaladmin | priv-category-add | Scenario Outline: Group categories @@ -179,7 +181,7 @@ Feature: Group API Given user technicaladmin is authenticated And the Yoda API for processing csv group data API is queried for data "" Then the response status code is "400" - + Examples: | group_name | | csv-missing-header | diff --git a/tests/features/api/api_research.feature b/tests/features/api/api_research.feature index 39a9fe31d..43994a824 100644 --- a/tests/features/api/api_research.feature +++ b/tests/features/api/api_research.feature @@ -21,6 +21,29 @@ Feature: Research API | /tempZone/home/research-initial | api_test_1234567890 | + Scenario Outline: Research folder with apostrophe add + Given user researcher is authenticated + And the Yoda research folder add API is queried with and + Then the response status code is "400" + And folder does not exist in + + Examples: + | collection | folder | + | /tempZone/home/research-initial | api_test_folder's | + + + @deposit + Scenario Outline: Deposit folder with apostrophe add + Given user researcher is authenticated + And the Yoda research folder add API is queried with and + Then the response status code is "400" + And folder does not exist in + + Examples: + | collection | folder | + | /tempZone/home/deposit-pilot | api_test_folder's | + + Scenario Outline: Research folder copy Given user researcher is authenticated And the Yoda research folder copy API is queried with , , and @@ -34,6 +57,18 @@ Feature: Research API | /tempZone/home/research-initial | api_test_copy | api_test_move1 | + Scenario Outline: Research folder copy with apostrophe + Given user researcher is authenticated + And the Yoda research folder copy API is queried with , , and + Then the response status code is "400" + And folder exists in + And folder does not exist in + + Examples: + | collection | folder | copy | + | /tempZone/home/research-initial | api_test_copy | api_test_copy2's | + + Scenario Outline: Research folder move Given user researcher is authenticated And the Yoda research folder move API is queried with , , and @@ -46,6 +81,17 @@ Feature: Research API | /tempZone/home/research-initial | api_test_move1 | api_test_move2 | + Scenario Outline: Research folder move with apostrophe + Given user researcher is authenticated + And the Yoda research folder move API is queried with , , and + Then the response status code is "400" + And folder does not exist in + + Examples: + | collection | folder | move | + | /tempZone/home/research-initial | api_test_move1 | api_test_move2's | + + Scenario Outline: Research folder rename Given user researcher is authenticated And the Yoda research folder rename API is queried with , and @@ -58,6 +104,18 @@ Feature: Research API | /tempZone/home/research-initial | api_test_folder | api_test_folder_renamed | + Scenario Outline: Research folder rename with apostrophe + Given user researcher is authenticated + And the Yoda research folder rename API is queried with , and + Then the response status code is "400" + And folder exists in + And folder does not exist in + + Examples: + | collection | folder_old | folder | + | /tempZone/home/research-initial | api_test_folder_renamed | api_test_folder_renamed's | + + Scenario Outline: Research file copy Given user researcher is authenticated And the Yoda research file copy API is queried with , , and diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature index ac3f90e2b..4ed3d018b 100644 --- a/tests/features/api/api_vault.feature +++ b/tests/features/api/api_vault.feature @@ -98,6 +98,17 @@ Feature: Vault API | /tempZone/home/vault-default-2 | | /tempZone/home/vault-core-2 | | /tempZone/home/vault-default-3 | + + + Scenario Outline: Published vault package passes troubleshooting script checks + Given user technicaladmin is authenticated + And data package exists in + Then data package in passes troubleshooting script checks + + Examples: + | vault | + | /tempZone/home/vault-default-2 | + | /tempZone/home/vault-default-3 | Scenario Outline: Vault preservable formats lists @@ -182,7 +193,7 @@ Feature: Vault API | /tempZone/home/vault-default-3 | - Scenario Outline: Revoke grant access to research group + Scenario Outline: Grant read access to research group Given user datamanager is authenticated And data package exists in And the Yoda vault grant read access research group API is queried on datapackage in diff --git a/tests/features/smoke/smoke_test.feature b/tests/features/smoke/smoke_test.feature index df8ba09db..269775f9e 100644 --- a/tests/features/smoke/smoke_test.feature +++ b/tests/features/smoke/smoke_test.feature @@ -255,19 +255,19 @@ Feature: Smoke tests Then the response status code is "200" Examples: - | vault | - | /tempZone/home/vault-smoke-test | + | vault | + | /tempZone/home/vault-smoke-test | - Scenario Outline: Vault revoke grant access to research group + Scenario Outline: Vault grant read access to research group Given user smoke_account is authenticated And data package exists in And the Yoda vault grant read access research group API is queried on datapackage in Then the response status code is "200" Examples: - | vault | - | /tempZone/home/vault-smoke-test | + | vault | + | /tempZone/home/vault-smoke-test | Scenario Outline: Vault get publication terms diff --git a/tests/features/ui/ui_browse.feature b/tests/features/ui/ui_browse.feature index c85a241e5..de2d0a3ec 100644 --- a/tests/features/ui/ui_browse.feature +++ b/tests/features/ui/ui_browse.feature @@ -26,7 +26,7 @@ Feature: Browse UI #Then content of sub-folder is shown Examples: - | user | folder | + | user | folder | | researcher | vault-core-1 | | researcher | vault-default-2 | | researcher | vault-core-2 | diff --git a/tests/features/ui/ui_data_transfer.feature b/tests/features/ui/ui_data_transfer.feature new file mode 100644 index 000000000..502a70c88 --- /dev/null +++ b/tests/features/ui/ui_data_transfer.feature @@ -0,0 +1,87 @@ +@ui +Feature: Data Transfer UI + + Scenario Outline: Data Transfer page + Given user is logged in + When user opens the Data Transfer page + Then Data Transfer is shown + + Examples: + | user | + | researcher | + | technicaladmin | + + + Scenario Outline: User clicks on the iCommands docs page + Given user is logged in + When user opens the Data Transfer page + And user clicks on the iCommands docs page + Then iCommands docs page is displayed + + Examples: + | user | + | researcher | + | technicaladmin | + + + Scenario Outline: User copies iCommands configuration + Given user is logged in + When user opens the Data Transfer page + And user clicks on iCommands copy button + Then iCommands configuration is copied + + Examples: + | user | + | researcher | + | technicaladmin | + + + Scenario Outline: User downloads iCommands configuration file + Given user is logged in + When user opens the Data Transfer page + And user clicks on iCommands download button + Then iCommands configuration file is downloaded as + + Examples: + | user | format | + | researcher | json | + | technicaladmin | json | + + + Scenario Outline: User clicks on the Gocommands docs page + Given user is logged in + When user opens the Data Transfer page + And user clicks on Gocommands tab + And user clicks on the Gocommands docs page + Then Gocommands docs page is displayed + + Examples: + | user | + | researcher | + | technicaladmin | + + + Scenario Outline: User copies Gocommands configuration + Given user is logged in + When user opens the Data Transfer page + And user clicks on Gocommands tab + And user clicks on Gocommands copy button + Then Gocommands configuration is copied + + Examples: + | user | + | researcher | + | technicaladmin | + + + Scenario Outline: User downloads Gocommands configuration file + Given user is logged in + When user opens the Data Transfer page + And user clicks on Gocommands tab + And user clicks on Gocommands download button + Then Gocommands configuration file is downloaded as + + Examples: + | user | format | + | researcher | yml | + | technicaladmin | yml | diff --git a/tests/features/ui/ui_vault.feature b/tests/features/ui/ui_vault.feature index 33f5f2d09..7ccfaa159 100644 --- a/tests/features/ui/ui_vault.feature +++ b/tests/features/ui/ui_vault.feature @@ -170,26 +170,50 @@ Feature: Vault UI Given user datamanager is logged in And module "vault" is shown When user browses to data package in - And user clicks action menu to revoke access - Then action menu holds option to grant access to research group + And user clicks action menu to change access + Then revoke text is displayed + When user confirms revoke read permissions Examples: | vault | | vault-initial1 | + Scenario Outline: Research group user has had access revoked to vault package + Given user is logged in + When user browses to previous vault package url + Then user does not have access to folder + + Examples: + | user | + | researcher | + | viewer | + + Scenario Outline: Grant read access to research group Given user datamanager is logged in And module "vault" is shown When user browses to data package in - And clicks action menu to grant access - Then action menu holds option to revoke access from research group + And user clicks action menu to change access + Then grant text is displayed + When user confirms grant read permissions Examples: | vault | | vault-initial1 | + Scenario Outline: Research group user has been granted access to vault package + Given user is logged in + When user browses to previous vault package url + Then contents of folder are shown + + Examples: + | user | vault | + | researcher | vault-initial1 | + | viewer | vault-initial1 | + + Scenario Outline: Copy datapackage to research space Given user datamanager is logged in And module "vault" is shown @@ -238,4 +262,4 @@ Feature: Vault UI Examples: | vault | group | - | vault-initial1 | research-initial1 | \ No newline at end of file + | vault-initial1 | research-initial1 | diff --git a/tests/requirements.txt b/tests/requirements.txt index 83db9541c..9df72093d 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -5,3 +5,4 @@ pytest-splinter==3.3.2 pytest_bdd==7.0.1 pytest==7.4.4 deepdiff==6.6.1 +pyperclip==1.9.0 diff --git a/tests/step_defs/api/common_vault.py b/tests/step_defs/api/common_vault.py index 2cfa8fa55..9b2706221 100644 --- a/tests/step_defs/api/common_vault.py +++ b/tests/step_defs/api/common_vault.py @@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status): raise AssertionError() +@then(parsers.parse('data package in {vault} passes troubleshooting script checks')) +def api_vault_batch_troubleshoot(user, vault, data_package): + http_status, result = api_request( + user, + "batch_troubleshoot_published_data_packages", + {"requested_package": data_package, "log_file": True, "offline": True} + ) + assert http_status == 200 + data = result['data'] + assert len(data) == 1 + # Confirm that all checks passed for this data package + for checks in data.values(): + assert all(checks.values()) + + @then('preservable formats lists are returned') def preservable_formats_lists(api_response): http_status, body = api_response diff --git a/tests/step_defs/ui/test_ui_data_transfer.py b/tests/step_defs/ui/test_ui_data_transfer.py new file mode 100644 index 000000000..abcaada4a --- /dev/null +++ b/tests/step_defs/ui/test_ui_data_transfer.py @@ -0,0 +1,136 @@ +# coding=utf-8 +"""Data Transfer UI feature tests.""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import os +import time +from pathlib import Path +from urllib.parse import urlparse + +import pyperclip +from pytest_bdd import parsers, scenarios, then, when + +from conftest import portal_url + +scenarios('../../features/ui/ui_data_transfer.feature') + +icommands_url = "https://docs.irods.org/4.2.12/icommands/user/" +gocommands_url = "https://github.com/cyverse/gocommands/blob/main/README.md" + + +@when("user opens the Data Transfer page") +def ui_data_transfer_page(browser): + url = "{}/user/data_transfer".format(portal_url) + browser.visit(url) + + +@then(parsers.parse("{title} is shown")) +def ui_data_transfer_page_content(browser, title): + assert browser.is_text_present(title) + + +@when("user clicks on the iCommands docs page") +def ui_data_transfer_icommands_page(browser): + browser.links.find_by_href(icommands_url).first.click() + time.sleep(2) + + # change to the new tab + browser.windows.current = browser.windows[-1] + + +@then("iCommands docs page is displayed") +def ui_data_transfer_icommands_page_content(browser): + assert browser.url == icommands_url + assert urlparse(browser.url).path == urlparse(icommands_url).path + + +@when('user clicks on iCommands copy button') +def ui_data_transfer_icommands_configuration_copy_button(browser): + browser.find_by_id('button1').click() + + +@then('iCommands configuration is copied') +def ui_data_transfer_icommands_configuration_copied(): + clipboard_content = pyperclip.paste() + assert clipboard_content is not None + + +@when("user clicks on iCommands download button") +def ui_data_transfer_icommands_configuration_download_button(browser): + browser.find_by_id('download-button1').click() + + +@then(parsers.parse("iCommands configuration file is downloaded as {format}")) +def ui_data_transfer_icommands_configuration_file_downloaded(browser, tmpdir, format): + if os.name == "nt": + assert True + return + + root_dir = Path(tmpdir).parent + if os.name == "nt": + download_dir = root_dir.joinpath("pytest-splinter0/splinter/download/") + else: + download_dir = root_dir.joinpath("pytest-splintercurrent/splinter/download/") + + for child in download_dir.iterdir(): + if os.path.basename(str(child)) == "irods_environment.{}".format(format): + assert True + return + raise AssertionError() + + +@when('user clicks on Gocommands tab') +def ui_data_transfer_gocommands_tab(browser): + browser.find_by_text('GoCommands').click() + + +@when("user clicks on the Gocommands docs page") +def ui_data_transfer_gocommands_page(browser): + browser.links.find_by_href(gocommands_url).first.click() + time.sleep(2) + + # change to the new tab + browser.windows.current = browser.windows[-1] + + +@then("Gocommands docs page is displayed") +def ui_data_transfer_gocommands_page_content(browser): + assert browser.url == gocommands_url + assert urlparse(browser.url).path == urlparse(gocommands_url).path + + +@when('user clicks on Gocommands copy button') +def ui_data_transfer_gocommands_configuration_copy_button(browser): + browser.find_by_id('button2').click() + + +@then("Gocommands configuration is copied") +def ui_data_transfer_gocommands_configuration_is_copied(): + clipboard_content = pyperclip.paste() + assert clipboard_content is not None + + +@when("user clicks on Gocommands download button") +def ui_data_transfer_gocommands_configuration_download_button(browser): + browser.find_by_id('download-button2').click() + + +@then(parsers.parse("Gocommands configuration file is downloaded as {format}")) +def ui_data_transfer_gocommands_configuration_downloaded(browser, tmpdir, format): + if os.name == "nt": + assert True + return + + root_dir = Path(tmpdir).parent + if os.name == "nt": + download_dir = root_dir.joinpath("pytest-splinter0/splinter/download/") + else: + download_dir = root_dir.joinpath("pytest-splintercurrent/splinter/download/") + + for child in download_dir.iterdir(): + if os.path.basename(str(child)) == "config.{}".format(format): + assert True + return + raise AssertionError() diff --git a/tests/step_defs/ui/test_ui_publication.py b/tests/step_defs/ui/test_ui_publication.py index a2d5750c6..1bda48371 100644 --- a/tests/step_defs/ui/test_ui_publication.py +++ b/tests/step_defs/ui/test_ui_publication.py @@ -98,7 +98,14 @@ def ui_check_version_provenance_vault(browser): action_log_rows = browser.find_by_css('.list-group-item-action') # Chronological (backward) status changes - prov_statuses = ['Published', 'Approved for publication', 'Added metadata: related datapackage', 'Submitted for publication', 'Secured in vault', 'Accepted for vault', 'Submitted for vault'] + prov_statuses = ['Published', + 'Approved for publication', + 'Removed metadata: additional lab', + 'Added metadata: related datapackage', + 'Submitted for publication', + 'Secured in vault', + 'Accepted for vault', + 'Submitted for vault'] for index in range(0, len(prov_statuses)): assert action_log_rows[index].value.find(prov_statuses[index]) != -1 @@ -243,7 +250,7 @@ def ui_data_package_approve(browser): @then(parsers.parse('the data package status is "{status}"')) def ui_data_package_status(browser, status): for _i in range(30): - if browser.is_text_present(status, wait_time=3): + if browser.is_text_present(status, wait_time=4): return True browser.reload() diff --git a/tests/step_defs/ui/test_ui_statistics.py b/tests/step_defs/ui/test_ui_statistics.py index 4d0a27057..05cc4b640 100644 --- a/tests/step_defs/ui/test_ui_statistics.py +++ b/tests/step_defs/ui/test_ui_statistics.py @@ -43,7 +43,7 @@ def ui_statistics_group_view(browser, group): @when('export statistics button is clicked') def ui_statistics_export(browser): - # For now prevent downloading on windows platforn + # For now prevent downloading on windows platform if os.name == "nt": return # Only click when not in Windows diff --git a/tests/step_defs/ui/test_ui_vault.py b/tests/step_defs/ui/test_ui_vault.py index 1ed185ebe..4b0739cb8 100644 --- a/tests/step_defs/ui/test_ui_vault.py +++ b/tests/step_defs/ui/test_ui_vault.py @@ -1,7 +1,7 @@ # coding=utf-8 """Vault UI feature tests.""" -__copyright__ = 'Copyright (c) 2020-2022, Utrecht University' +__copyright__ = 'Copyright (c) 2020-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import time @@ -15,9 +15,12 @@ scenarios('../../features/ui/ui_vault.feature') +previous_vault_path = '' + @when(parsers.parse("user browses to data package in {vault}")) def ui_browse_data_package(browser, vault): + global previous_vault_path link = [] while len(link) == 0: link = browser.links.find_by_partial_text(vault) @@ -31,6 +34,7 @@ def ui_browse_data_package(browser, vault): research = vault.replace("vault-", "research-") data_packages = browser.links.find_by_partial_text(research) data_packages.click() + previous_vault_path = browser.driver.current_url @when('user submits the data package for publication') @@ -158,36 +162,59 @@ def ui_data_package_provenance_information_is_visible(browser): assert browser.is_element_visible_by_css('.actionlog') -@when('user clicks action menu to revoke access') -def ui_data_package_revoke_vault_access(browser): +@when('user clicks action menu to change access') +def ui_data_package_change_vault_access(browser): browser.find_by_id('actionMenu').click() - browser.find_by_css('a.action-revoke-vault-access').click() + browser.find_by_css('a.action-change-vault-access').click() -@then('action menu holds option to grant access to research group') -def ui_data_package_grant_option_present(browser): - browser.find_by_id('actionMenu').click() - assert browser.is_element_present_by_css('.action-grant-vault-access') +@then('revoke text is displayed') +def ui_data_package_revoke_message(browser): + time.sleep(3) + assert browser.is_text_present('revoke') -@when('clicks action menu to grant access') -def ui_data_package_grant_vault_access(browser): - browser.find_by_id('actionMenu').click() - browser.find_by_css('a.action-grant-vault-access').click() +@then('grant text is displayed') +def ui_data_package_grant_message(browser): + time.sleep(3) + assert browser.is_text_present('grant') -@then('action menu holds option to revoke access from research group') -def ui_data_package_revoke_option_present(browser): - browser.find_by_id('actionMenu').click() - assert browser.is_element_present_by_css('.action-revoke-vault-access') +@when("user confirms revoke read permissions") +def ui_data_package_revoke_read_permissions_confirm(browser): + browser.find_by_css(".action-confirm-revoke-read-permissions").click() + + +@when("user confirms grant read permissions") +def ui_data_package_grant_read_permissions_confirm(browser): + browser.find_by_css(".action-confirm-grant-read-permissions").click() @when('user clicks action menu to copy data package to research') -def ui_data_package_copy_to_resarch(browser): +def ui_data_package_copy_to_research(browser): browser.find_by_id('actionMenu').click() browser.find_by_css('a.action-copy-vault-package-to-research').click() +@when('user browses to previous vault package url') +def ui_data_package_browses_previous_url(browser): + if len(previous_vault_path): + browser.visit(previous_vault_path) + else: + assert False + + +@then('contents of folder are shown') +def ui_data_package_contents(browser): + assert browser.is_text_present('yoda-metadata') + assert browser.is_text_present('original') + + +@then('user does not have access to folder') +def ui_data_package_no_access(browser): + assert browser.is_text_present('This vault space path does not exist') + + @when(parsers.parse("user chooses research folder corresponding to {vault}")) def ui_browse_research_to_copy_data_package_to(browser, vault): research = vault.replace("vault-", "research-") @@ -208,8 +235,8 @@ def ui_user_presses_copy_package_button(browser): @then('data package is copied to research area') def ui_data_package_is_copied_to_research(browser): - browser.find_by_id('actionMenu').click() - browser.is_element_present_by_css('.action-revoke-vault-access') + # TODO + pass @when('user clicks clicks action menu to check compliance') diff --git a/tools/copy-accepted-folders-to-vault.r b/tools/copy-accepted-folders-to-vault.r index 73f6173b8..ebc7c8334 100644 --- a/tools/copy-accepted-folders-to-vault.r +++ b/tools/copy-accepted-folders-to-vault.r @@ -3,35 +3,9 @@ copyToVault { # Copy research folder to vault. # This script is kept as dumb as possible. - # All processing and error handling is done by rule_folder_secure - *ContInxOld = 1; - msiAddSelectFieldToGenQuery("COLL_NAME", "", *GenQInp); - msiAddConditionToGenQuery("META_COLL_ATTR_NAME", "=", UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault", *GenQInp); - msiAddConditionToGenQuery("META_COLL_ATTR_VALUE", "=", CRONJOB_PENDING, *GenQInp); - - msiExecGenQuery(*GenQInp, *GenQOut); - msiGetContInxFromGenQueryOut(*GenQOut, *ContInxNew); - - while(*ContInxOld > 0) { - foreach(*row in *GenQOut) { - *folder = *row.COLL_NAME; - # When rule_folder_secure fails continue with the other folders. - # *errorcode = '0'; - # rule_folder_secure(*folder, *errorcode); - # if (*errorcode == '0') { - if (errorcode(iiFolderSecure(*folder)) == 0) { - *cronjobState = UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_OK; - msiString2KeyValPair(*cronjobState, *cronjobStateKvp); - *err = errormsg(msiRemoveKeyValuePairsFromObj(*cronjobStateKvp, *folder, "-C"), *msg); - } - } - - *ContInxOld = *ContInxNew; - if(*ContInxOld > 0) { - msiGetMoreRows(*GenQInp, *GenQOut, *ContInxNew); - } - } - msiCloseGenQuery(*GenQInp, *GenQOut); + # All processing and error handling is done by rule_vault_copy_accepted_to_vault + *state = "CRONJOB_PENDING" + rule_vault_copy_to_vault(*state); } input null output ruleExecOut diff --git a/tools/copy-one-coll-to-vault.r b/tools/copy-one-coll-to-vault.r new file mode 100644 index 000000000..ee216a250 --- /dev/null +++ b/tools/copy-one-coll-to-vault.r @@ -0,0 +1,9 @@ +#!/usr/bin/irule -F + +copyOneCollToVault { + # Copy research folder to vault. + *return = ""; + rule_folder_secure(*coll, *return); +} +input *coll="" +output ruleExecOut diff --git a/tools/edit-vault-metadata.py b/tools/edit-vault-metadata.py new file mode 100755 index 000000000..d6c99c1e4 --- /dev/null +++ b/tools/edit-vault-metadata.py @@ -0,0 +1,259 @@ +#!/usr/local/bin/python3 + +""" + edit-vault-metadata : script for manually editing metadata of a data package + in the vault. + + By default, the script lets the vault ingest workflow handle ingestion of new metadata + into the vault. In case where that is not possible (e.g. because the vault group no longer + has a research group, because the category does not have a datamanager group, etc.), you + can use the --direct option to make the script update the vault metadata directly, bypassing + the normal vault ingest workflow. + + In direct mode, this script takes care of: + - Finding the current (latest) metadata file of the data package + - Downloading it + - Starting an editor to edit it + - Re-uploading the metadata file as a new version + - Setting the right ACLs + - Updating the provenance log of the data package +""" + +import argparse +import filecmp +import os +import re +import subprocess +import sys +import tempfile +import time +from typing import List, Tuple, Union + + +def get_args(): + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument( + 'collection', + help='Vault collection') + + parser.add_argument( + '-m', '--log-message', + default="metadata manually updated by technical admin", + required=False, + help="Message to be logged in the provenance log for this edit (only applies in direct mode)") + parser.add_argument( + '-d', '--direct', + action='store_true', + default=False, + help="Edit file directly in vault collection. This side-steps the normal ingestion process, but can be needed for vault groups without a research group, categories without a datamanager group, and other situations not support by the default ingestion process.") + + parsed_args = parser.parse_args() + + if not parsed_args.collection.startswith("/"): + sys.exit("Error: collection must be an absolute path.") + + return parsed_args + + +def start_editor(filename: str): + editor = os.environ.get('EDITOR', 'vim') + subprocess.call([editor, filename]) + + +def check_edited_file_changed(filename: str) -> bool: + return not filecmp.cmp(filename, filename + ".orig") + + +def get_latest_metadata_file(collection: str) -> Union[str, None]: + latest_timestamp = None + latest_filename = None + lines = subprocess.check_output(["ils", collection]) + for line in lines.decode("utf-8").split("\n"): + match = re.search(r"^ (yoda-metadata\[(\d+)\]\.json)\s*$", line) + if match and (latest_timestamp is None or match.group(2) + > latest_timestamp): + latest_filename = match.group(1) + latest_timestamp = match.group(2) + return latest_filename + + +def apply_acls(path: str, acls: List[Tuple[str, str]]): + for acl in acls: + retcode = subprocess.call(["ichmod", "-M", acl[1], acl[0], path]) + if retcode != 0: + sys.exit("Could not set ACL {}:{} for {}".format(acl[1], acl[0], path)) + + +def create_collection(path: str): + retcode = subprocess.call(["imkdir", path]) + if retcode != 0: + sys.exit("Error: could not create collection " + path) + + +def create_collection_and_apply_acls_recursively(path: str, acls: List[Tuple[str, str]]): + path_components = path.split("/") + for (level, _) in enumerate(path_components): + current_collection = "/".join(path_components[:level + 1]) + current_collection_exists = collection_exists(current_collection) + if level >= 2 and current_collection_exists: + apply_acls(current_collection, acls) + elif level >= 3 and not current_collection_exists: + create_collection(current_collection) + apply_acls(current_collection, acls) + + +def get_dataobject_acls(path: str) -> List[Tuple[str, str]]: + results = [] + lines = subprocess.check_output(["ils", "-A", path]) + for line in lines.decode("utf-8").split("\n"): + match = re.search(r"^ ACL - ([\S\s]+)$", line) + if match: + acl_line = match.group(1) + for acl_entry in acl_line.replace("read object", "read").replace("g:", "").split(): + (acl_group, acl_priv) = acl_entry.split(":") + acl_clean_group = acl_group.split("#")[0] + results.append((acl_clean_group, acl_priv)) + return results + + +def upload_new_metadata_file(local_filename: str, remote_filename: str): + print("Uploading {} to {}".format(local_filename, remote_filename)) + retcode = subprocess.call(["iput", local_filename, remote_filename]) + if retcode != 0: + sys.exit("Error: could not upload metadata file {} to {}.".format( + local_filename, + remote_filename)) + + +def download_metadata_file(destination_dir: str, remote_path: str) -> str: + local_path_edit = os.path.join(destination_dir, + os.path.basename(remote_path)) + retcode = subprocess.call(["iget", remote_path, local_path_edit]) + if retcode != 0: + sys.exit("Error: could not download metadata file {} to {}.".format( + remote_path, + local_path_edit)) + + local_path_orig = os.path.join(destination_dir, + os.path.basename(remote_path)) + ".orig" + retcode = subprocess.call(["iget", remote_path, local_path_orig]) + if retcode != 0: + sys.exit("Error: could not download metadata file {} to {}.".format( + remote_path, + local_path_orig)) + + return local_path_edit + + +def get_datamanager_vault_subcollection(datamanager_collection: str, vault_path: str): + vault_group = vault_path.split("/")[3] + return os.path.join(os.path.join(datamanager_collection, vault_group), os.path.basename(vault_path)) + + +def get_new_metadata_name(collection: str, zone_name: str, direct_mode: bool) -> str: + if direct_mode: + return os.path.join(collection, "yoda-metadata[{}].json".format(str(int(time.time())))) + + research_collection = get_research_collection_for_vault_path(collection) + if research_collection is None: + sys.exit("Error: cannot use default workflow. This vault group does not have a research group anymore. You can bypass the default workflow using --direct mode.") + research_group = get_research_group_for_research_collection(research_collection) + category = get_category_research_group(research_group) + dm_collection = get_datamanager_collection_for_category(category, zone_name) + if dm_collection is None: + sys.exit("Error: cannot use default workflow. The research group for this vault group does not have a datamanager group. You can bypass the default workflow using --direct mode.") + dm_subcollection = get_datamanager_vault_subcollection(dm_collection, collection) + return os.path.join(dm_subcollection, "yoda-metadata.json") + + +def update_provenance_log(vault_collection: str, log_message: str): + retcode = subprocess.call(["/etc/irods/yoda-ruleset/tools/log-provenance-action.sh", vault_collection, "rods", log_message]) + if retcode != 0: + sys.exit("Error: could not update provenance log for {}.".format(vault_collection)) + + +def collection_exists(path: str) -> bool: + result = subprocess.run(["iquest", "%s", "--no-page", "SELECT COLL_NAME WHERE COLL_NAME ='{}'".format(path)], capture_output=True, text=True) + if result.returncode == 0 and path in result.stdout: + return True + elif result.returncode == 1 and "CAT_NO_ROWS_FOUND" in result.stdout: + return False + else: + sys.exit("Unexpected result when checking for existence of collection " + path) + + +def get_research_collection_for_vault_path(path: str) -> str: + if not path.startswith("/"): + sys.exit("Error: need absolute vault path to determine research group.") + vault_main_collection = "/".join(path.split("/")[:4]) + research_collection = vault_main_collection.replace("vault-", "research-", 1) + return research_collection + + +def get_research_group_for_research_collection(path: str) -> str: + if not path.startswith("/"): + sys.exit("Error: need absolute research collectoin path to determine research group.") + return path.split("/")[3] + + +def get_zone_name_from_path(path: str) -> str: + if not path.startswith("/"): + sys.exit("Error: need absolute research collection path to determine research group.") + return path.split("/")[1] + + +def get_research_group_for_vault_path(path: str) -> Union[str, None]: + research_collection = get_research_collection_for_vault_path(path) + if collection_exists(research_collection): + return get_research_group_for_research_collection(path) + else: + return None + + +def get_datamanager_collection_for_category(category: str, zone_name: str) -> Union[str, None]: + datamanager_collection = "/{}/home/datamanager-{}".format(zone_name, category) + return datamanager_collection if collection_exists(datamanager_collection) else None + + +def get_category_research_group(research_group: str) -> str: + result = subprocess.run(["iquest", "%s", "--no-page", "SELECT META_USER_ATTR_VALUE WHERE USER_NAME = '{}' and META_USER_ATTR_NAME = 'category'".format(research_group)], capture_output=True, text=True) + if result.returncode == 0: + return result.stdout.split("\n")[0] + else: + sys.exit("Error: could not find category for research group " + research_group) + + +def main(): + args = get_args() + if not collection_exists(args.collection): + sys.exit("Error: collection {} does not exist.".format(args.collection)) + zone_name = get_zone_name_from_path(args.collection) + with tempfile.TemporaryDirectory() as tempdir: + metadata_file = get_latest_metadata_file(args.collection) + metadata_file_path = os.path.join(args.collection, metadata_file) + metadata_acls = get_dataobject_acls(metadata_file_path) + print("Metadata data object: " + metadata_file_path) + local_filename = download_metadata_file(tempdir, metadata_file_path) + start_editor(local_filename) + if check_edited_file_changed(local_filename): + remote_filename = get_new_metadata_name(args.collection, zone_name, args.direct) + if not args.direct: + dm_subcollection = os.path.dirname(remote_filename) + print("Creating datamanager subcollection for vault group " + dm_subcollection + " recursively.") + create_collection_and_apply_acls_recursively(dm_subcollection, [("rods", "own")]) + print("Uploading new version of metadata.") + upload_new_metadata_file(local_filename, remote_filename) + if args.direct: + print("Applying ACLs to new metadata.") + apply_acls(remote_filename, metadata_acls) + print("Updating provenance log ...") + update_provenance_log(args.collection, args.log_message) + print("Done.") + else: + print("Not updating metadata, since it wasn't changed.") + + +if __name__ == "__main__": + main() diff --git a/tools/grant-readers-access-to-vault-packages.r b/tools/grant-readers-access-to-vault-packages.r new file mode 100644 index 000000000..db650a521 --- /dev/null +++ b/tools/grant-readers-access-to-vault-packages.r @@ -0,0 +1,9 @@ +#!/usr/bin/irule -F + +grantReadersAccessVaultPackages { + # Grant read- groups access to corresponding vault packages + *return = ""; + rule_vault_grant_readers_vault_access(*dryRun, *verbose, *return); +} +input *dryRun="", *verbose="" +output ruleExecOut diff --git a/tools/grant-readers-access-to-vault-packages.sh b/tools/grant-readers-access-to-vault-packages.sh new file mode 100755 index 000000000..ef81bb13d --- /dev/null +++ b/tools/grant-readers-access-to-vault-packages.sh @@ -0,0 +1,2 @@ +#!/bin/bash +irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/grant-readers-access-to-vault-packages.r '*dryRun="'$1'"' '*verbose="'$2'"' diff --git a/tools/log-provenance-action.r b/tools/log-provenance-action.r new file mode 100644 index 000000000..6cfea5bfe --- /dev/null +++ b/tools/log-provenance-action.r @@ -0,0 +1,14 @@ +#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F +# +# Logs an action in the provenance log +# +import genquery + +def main(rule_args, callback, rei): + collection = global_vars["*collection"].strip('"') + actor = global_vars["*actor"].strip('"') + action = global_vars["*action"].strip('"') + callback.rule_provenance_log_action(actor, collection, action) + +INPUT *collection="", *actor="rods", *action="" +OUTPUT ruleExecOut diff --git a/tools/log-provenance-action.sh b/tools/log-provenance-action.sh new file mode 100755 index 000000000..960c8202f --- /dev/null +++ b/tools/log-provenance-action.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +COLLECTION="$1" +ACTOR="$2" +ACTION="$3" + +if [ -z "$COLLECTION" ] +then echo "Error: missing collection parameter value." + exit 1 +fi + +if [ -z "$ACTOR" ] +then echo "Error: missing actor parameter value." + exit 1 +fi + +if [ -z "$ACTION" ] +then echo "Error: missing action parameter value." + exit 1 +fi + +/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F /etc/irods/yoda-ruleset/tools/log-provenance-action.r "*collection=$COLLECTION" "*actor=$ACTOR" "*action=$ACTION" diff --git a/tools/retry-copy-to-vault.r b/tools/retry-copy-to-vault.r index f45304849..7b072c0aa 100644 --- a/tools/retry-copy-to-vault.r +++ b/tools/retry-copy-to-vault.r @@ -1,35 +1,9 @@ retryCopyToVault { # Copy research folder to vault. # This script is kept as dumb as possible. - # All processing and error handling is done by rule_folder_secure - *ContInxOld = 1; - msiAddSelectFieldToGenQuery("COLL_NAME", "", *GenQInp); - msiAddConditionToGenQuery("META_COLL_ATTR_NAME", "=", UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault", *GenQInp); - msiAddConditionToGenQuery("META_COLL_ATTR_VALUE", "=", CRONJOB_RETRY, *GenQInp); - - msiExecGenQuery(*GenQInp, *GenQOut); - msiGetContInxFromGenQueryOut(*GenQOut, *ContInxNew); - - while(*ContInxOld > 0) { - foreach(*row in *GenQOut) { - *folder = *row.COLL_NAME; - # When rule_folder_secure fails continue with the other folders. - # *errorcode = '0'; - # rule_folder_secure(ctx, *folder, *errorcode); - # if (*errorcode == '0') { - if (errorcode(iiFolderSecure(*folder)) == 0) { - *cronjobState = UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_OK; - msiString2KeyValPair(*cronjobState, *cronjobStateKvp); - *err = errormsg(msiRemoveKeyValuePairsFromObj(*cronjobStateKvp, *folder, "-C"), *msg); - } - } - - *ContInxOld = *ContInxNew; - if(*ContInxOld > 0) { - msiGetMoreRows(*GenQInp, *GenQOut, *ContInxNew); - } - } - msiCloseGenQuery(*GenQInp, *GenQOut); + # All processing and error handling is done by rule_vault_copy_accepted_retry_to_vault + *state = "CRONJOB_RETRY" + rule_vault_copy_to_vault(*state); } input null output ruleExecOut diff --git a/tools/run-integration-tests.r b/tools/run-integration-tests.r index 5407ed5f6..5d128cd0d 100644 --- a/tools/run-integration-tests.r +++ b/tools/run-integration-tests.r @@ -6,9 +6,10 @@ import genquery def main(rule_args, callback, rei): - result = callback.rule_run_integration_tests("") - callback.writeLine("stdout", result["arguments"][0]) + tests = global_vars["*tests"].strip('"') + result = callback.rule_run_integration_tests(tests, "") + callback.writeLine("stdout", result["arguments"][1]) -INPUT null +INPUT *tests="" OUTPUT ruleExecOut diff --git a/tools/run-integration-tests.sh b/tools/run-integration-tests.sh new file mode 100755 index 000000000..e1b4e03b7 --- /dev/null +++ b/tools/run-integration-tests.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# +# This script runs the integration tests, or a subset of them. +# +# Run all tests: ./run-integration-tests.sh +# Run tests with a specific prefix: ./run-integration-tests.sh util.collection.* +# Run one specific test: ./run-integration-test.ssh util.collection.owner + + +TESTS="$1" +TOOLSDIR=$(dirname "$0") +/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F "$TOOLSDIR/run-integration-tests.r" "$TESTS" diff --git a/tools/scheduled-copytovault.sh b/tools/scheduled-copytovault.sh index 6b57d8ba6..566edbb55 100755 --- a/tools/scheduled-copytovault.sh +++ b/tools/scheduled-copytovault.sh @@ -1,2 +1,2 @@ #!/bin/sh -irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/copy-accepted-folders-to-vault.r +irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/copy-one-coll-to-vault.r '*coll="'$1'"' diff --git a/tools/transform-existing-publications.r b/tools/transform-existing-publications.r index 91526ad75..9429c0790 100644 --- a/tools/transform-existing-publications.r +++ b/tools/transform-existing-publications.r @@ -15,13 +15,13 @@ def main(rule_args, callback, rei): # Changing yoda prefix -> version iter = genquery.row_iterator( "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "USER_ZONE = '{}' AND META_COLL_ATTR_NAME LIKE 'org_publication_yoda%'".format(zone), + "COLL_ZONE_NAME = '{}' AND META_COLL_ATTR_NAME LIKE 'org_publication_yoda%'".format(zone), genquery.AS_TUPLE, callback) iter2 = genquery.row_iterator( "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE", - "USER_ZONE = '{}' AND META_COLL_ATTR_NAME in ('org_publication_DOIAvailable', 'org_publication_DOIMinted')".format(zone), + "COLL_ZONE_NAME = '{}' AND META_COLL_ATTR_NAME in ('org_publication_DOIAvailable', 'org_publication_DOIMinted')".format(zone), genquery.AS_TUPLE, callback) @@ -33,4 +33,4 @@ def main(rule_args, callback, rei): subprocess.call(["imeta", "mod", "-C", row[0], row[1], row[2], "n:{}".format(attr_name), "v:{}".format(row[2])]) INPUT null -OUTPUT ruleExecOut \ No newline at end of file +OUTPUT ruleExecOut diff --git a/tools/troubleshoot-published-data.py b/tools/troubleshoot-published-data.py new file mode 100644 index 000000000..bba14bc72 --- /dev/null +++ b/tools/troubleshoot-published-data.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +"""This script collects all published packages and checks that they have all the required info. + +Example: +To check all published packages: +python3 troubleshoot-published-data.py + +To check one specific package by name: +python3 troubleshoot-published-data.py -p research-initial[1725262507] + +To put results into a log file and complete the checks offline: +python3 troubleshoot-published-data.py -l -o +""" +import argparse +import subprocess + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="troubleshoot-published-data.py", + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("-l", "--log-file", action='store_true', + help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log") + parser.add_argument("-o", "--offline", action='store_true', + help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).") + parser.add_argument("-n", "--no-datacite", action='store_true', + help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).") + parser.add_argument("-p", "--package", type=str, required=False, + help="Troubleshoot a specific data package by name (default: troubleshoot all packages)") + return parser.parse_args() + + +def main(): + args = parse_args() + rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r" + data_package = f"*data_package={args.package}" + log_loc = f"*log_loc={args.log_file if args.log_file else ''}" + offline = f"*offline={args.offline}" + no_datacite = f"*no_datacite={args.no_datacite}" + subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F', + rule_name, data_package, log_loc, offline, no_datacite]) + + +if __name__ == '__main__': + main() diff --git a/tools/troubleshoot_data.r b/tools/troubleshoot_data.r new file mode 100644 index 000000000..3caac4671 --- /dev/null +++ b/tools/troubleshoot_data.r @@ -0,0 +1,11 @@ +#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F + +def main(rule_args, callback, rei): + data_package = global_vars["*data_package"].strip('"') + log_loc = global_vars["*log_loc"].strip('"') + offline = global_vars["*offline"].strip('"') + no_datacite = global_vars["*no_datacite"].strip('"') + callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite) + +INPUT *data_package="", *log_loc="", *offline="", *no_datacite="" +OUTPUT ruleExecOut diff --git a/tools/update-publications.r b/tools/update-publications.r index 374bcdb9b..e4ef73069 100644 --- a/tools/update-publications.r +++ b/tools/update-publications.r @@ -1,5 +1,18 @@ +#!/usr/bin/irule -r irods_rule_engine_plugin-irods_rule_language-instance -F +# +# Updates publication endpoints (Landing page, MOAI, DataCite) for either all data +# packages or one selected data package. +# +# To update one data package: +# $ irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/update-publications.r \ +# '*package="/tempZone/home/vault-mygroup/package[123456789]"' +# +# To update all data packages: +# $ irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/update-publications.r +# updatePublications() { - writeLine("stdout", "[UPDATE PUBLICATIONS] Start scan"); + writeLine("stdout", "[UPDATE PUBLICATIONS] Start for *package"); + *packagesFound = 0; # Scan for published vault packages. *ContInxOld = 1; @@ -15,8 +28,10 @@ updatePublications() { foreach(*row in *GenQ2Out) { *collName = *row.COLL_NAME; - # Check if this really is a vault package - if (*collName like regex "/[^/]+/home/vault-.*") { + # Check if this really is a vault package, or selected vault package + if ((*package == '*' && *collName like regex "/[^/]+/home/vault-.*") || + (*package != '*' && *collName like regex "/[^/]+/home/vault-.*" && *collName == *package ) ) { + *packagesFound = 1; *status = '' *statusInfo = ''; rule_update_publication(*collName, *updateDatacite, *updateLandingpage, *updateMOAI, *status, *statusInfo); @@ -30,7 +45,14 @@ updatePublications() { } } msiCloseGenQuery(*GenQ2Inp, *GenQ2Out); - writeLine("stdout", "[UPDATE PUBLICATIONS] Finished scan"); + + if (*packagesFound == 0) { + writeLine("stdout", "[UPDATE PUBLICATIONS] No packages found for *package") + } + else { + writeLine("stdout", "[UPDATE PUBLICATIONS] Finished for *package"); + } } -input *updateDatacite="Yes", *updateLandingpage="Yes", *updateMOAI="Yes" + +input *updateDatacite="Yes", *updateLandingpage="Yes", *updateMOAI="Yes", *package='*' output ruleExecOut diff --git a/unit-tests/test_schema_transformations.py b/unit-tests/test_schema_transformations.py new file mode 100644 index 000000000..d273365ca --- /dev/null +++ b/unit-tests/test_schema_transformations.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +"""Unit tests for the correctify functions in schema_transformations""" + +__copyright__ = 'Copyright (c) 2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import sys +from unittest import TestCase + +sys.path.append('..') + +from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_scopus + + +class CorrectifyIsniTest(TestCase): + def test_isni_correct_format(self): + """Test ISNI with correct format""" + isni = "https://isni.org/isni/1234123412341234" + self.assertEqual(correctify_isni(isni), isni) + + def test_isni_correct_format_containing_x(self): + """Test ISNI with correct format""" + isni = "https://isni.org/isni/123412341234123x" + correct_isni = "https://isni.org/isni/123412341234123X" + self.assertEqual(correctify_isni(isni), correct_isni) + + def test_isni_invalid_format(self): + """Test ISNI with invalid format (1 less number)""" + isni = "123412341234123" + self.assertIsNone(correctify_isni(isni)) + + def test_isni_malformed_format(self): + """Test ISNI with invalid format""" + isni = "foobar0123456789" + self.assertIsNone(correctify_isni(isni)) + + def test_isni_with_spaces(self): + """Test ISNI that contains spaces and should be corrected""" + isni = " https://isni.org/isni/123412341234123x " + corrected_isni = "https://isni.org/isni/123412341234123X" + self.assertEqual(correctify_isni(isni), corrected_isni) + + +class CorrectifyOrcidTest(TestCase): + def test_orcid_correct_format(self): + """Test ORCID with correct format""" + orcid = "https://orcid.org/1234-1234-1234-1234" + self.assertEqual(correctify_orcid(orcid), orcid) + + def test_orcid_correct_format_containing_x(self): + """Test ORCID with correct format""" + orcid = "https://orcid.org/1234-1234-1234-123x" + correct_orcid = "https://orcid.org/1234-1234-1234-123X" + self.assertEqual(correctify_orcid(orcid), correct_orcid) + + def test_orcid_invalid_format(self): + """Test ORCID with invalid format (1 less number)""" + orcid = "1234-1234-1234-123" + self.assertIsNone(correctify_orcid(orcid)) + + def test_orcid_malformed_format(self): + """Test ORCID with invalid format""" + orcid = "1234-foo-bar-1234" + self.assertIsNone(correctify_orcid(orcid)) + + def test_orcid_with_spaces(self): + """Test ORCID that contains spaces and should be corrected""" + orcid = " https://orcid.org/1234-1234-1234-123x " + corrected_orcid = "https://orcid.org/1234-1234-1234-123X" + self.assertEqual(correctify_orcid(orcid), corrected_orcid) + + +class CorrectifyScopusTest(TestCase): + def test_correctify_format(self): + """Test SCOPUS with correct format""" + scopus = "12345678901" + self.assertEqual(correctify_scopus(scopus), scopus) + + def test_correctify_invalid_format(self): + """Test SCOPUS with invalid format""" + scopus = "123456789012" + self.assertIsNone(correctify_scopus(scopus)) + + def test_malformed_format(self): + """Test SCOPUS with invalid format""" + scopus = "foobar1234" + self.assertIsNone(correctify_scopus(scopus)) + + def test_orcid_with_spaces(self): + """Test SCOPUS that contains spaces and should be corrected""" + scopus = " 01234567890 " + corrected_scopus = "01234567890" + self.assertEqual(correctify_scopus(scopus), corrected_scopus) diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py index be9ef703c..5962f2164 100644 --- a/unit-tests/test_util_misc.py +++ b/unit-tests/test_util_misc.py @@ -1,31 +1,228 @@ # -*- coding: utf-8 -*- """Unit tests for the misc utils module""" -__copyright__ = 'Copyright (c) 2023, Utrecht University' +__copyright__ = 'Copyright (c) 2023-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import sys +import time +from collections import namedtuple, OrderedDict from unittest import TestCase sys.path.append('../util') -from misc import human_readable_size +from misc import check_data_package_system_avus, human_readable_size, last_run_time_acceptable, remove_empty_objects + +# AVs of a successfully published data package, that is the first version of the package +avs_success_data_package = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/ICGVFV-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/ICGVFV-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/ICGVFV.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/ICGVFV.html", + "org_publication_lastModifiedDateTime": "2024-10-04T15:32:46.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-04T15:33:17.853806", + "org_publication_randomId": "ICGVFV", + "org_publication_status": "OK", + "org_publication_submission_actor": "researcher#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-default-3/research-default-3[1728048679]", + "org_publication_versionDOI": "10.00012/UU01-ICGVFV", + "org_publication_versionDOIMinted": "yes", +} + +avs_success_data_package_multiversion = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseDOIMinted": "yes", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/YU0JDH-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/YU0JDH-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/YU0JDH.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/YU0JDH.html", + "org_publication_lastModifiedDateTime": "2024-10-11T08:49:17.000000", + "org_publication_license": "Custom", + "org_publication_oaiUploaded": "yes", + "org_publication_previous_version": "/tempZone/home/vault-initial1/new-group01[1728550839]", + "org_publication_publicationDate": "2024-10-11T08:50:01.812220", + "org_publication_randomId": "YU0JDH", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728629336]", + "org_publication_versionDOI": "10.00012/UU01-YU0JDH", + "org_publication_versionDOIMinted": "yes" +} + +avs_success_data_package_multiversion_first = { + "org_publication_accessRestriction": "Open - freely retrievable", + "org_publication_anonymousAccess": "yes", + "org_publication_approval_actor": "datamanager#tempZone", + "org_publication_baseDOI": "10.00012/UU01-X0GU3S", + "org_publication_baseRandomId": "X0GU3S", + "org_publication_combiJsonPath": "/tempZone/yoda/publication/T8D8QU-combi.json", + "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/T8D8QU-dataCite.json", + "org_publication_dataCiteMetadataPosted": "yes", + "org_publication_landingPagePath": "/tempZone/yoda/publication/T8D8QU.html", + "org_publication_landingPageUploaded": "yes", + "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/T8D8QU.html", + "org_publication_lastModifiedDateTime": "2024-10-10T09:06:05.000000", + "org_publication_license": "Creative Commons Attribution 4.0 International Public License", + "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode", + "org_publication_next_version": "/tempZone/home/vault-initial1/new-group01[1728545387]", + "org_publication_oaiUploaded": "yes", + "org_publication_publicationDate": "2024-10-10T09:06:02.177810", + "org_publication_randomId": "T8D8QU", + "org_publication_status": "OK", + "org_publication_submission_actor": "datamanager#tempZone", + "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728543897]", + "org_publication_versionDOI": "10.00012/UU01-T8D8QU", + "org_publication_versionDOIMinted": "yes", +} + +# From avu.py +Avu = namedtuple('Avu', list('avu')) +Avu.attr = Avu.a +Avu.value = Avu.v +Avu.unit = Avu.u class UtilMiscTest(TestCase): + def test_check_data_package_system_avus(self): + # Success + avs = avs_success_data_package + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, extra optional avu + avs['org_publication_baseDOIAvailable'] = 'yes' + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + del avs['org_publication_baseDOIAvailable'] + + # Missing license Uri for non-custom license + del avs['org_publication_licenseUri'] + avus_missing_license_uri = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_license_uri) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Custom license, no license Uri (happy flow) + avs['org_publication_license'] = "Custom" + avus_custom_license = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_custom_license) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Unexpected + avs['org_publication_userAddedSomethingWeird'] = "yodayoda:)" + avus_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_unexpected) + self.assertTrue(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing and unexpected + del avs['org_publication_landingPagePath'] + avus_missing_unexpected = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing_unexpected) + self.assertFalse(result['no_missing_avus']) + self.assertFalse(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 1) + + # Missing + del avs['org_publication_userAddedSomethingWeird'] + avus_missing = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_missing) + self.assertFalse(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 1) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, latest version of a publication + avs = avs_success_data_package_multiversion + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + # Success, first version of a publication that has had other versions + avs = avs_success_data_package_multiversion_first + avus_success = [Avu(attr, val, "") for attr, val in avs.items()] + result = check_data_package_system_avus(avus_success) + self.assertTrue(result['no_missing_avus']) + self.assertTrue(result['no_unexpected_avus']) + self.assertTrue(len(result['missing_avus']) == 0) + self.assertTrue(len(result['unexpected_avus']) == 0) + + def test_last_run_time_acceptable(self): + """Test the last run time for copy to vault""" + # No last run time (job hasn't be tried before) + found = False + last_run = 1 + self.assertEqual(last_run_time_acceptable("b", found, last_run, 300), True) + + # Last run time greater than the backoff, so can run + now = int(time.time()) + found = True + copy_backoff_time = 300 + last_run = now - copy_backoff_time - 1 + self.assertEqual(last_run_time_acceptable("b", found, last_run, copy_backoff_time), True) + + # Last run time more recent than the backoff, so should not run + found = True + copy_backoff_time = 300 + last_run = now + self.assertEqual(last_run_time_acceptable("b", found, int(time.time()), copy_backoff_time), False) + def test_human_readable_size(self): output = human_readable_size(0) - self.assertEquals(output, "0 B") + self.assertEqual(output, "0 B") output = human_readable_size(1024) - self.assertEquals(output, "1.0 KiB") + self.assertEqual(output, "1.0 KiB") output = human_readable_size(1048576) - self.assertEquals(output, "1.0 MiB") + self.assertEqual(output, "1.0 MiB") output = human_readable_size(26843550000) - self.assertEquals(output, "25.0 GiB") + self.assertEqual(output, "25.0 GiB") output = human_readable_size(989560500000000) - self.assertEquals(output, "900.0 TiB") + self.assertEqual(output, "900.0 TiB") output = human_readable_size(112590000000000000) - self.assertEquals(output, "100.0 PiB") + self.assertEqual(output, "100.0 PiB") output = human_readable_size(3931462330709348188) - self.assertEquals(output, "3.41 EiB") + self.assertEqual(output, "3.41 EiB") + + def test_remove_empty_objects(self): + d = OrderedDict({"key1": None, "key2": "", "key3": {}, "key4": []}) + self.assertDictEqual(remove_empty_objects(d), OrderedDict({})) + d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": []}}) + self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"})) + d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": [], "key5": "value5"}}) + self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1", "key2": {"key5": "value5"}})) + d = OrderedDict({"key1": "value1", "key2": [{}]}) + self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"})) diff --git a/unit-tests/test_vault.py b/unit-tests/test_vault.py new file mode 100644 index 000000000..c0bd7c5fe --- /dev/null +++ b/unit-tests/test_vault.py @@ -0,0 +1,58 @@ +"""Unit tests for the vault functions""" + +__copyright__ = 'Copyright (c) 2023-2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +import sys +from unittest import TestCase + +sys.path.append('..') + +from vault_utils import get_copy_folder_to_vault_irsync_command, get_sanity_checks_results_copy_to_vault_paths + + +class VaultTest(TestCase): + + def test_get_copy_folder_to_vault_irsync_command_with_vault_resc(self): + output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", "vaultResc", True) + self.assertEqual(output, ["irsync", "-rK", "-R", "vaultResc", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"]) + + def test_get_copy_folder_to_vault_irsync_command_without_vault_resc(self): + output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", None, True) + self.assertEqual(output, ["irsync", "-rK", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"]) + + def test_get_copy_folder_to_vault_irsync_command_no_multithreading(self): + output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", "vaultResc", False) + self.assertEqual(output, ["irsync", "-rK", "-R", "vaultResc", "-N", "0", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"]) + + def test_get_sanity_check_results_copy_to_vault_paths_ok(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/vault-foo") + self.assertEqual(output, []) + + def test_get_sanity_check_results_copy_to_vault_paths_relative_source(self): + output = get_sanity_checks_results_copy_to_vault_paths("research-foo", "/tempZone/home/vault-foo") + self.assertEqual(output, ["Source path is not absolute."]) + + def test_get_sanity_check_results_copy_to_vault_paths_relative_target(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "vault-foo") + self.assertEqual(output, ["Target path is not absolute."]) + + def test_get_sanity_check_results_copy_to_vault_paths_dotdot_source(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo/..", "/tempZone/home/vault-foo") + self.assertEqual(output, ["Source path contains parent references (..)"]) + + def test_get_sanity_check_results_copy_to_vault_paths_dotdot_target(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/../vault-foo") + self.assertEqual(output, ["Target path contains parent references (..)"]) + + def test_get_sanity_check_results_copy_to_vault_paths_wrong_source_space(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/vault-foo", "/tempZone/home/vault-foo") + self.assertEqual(output, ["Source path not in research or deposit group."]) + + def test_get_sanity_check_results_copy_to_vault_paths_wrong_target_space(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/deposit-foo") + self.assertEqual(output, ["Target path not in vault group."]) + + def test_get_sanity_check_results_copy_to_vault_paths_source_target_mismatch(self): + output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/vault-bar") + self.assertEqual(output, ["Source and target group are not in same compartment."]) diff --git a/unit-tests/unit_tests.py b/unit-tests/unit_tests.py index a008c8607..58b0e6f87 100644 --- a/unit-tests/unit_tests.py +++ b/unit-tests/unit_tests.py @@ -9,13 +9,18 @@ from test_intake import IntakeTest from test_policies import PoliciesTest from test_revisions import RevisionTest +from test_schema_transformations import CorrectifyIsniTest, CorrectifyOrcidTest, CorrectifyScopusTest from test_util_misc import UtilMiscTest from test_util_pathutil import UtilPathutilTest from test_util_yoda_names import UtilYodaNamesTest +from test_vault import VaultTest def suite(): test_suite = TestSuite() + test_suite.addTest(makeSuite(CorrectifyIsniTest)) + test_suite.addTest(makeSuite(CorrectifyOrcidTest)) + test_suite.addTest(makeSuite(CorrectifyScopusTest)) test_suite.addTest(makeSuite(GroupImportTest)) test_suite.addTest(makeSuite(IntakeTest)) test_suite.addTest(makeSuite(PoliciesTest)) @@ -23,4 +28,5 @@ def suite(): test_suite.addTest(makeSuite(UtilMiscTest)) test_suite.addTest(makeSuite(UtilPathutilTest)) test_suite.addTest(makeSuite(UtilYodaNamesTest)) + test_suite.addTest(makeSuite(VaultTest)) return test_suite diff --git a/util/avu.py b/util/avu.py index 9653b96e4..7d4bfcfb4 100644 --- a/util/avu.py +++ b/util/avu.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- """Utility / convenience functions for dealing with AVUs.""" -__copyright__ = 'Copyright (c) 2019-2021, Utrecht University' +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import itertools +import json from collections import namedtuple import genquery import irods_types +import log import msi import pathutil @@ -33,6 +35,70 @@ def of_data(ctx, path): "COLL_NAME = '{}' AND DATA_NAME = '{}'".format(*pathutil.chop(path)))) +def get_attr_val_of_coll(ctx, coll, attr): + """Get the value corresponding to an attr for a given collection.""" + iter = genquery.Query( + ctx, + "META_COLL_ATTR_VALUE", + "META_COLL_ATTR_NAME = '{}' AND COLL_NAME = '{}'".format(attr, coll)) + + for row in iter: + return row + raise ValueError("Attribute {} not found in AVUs of collection {}".format(attr, coll)) + + +def inside_coll(ctx, path, recursive=False): + """Get a list of all AVUs inside a collection with corresponding paths. + + Note: the returned value is a generator / lazy list, so that large + collections can be handled without keeping everything in memory. + use list(...) on the result to get an actual list if necessary. + + The returned paths are absolute paths (e.g. '/tempZone/home/x'). + + :param ctx: Combined type of a callback and rei struct + :param path: Path of collection + :param recursive: List AVUs recursively + + :returns: List of all AVUs inside a collection with corresponding paths + """ + # coll+name -> path + def to_absolute(row, type): + if type == "collection": + return (row[1], type, row[2], row[3], row[4]) + else: + return ('{}/{}'.format(row[0], row[1]), type, row[2], row[3], row[4]) + + collection_root = genquery.row_iterator( + "COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS", + "COLL_PARENT_NAME = '{}'".format(path), + genquery.AS_LIST, ctx) + collection_root = itertools.imap(lambda x: to_absolute(x, "collection"), collection_root) + + data_objects_root = genquery.row_iterator( + "COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS", + "COLL_NAME = '{}'".format(path), + genquery.AS_LIST, ctx) + data_objects_root = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_root) + + if not recursive: + return itertools.chain(collection_root, data_objects_root) + + collection_sub = genquery.row_iterator( + "COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS", + "COLL_PARENT_NAME like '{}/%'".format(path), + genquery.AS_LIST, ctx) + collection_sub = itertools.imap(lambda x: to_absolute(x, "collection"), collection_sub) + + data_objects_sub = genquery.row_iterator( + "COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS", + "COLL_NAME like '{}/%'".format(path), + genquery.AS_LIST, ctx) + data_objects_sub = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_sub) + + return itertools.chain(collection_root, data_objects_root, collection_sub, data_objects_sub) + + def of_group(ctx, group): """Get (a,v,u) triplets for a given group.""" return itertools.imap(lambda x: Avu(*x), @@ -46,12 +112,40 @@ def set_on_data(ctx, path, a, v): msi.set_key_value_pairs_to_obj(ctx, x['arguments'][1], path, '-d') -def set_on_coll(ctx, coll, a, v): - """Set key/value metadata on a collection.""" +def set_on_coll(ctx, coll, a, v, catch=False): + """Set key/value metadata on a collection. Optionally catch any exceptions that occur. + + :param ctx: Combined type of a callback and rei struct + :param coll: Collection to get paginated contents of + :param a: Attribute + :param v: Value + :param catch: Whether to catch any exceptions that occur + + :returns: True if catch=True and no exceptions occurred during operation + """ + if catch: + return _set_on_coll_catch(ctx, coll, a, v) + + _set_on_coll(ctx, coll, a, v) + return True + + +def _set_on_coll(ctx, coll, a, v): x = msi.string_2_key_val_pair(ctx, '{}={}'.format(a, v), irods_types.BytesBuf()) msi.set_key_value_pairs_to_obj(ctx, x['arguments'][1], coll, '-C') +def _set_on_coll_catch(ctx, coll, a, v): + """Set AVU, but catch exception.""" + try: + _set_on_coll(ctx, coll, a, v) + except Exception: + log.write(ctx, "Failed to set AVU {} on coll {}".format(a, coll)) + return False + + return True + + def set_on_resource(ctx, resource, a, v): """Set key/value metadata on a resource.""" x = msi.string_2_key_val_pair(ctx, '{}={}'.format(a, v), irods_types.BytesBuf()) @@ -100,11 +194,39 @@ def rm_from_group(ctx, group, a, v): msi.remove_key_value_pairs_from_obj(ctx, x['arguments'][1], group, '-u') -def rmw_from_coll(ctx, obj, a, v, u=''): - """Remove AVU from collection with wildcards.""" +def rmw_from_coll(ctx, obj, a, v, catch=False, u=''): + """Remove AVU from collection with wildcards. Optionally catch any exceptions that occur. + + :param ctx: Combined type of a callback and rei struct + :param obj: Collection to get paginated contents of + :param a: Attribute + :param v: Value + :param catch: Whether to catch any exceptions that occur + :param u: Unit + + :returns: True if catch=True and no exceptions occurred during operation + """ + if catch: + return _rmw_from_coll_catch(ctx, obj, a, v, u) + + _rmw_from_coll(ctx, obj, a, v, u) + return True + + +def _rmw_from_coll(ctx, obj, a, v, u=''): msi.rmw_avu(ctx, '-C', obj, a, v, u) +def _rmw_from_coll_catch(ctx, obj, a, v, u=''): + try: + _rmw_from_coll(ctx, obj, a, v, u) + except Exception: + log.write(ctx, "Failed to rm AVU {} on coll {}".format(a, obj)) + return False + + return True + + def rmw_from_data(ctx, obj, a, v, u=''): """Remove AVU from data object with wildcards.""" msi.rmw_avu(ctx, '-d', obj, a, v, u) @@ -113,3 +235,30 @@ def rmw_from_data(ctx, obj, a, v, u=''): def rmw_from_group(ctx, group, a, v, u=''): """Remove AVU from group with wildcards.""" msi.rmw_avu(ctx, '-u', group, a, v, u) + + +def apply_atomic_operations(ctx, operations): + """Sequentially executes all operations as a single transaction. + + Operations should be a dict with structure as defined in + https://docs.irods.org/4.2.12/doxygen/libmsi__atomic__apply__metadata__operations_8cpp.html + If an error occurs, all updates are rolled back and an error is returned. + + :param ctx: Combined type of a callback and rei struct + :param operations: Dict containing the batch of metadata operations + + :returns: Boolean indicating if all metadata operations were executed + """ + try: + msi.atomic_apply_metadata_operations(ctx, json.dumps(operations), "") + return True + except msi.Error as e: + # iRODS errorcode -1811000 (INVALID_OPERATION) + if str(e).find("-1811000") > -1: + log.write(ctx, "apply_atomic_operations: invalid metadata operation") + # iRODS errorcode -130000 (SYS_INVALID_INPUT_PARAM) + elif str(e).find("-130000") > -1: + log.write(ctx, "apply_atomic_operations: invalid entity name or entity type") + else: + log.write(ctx, "apply_atomic_operations: {}".format(e)) + return False diff --git a/util/config.py b/util/config.py index 2054bc27a..704eb688c 100644 --- a/util/config.py +++ b/util/config.py @@ -145,7 +145,10 @@ def __repr__(self): text_file_extensions=[], user_max_connections_enabled=False, user_max_connections_number=4, - vault_copy_multithread_enabled=True) + vault_copy_backoff_time=300, + vault_copy_max_retries=5, + vault_copy_multithread_enabled=True, + python3_interpreter='/usr/local/bin/python3') # }}} diff --git a/util/constants.py b/util/constants.py index b679d090b..2d91034a0 100644 --- a/util/constants.py +++ b/util/constants.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Constants that apply to all Yoda environments.""" -__copyright__ = 'Copyright (c) 2016-2023, Utrecht University' +__copyright__ = 'Copyright (c) 2016-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' from enum import Enum @@ -87,6 +87,8 @@ IIARCHIVEATTRNAME = UUORGMETADATAPREFIX + 'archival_status' IIBAGITOR = UUORGMETADATAPREFIX + 'bagitor' IICOPYPARAMSNAME = UUORGMETADATAPREFIX + 'copy_to_vault_params' +IICOPYRETRYCOUNT = UUORGMETADATAPREFIX + 'retry_count' +IICOPYLASTRUN = UUORGMETADATAPREFIX + 'last_run' DATA_PACKAGE_REFERENCE = UUORGMETADATAPREFIX + 'data_package_reference' diff --git a/util/data_object.py b/util/data_object.py index b625672fb..73fed02c2 100644 --- a/util/data_object.py +++ b/util/data_object.py @@ -24,6 +24,39 @@ def exists(ctx, path): genquery.AS_LIST, ctx))) > 0 +def get_properties(ctx, data_id, resource): + """ Retrieves default properties of a data object from iRODS. + + :param ctx: Combined type of a callback and rei struct + :param data_id: data_id of the data object + :param resource: Name of resource + + :returns: dictionary mapping each requested property to its retrieved value, or None if not found. + """ + # Default properties available for retrieva + properties = [ + "DATA_ID", "DATA_MODIFY_TIME", "DATA_OWNER_NAME", "DATA_SIZE", + "COLL_ID", "DATA_RESC_HIER", "DATA_NAME", "COLL_NAME", + ] + + # Retrieve data obejct with default properties + query_fields = ", ".join(properties) + iter = genquery.row_iterator( + query_fields, + "DATA_ID = '{}' AND DATA_RESC_HIER like '{}%'".format(data_id, resource), + genquery.AS_LIST, ctx + ) + + # Return a None when no data object is found + prop_dict = None + + for row in iter: + prop_dict = {prop: value for prop, value in zip(properties, row)} + break + + return prop_dict + + def owner(ctx, path): """Find the owner of a data object. Returns (name, zone) or None.""" owners = list(genquery.row_iterator( @@ -198,6 +231,18 @@ def name_from_id(ctx, data_id): return '/'.join(x) +def id_from_path(ctx, path): + """Get data object id from data object path at its first appearance. + + :param ctx: Combined type of a callback and rei struct + :param path: Path to iRODS data object + + :returns: Data object id + """ + return genquery.Query(ctx, "DATA_ID", + "COLL_NAME = '%s' AND DATA_NAME = '%s'" % pathutil.chop(path)).first() + + def decode_checksum(checksum): """Decode data object checksum. diff --git a/util/jsonutil.py b/util/jsonutil.py index 6d775c741..114ecf798 100644 --- a/util/jsonutil.py +++ b/util/jsonutil.py @@ -69,7 +69,7 @@ def _promote_strings(json_data): :returns: JSON structure with UTF-8 encoded strings transformed to unicode strings """ return _fold(json_data, - str=lambda x: x.decode('utf-8'), + str=lambda x: x.decode('utf-8', errors='replace'), OrderedDict=lambda x: OrderedDict([(k.decode('utf-8'), v) for k, v in x.items()]), dict=lambda x: OrderedDict([(k.decode('utf-8'), v) for k, v in x.items()])) diff --git a/util/log.py b/util/log.py index 994f42e4f..729f8fdd5 100644 --- a/util/log.py +++ b/util/log.py @@ -17,15 +17,20 @@ import user -def write(ctx, message): - """Write a message to the log, including client name and originating module. +def write(ctx, message, write_stdout=False): + """Write a message to the log or stdout. + Includes client name and originating module if writing to log. - :param ctx: Combined type of a callback and rei struct - :param message: Message to write to log + :param ctx: Combined type of a callback and rei struct + :param message: Message to write to log + :param write_stdout: Whether to write to stdout (used for a few of our scripts) """ - stack = inspect.stack()[1] - module = inspect.getmodule(stack[0]) - _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) + if write_stdout: + ctx.writeLine("stdout", message) + else: + stack = inspect.stack()[1] + module = inspect.getmodule(stack[0]) + _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message)) def _write(ctx, message): diff --git a/util/misc.py b/util/misc.py index 062a594f0..6c1e54623 100644 --- a/util/misc.py +++ b/util/misc.py @@ -1,10 +1,106 @@ # -*- coding: utf-8 -*- """Miscellaneous util functions.""" -__copyright__ = 'Copyright (c) 2019-2023, Utrecht University' +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import math +import time +from collections import OrderedDict + +import constants + + +def check_data_package_system_avus(extracted_avus): + """ + Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_'). + This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from + a successfully published data package. + + :param extracted_avus: AVUs of the data package in AVU form + + :returns: Dictionary of the results of the check + """ + # Filter those starting with 'org_publication' + extracted_avs = {} + for m in extracted_avus: + if m.attr.startswith(constants.UUORGMETADATAPREFIX + 'publication_'): + extracted_avs[m.attr] = m.value + extracted_attrs = set(extracted_avs.keys()) + + # Define the set of ground truth AVUs + avu_names_suffix = { + 'approval_actor', 'randomId', + 'versionDOI', 'dataCiteJsonPath', 'license', + 'anonymousAccess', 'versionDOIMinted', + 'accessRestriction', 'landingPagePath', + 'publicationDate', + 'vaultPackage', 'submission_actor', 'status', + 'lastModifiedDateTime', 'combiJsonPath', + 'landingPageUploaded', 'oaiUploaded', + 'landingPageUrl', 'dataCiteMetadataPosted' + } + + # If the license is not Custom, it must have a licenseUri + if constants.UUORGMETADATAPREFIX + 'publication_license' in extracted_attrs: + if extracted_avs[constants.UUORGMETADATAPREFIX + 'publication_license'] != "Custom": + avu_names_suffix.add('licenseUri') + + # Define additional set of AVUs with more than one version of publication + avu_names_version_suffix = { + 'previous_version', 'baseDOI', 'baseRandomId', + 'baseDOIMinted' + } + + # Define additional set of AVUs expected for the first version of a publication, when there are multiple versions + avu_names_first_version_suffix = { + 'baseRandomId', 'baseDOI', 'next_version' + } + + # for the second version, all we need is next_version in addition to avu_names_version_suffix + avu_names_previous_version_suffix = {'next_version'} + + # optional avus + avu_names_optional_suffix = { + 'versionDOIAvailable', 'baseDOIAvailable' + } + + combined_avu_names_suffix = avu_names_suffix + + if constants.UUORGMETADATAPREFIX + 'publication_previous_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_version_suffix) + if constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_previous_version_suffix) + elif constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs: + combined_avu_names_suffix.update(avu_names_first_version_suffix) + + ground_truth_avus = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + combined_avu_names_suffix.update(avu_names_optional_suffix) + ground_truth_avus_with_optional = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix} + # Find missing and unexpected AVUs + missing_avus = ground_truth_avus - extracted_attrs + unexpected_avus = extracted_attrs - ground_truth_avus_with_optional + + results = { + 'no_missing_avus': not bool(missing_avus), + 'missing_avus': list(missing_avus), + 'no_unexpected_avus': not bool(unexpected_avus), + 'unexpected_avus': list(unexpected_avus) + } + + return results + + +def last_run_time_acceptable(coll, found, last_run, config_backoff_time): + """Return whether the last run time is acceptable to continue with task.""" + now = int(time.time()) + + if found: + # Too soon to run + if now < last_run + config_backoff_time: + return False + + return True def human_readable_size(size_bytes): @@ -16,3 +112,23 @@ def human_readable_size(size_bytes): p = math.pow(1024, i) s = round(size_bytes / p, 2) return '{} {}'.format(s, size_name[i]) + + +def remove_empty_objects(d): + """Remove empty objects (None, '', {}, []) from OrderedDict.""" + if isinstance(d, dict): + # Create OrderedDict to maintain order. + cleaned_dict = OrderedDict() + for k, v in d.items(): + # Recursively remove empty objects. + cleaned_value = remove_empty_objects(v) + # Only add non-empty values. + if cleaned_value not in (None, '', {}, []): + cleaned_dict[k] = cleaned_value + return cleaned_dict + elif isinstance(d, list): + # Clean lists by filtering out empty objects. + return [remove_empty_objects(item) for item in d if remove_empty_objects(item) not in (None, '', {}, [])] + else: + # Return the value abecause it is not a dict or list. + return d diff --git a/util/msi.py b/util/msi.py index 882fdac7d..74c5431da 100644 --- a/util/msi.py +++ b/util/msi.py @@ -6,7 +6,7 @@ all errors to unambiguous Python exceptions. """ -__copyright__ = 'Copyright (c) 2019-2023, Utrecht University' +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' __license__ = 'GPLv3, see LICENSE' import irods_types @@ -111,6 +111,9 @@ def _make_exception(name, message): get_icat_time, GetIcatTimeError = make('GetIcatTime', 'Could not get Icat time') get_obj_type, GetObjTypeError = make('GetObjType', 'Could not get object type') mod_avu_metadata, ModAVUMetadataError = make('ModAVUMetadata', 'Could not modify AVU metadata') +stat_vault, MSIStatVaultError = make("_stat_vault", 'Could not stat file system object in vault.') +file_checksum, FileChecksumError = make("_file_checksum", 'Could not calculate non-persistent checksum of vault file.') +dir_list, DirListError = make("_dir_list", 'Could not list vault directory contents.') archive_create, ArchiveCreateError = make('ArchiveCreate', 'Could not create archive') archive_index, ArchiveIndexError = make('ArchiveIndex', 'Could not index archive') @@ -135,7 +138,11 @@ def _make_exception(name, message): add_avu, AddAvuError = make('_add_avu', 'Could not add metadata to object') rmw_avu, RmwAvuError = make('_rmw_avu', 'Could not remove metadata to object') +atomic_apply_metadata_operations, AtomicApplyMetadataOperationsError = make('_atomic_apply_metadata_operations', 'Could not apply atomic metadata operations') + sudo_obj_acl_set, SudoObjAclSetError = make('SudoObjAclSet', 'Could not set ACLs as admin') +sudo_obj_meta_set, SudoObjMetaSetError = make('SudoObjMetaSet', 'Could not set metadata as admin') +sudo_obj_meta_remove, SudoObjMetaRemoveError = make('SudoObjMetaRemove', 'Could not remove metadata as admin') touch, TouchError = make('_touch', 'Could not update the data object or collection') obj_stat, ObjStatError = make('ObjStat', 'Could not get the stat of data object or collection') diff --git a/uuGroupPolicyChecks.r b/uuGroupPolicyChecks.r index 7a649644d..54d08ee04 100644 --- a/uuGroupPolicyChecks.r +++ b/uuGroupPolicyChecks.r @@ -639,10 +639,13 @@ uuUserPolicyCanUserModify(*actor, *userName, *attribute, *allowed, *reason) { } else { *reason = "Cannot modify settings of other user."; } + # User SRAM invitation + } else if (*attribute == "org_sram_invited") { + *allowed = 1; # User notifications } else if (trimr(*attribute, "_") == "org_notification") { *allowed = 1; } else { - *reason = "Invalid user attribute name."; + *reason = "Invalid user attribute name."; } } diff --git a/uuPolicies.r b/uuPolicies.r index 54923686a..7e8dcba60 100644 --- a/uuPolicies.r +++ b/uuPolicies.r @@ -45,7 +45,7 @@ acCreateUserZoneCollections { uuGetUserType($otherUserName, *type); if (*type == "rodsuser") { # Do not create home directories for regular users. - # but do create trash directories as iRODS always uses the personal trash folder evan when in a group directory + # but do create trash directories as iRODS always uses the personal trash folder even when in a group directory acCreateCollByAdmin("/"++$rodsZoneProxy++"/trash/home", $otherUserName); } else if (*type == "rodsgroup" && ($otherUserName like "read-*")) { # Do not create home directories for read- groups. diff --git a/uuTreeWalk.r b/uuTreeWalk.r index a61673716..bd610d91e 100644 --- a/uuTreeWalk.r +++ b/uuTreeWalk.r @@ -7,8 +7,8 @@ # \brief Walks through a collection tree and calls an arbitrary rule for each tree-item. # # \param[in] direction can be "forward" or "reverse" -# forward means process collection itself, then childs -# reverse means process childs first +# forward means process collection itself, then children +# reverse means process children first # reverse is useful e.g. to delete collection trees # \param[in] topLevelCollection pathname of the root of the tree, must be collection # NB: the root itself is also processed diff --git a/vault.py b/vault.py index f949973f0..1e8d9f653 100644 --- a/vault.py +++ b/vault.py @@ -22,6 +22,7 @@ import policies_datamanager import policies_datapackage_status from util import * +from vault_utils import get_copy_folder_to_vault_irsync_command, get_sanity_checks_results_copy_to_vault_paths __all__ = ['api_vault_submit', 'api_vault_approve', @@ -30,12 +31,14 @@ 'api_vault_republish', 'api_vault_preservable_formats_lists', 'api_vault_unpreservable_files', + 'rule_vault_copy_to_vault', 'rule_vault_copy_numthreads', 'rule_vault_copy_original_metadata_to_vault', 'rule_vault_write_license', 'rule_vault_enable_indexing', 'rule_vault_disable_indexing', 'rule_vault_process_status_transitions', + 'rule_vault_grant_readers_vault_access', 'api_vault_system_metadata', 'api_vault_collection_details', 'api_vault_get_package_by_reference', @@ -205,7 +208,7 @@ def api_vault_copy_to_research(ctx, coll_origin, coll_target): if not collection.exists(ctx, coll_target): return api.Error('TargetPathNotExists', 'The target you specified does not exist') - # Check if user has READ ACCESS to specific vault packatge in collection coll_origin. + # Check if user has READ ACCESS to specific vault package in collection coll_origin. user_full_name = user.full_name(ctx) category = groups.group_category(ctx, group_name) is_datamanager = groups.user_is_datamanager(ctx, category, user.full_name(ctx)) @@ -483,22 +486,6 @@ def api_vault_system_metadata(ctx, coll): landinpage_url = row[0] system_metadata["Landingpage"] = "{}".format(landinpage_url, landinpage_url) - # Check for previous version. - previous_version = get_previous_version(ctx, coll) - if previous_version: - previous_version_doi = get_doi(ctx, previous_version) - system_metadata["Persistent Identifier DOI"] = persistent_identifier_doi = "previous version: {}".format(previous_version_doi, previous_version_doi) - - # Persistent Identifier DOI. - package_doi = get_doi(ctx, coll) - - if package_doi: - if previous_version: - persistent_identifier_doi = "{} (previous version: {})".format(package_doi, package_doi, previous_version_doi, previous_version_doi) - else: - persistent_identifier_doi = "{}".format(package_doi, package_doi) - system_metadata["Persistent Identifier DOI"] = persistent_identifier_doi - # Data Package Reference. data_package_reference = "" iter = genquery.row_iterator( @@ -559,6 +546,46 @@ def get_coll_vault_status(ctx, path, org_metadata=None): return constants.vault_package_state.EMPTY +def get_all_published_versions(ctx, path): + """Get all published versions of a data package.""" + base_doi = get_doi(ctx, path, 'base') + package_doi = get_doi(ctx, path) + coll_parent_name = path.rsplit('/', 1)[0] + + org_publ_info, data_packages, grouped_base_dois = get_all_doi_versions(ctx, coll_parent_name) + + count = 0 + all_versions = [] + + for data in data_packages: + if data[2] == package_doi: + count += 1 + + if count == 1: # Base DOI does not exist as it is first version of the publication + # Convert the date into two formats for display and tooltip (Jan 1, 1990 and 1990-01-01 00:00:00) + data_packages = [[x[0], datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime("%b %d, %Y"), x[2], + datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime('%Y-%m-%d %H:%M:%S%z'), x[3]] for x in data_packages] + + for item in data_packages: + if item[2] == package_doi: + all_versions.append([item[1], item[2], item[3]]) + else: # Base DOI exists + # Sort by publication date + sorted_publ = [sorted(x, key=lambda x: datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f"), reverse=True) for x in grouped_base_dois] + + sorted_publ = [element for innerList in sorted_publ for element in innerList] + + # Convert the date into two formats for display and tooltip (Jan 1, 1990 and 1990-01-01 00:00:00) + sorted_publ = [[x[0], datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime("%b %d, %Y"), x[2], + datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime('%Y-%m-%d %H:%M:%S%z'), x[3]] for x in sorted_publ] + + for item in sorted_publ: + if item[0] == base_doi: + all_versions.append([item[1], item[2], item[3]]) + + return base_doi, package_doi, all_versions + + @api.make() def api_vault_collection_details(ctx, path): """Return details of a vault collection. @@ -571,7 +598,7 @@ def api_vault_collection_details(ctx, path): if not collection.exists(ctx, path): return api.Error('nonexistent', 'The given path does not exist') - # Check if collection is in vault spcae. + # Check if collection is in vault space. space, _, group, subpath = pathutil.info(path) if space is not pathutil.Space.VAULT: return {} @@ -603,6 +630,8 @@ def api_vault_collection_details(ctx, path): return {'member_type': member_type, 'is_datamanager': is_datamanager} else: metadata = True + # Retreive all published versions + base_doi, package_doi, all_versions = get_all_published_versions(ctx, path) # Check if a vault action is pending. vault_action_pending = False @@ -652,7 +681,10 @@ def api_vault_collection_details(ctx, path): "has_datamanager": has_datamanager, "is_datamanager": is_datamanager, "vault_action_pending": vault_action_pending, - "research_group_access": research_group_access + "research_group_access": research_group_access, + "all_versions": all_versions, + "base_doi": base_doi, + "package_doi": package_doi } if config.enable_data_package_archive: import vault_archive @@ -764,25 +796,74 @@ def api_vault_get_publication_terms(ctx): return api.Error('TermsReadFailed', 'Could not open Terms and Agreements.') -@api.make() -def api_grant_read_access_research_group(ctx, coll): - """Grant read rights of research group for datapackage in vault. +def change_read_access_group(ctx, coll, actor, group, grant=True): + """Grant/revoke research group read access to vault package. - :param ctx: Combined type of a callback and rei struct - :param coll: Collection of data package to remove read rights from + :param ctx: Combined type of a callback and rei struct + :param coll: Collection of data package to grant/remove read rights from + :param actor: User changing the permissions + :param group: Group to grant/revoke read access to vault package + :param grant: Whether to grant or revoke access - :returns: API status + :returns: 2-Tuple of boolean successfully changed, API status if error + """ + try: + acl_kv = msi.kvpair(ctx, "actor", actor) + if grant: + msi.sudo_obj_acl_set(ctx, "recursive", "read", group, coll, acl_kv) + else: + msi.sudo_obj_acl_set(ctx, "recursive", "null", group, coll, acl_kv) + except Exception: + policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, group, "1", "read") + if bool(policy_error): + return False, api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll)) + else: + return False, api.Error('ErrorACLs', str(policy_error)) + + return True, '' + + +def check_change_read_access_research_group(ctx, coll, grant=True): + """Initial checks when changing read rights of research group for datapackage in vault. + + :param ctx: Combined type of a callback and rei struct + :param coll: Collection of data package to revoke/grant read rights from + :param grant: Whether to grant or revoke read rights + + :returns: 2-Tuple of boolean whether ok to continue and API status if error """ + verb = "grant" if grant else "revoke" + if not collection.exists(ctx, coll): - return api.Error('nonexistent', 'The given path does not exist') + return False, api.Error('nonexistent', 'The given path does not exist') coll_parts = coll.split('/') if len(coll_parts) != 5: - return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages') + return False, api.Error('invalid_collection', 'The datamanager can only {} permissions to vault packages'.format(verb)) - space, zone, group, subpath = pathutil.info(coll) + space, _, _, _ = pathutil.info(coll) if space is not pathutil.Space.VAULT: - return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages') + return False, api.Error('invalid_collection', 'The datamanager can only {} permissions to vault packages'.format(verb)) + + return True, '' + + +def change_read_access_research_group(ctx, coll, grant=True): + """Grant/revoke read rights of members of research group to a + datapackage in vault. This operation also includes read only members. + + :param ctx: Combined type of a callback and rei struct + :param coll: Collection of data package to grant/remove read rights from + :param grant: Whether to grant or revoke access + + :returns: API status + """ + verb = "granting" if grant else "revoking" + response, api_error = check_change_read_access_research_group(ctx, coll, True) + if not response: + return api_error + + _, _, group, subpath = pathutil.info(coll) # Find category group_parts = group.split('-') @@ -791,26 +872,34 @@ def api_grant_read_access_research_group(ctx, coll): else: research_group_name = 'research-' + '-'.join(group_parts[1:]) category = groups.group_category(ctx, group) + read_group_name = 'read-' + '-'.join(group_parts[1:]) # Is datamanager? actor = user.full_name(ctx) if groups.user_role(ctx, actor, 'datamanager-' + category) in ['normal', 'manager']: - # Grant research group read access to vault package. - try: - acl_kv = msi.kvpair(ctx, "actor", actor) - msi.sudo_obj_acl_set(ctx, "recursive", "read", research_group_name, coll, acl_kv) - except Exception: - policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, research_group_name, "1", "read") - if bool(policy_error): - return api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll)) - else: - return api.Error('ErrorACLs', str(policy_error)) + # Grant/revoke research group read access to vault package. + for group_name in (research_group_name, read_group_name): + response, api_error = change_read_access_group(ctx, coll, actor, group_name, grant) + if not response: + return api_error else: - return api.Error('NoDatamanager', 'Actor must be a datamanager for granting access') + return api.Error('NoDatamanager', 'Actor must be a datamanager for {} access'.format(verb)) return {'status': 'Success', 'statusInfo': ''} +@api.make() +def api_grant_read_access_research_group(ctx, coll): + """Grant read rights of research group for datapackage in vault. + + :param ctx: Combined type of a callback and rei struct + :param coll: Collection of data package to remove read rights from + + :returns: API status + """ + return change_read_access_research_group(ctx, coll, True) + + @api.make() def api_revoke_read_access_research_group(ctx, coll): """Revoke read rights of research group for datapackage in vault. @@ -820,62 +909,75 @@ def api_revoke_read_access_research_group(ctx, coll): :returns: API status """ - if not collection.exists(ctx, coll): - return api.Error('nonexistent', 'The given path does not exist') + return change_read_access_research_group(ctx, coll, False) - coll_parts = coll.split('/') - if len(coll_parts) != 5: - return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages') - space, zone, group, subpath = pathutil.info(coll) - if space is not pathutil.Space.VAULT: - return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages') +@rule.make() +def rule_vault_copy_to_vault(ctx, state): + """ Collect all folders with a given cronjob state + and try to copy them to the vault. - # Find category - group_parts = group.split('-') - if subpath.startswith("deposit-"): - research_group_name = 'deposit-' + '-'.join(group_parts[1:]) - else: - research_group_name = 'research-' + '-'.join(group_parts[1:]) - category = groups.group_category(ctx, group) + :param ctx: Combined type of a callback and rei struct + :param state: one of constants.CRONJOB_STATE + """ + iter = get_copy_to_vault_colls(ctx, state) + for row in iter: + coll = row[0] + log.write(ctx, "copy_to_vault {}: {}".format(state, coll)) + if not folder.precheck_folder_secure(ctx, coll): + continue - # Is datamanager? - actor = user.full_name(ctx) - if groups.user_role(ctx, actor, 'datamanager-' + category) in ['normal', 'manager']: - # Grant research group read access to vault package. - try: - acl_kv = msi.kvpair(ctx, "actor", actor) - msi.sudo_obj_acl_set(ctx, "recursive", "null", research_group_name, coll, acl_kv) - except Exception: - policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, research_group_name, "1", "read") - if bool(policy_error): - return api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll)) - else: - return api.Error('ErrorACLs', str(policy_error)) - else: - return api.Error('NoDatamanager', 'Actor must be a datamanager for revoking access') + # failed copy + if not folder.folder_secure(ctx, coll): + log.write(ctx, "copy_to_vault {} failed for collection <{}>".format(state, coll)) + folder.folder_secure_set_retry(ctx, coll) - return {'status': 'Success', 'statusInfo': ''} + +def get_copy_to_vault_colls(ctx, cronjob_state): + iter = list(genquery.Query(ctx, + ['COLL_NAME'], + "META_COLL_ATTR_NAME = '{}' AND META_COLL_ATTR_VALUE = '{}'".format( + constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", + cronjob_state), + output=genquery.AS_LIST)) + return iter -def copy_folder_to_vault(ctx, folder, target): - """Copy folder and all its contents to target in vault. +def copy_folder_to_vault(ctx, coll, target): + """Copy folder and all its contents to target in vault using irsync. - The data will reside onder folder '/original' within the vault. + The data will reside under folder '/original' within the vault. :param ctx: Combined type of a callback and rei struct - :param folder: Path of a folder in the research space + :param coll: Path of a folder in the research space :param target: Path of a package in the vault space - :raises Exception: Raises exception when treewalk_and_ingest did not finish correctly + :returns: True for successful copy """ - destination = target + '/original' - origin = folder + sanity_check_results = get_sanity_checks_results_copy_to_vault_paths(coll, target) + if len(sanity_check_results) > 0: + log.write(ctx, "Not copying folder to vault because of sanity check failures: " + + str(sanity_check_results)) + return False + + returncode = 0 + irsync_command = get_copy_folder_to_vault_irsync_command(coll, + target, + config.resource_vault, + config.vault_copy_multithread_enabled) - # Origin is a never changing value to be able to designate a relative path within ingest_object - error = 0 # Initial error state. Should stay 0. - if treewalk_and_ingest(ctx, folder, destination, origin, error): - raise Exception('copy_folder_to_vault: Error copying folder to vault') + try: + returncode = subprocess.call(irsync_command) + except Exception as e: + log.write(ctx, "irsync failure: " + e) + log.write(ctx, "irsync failure for coll <{}> and target <{}>".format(coll, target)) + return False + + if returncode != 0: + log.write(ctx, "irsync failure for coll <{}> and target <{}>".format(coll, target)) + return False + + return True def treewalk_and_ingest(ctx, folder, target, origin, error): @@ -930,6 +1032,7 @@ def ingest_object(ctx, parent, item, item_is_collection, destination, origin): source_path = parent + "/" + item read_access = msi.check_access(ctx, source_path, 'read object', irods_types.BytesBuf())['arguments'][2] + # TODO use set_acl_check? if read_access != b'\x01': try: msi.set_acl(ctx, "default", "admin:read", user.full_name(ctx), source_path) @@ -973,13 +1076,21 @@ def ingest_object(ctx, parent, item, item_is_collection, destination, origin): return 0 -def set_vault_permissions(ctx, group_name, folder, target): +def set_vault_permissions(ctx, coll, target): """Set permissions in the vault as such that data can be copied to the vault.""" + group_name = folder.collection_group_name(ctx, coll) + if group_name == '': + log.write(ctx, "set_vault_permissions: Cannot determine which deposit or research group <{}> belongs to".format(coll)) + return False + parts = group_name.split('-') base_name = '-'.join(parts[1:]) + valid_read_groups = [group_name] - parts = folder.split('/') vault_group_name = constants.IIVAULTPREFIX + base_name + if parts[0] != 'deposit': + read_group_name = "read-" + base_name + valid_read_groups.append(read_group_name) # Check if noinherit is set zone = user.zone(ctx) @@ -1018,11 +1129,12 @@ def set_vault_permissions(ctx, group_name, folder, target): if access_name != "read object": # Grant the research group read-only access to the collection to enable browsing through the vault. - try: - msi.set_acl(ctx, "default", "admin:read", group_name, vault_path) - log.write(ctx, "Granted " + group_name + " read access to " + vault_path) - except msi.Error: - log.write(ctx, "Failed to grant " + group_name + " read access to " + vault_path) + for name in valid_read_groups: + try: + msi.set_acl(ctx, "default", "admin:read", name, vault_path) + log.write(ctx, "Granted " + name + " read access to " + vault_path) + except msi.Error: + log.write(ctx, "Failed to grant " + name + " read access to " + vault_path) # Check if vault group has ownership iter = genquery.row_iterator( @@ -1053,8 +1165,144 @@ def set_vault_permissions(ctx, group_name, folder, target): if group.exists(ctx, datamanager_group_name): msi.set_acl(ctx, "recursive", "admin:read", datamanager_group_name, target) - # Grant research group read access to vault package. - msi.set_acl(ctx, "recursive", "admin:read", group_name, target) + # Grant research group, research group readers read access to vault package. + for name in valid_read_groups: + msi.set_acl(ctx, "recursive", "admin:read", name, target) + + return True + + +def reader_needs_access(ctx, group_name, coll): + """Return if research group has access to this group but readers do not""" + iter = genquery.row_iterator( + "COLL_ACCESS_USER_ID", + "COLL_NAME = '" + coll + "'", + genquery.AS_LIST, ctx + ) + reader_found = False + research_found = False + + for row in iter: + user_id = row[0] + user_name = user.name_from_id(ctx, user_id) + # Check if there are *any* readers + if user_name.startswith('read-'): + reader_found = True + elif user_name == group_name: + research_found = True + + return not reader_found and research_found + + +def set_reader_vault_permissions(ctx, group_name, zone, dry_run): + """Given a research group name, give reader group access to + vault packages if they don't have that access already. + + :param ctx: Combined type of a callback and rei struct + :param group_name: Research group name + :param zone: Zone + :param dry_run: Whether to only print which groups would be changed without changing them + + :return: Boolean whether completed successfully or there were errors. + """ + parts = group_name.split('-') + base_name = '-'.join(parts[1:]) + read_group_name = 'read-' + base_name + vault_group_name = constants.IIVAULTPREFIX + base_name + vault_path = "/" + zone + "/home/" + vault_group_name + no_errors = True + + # Do not change the permissions if there aren't any vault packages in this vault. + if collection.empty(ctx, vault_path): + return True + + if reader_needs_access(ctx, group_name, vault_path): + # Grant the research group readers read-only access to the collection + # to enable browsing through the vault. + try: + if dry_run: + log.write(ctx, "Would have granted " + read_group_name + " read access to " + vault_path) + else: + msi.set_acl(ctx, "default", "admin:read", read_group_name, vault_path) + log.write(ctx, "Granted " + read_group_name + " read access to " + vault_path) + except msi.Error: + no_errors = False + log.write(ctx, "Failed to grant " + read_group_name + " read access to " + vault_path) + + iter = genquery.row_iterator( + "COLL_NAME", + "COLL_PARENT_NAME = '{}'".format(vault_path), + genquery.AS_LIST, ctx + ) + for row in iter: + target = row[0] + if reader_needs_access(ctx, group_name, target): + try: + if dry_run: + log.write(ctx, "Would have granted " + read_group_name + " read access to " + target) + else: + msi.set_acl(ctx, "recursive", "admin:read", read_group_name, target) + log.write(ctx, "Granted " + read_group_name + " read access to " + target) + except Exception: + no_errors = False + log.write(ctx, "Failed to set read permissions for <{}> on coll <{}>".format(read_group_name, target)) + + return no_errors + + +@rule.make(inputs=[0, 1], outputs=[2]) +def rule_vault_grant_readers_vault_access(ctx, dry_run, verbose): + """Rule for granting reader members of research groups access to vault packages in their + group if they don't have access already + + :param ctx: Combined type of a callback and rei struct + :param dry_run: Whether to only print which groups would be changed without making changes + :param verbose: Whether to be more verbose + + :return: String status of completed successfully ('0') or there were errors ('1') + """ + dry_run = (dry_run == '1') + verbose = (verbose == '1') + no_errors = True + + log.write(ctx, "grant_readers_vault_access started.") + + if user.user_type(ctx) != 'rodsadmin': + log.write(ctx, "User is not rodsadmin") + return '1' + + if dry_run or verbose: + modes = [] + if dry_run: + modes.append("dry run") + if verbose: + modes.append("verbose") + log.write(ctx, "Running grant_readers_vault_access in {} mode.".format((" and ").join(modes))) + + zone = user.zone(ctx) + + # Get the group names + userIter = genquery.row_iterator( + "USER_GROUP_NAME", + "USER_TYPE = 'rodsgroup' AND USER_ZONE = '{}' AND USER_GROUP_NAME like 'research-%'".format(zone), + genquery.AS_LIST, + ctx) + + for row in userIter: + name = row[0] + if verbose: + log.write(ctx, "{}: checking permissions".format(name)) + if not set_reader_vault_permissions(ctx, name, zone, dry_run): + no_errors = False + + message = "" + if no_errors: + message = "grant_readers_vault_access completed successfully." + else: + message = "grant_readers_vault_access completed, with errors." + log.write(ctx, message) + + return '0' if no_errors else '1' @rule.make(inputs=range(4), outputs=range(4, 6)) @@ -1116,7 +1364,7 @@ def vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous iter = genquery.row_iterator( "META_COLL_ATTR_VALUE", "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_landingPageUrl'" % (coll), - genquery.AS_LIST, callback + genquery.AS_LIST, ctx ) for row in iter: @@ -1127,7 +1375,7 @@ def vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous iter = genquery.row_iterator( "META_COLL_ATTR_VALUE", "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_versionDOI'" % (coll), - genquery.AS_LIST, callback + genquery.AS_LIST, ctx ) for row in iter: @@ -1187,7 +1435,7 @@ def vault_request_status_transitions(ctx, coll, new_vault_status, previous_versi # Except for status transition to PUBLISHED/DEPUBLISHED, # because it is requested by the system before previous pending # transition is removed. - if new_vault_status != constants.vault_package_state.PUBLISHED and new_vault_status != constants.vault_package_state.DEPUBLISHED: + if new_vault_status not in (constants.vault_package_state.PUBLISHED, constants.vault_package_state.DEPUBLISHED): action_status = constants.UUORGMETADATAPREFIX + '"vault_status_action_' + coll_id iter = genquery.row_iterator( "COLL_ID", @@ -1255,17 +1503,21 @@ def get_approver(ctx, path): return None -def get_doi(ctx, path): +def get_doi(ctx, path, doi='version'): """Get the DOI of a data package in the vault. :param ctx: Combined type of a callback and rei struct :param path: Vault package to get the DOI of + :param doi: 'base' or 'version' to retrieve required DOI :return: Data package DOI or None """ + if doi != 'base': + doi = 'version' + iter = genquery.row_iterator( "META_COLL_ATTR_VALUE", - "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_versionDOI'" % (path), + "COLL_NAME = '{}' AND META_COLL_ATTR_NAME = 'org_publication_{}DOI'".format(path, doi), genquery.AS_LIST, ctx ) diff --git a/vault_archive.py b/vault_archive.py index d52da9edd..ea034a42a 100644 --- a/vault_archive.py +++ b/vault_archive.py @@ -134,11 +134,11 @@ def create_archive(ctx, coll): def extract_archive(ctx, coll): while True: state = ctx.dmattr(package_archive_path(ctx, coll), config.data_package_archive_fqdn, "")["arguments"][2] - if state != "UNM" and state != "MIG": + if state not in ("UNM", "MIG"): break time.sleep(10) - if state != "DUL" and state != "REG" and state != "INV": + if state not in ("DUL", "REG", "INV"): log.write(ctx, "Archive of data package <{}> is not available, state is <{}>".format(coll, state)) raise Exception("Archive is not available") @@ -253,7 +253,7 @@ def vault_extract_archive(ctx, coll): def update(ctx, coll, attr): - if pathutil.info(coll).space == pathutil.Space.VAULT and attr != constants.IIARCHIVEATTRNAME and attr != constants.UUPROVENANCELOG and vault_archival_status(ctx, coll) == "archived": + if pathutil.info(coll).space == pathutil.Space.VAULT and attr not in (constants.IIARCHIVEATTRNAME, constants.UUPROVENANCELOG) and vault_archival_status(ctx, coll) == "archived": avu.set_on_coll(ctx, coll, constants.IIARCHIVEATTRNAME, "update") ctx.dmget(package_archive_path(ctx, coll), config.data_package_archive_fqdn, "OFL") diff --git a/vault_utils.py b/vault_utils.py new file mode 100644 index 000000000..5875b52f4 --- /dev/null +++ b/vault_utils.py @@ -0,0 +1,74 @@ +"""Utility functions for vault module.""" + +__copyright__ = 'Copyright (c) 2019-2024, Utrecht University' +__license__ = 'GPLv3, see LICENSE' + +from util import pathutil + + +def get_copy_folder_to_vault_irsync_command(coll, target, vault_resource, multi_threading): + """Internal function to determine rsync command for copy-to-vault + + :param coll: source collection + :param target: target collection + :param vault_resource: resource to store vault data on (can be None) + :param multi_threading: if set to false, disable multi threading, + otherwise use server default + + :returns: irsync command with parameters in list format + """ + + irsync_command = ["irsync", "-rK"] + + if vault_resource is not None: + irsync_command.extend(["-R", vault_resource]) + + if not multi_threading: + irsync_command.extend(["-N", "0"]) # 0 means no multi threading + + irsync_command.extend(["i:{}/".format(coll), "i:{}/original".format(target)]) + return irsync_command + + +def get_sanity_checks_results_copy_to_vault_paths(source, target): + """Internal function to determine whether a source and destination path for + archiving data in the vault pass sanity checks. + + :param source: source collection + :param target: target collection + + :returns: list of sanity check fails (empty list means all tests passed) + """ + failed = [] + + if not source.startswith("/"): + failed.append("Source path is not absolute.") + + if not target.startswith("/"): + failed.append("Target path is not absolute.") + + if ".." in source.split("/"): + failed.append("Source path contains parent references (..)") + + if ".." in target.split("/"): + failed.append("Target path contains parent references (..)") + + if len(failed) > 0: + # The remaining tests assume absolute paths without parent references, + # so skip these tests if previous tests did not pass. + return failed + + (source_space, source_zone, source_group, _) = pathutil.info(source) + (target_space, target_zone, target_group, _) = pathutil.info(target) + + if source_space not in (pathutil.Space.DEPOSIT, pathutil.Space.RESEARCH): + failed.append("Source path not in research or deposit group.") + + if target_space != pathutil.Space.VAULT: + failed.append("Target path not in vault group.") + + if (source_zone != target_zone + or "-".join(source_group.split("-")[1:]) != "-".join(target_group.split("-")[1:])): + failed.append("Source and target group are not in same compartment.") + + return failed