From 34df8fe6a753606e1882bcf33236dc44b3d34a69 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Fri, 19 Jul 2024 15:20:44 +0200
Subject: [PATCH 01/57] Add integr. tests for counting user connections

---
 integration_tests.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/integration_tests.py b/integration_tests.py
index c2b8974aa..b18ba6386 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -18,6 +18,10 @@
     {"name": "policies.check_anonymous_access_allowed.remote",
      "test": lambda ctx: ctx.rule_check_anonymous_access_allowed("1.2.3.4", ""),
      "check": lambda x: x['arguments'][1] == 'false'},
+    {"name": "policies.check_max_connections_exceeded",
+     "test": lambda ctx: ctx.rule_check_max_connections_exceeded(""),
+     # This rule should always return 'false' for user 'rods'
+     "check": lambda x: x['arguments'][0] == 'false'},
     # Vault metadata schema report: only check return value type, not contents
     {"name": "schema_transformation.batch_vault_metadata_schema_report",
      "test": lambda ctx: ctx.rule_batch_vault_metadata_schema_report(""),
@@ -91,6 +95,9 @@
     {"name":   "util.user.is_member_of.no",
      "test": lambda ctx: user.is_member_of(ctx, "research-initial", "datamanager"),
      "check": lambda x: not x},
+    {"name":   "util.user.number_of_connection",
+     "test": lambda ctx: user.number_of_connections(ctx),
+     "check": lambda x: isinstance(x, int) and x > 0},
     {"name":   "util.user.usertype.rodsadmin",
      "test": lambda ctx: user.user_type(ctx, "rods"),
      "check": lambda x: x == "rodsadmin"},

From 30480b742b03589b74b78170ebd341d127dec4a7 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Fri, 19 Jul 2024 16:10:35 +0200
Subject: [PATCH 02/57] Integration tests: enable running subset

By default, when calling the integration test rule, we run all
tests. This adds an option to run a single test, or all tests
with a particular prefix (e.g. "util.collection.*").
---
 integration_tests.py           | 14 ++++++++++++--
 tools/run-integration-tests.r  |  7 ++++---
 tools/run-integration-tests.sh | 12 ++++++++++++
 3 files changed, 28 insertions(+), 5 deletions(-)
 create mode 100755 tools/run-integration-tests.sh

diff --git a/integration_tests.py b/integration_tests.py
index b18ba6386..e1840b7fe 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -107,13 +107,17 @@
 ]
 
 
-@rule.make(inputs=[], outputs=[0])
-def rule_run_integration_tests(ctx):
+@rule.make(inputs=[0], outputs=[1])
+def rule_run_integration_tests(ctx, tests):
     """This function runs the integration tests. It must be run by
     a rodsadmin user on a development environment. It assumes the standard
     test data is present.
 
     :param ctx:  Combined type of a callback and rei struct
+    :param tests: Indicates which tests to run:
+                  - Empty string means all tests
+                  - String ending with '*' means all tests that start with a prefix, e.g. 'util.user.*'
+                  - Otherwise the string should be the exact name of a test
 
     :returns: string with test results. Each line has one test name and its verdict.
     """
@@ -133,8 +137,14 @@ def rule_run_integration_tests(ctx):
         name = testconfig["name"]
         test = testconfig["test"]
         check = testconfig["check"]
+
         exception = False
 
+        if (tests != ""
+                and tests != name
+                and not (tests.endswith("*") and name.startswith(tests[0:-1]))):
+            continue
+
         try:
             result = test(ctx)
         except BaseException:
diff --git a/tools/run-integration-tests.r b/tools/run-integration-tests.r
index 5407ed5f6..5d128cd0d 100644
--- a/tools/run-integration-tests.r
+++ b/tools/run-integration-tests.r
@@ -6,9 +6,10 @@ import genquery
 
 
 def main(rule_args, callback, rei):
-    result = callback.rule_run_integration_tests("")
-    callback.writeLine("stdout", result["arguments"][0])
+    tests = global_vars["*tests"].strip('"')
+    result = callback.rule_run_integration_tests(tests, "")
+    callback.writeLine("stdout", result["arguments"][1])
 
 
-INPUT null
+INPUT *tests=""
 OUTPUT ruleExecOut
diff --git a/tools/run-integration-tests.sh b/tools/run-integration-tests.sh
new file mode 100755
index 000000000..e1b4e03b7
--- /dev/null
+++ b/tools/run-integration-tests.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+#
+# This script runs the integration tests, or a subset of them.
+#
+# Run all tests:                    ./run-integration-tests.sh
+# Run tests with a specific prefix: ./run-integration-tests.sh util.collection.*
+# Run one specific test:            ./run-integration-test.ssh util.collection.owner
+
+
+TESTS="$1"
+TOOLSDIR=$(dirname "$0")
+/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F "$TOOLSDIR/run-integration-tests.r" "$TESTS"

From 5b8e4b21bc7f340f452ce49ca39929af70cf8990 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:06:14 +0200
Subject: [PATCH 03/57] Fix call to deposit create

---
 deposit.py | 9 +++++----
 vault.py   | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/deposit.py b/deposit.py
index bd7dacc4a..483cff9e6 100644
--- a/deposit.py
+++ b/deposit.py
@@ -25,15 +25,16 @@
 
 
 @api.make()
-def api_deposit_copy_data_package(ctx, reference):
+def api_deposit_copy_data_package(ctx, reference, deposit_group):
     """Create deposit collection and copies selected datapackage into the newly created deposit
 
-    :param ctx:       Combined type of a callback and rei struct
-    :param reference: Data Package Reference (UUID4)
+    :param ctx:           Combined type of a callback and rei struct
+    :param reference:     Data Package Reference (UUID4)
+    :param deposit_group: Deposit group to copy to
 
     :returns: Path to created deposit collection or API error
     """
-    result = deposit_create(ctx)
+    result = deposit_create(ctx, deposit_group)
     if result["deposit_path"] == "not_allowed":
         return api.Error('not_allowed', 'Could not create deposit collection.')
 
diff --git a/vault.py b/vault.py
index f949973f0..beeb86f85 100644
--- a/vault.py
+++ b/vault.py
@@ -205,7 +205,7 @@ def api_vault_copy_to_research(ctx, coll_origin, coll_target):
     if not collection.exists(ctx, coll_target):
         return api.Error('TargetPathNotExists', 'The target you specified does not exist')
 
-    # Check if user has READ ACCESS to specific vault packatge in collection coll_origin.
+    # Check if user has READ ACCESS to specific vault package in collection coll_origin.
     user_full_name = user.full_name(ctx)
     category = groups.group_category(ctx, group_name)
     is_datamanager = groups.user_is_datamanager(ctx, category, user.full_name(ctx))

From 1ce16420491b4abc1050d2fcb0b70d41fe50eb15 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Mon, 22 Jul 2024 14:08:54 +0200
Subject: [PATCH 04/57] YDA-5393: Remove iiCopyFolderToVault

---
 iiVault.r | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/iiVault.r b/iiVault.r
index 805b78402..570c5ef0e 100644
--- a/iiVault.r
+++ b/iiVault.r
@@ -6,24 +6,6 @@
 # \license   GPLv3, see LICENSE.
 
 
-# \brief iiCopyFolderToVault
-#
-# \param[in] folder  folder to copy to the vault
-# \param[in] target  path of the vault package
-#
-iiCopyFolderToVault(*folder, *target) {
-
-	writeLine("serverLog", "iiCopyFolderToVault: Copying *folder to *target")
-	*buffer.source = *folder;
-	*buffer.destination = *target ++ "/original";
-	uuTreeWalk("forward", *folder, "iiIngestObject", *buffer, *error);
-	if (*error != 0) {
-		msiGetValByKey(*buffer, "msg", *msg); # using . syntax here lead to type error
-		writeLine("stdout", "iiIngestObject: *error: *msg");
-		fail;
-	}
-}
-
 # \brief Called by uuTreeWalk for each collection and dataobject to copy to the vault.
 #
 # \param[in] itemParent

From 16704da272b76d526e116f3b634c76c9afc8a94d Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:04:20 +0200
Subject: [PATCH 05/57] YDA-5778: Disallow folders with apostrophes

---
 meta_form.py |  10 ++--
 research.py  | 161 ++++++++++++++++++++++++++-------------------------
 2 files changed, 88 insertions(+), 83 deletions(-)

diff --git a/meta_form.py b/meta_form.py
index b322926ca..619a30db3 100644
--- a/meta_form.py
+++ b/meta_form.py
@@ -150,7 +150,8 @@ def load(ctx, coll):
 
     if space in [pathutil.Space.RESEARCH, pathutil.Space.DEPOSIT]:
         is_locked = folder.is_locked(ctx, coll, org_metadata)
-        can_edit = is_member and not is_locked
+        # Do not allow editing of files in folders with apostrophes in name
+        can_edit = is_member and not is_locked and '\'' not in subpath
 
         # Analyze a possibly existing metadata JSON file.
         meta_path = meta.get_collection_metadata_path(ctx, coll)
@@ -229,9 +230,10 @@ def load(ctx, coll):
 
         status    = vault.get_coll_vault_status(ctx, coll, org_metadata)
         can_edit  = (groups.user_is_datamanager(ctx, category, user_full_name)
-                     and (status == constants.vault_package_state.UNPUBLISHED
-                          or status == constants.vault_package_state.PUBLISHED
-                          or status == constants.vault_package_state.DEPUBLISHED))
+                     and status in (constants.vault_package_state.UNPUBLISHED,
+                     constants.vault_package_state.PUBLISHED,
+                     constants.vault_package_state.DEPUBLISHED)
+                     and '\'' not in subpath)
         meta_path = meta.get_latest_vault_metadata_path(ctx, coll)
 
         if meta_path is None:
diff --git a/research.py b/research.py
index 4648be5b6..c03fec258 100644
--- a/research.py
+++ b/research.py
@@ -27,6 +27,26 @@
            'api_research_manifest']
 
 
+def folder_new_name_check(folder_name):
+    if len(folder_name) == 0:
+        return False, api.Error('missing_foldername', 'Missing folder name. Please add a folder name')
+
+    # TODO remove when upgrade to GenQuery 2
+    # This check should only be done on new folders, since may have old folders with apostrophes
+    if '\'' in folder_name:
+        return False, api.Error('invalid_foldername', 'It is not allowed to use apostrophes in a folder name')
+
+    # Name should not contain '\\' or '/'
+    if '/' in folder_name or '\\' in folder_name:
+        return False, api.Error('invalid_foldername', 'It is not allowed to use slashes in the new folder name')
+
+    # name should not be '.' or '..'
+    if folder_name in ('.', '..'):
+        return False, api.Error('invalid_foldername', 'it is not allowed to name the folder {}'.format(folder_name))
+
+    return True, ""
+
+
 @api.make()
 def api_research_folder_add(ctx, coll, new_folder_name):
     """Add a new folder to a research folder.
@@ -39,8 +59,9 @@ def api_research_folder_add(ctx, coll, new_folder_name):
     """
     coll_target = coll + '/' + new_folder_name
 
-    if len(new_folder_name) == 0:
-        return api.Error('missing_foldername', 'Missing folder name. Please add a folder name')
+    valid_folder_name, error_response = folder_new_name_check(new_folder_name)
+    if not valid_folder_name:
+        return error_response
 
     try:
         validate_filepath(coll_target.decode('utf-8'))
@@ -51,14 +72,6 @@ def api_research_folder_add(ctx, coll, new_folder_name):
     if not len(coll.split('/')) > 2:
         return api.Error('invalid_destination', 'It is not possible to add folder ' + new_folder_name + ' at this location')
 
-    # Name should not contain '\\' or '/'
-    if '/' in new_folder_name or '\\' in new_folder_name:
-        return api.Error('invalid_foldername', 'It is not allowed to use slashes in a folder name')
-
-    # Name should not be '.' or '..'
-    if new_folder_name == '.' or new_folder_name == '..':
-        return api.Error('invalid_foldername', 'It is not allowed to name the folder {}'.format(new_folder_name))
-
     # in vault?
     target_group_name = coll_target.split('/')[3]
     if target_group_name.startswith('vault-'):
@@ -90,58 +103,83 @@ def api_research_folder_add(ctx, coll, new_folder_name):
     return api.Result.ok()
 
 
-@api.make()
-def api_research_folder_copy(ctx, folder_path, new_folder_path, overwrite=False):
-    """Copy a folder in a research folder.
+def folder_copy_check(ctx, folder_path, new_folder_path, overwrite, copy=True):
+    """Check whether can copy (or move) folder to new folder location.
 
     :param ctx:             Combined type of a callback and rei struct
     :param folder_path:     Path to the folder to copy
     :param new_folder_path: Path to the new copy of the folder
     :param overwrite:       Overwrite folder if it already exists
+    :param copy:            Whether a copy operation (True) or move (False) (just for logging purposes)
 
-    :returns: Dict with API status result
+    :returns: 2-Tuple containing whether can copy/move, and the error if cannot
     """
+    # Whether copy or move
+    verb = 'copy' if copy else 'move'
+    verb_past = 'copied' if copy else 'moved'
     if len(new_folder_path) == 0:
-        return api.Error('missing_folder_path', 'Missing folder path. Please add a folder path')
+        return False, api.Error('missing_folder_path', 'Missing folder path. Please add a folder path')
+
+    # TODO remove when upgrade to GenQuery 2
+    if '\'' in new_folder_path:
+        return False, api.Error('invalid_foldername', 'It is not allowed to use apostrophes in a folder name')
 
     try:
         validate_filepath(new_folder_path.decode('utf-8'))
     except ValidationError:
-        return api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder')
+        return False, api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder')
 
     # Same folder path makes no sense.
     if folder_path == new_folder_path:
-        return api.Error('invalid_folder_path', 'Origin and copy folder paths are equal. Please choose another destination')
+        return False, api.Error('invalid_folder_path', 'Origin and {} folder paths are equal. Please choose another destination'.format(verb))
 
     # Inside the same path makes no sense.
     if "{}/".format(folder_path) in new_folder_path:
-        return api.Error('invalid_folder_path', 'Cannot copy folder inside itself. Please choose another destination')
+        return False, api.Error('invalid_folder_path', 'Cannot {} folder inside itself. Please choose another destination'.format(verb))
 
     # not in home - a groupname must be present ie at least 2!?
     if not len(new_folder_path.split('/')) > 2:
-        return api.Error('invalid_destination', 'It is not possible to copy folder at this location')
+        return False, api.Error('invalid_destination', 'It is not possible to {} folder at this location'.format(verb))
 
     # in vault?
     target_group_name = new_folder_path.split('/')[3]
     if target_group_name.startswith('vault-'):
-        return api.Error('invalid_destination', 'It is not possible to copy folder to the vault')
+        return False, api.Error('invalid_destination', 'It is not possible to {} folder to the vault'.format(verb))
 
     # permissions ok for group?
     user_full_name = user.full_name(ctx)
     if groups.user_role(ctx, user_full_name, target_group_name) in ['none', 'reader']:
-        return api.Error('not_allowed', 'You do not have sufficient permissions to copy the selected folder')
+        return False, api.Error('not_allowed', 'You do not have sufficient permissions to {} the selected folder'.format(verb))
 
     # Folder not locked?
     if folder.is_locked(ctx, new_folder_path):
-        return api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be copied')
+        return False, api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be {}'.format(verb_past))
 
     # Does original folder exist?
     if not collection.exists(ctx, folder_path):
-        return api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found')
+        return False, api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found')
 
     # Collection exists in destination?
     if not overwrite and collection.exists(ctx, new_folder_path):
-        return api.Error('invalid_destination', 'Folder with this name already exists in destination')
+        return False, api.Error('invalid_destination', 'Folder with this name already exists in destination')
+
+    return True, ""
+
+
+@api.make()
+def api_research_folder_copy(ctx, folder_path, new_folder_path, overwrite=False):
+    """Copy a folder in a research folder.
+
+    :param ctx:             Combined type of a callback and rei struct
+    :param folder_path:     Path to the folder to copy
+    :param new_folder_path: Path to the new copy of the folder
+    :param overwrite:       Overwrite folder if it already exists
+
+    :returns: Dict with API status result
+    """
+    valid, errorResponse = folder_copy_check(ctx, folder_path, new_folder_path, overwrite, True)
+    if not valid:
+        return errorResponse
 
     # All requirements OK
     try:
@@ -163,47 +201,9 @@ def api_research_folder_move(ctx, folder_path, new_folder_path, overwrite=False)
 
     :returns: Dict with API status result
     """
-    if len(new_folder_path) == 0:
-        return api.Error('missing_folder_path', 'Missing folder path. Please add a folder path')
-
-    try:
-        validate_filepath(new_folder_path.decode('utf-8'))
-    except ValidationError:
-        return api.Error('invalid_foldername', 'This is not a valid folder name. Please choose another name for your folder')
-
-    # Same folder path makes no sense.
-    if folder_path == new_folder_path:
-        return api.Error('invalid_folder_path', 'Origin and move folder paths are equal. Please choose another destination')
-
-    # Inside the same path makes no sense.
-    if "{}/".format(folder_path) in new_folder_path:
-        return api.Error('invalid_folder_path', 'Cannot move folder inside itself. Please choose another destination')
-
-    # not in home - a groupname must be present ie at least 2!?
-    if not len(new_folder_path.split('/')) > 2:
-        return api.Error('invalid_destination', 'It is not possible to move folder at this location')
-
-    # in vault?
-    target_group_name = new_folder_path.split('/')[3]
-    if target_group_name.startswith('vault-'):
-        return api.Error('invalid_destination', 'It is not possible to move folder to the vault')
-
-    # permissions ok for group?
-    user_full_name = user.full_name(ctx)
-    if groups.user_role(ctx, user_full_name, target_group_name) in ['none', 'reader']:
-        return api.Error('not_allowed', 'You do not have sufficient permissions to move the selected folder')
-
-    # Folder not locked?
-    if folder.is_locked(ctx, new_folder_path):
-        return api.Error('not_allowed', 'The indicated folder is locked and therefore the folder can not be moved')
-
-    # Does original folder exist?
-    if not collection.exists(ctx, folder_path):
-        return api.Error('invalid_source', 'The original folder ' + folder_path + ' can not be found')
-
-    # Collection exists in destination?
-    if not overwrite and collection.exists(ctx, new_folder_path):
-        return api.Error('invalid_destination', 'Folder with this name already exists in destination')
+    valid, errorResponse = folder_copy_check(ctx, folder_path, new_folder_path, overwrite, False)
+    if not valid:
+        return errorResponse
 
     # All requirements OK
     try:
@@ -227,8 +227,9 @@ def api_research_folder_rename(ctx, new_folder_name, coll, org_folder_name):
     """
     coll_target = coll + '/' + new_folder_name
 
-    if len(new_folder_name) == 0:
-        return api.Error('missing_foldername', 'Missing folder name. Please add a folder name')
+    valid_folder_name, error_response = folder_new_name_check(new_folder_name)
+    if not valid_folder_name:
+        return error_response
 
     try:
         validate_filepath(coll_target.decode('utf-8'))
@@ -241,15 +242,7 @@ def api_research_folder_rename(ctx, new_folder_name, coll, org_folder_name):
 
     # not in home - a groupname must be present ie at least 2!?
     if not len(coll.split('/')) > 2:
-        return api.Error('invalid_destination', 'It is not possible to add folder ' + folder_name + ' at this location')
-
-    # Name should not contain '\\' or '/'
-    if '/' in new_folder_name or '\\' in new_folder_name:
-        return api.Error('invalid_foldername', 'It is not allowed to use slashes in the new folder name')
-
-    # Name should not be '.' or '..'
-    if new_folder_name == '.' or new_folder_name == '..':
-        return api.Error('invalid_foldername', 'It is not allowed to name the folder {}'.format(new_folder_name))
+        return api.Error('invalid_destination', 'It is not possible to add folder ' + org_folder_name + ' at this location')
 
     # in vault?
     target_group_name = coll_target.split('/')[3]
@@ -300,7 +293,7 @@ def api_research_folder_delete(ctx, coll, folder_name):
 
     # Name should not contain '\\' or '/'.
     if '/' in folder_name or '\\' in folder_name:
-        return api.Error('invalid_foldername', 'It is not allowed to use slashes in folder name to be delete')
+        return api.Error('invalid_foldername', 'It is not allowed to use slashes in folder name that will be deleted')
 
     # in vault?
     target_group_name = coll_target.split('/')[3]
@@ -386,6 +379,11 @@ def api_research_file_copy(ctx, filepath, new_filepath, overwrite=False):
     except Exception:
         return api.Error('invalid_filename', 'This is not a valid file name. Please choose another name')
 
+    # TODO remove when upgrade to GenQuery 2
+    # This check should only be done on new folders, since may have old folders with apostrophes
+    if '\'' in coll:
+        return api.Error('invalid_filepath', 'It is not allowed to copy a file to a folder with an apostrophe in the name')
+
     # not in home - a groupname must be present ie at least 2!?
     if not len(coll.split('/')) > 2:
         return api.Error('invalid_destination', 'It is not possible to copy files at this location')
@@ -513,13 +511,18 @@ def api_research_file_move(ctx, filepath, new_filepath, overwrite=False):
     if filepath == new_filepath:
         return api.Error('invalid_filepath', 'Origin and move file paths are equal. Please choose another destination')
 
-    coll = pathutil.chop(new_filepath)[0]
-    data_name = pathutil.chop(new_filepath)[1]
+    # These are of the NEW filepath
+    coll, data_name = pathutil.chop(new_filepath)
     try:
         validate_filename(data_name.decode('utf-8'))
     except Exception:
         return api.Error('invalid_filename', 'This is not a valid file name. Please choose another name')
 
+    # TODO remove when upgrade to GenQuery 2
+    # This check should only be done on new folders, since may have old folders with apostrophes
+    if '\'' in coll:
+        return api.Error('invalid_filepath', 'It is not allowed to move a file to a folder with an apostrophe in the name')
+
     # not in home - a groupname must be present ie at least 2!?
     if not len(coll.split('/')) > 2:
         return api.Error('invalid_destination', 'It is not possible to move files to this location')

From ed7604fac33b77ddbd83626ca467c6e19096245a Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Mon, 17 Jun 2024 11:52:16 +0200
Subject: [PATCH 06/57] YDA-5393: refactor copy to vault with irsync

(backport to Yoda 1.9)

Co-authored-by: Lazlo Westerhof <l.r.westerhof@uu.nl>
---
 folder.py                              | 470 +++++++++++++++++--------
 groups.py                              |   4 +-
 iiFolderStatusTransitions.r            | 148 +-------
 integration_tests.py                   | 312 +++++++++++++++-
 policies.py                            |   2 +-
 policies_folder_status.py              |   2 +-
 research.py                            |   7 +-
 tools/copy-accepted-folders-to-vault.r |  32 +-
 tools/copy-one-coll-to-vault.r         |   9 +
 tools/retry-copy-to-vault.r            |  32 +-
 tools/scheduled-copytovault.sh         |   2 +-
 unit-tests/test_util_misc.py           |  39 +-
 util/avu.py                            |  67 +++-
 util/config.py                         |   5 +-
 util/constants.py                      |   4 +-
 util/misc.py                           |  15 +-
 uuPolicies.r                           |   2 +-
 uuTreeWalk.r                           |   4 +-
 vault.py                               |  73 +++-
 19 files changed, 851 insertions(+), 378 deletions(-)
 create mode 100644 tools/copy-one-coll-to-vault.r

diff --git a/folder.py b/folder.py
index d1719dd0e..55779cd45 100644
--- a/folder.py
+++ b/folder.py
@@ -4,6 +4,7 @@
 __copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
+import time
 import uuid
 
 import genquery
@@ -11,6 +12,7 @@
 
 import epic
 import meta
+import notifications
 import policies_folder_status
 import provenance
 import vault
@@ -169,219 +171,409 @@ def api_folder_reject(ctx, coll):
     return set_status_as_datamanager(ctx, coll, constants.research_package_state.REJECTED)
 
 
-@rule.make(inputs=[0, 1], outputs=[2])
-def rule_folder_secure(ctx, coll, target):
-
+@rule.make(inputs=[0], outputs=[1])
+def rule_folder_secure(ctx, coll):
     """Rule interface for processing vault status transition request.
     :param ctx:             Combined type of a callback and rei struct
     :param coll:            Collection to be copied to vault
-    :param target:          Vault target to copy research package to including license file etc
 
-    :return: returns result of securing action
+    :return: result of securing action (1 for successfully secured or skipped folder)
     """
-    return folder_secure(ctx, coll, target)
+    if not precheck_folder_secure(ctx, coll):
+        return '1'
 
+    if not folder_secure(ctx, coll):
+        folder_secure_set_retry(ctx, coll)
+        return '0'
 
-def folder_secure(ctx, coll, target):
-    """Secure a folder to the vault.
+    return '1'
 
-    This function should only be called by a rodsadmin
-    and should not be called from the portal.
+
+def precheck_folder_secure(ctx, coll):
+    """Whether to continue with securing. Should not touch the retry attempts,
+       these are prechecks and don't count toward the retry attempts limit
 
     :param ctx:  Combined type of a callback and rei struct
     :param coll: Folder to secure
-    :param target: Target folder in vault
 
-    :returns: '0' when nu error occurred
-    """
+    :returns: True when successful
     """
-    # Following code is overturned by code in the rule language.
-    # This, as large files were not properly copied to the vault.
-    # Using the rule language this turned out to work fine.
+    if user.user_type(ctx) != 'rodsadmin':
+        log.write(ctx, "folder_secure: User is not rodsadmin")
+        return False
 
-    log.write(ctx, 'folder_secure: Start securing folder <{}>'.format(coll))
+    found, last_run = get_last_run_time(ctx, coll)
+    if (not correct_copytovault_start_status(ctx, coll)
+            or not misc.last_run_time_acceptable(coll, found, last_run, config.vault_copy_backoff_time)):
+        return False
 
-    if user.user_type(ctx) != 'rodsadmin':
-        log.write(ctx, "folder_secure: User is no rodsadmin")
-        return '1'
+    return True
 
-    # Check modify access on research folder.
-    msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())
 
-    modify_access = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())['arguments'][2]
+def folder_secure(ctx, coll):
+    """Secure a folder to the vault. If the previous copy did not finish, retry
 
-    # Set cronjob status
-    if modify_access != b'\x01':
-        try:
-            msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll)
-        except msi.Error as e:
-            log.write(ctx, "Could not set acl (admin:write) for collection: " + coll)
-            return '1'
+    This function should only be called by a rodsadmin
+    and should not be called from the portal.
 
-    avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['PROCESSING'])
+    :param ctx:  Combined type of a callback and rei struct
+    :param coll: Folder to secure
 
-    found = False
-    iter = genquery.row_iterator(
-        "META_COLL_ATTR_VALUE",
-        "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYPARAMSNAME + "'",
-        genquery.AS_LIST, ctx
-    )
-    for row in iter:
-        target = row[0]
-        found = True
+    :returns: True when successful
+    """
 
-    if found:
-        avu.rm_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, target)
+    log.write(ctx, 'folder_secure: Start securing folder <{}>'.format(coll))
 
-    if modify_access != b'\x01':
-        try:
-            msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll)
-        except msi.Error as e:
-            log.write(ctx, "Could not set acl (admin:null) for collection: " + coll)
-            return '1'
+    # Checks before start securing
+    if not check_folder_secure(ctx, coll):
+        return False
 
-    # Determine vault target if it does not exist.
-    if not found:
-        target = determine_vault_target(ctx, coll)
-        if target == "":
-            log.write(ctx, "folder_secure: No vault target found")
-            return '1'
+    # Set cronjob status
+    if not set_cronjob_status(ctx, constants.CRONJOB_STATE['PROCESSING'], coll):
+        return False
 
-        # Create vault target and set status to INCOMPLETE.
-        msi.coll_create(ctx, target, '', irods_types.BytesBuf())
-        avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.INCOMPLETE)
+    # Get the target folder
+    target = determine_and_set_vault_target(ctx, coll)
+    if not target:
+        return False
 
     # Copy all original info to vault
-    # try:
-    # vault.copy_folder_to_vault(ctx, coll, target)
-    # except Exception as e:
-    # log.write(ctx, e)
-    # return '1'
+    if not vault.copy_folder_to_vault(ctx, coll, target):
+        return False
 
-    ctx.iiCopyFolderToVault(coll, target)
-    """
     # Starting point of last part of securing a folder into the vault
-    msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())
-    modify_access = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())['arguments'][2]
-
     # Generate UUID4 and set as Data Package Reference.
     if config.enable_data_package_reference:
-        avu.set_on_coll(ctx, target, constants.DATA_PACKAGE_REFERENCE, str(uuid.uuid4()))
+        if not avu.set_on_coll(ctx, target, constants.DATA_PACKAGE_REFERENCE, str(uuid.uuid4()), True):
+            return False
 
     meta.copy_user_metadata(ctx, coll, target)
     vault.vault_copy_original_metadata_to_vault(ctx, target)
     vault.vault_write_license(ctx, target)
+    group_name = collection_group_name(ctx, coll)
 
     # Enable indexing on vault target.
-    if collection_group_name(ctx, coll).startswith("deposit-"):
+    if group_name.startswith("deposit-"):
         vault.vault_enable_indexing(ctx, target)
 
     # Copy provenance log from research folder to vault package.
     provenance.provenance_copy_log(ctx, coll, target)
 
     # Try to register EPIC PID if enabled.
+    if not set_epic_pid(ctx, target):
+        return False
+
+    # Set vault permissions for new vault package.
+    if not vault.set_vault_permissions(ctx, coll, target):
+        return False
+
+    # Set cronjob status to OK.
+    if not set_cronjob_status(ctx, constants.CRONJOB_STATE['OK'], coll):
+        return False
+
+    # Vault package is ready, set vault package state to UNPUBLISHED.
+    if not avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.UNPUBLISHED, True):
+        return False
+
+    if not set_acl_check(ctx, "recursive", "admin:write", coll, 'Could not set ACL (admin:write) for collection: ' + coll):
+        return False
+    set_acl_parents(ctx, "recursive", "admin:write", coll)
+
+    # Save vault package for notification.
+    set_vault_data_package(ctx, coll, target)
+
+    # Everything is done, set research folder state to SECURED.
+    if not folder_secure_succeed_avus(ctx, coll):
+        return False
+
+    # Deposit group has been deleted once secured status is set,
+    # so cannot change AVUs on collection
+    if not group_name.startswith("deposit-"):
+        if not set_acl_check(ctx, "recursive", "admin:null", coll, "Could not set ACL (admin:null) for collection: {}".format(coll)):
+            return False
+
+        set_acl_parents(ctx, "default", "admin:null", coll)
+
+    # All went well
+    return True
+
+
+def check_folder_secure(ctx, coll):
+    """Some initial set up that determines whether folder secure can continue.
+       These WILL affect the retry attempts.
+
+    :param ctx:  Combined type of a callback and rei struct
+    :param coll: Folder to secure
+
+    :returns: True when successful
+    """
+    if (not set_can_modify(ctx, coll)
+            or not retry_attempts(ctx, coll)
+            or not set_last_run_time(ctx, coll)):
+        return False
+
+    return True
+
+
+def correct_copytovault_start_status(ctx, coll):
+    """Confirm that the copytovault cronjob avu status is correct state to start securing"""
+    cronjob_status = get_cronjob_status(ctx, coll)
+    if cronjob_status in (constants.CRONJOB_STATE['PENDING'], constants.CRONJOB_STATE['RETRY']):
+        return True
+
+    return False
+
+
+def get_last_run_time(ctx, coll):
+    """Get the last run time, if found"""
+    found = False
+    last_run = 1
+    iter = genquery.row_iterator(
+        "META_COLL_ATTR_VALUE",
+        "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYLASTRUN + "'",
+        genquery.AS_LIST, ctx
+    )
+    for row in iter:
+        last_run = int(row[0])
+        found = True
+
+    return found, last_run
+
+
+def set_last_run_time(ctx, coll):
+    """Set last run time, return True for successful set"""
+    now = int(time.time())
+    return avu.set_on_coll(ctx, coll, constants.IICOPYLASTRUN, str(now), True)
+
+
+def set_can_modify(ctx, coll):
+    """Check if have permission to modify, set if necessary"""
+    check_access_result = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())
+    modify_access = check_access_result['arguments'][2]
+    if modify_access != b'\x01':
+        # TODO set to a lower read?
+        # This allows us permission to copy the files
+        if not set_acl_check(ctx, "recursive", "admin:read", coll, "Could not set ACL (admin:read) for collection: {}".format(coll)):
+            return False
+        # This allows us permission to set AVUs
+        if not set_acl_check(ctx, "default", "admin:write", coll, "Could not set ACL (admin:write) for collection: {}".format(coll)):
+            return False
+
+    return True
+
+
+def get_retry_count(ctx, coll):
+    """ Get the retry count, if not such AVU, return 0 """
+    retry_count = 0
+    iter = genquery.row_iterator(
+        "META_COLL_ATTR_VALUE, COLL_NAME",
+        "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYRETRYCOUNT + "'",
+        genquery.AS_LIST, ctx
+    )
+    for row in iter:
+        retry_count = int(row[0])
+
+    return retry_count
+
+
+def retry_attempts(ctx, coll):
+    """ Check if there have been too many retries. """
+    retry_count = get_retry_count(ctx, coll)
+
+    if retry_count >= config.vault_copy_max_retries:
+        return False
+
+    return True
+
+
+def folder_secure_succeed_avus(ctx, coll):
+    """Set/rm AVUs on source folder when successfully secured folder"""
+    # attributes = [x[0] for x in avu.of_coll(ctx, coll)]
+    attributes = [x[0] for x in get_org_metadata(ctx, coll)]
+
+    # In cases where copytovault only ran once, okay that these attributes were not created
+    if constants.IICOPYRETRYCOUNT in attributes:
+        if not avu.rmw_from_coll(ctx, coll, constants.IICOPYRETRYCOUNT, "%", True):
+            return False
+    if constants.IICOPYLASTRUN in attributes:
+        if not avu.rmw_from_coll(ctx, coll, constants.IICOPYLASTRUN, "%", True):
+            return False
+
+    if (not avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True)
+            or not rm_cronjob_status(ctx, coll)):
+        return False
+
+    # Note: this is the AVU that must always be *last* to be set in folder secure,
+    # otherwise could be a problem for deposit groups
+    if not avu.set_on_coll(ctx, coll, constants.IISTATUSATTRNAME, constants.research_package_state.SECURED, True):
+        return False
+
+    return True
+
+
+def folder_secure_set_retry(ctx, coll):
+    # When a folder secure fails, try to set the retry AVU and other applicable AVUs on source folder.
+    # If too many attempts, fail.
+    new_retry_count = get_retry_count(ctx, coll) + 1
+    if new_retry_count > config.vault_copy_max_retries:
+        folder_secure_fail(ctx, coll)
+        send_fail_folder_secure_notification(ctx, coll)
+    else:
+        folder_secure_set_retry_avus(ctx, coll, new_retry_count)
+
+
+def folder_secure_set_retry_avus(ctx, coll, retry_count):
+    avu.set_on_coll(ctx, coll, constants.IICOPYRETRYCOUNT, str(retry_count), True)
+    set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], coll)
+
+
+def folder_secure_fail(ctx, coll):
+    """When there are too many retries, give up, set the AVUs and send notifications"""
+    # Errors are caught here in hopes that will still be able to set UNRECOVERABLE status at least
+    avu.rmw_from_coll(ctx, coll, constants.IICOPYRETRYCOUNT, "%", True)
+    # Remove target AVU
+    avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True)
+    set_cronjob_status(ctx, constants.CRONJOB_STATE['UNRECOVERABLE'], coll)
+
+
+def send_fail_folder_secure_notification(ctx, coll):
+    """Send notifications to datamanagers that copy to vault failed"""
+    if datamanager_exists(ctx, coll):
+        datamanagers = get_datamanagers(ctx, coll)
+        message = "Data package failed to copy to vault after maximum retries"
+        for datamanager in datamanagers:
+            datamanager = '{}#{}'.format(*datamanager)
+            notifications.set(ctx, "system", datamanager, coll, message)
+
+
+def set_epic_pid(ctx, target):
+    """Try to set epic pid, if fails return False"""
     if config.epic_pid_enabled:
         ret = epic.register_epic_pid(ctx, target)
         url = ret['url']
         pid = ret['pid']
         http_code = ret['httpCode']
 
-        if (http_code != "0" and http_code != "200" and http_code != "201"):
-            # Something went wrong while registering EPIC PID, set cronjob state to retry.
-            log.write(ctx, "folder_secure: epid pid returned http <{}>".format(http_code))
-            if modify_access != b'\x01':
-                try:
-                    msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll)
-                except msi.Error:
-                    return '1'
-
-            avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['RETRY'])
-            avu.set_on_coll(ctx, coll, constants.IICOPYPARAMSNAME, target)
-
-            if modify_access != b'\x01':
-                try:
-                    msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll)
-                except msi.Error:
-                    log.write(ctx, "Could not set acl (admin:null) for collection: " + coll)
-                    return '1'
+        if http_code not in ('0', '200', '201'):
+            # Something went wrong while registering EPIC PID, return false so retry status will be set
+            log.write(ctx, "folder_secure: epic pid returned http <{}>".format(http_code))
+            return False
 
         if http_code != "0":
             # save EPIC Persistent ID in metadata
             epic.save_epic_pid(ctx, target, url, pid)
 
-    # Set vault permissions for new vault package.
-    group = collection_group_name(ctx, coll)
-    if group == '':
-        log.write(ctx, "folder_secure: Cannot determine which deposit or research group <{}> belongs to".format(coll))
-        return '1'
+    return True
 
-    vault.set_vault_permissions(ctx, group, coll, target)
 
-    # Set cronjob status to OK.
-    if modify_access != b'\x01':
-        try:
-            msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), coll)
-        except msi.Error:
-            log.write(ctx, "Could not set acl (admin:write) for collection: " + coll)
-            return '1'
+def get_cronjob_status(ctx, coll):
+    """Get the cronjob status of given collection"""
+    iter = genquery.row_iterator(
+        "META_COLL_ATTR_VALUE",
+        "COLL_NAME = '{}' AND META_COLL_ATTR_NAME = '{}'".format(coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault"),
+        genquery.AS_LIST, ctx
+    )
+    for row in iter:
+        return row[0]
 
-    avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", constants.CRONJOB_STATE['OK'])
 
-    if modify_access != b'\x01':
-        try:
-            msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), coll)
-        except msi.Error:
-            log.write(ctx, "Could not set acl (admin:null) for collection: " + coll)
-            return '1'
+def rm_cronjob_status(ctx, coll):
+    """Remove cronjob_copy_to_vault attribute on source collection
 
-    # Vault package is ready, set vault package state to UNPUBLISHED.
-    avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.UNPUBLISHED)
+    :param ctx:  Combined type of a callback and rei struct
+    :param coll: Source collection (folder that was being secured)
 
-    # Everything is done, set research folder state to SECURED.
-    try:
-        msi.set_acl(ctx, "recursive", "admin:write", user.full_name(ctx), coll)
-    except msi.Error:
-        log.write(ctx, "Could not set acl (admin:write) for collection: " + coll)
-        return '1'
+    :returns: True when successfully removed
+    """
+    return avu.rmw_from_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", "%", True)
 
-    parent, chopped_coll = pathutil.chop(coll)
-    while parent != "/" + user.zone(ctx) + "/home":
-        try:
-            msi.set_acl(ctx, "default", "admin:write", user.full_name(ctx), parent)
-        except msi.Error:
-            log.write(ctx, "Could not set ACL on " + parent)
-        parent, chopped_coll = pathutil.chop(parent)
 
-    # Save vault package for notification.
-    set_vault_data_package(ctx, coll, target)
+def set_cronjob_status(ctx, status, coll):
+    """Set cronjob_copy_to_vault attribute on source collection
+
+    :param ctx:    Combined type of a callback and rei struct
+    :param status: Status to set on collection
+    :param coll:   Source collection (folder being secured)
+
+    :returns: True when successfully set
+    """
+    return avu.set_on_coll(ctx, coll, constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault", status, True)
 
-    # Set folder status to SECURED.
-    avu.set_on_coll(ctx, coll, constants.IISTATUSATTRNAME, constants.research_package_state.SECURED)
 
+def set_acl_parents(ctx, acl_recurse, acl_type, coll):
+    """Set ACL for parent collections"""
+    parent, _ = pathutil.chop(coll)
+    while parent != "/" + user.zone(ctx) + "/home":
+        set_acl_check(ctx, acl_recurse, acl_type, parent, "Could not set the ACL ({}) on {}".format(acl_type, parent))
+        parent, _ = pathutil.chop(parent)
+
+
+def set_acl_check(ctx, acl_recurse, acl_type, coll, error_msg=''):
+    """Set the ACL if possible, log error_msg if it goes wrong"""
+    # TODO turn acl_recurse into a boolean
     try:
-        msi.set_acl(ctx, "recursive", "admin:null", user.full_name(ctx), coll)
+        msi.set_acl(ctx, acl_recurse, acl_type, user.full_name(ctx), coll)
     except msi.Error:
-        log.write(ctx, "Could not set acl (admin:null) for collection: " + coll)
+        if error_msg:
+            log.write(ctx, error_msg)
+        return False
 
-    parent, chopped_coll = pathutil.chop(coll)
-    while parent != "/" + user.zone(ctx) + "/home":
-        try:
-            msi.set_acl(ctx, "default", "admin:null", user.full_name(ctx), parent)
-        except msi.Error:
-            log.write(ctx, "Could not set ACL (admin:null) on " + parent)
+    return True
 
-        parent, chopped_coll = pathutil.chop(parent)
 
-    # All went well
-    return '0'
+def get_existing_vault_target(ctx, coll):
+    """Determine vault target on coll, if it was already determined before """
+    found = False
+    target = ""
+    iter = genquery.row_iterator(
+        "META_COLL_ATTR_VALUE",
+        "COLL_NAME = '" + coll + "' AND META_COLL_ATTR_NAME = '" + constants.IICOPYPARAMSNAME + "'",
+        genquery.AS_LIST, ctx
+    )
+    for row in iter:
+        target = row[0]
+        found = True
+
+    return found, target
+
+
+def set_vault_target(ctx, coll, target):
+    """Create vault target and AVUs"""
+    msi.coll_create(ctx, target, '', irods_types.BytesBuf())
+    if not avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.INCOMPLETE, True):
+        return False
+
+    # Note on the source the target folder in case a copy stops midway
+    if not avu.set_on_coll(ctx, coll, constants.IICOPYPARAMSNAME, target, True):
+        return False
+
+    return True
+
+
+def determine_and_set_vault_target(ctx, coll):
+    """Determine and set target on coll"""
+    found, target = get_existing_vault_target(ctx, coll)
+
+    # Determine vault target if it does not exist.
+    if not found:
+        target = determine_new_vault_target(ctx, coll)
+        if target == "":
+            log.write(ctx, "folder_secure: No possible vault target found")
+            return ""
+
+        # Create vault target and set status to INCOMPLETE.
+        if not set_vault_target(ctx, coll, target):
+            return ""
+
+    return target
 
 
-def determine_vault_target(ctx, folder):
+def determine_new_vault_target(ctx, folder):
     """Determine vault target path for a folder."""
 
     group = collection_group_name(ctx, folder)
     if group == '':
-        log.write(ctx, "Cannot determine which deposit or research group " + + " belongs to")
+        log.write(ctx, "Cannot determine which deposit or research group <{}> belongs to".format(folder))
         return ""
 
     parts = group.split('-')
diff --git a/groups.py b/groups.py
index fbc20c22e..1cad41faf 100644
--- a/groups.py
+++ b/groups.py
@@ -70,7 +70,7 @@ def getGroupsData(ctx):
 
         if attr in ["schema_id", "data_classification", "category", "subcategory"]:
             group[attr] = value
-        elif attr == "description" or attr == "expiration_date":
+        elif attr in ('description', 'expiration_date'):
             # Deal with legacy use of '.' for empty description metadata and expiration date.
             # See uuGroupGetDescription() in uuGroup.r for correct behavior of the old query interface.
             group[attr] = '' if value == '.' else value
@@ -163,7 +163,7 @@ def getGroupData(ctx, name):
         user = row[0]
         zone = row[1]
 
-        if name != user and name != "rodsadmin" and name != "public":
+        if name not in (user, 'rodsadmin', 'public'):
             group["members"].append(user + "#" + zone)
 
     if name.startswith("research-"):
diff --git a/iiFolderStatusTransitions.r b/iiFolderStatusTransitions.r
index a59933b79..56fa7adab 100644
--- a/iiFolderStatusTransitions.r
+++ b/iiFolderStatusTransitions.r
@@ -26,6 +26,16 @@ iiScheduleCopyToVault() {
 	}
 }
 
+# \brief Schedule copy-to-vault for just one coll (asynchronously).
+#
+# \param[in]  folder	    Path of folder
+#
+iiScheduleCollCopyToVault(*coll) {
+	delay ("<INST_NAME>irods_rule_engine_plugin-irods_rule_language-instance</INST_NAME><PLUSET>1s</PLUSET>") {
+		msiExecCmd("scheduled-copytovault.sh", "'*coll'", "", "", 0, *out);
+	}
+}
+
 
 # \brief iiFolderDatamanagerAction
 #
@@ -205,144 +215,6 @@ iiRemoveMetadataFromItem(*itemParent, *itemName, *itemIsCollection, *buffer, *er
 	}
 }
 
-# \brief iiFolderSecure   Secure a folder to the vault. This function should only be called by a rodsadmin
-#			  and should not be called from the portal. Thus no statusInfo is returned, but
-#			  log messages are sent to stdout instead
-#
-# \param[in] folder
-#
-iiFolderSecure(*folder) {
-	uuGetUserType(uuClientFullName, *userType);
-	if (*userType != "rodsadmin") {
-		writeLine("stdout", "iiFolderSecure: Should only be called by a rodsadmin");
-		fail;
-	}
-
-	# Check modify access on research folder.
-	msiCheckAccess(*folder, "modify object", *modifyAccess);
-
-	# Set cronjob status.
-	msiString2KeyValPair(UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_PROCESSING, *kvp);
-	if (*modifyAccess != 1) {
-		msiSetACL("default", "admin:write", uuClientFullName, *folder);
-	}
-	msiSetKeyValuePairsToObj(*kvp, *folder, "-C");
-	*found = false;
-	foreach (*row in SELECT META_COLL_ATTR_VALUE
-			 WHERE COLL_NAME = '*folder'
-			 AND META_COLL_ATTR_NAME = IICOPYPARAMSNAME) {
-		# retry with previous parameters
-		*target = *row.META_COLL_ATTR_VALUE;
-		*found = true;
-	}
-	if (*found) {
-		# Remove parameters from metadata
-		msiString2KeyValPair("", *kvp);
-		*key = IICOPYPARAMSNAME;
-		*kvp."*key" = *target;
-		msiRemoveKeyValuePairsFromObj(*kvp, *folder, "-C");
-	}
-	if (*modifyAccess != 1) {
-		msiSetACL("default", "admin:null", uuClientFullName, *folder);
-	}
-
-	if (!*found) {
-                # this file
-		*target = iiDetermineVaultTarget(*folder);
-	}
-
-	# Copy to vault.
-	iiCopyFolderToVault(*folder, *target);
-
-    # Continue securing process in PREP.
-    *return = "";
-    rule_folder_secure(*folder, *target, *return);
-}
-
-
-# \brief iiDetermineVaultTarget
-#
-# \param[in] folder
-# \returnvalue target path
-#
-iiDetermineVaultTarget(*folder) {
-	*err = errorcode(iiCollectionGroupName(*folder, *groupName));
-	if (*err < 0) {
-		writeLine("stdout", "iiDetermineVaultTarget: Cannot determine which research group *folder belongs to");
-		fail;
-	} else {
-		writeLine("stdout", "iiDetermineVaultTarget: *folder belongs to *groupName");
-	}
-	uuChop(*groupName, *_, *baseName, "-", true);
-	uuChopPath(*folder, *parent, *datapackageName);
-
-	# Make room for the timestamp and sequence number
-	if (strlen(*datapackageName) > 235) {
-		*datapackageName = substr(*datapackageName, 0, 235);
-	}
-
-	msiGetIcatTime(*timestamp, "unix");
-	*timestamp = triml(*timestamp, "0");
-        *vaultGroupName = IIVAULTPREFIX ++ *baseName;
-
-	*target = "/$rodsZoneClient/home/*vaultGroupName/*datapackageName[*timestamp]";
-
-	*i = 0;
-	while (uuCollectionExists(*target)) {
-		writeLine("stdout", "iiDetermineVaultTarget: *target already exists");
-		*i = *i + 1;
-		*target = "/$rodsZoneClient/home/*vaultGroupName/*datapackageName[*timestamp][*i]";
-	}
-	writeLine("stdout", "iiDetermineVaultTarget: Target is *target");
-	*target;
-}
-
-
-# \brief Return the name of the group a collection belongs to.
-#
-# \param[in]  folder
-# \param[out] groupName
-#
-iiCollectionGroupName(*folder, *groupName) {
-	if (*folder like regex "/[^/]+/home/deposit-.[^/]*/.*") {
-		uuChopPath(*folder, *parent, *baseName);
-		*path = *parent;
-	} else {
-		*path = *folder;
-	}
-
-	*isfound = false;
-	*groupName = "";
-	foreach(*accessid in SELECT COLL_ACCESS_USER_ID WHERE COLL_NAME = *path) {
-		*id = *accessid.COLL_ACCESS_USER_ID;
-		foreach(*group in SELECT USER_GROUP_NAME WHERE USER_GROUP_ID = *id) {
-				*groupName = *group.USER_GROUP_NAME;
-		}
-		if (*groupName like regex "(deposit|research|intake)-.*") {
-			*isfound = true;
-			break;
-		}
-	}
-
-	if (!*isfound) {
-		foreach(*accessid in SELECT COLL_ACCESS_USER_ID WHERE COLL_NAME = *path) {
-			*id = *accessid.COLL_ACCESS_USER_ID;
-			foreach(*group in SELECT USER_GROUP_NAME WHERE USER_GROUP_ID = *id) {
-					*groupName = *group.USER_GROUP_NAME;
-			}
-			if (*groupName like regex "(datamanager|vault)-.*") {
-				*isfound = true;
-				break;
-			}
-		}
-	}
-	if (!*isfound){
-		# No results found. Not a group folder
-		writeLine("serverLog", "*path does not belong to a deposit, research or intake group or is not available to current user");
-	}
-}
-
-
 # \brief Check validity of requested folder status transition in a research area.
 #
 # \param[in] fromstatus    folder status before requested transition
diff --git a/integration_tests.py b/integration_tests.py
index e1840b7fe..411581051 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -7,11 +7,321 @@
 __all__ = ['rule_run_integration_tests']
 
 import json
+import re
+import time
 import traceback
+import uuid
+
+import folder
+from util import avu, collection, config, constants, data_object, log, msi, resource, rule, user
+
+
+def _call_msvc_stat_vault(ctx, resc_name, data_path):
+    ret = msi.stat_vault(ctx, resc_name, data_path, '', '')
+    return (ret['arguments'][2], ret['arguments'][3])
+
+
+def _call_msvc_stat_vault_check_exc(ctx, resc_name, data_path):
+    """Verifies whether a call to the stat vault microservices raises an exception"""
+    try:
+        msi.stat_vault(ctx, resc_name, data_path, '', '')
+        return False
+    except Exception:
+        return True
+
+
+def _call_msvc_json_arrayops(ctx, jsonstr, val, ops, index, argument_index):
+    """Returns an output argument from the json_arrayops microservice"""
+    return ctx.msi_json_arrayops(jsonstr, val, ops, index)["arguments"][argument_index]
+
+
+def _call_msvc_json_objops(ctx, jsonstr, val, ops, argument_index):
+    """Returns an output argument from the json_objops microservice"""
+    return ctx.msi_json_objops(jsonstr, val, ops)["arguments"][argument_index]
+
+
+def _create_tmp_object(ctx):
+    """Creates a randomly named test data object and returns its name"""
+    path = "/{}/home/rods/{}.test".format(user.zone(ctx), str(uuid.uuid4()))
+    data_object.write(ctx, path, "test")
+    return path
+
+
+def _create_tmp_collection(ctx):
+    """Creates a randomly named test collection and returns its name"""
+    path = "/{}/home/rods/{}-test".format(user.zone(ctx), str(uuid.uuid4()))
+    collection.create(ctx, path)
+    return path
+
+
+def _test_msvc_add_avu_object(ctx):
+    tmp_object = _create_tmp_object(ctx)
+    ctx.msi_add_avu('-d', tmp_object, "foo", "bar", "baz")
+    result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)]
+    data_object.remove(ctx, tmp_object)
+    return result
+
+
+def _test_msvc_add_avu_collection(ctx):
+    tmp_object = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz")
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)]
+    collection.remove(ctx, tmp_object)
+    return result
+
+
+def _test_msvc_rmw_avu_object(ctx, rmw_attributes):
+    tmp_object = _create_tmp_object(ctx)
+    ctx.msi_add_avu('-d', tmp_object, "foo", "bar", "baz")
+    ctx.msi_add_avu('-d', tmp_object, "foot", "hand", "head")
+    ctx.msi_add_avu('-d', tmp_object, "aap", "noot", "mies")
+    ctx.msi_rmw_avu('-d', tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2])
+    result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)]
+    data_object.remove(ctx, tmp_object)
+    return result
+
+
+def _test_msvc_rmw_avu_collection(ctx, rmw_attributes):
+    tmp_object = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz")
+    ctx.msi_add_avu('-c', tmp_object, "foot", "hand", "head")
+    ctx.msi_add_avu('-c', tmp_object, "aap", "noot", "mies")
+    ctx.msi_rmw_avu('-c', tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2])
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)]
+    collection.remove(ctx, tmp_object)
+    return result
+
+def _test_avu_set_collection(ctx, catch):
+    # Test setting avu with catch and without catch
+    tmp_object = _create_tmp_collection(ctx)
+    avu.set_on_coll(ctx, tmp_object, "foo", "bar", catch)
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)]
+    collection.remove(ctx, tmp_object)
+    return result
+
+
+def _test_avu_rmw_collection(ctx, rmw_attributes):
+    # Test removing with catch and without catch
+    tmp_object = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz")
+    ctx.msi_add_avu('-c', tmp_object, "aap", "noot", "mies")
+    avu.rmw_from_coll(ctx, tmp_object, rmw_attributes[0], rmw_attributes[1], rmw_attributes[2], rmw_attributes[3])
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)]
+    collection.remove(ctx, tmp_object)
+    return result
+
+
+def _test_folder_set_retry_avus(ctx):
+    tmp_coll = _create_tmp_collection(ctx)
+    folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2)
+    # Needed to be able to delete collection
+    msi.set_acl(ctx, "default", "admin:own", user.full_name(ctx), tmp_coll)
+    collection.remove(ctx, tmp_coll)
+    return True
+
+
+def _test_folder_cronjob_status(ctx):
+    tmp_coll = _create_tmp_collection(ctx)
+    result_set = folder.set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], tmp_coll)
+    status = folder.get_cronjob_status(ctx, tmp_coll)
+    correct_status = status == constants.CRONJOB_STATE['RETRY']
+    result_rm = folder.rm_cronjob_status(ctx, tmp_coll)
+    collection.remove(ctx, tmp_coll)
+    return result_set, correct_status, result_rm
+
+
+def _test_folder_set_get_last_run(ctx):
+    tmp_coll = _create_tmp_collection(ctx)
+    result = folder.set_last_run_time(ctx, tmp_coll)
+    found, last_run = folder.get_last_run_time(ctx, tmp_coll)
+    collection.remove(ctx, tmp_coll)
+    return result, found, last_run
+
+
+def _test_folder_secure_func(ctx, func):
+    """Create tmp collection, apply func to it and get result, and clean up.
+       Used for testing functions that modify avu/acls related to folder secure.
+       Happy flow.
+
+    :param ctx:  Combined type of a callback and rei struct
+    :param func: Function to test
+
+    :returns: Result of action
+    """
+    tmp_coll = _create_tmp_collection(ctx)
+    # Assume returns True/False, or does not return
+    result = func(ctx, tmp_coll)
+    # Needed to be able to delete collection in situations where func changed ACLs
+    msi.set_acl(ctx, "default", "admin:own", user.full_name(ctx), tmp_coll)
+    collection.remove(ctx, tmp_coll)
+    if result is None:
+        return True
+    return result
 
-from util import collection, config, data_object, log, resource, rule, user
 
 basic_integration_tests = [
+    {"name": "msvc.add_avu_collection",
+     "test": lambda ctx: _test_msvc_add_avu_collection(ctx),
+     "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)},
+    {"name": "msvc.add_avu_object",
+     "test": lambda ctx: _test_msvc_add_avu_object(ctx),
+     "check": lambda x: (("foo", "bar", "baz") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "msvc.json_arrayops.add",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "add", 0, 0),
+     "check": lambda x: x == '["a", "b", "c", "d"]'},
+    {"name": "msvc.json_arrayops.find_exist",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "b", "find", 0, 3),
+     "check": lambda x: x == 1},
+    {"name": "msvc.json_arrayops.find_notexist",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "find", 0, 3),
+     "check": lambda x: x == -1},
+    {"name": "msvc.json_arrayops.get",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "", "get", 1, 1),
+     "check": lambda x: x == 'b'},
+    {"name": "msvc.json_arrayops.rm_exist",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "b", "rm", 0, 0),
+     "check": lambda x: x == '["a", "c"]'},
+    {"name": "msvc.json_arrayops.rm_notexist",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "d", "rm", 0, 0),
+     "check": lambda x: x == '["a", "b", "c"]'},
+    {"name": "msvc.json_arrayops.size",
+     "test": lambda ctx: _call_msvc_json_arrayops(ctx, '["a", "b", "c"]', "", "size", 0, 3),
+     "check": lambda x: x == 3},
+    {"name": "msvc.json_objops.add_notexist_empty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'add',  0),
+     "check": lambda x: x == '{"e": "f"}'},
+    {"name": "msvc.json_objops.add_notexist_nonempty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'add',  0),
+     "check": lambda x: x == '{"a": "b", "e": "f"}'},
+    {"name": "msvc.json_objops.add_exist_nonempty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'add',  0),
+     "check": lambda x: x == '{"a": "b", "e": "g"}'},
+    {"name": "msvc.json_objops.get_exist",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", ""), 'get',  1),
+     "check": lambda x: str(x) == "(['c'], ['d'])"},
+    {"name": "msvc.json_objops.get_notexist",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "e", ""), 'get',  1),
+     "check": lambda x: str(x) == "(['e'], [''])"},
+    {"name": "msvc.json_objops.rm_exist",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", "d"), 'rm',  0),
+     "check": lambda x: x == '{"a": "b"}'},
+    {"name": "msvc.json_objops.rm_notexist",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b", "c": "d"}', msi.kvpair(ctx, "c", "e"), 'rm',  0),
+     "check": lambda x: x == '{"a": "b", "c": "d"}'},
+    {"name": "msvc.json_objops.set_notexist_empty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '', msi.kvpair(ctx, "e", "f"), 'set',  0),
+     "check": lambda x: x == '{"e": "f"}'},
+    {"name": "msvc.json_objops.set_notexist_nonempty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "f"), 'set',  0),
+     "check": lambda x: x == '{"a": "b", "e": "f"}'},
+    {"name": "msvc.json_objops.set_exist_nonempty",
+     "test": lambda ctx: _call_msvc_json_objops(ctx, '{"a": "b"}', msi.kvpair(ctx, "e", "g"), 'set',  0),
+     "check": lambda x: x == '{"a": "b", "e": "g"}'},
+    {"name": "msvc.msi_vault_stat.file",
+     "test": lambda ctx: (_call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.uri"),
+                          _call_msvc_stat_vault(ctx, "dev001_2", "/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.uri")),
+     "check": lambda x: (x[0][0] == "FILE" and x[0][1] == "45") or (x[1][0] == "FILE" and x[1][1] == "45")},
+    {"name": "msvc.msi_vault_stat.dir",
+     "test": lambda ctx: (_call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/home"),
+                          _call_msvc_stat_vault(ctx, "dev001_2", "/var/lib/irods/Vault1_2/home")),
+     "check": lambda x: (x[0][0] == "DIR" and x[0][1] == "0") or (x[1][0] == "DIR" and x[1][1] == "0")},
+    {"name": "msvc.msi_vault_stat.notexist",
+     "test": lambda ctx: _call_msvc_stat_vault(ctx, "dev001_1", "/var/lib/irods/Vault1_1/doesnotexist"),
+     "check": lambda x: x[0] == "NOTEXIST" and x[1] == "0"},
+    {"name": "msvc.msi_vault_stat.resourcenotexist",
+     "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "doesnotexist", "/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.uri"),
+     "check": lambda x: x},
+    {"name": "msvc.msi_vault_stat.outsidevault1",
+     "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "dev001_1", "/etc/passwd"),
+     "check": lambda x: x},
+    {"name": "msvc.msi_vault_stat.outsidevault2",
+     "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "dev001_1", "/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.uri"),
+     "check": lambda x: x},
+    {"name": "msvc.rmw_avu_collection_literal",
+     "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("foo", "bar", "baz")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and ("foot", "hand", "head") in x
+                         and len(x) == 2)},
+    {"name": "msvc.rmw_avu_object_literal",
+     "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("foo", "bar", "baz")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and ("foot", "hand", "head") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 2
+                         )},
+    {"name": "msvc.rmw_avu_collection_literal_notexist",
+     "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("does", "not", "exist")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and ("foo", "bar", "baz") in x
+                         and ("foot", "hand", "head") in x
+                         and len(x) == 3)},
+    {"name": "msvc.rmw_avu_object_literal_notexist",
+     "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("does", "not", "exist")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and ("foo", "bar", "baz") in x
+                         and ("foot", "hand", "head") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 3
+                         )},
+    {"name": "msvc.rmw_avu_collection_wildcard",
+     "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("fo%", "%", "%")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and len(x) == 1)},
+    {"name": "msvc.rmw_avu_object_wildcard",
+     "test": lambda ctx: _test_msvc_rmw_avu_object(ctx, ("fo%", "%", "%")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "avu.set_from_coll.catch.yes",
+     "test": lambda ctx: _test_avu_set_collection(ctx, True),
+     "check": lambda x: (("foo", "bar", "") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "avu.set_from_coll.catch.no",
+     "test": lambda ctx: _test_avu_set_collection(ctx, False),
+     "check": lambda x: (("foo", "bar", "") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "avu.rmw_from_coll_wildcard.catch.yes",
+     "test": lambda ctx: _test_avu_rmw_collection(ctx, ("foo", "%", True, "%")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "avu.rmw_from_coll_wildcard.catch.no",
+     "test": lambda ctx: _test_avu_rmw_collection(ctx, ("foo", "%", False, "%")),
+     "check": lambda x: (("aap", "noot", "mies") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name":  "folder.set_can_modify",
+     "test": lambda ctx: _test_folder_secure_func(ctx, folder.set_can_modify),
+     "check": lambda x: x},
+    {"name":  "folder.cronjob_status",
+     "test": lambda ctx: _test_folder_cronjob_status(ctx),
+     "check": lambda x: x[0] and x[1] and x[2]},
+    {"name":  "folder.set_get_last_run_time",
+     "test": lambda ctx: _test_folder_set_get_last_run(ctx),
+     "check": lambda x: x[0] and x[1] and x[2] + 25 >= int(time.time())},
+    {"name":  "folder.set_last_run_time",
+     "test": lambda ctx: _test_folder_secure_func(ctx, folder.set_last_run_time),
+     "check": lambda x: x},
+    {"name":  "folder.check_folder_secure",
+     "test": lambda ctx: _test_folder_secure_func(ctx, folder.check_folder_secure),
+     "check": lambda x: x},
+    {"name":  "folder.folder_secure_fail",
+     "test": lambda ctx: _test_folder_secure_func(ctx, folder.folder_secure_fail),
+     "check": lambda x: x},
+    {"name":  "folder.set_retry_avus",
+     "test": lambda ctx: _test_folder_set_retry_avus(ctx),
+     "check": lambda x: x},
+    {"name":  "folder.determine_new_vault_target.research",
+     "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/research-initial/testdata"),
+     "check": lambda x: re.match("^\/tempZone\/home\/vault-initial\/testdata\[[0-9]*\]$", x) is not None},
+    {"name":  "folder.determine_new_vault_target.deposit",
+     "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/deposit-pilot/deposit-hi[123123]"),
+     "check": lambda x: re.match("^\/tempZone\/home\/vault-pilot\/deposit-hi\[[0-9]*\]\[[0-9]*\]$", x) is not None},
+    {"name":  "folder.determine_new_vault_target.invalid",
+     "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/not-research-group-not-exist/folder-not-exist"),
+     "check": lambda x: x == ""},
     {"name": "policies.check_anonymous_access_allowed.local",
      "test": lambda ctx: ctx.rule_check_anonymous_access_allowed("127.0.0.1", ""),
      "check": lambda x: x['arguments'][1] == 'true'},
diff --git a/policies.py b/policies.py
index 06cd50cc7..52b4ce092 100644
--- a/policies.py
+++ b/policies.py
@@ -445,7 +445,7 @@ def py_acPreProcForModifyAVUMetadata_cp(ctx, _, t_src, t_dst, src, dst):
     return policy.succeed()
 
 
-# This PEP is called after a AVU is added (option = 'add'), set (option =
+# This PEP is called after an AVU is added (option = 'add'), set (option =
 # 'set') or removed (option = 'rm') in the research area or the vault. Post
 # conditions defined in folder.py and iiVaultTransitions.r
 # are called here.
diff --git a/policies_folder_status.py b/policies_folder_status.py
index 2e64b28f7..a2f00869b 100644
--- a/policies_folder_status.py
+++ b/policies_folder_status.py
@@ -131,7 +131,7 @@ def post_status_transition(ctx, path, actor, status):
         # Set state to secure package in vault space.
         attribute = constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault"
         avu.set_on_coll(ctx, path, attribute, constants.CRONJOB_STATE['PENDING'])
-        ctx.iiScheduleCopyToVault()
+        ctx.iiScheduleCollCopyToVault(path)
 
     elif status is constants.research_package_state.FOLDER:
         # If previous action was submit and new status is FOLDER action is unsubmit.
diff --git a/research.py b/research.py
index c03fec258..ff61fd41e 100644
--- a/research.py
+++ b/research.py
@@ -372,8 +372,9 @@ def api_research_file_copy(ctx, filepath, new_filepath, overwrite=False):
     if filepath == new_filepath:
         return api.Error('invalid_filepath', 'Origin and copy file paths are equal. Please choose another destination')
 
-    coll = pathutil.chop(new_filepath)[0]
-    data_name = pathutil.chop(new_filepath)[1]
+    _, org_data_name = pathutil.chop(filepath)
+    # These are of the NEW filepath
+    coll, data_name = pathutil.chop(new_filepath)
     try:
         validate_filename(data_name.decode('utf-8'))
     except Exception:
@@ -408,7 +409,7 @@ def api_research_file_copy(ctx, filepath, new_filepath, overwrite=False):
 
     # Does org file exist?
     if not data_object.exists(ctx, filepath):
-        return api.Error('invalid_source', 'The original file ' + data_name + ' can not be found')
+        return api.Error('invalid_source', 'The original file ' + org_data_name + ' can not be found')
 
     # new filename already exists?
     if not overwrite and data_object.exists(ctx, new_filepath):
diff --git a/tools/copy-accepted-folders-to-vault.r b/tools/copy-accepted-folders-to-vault.r
index 73f6173b8..ebc7c8334 100644
--- a/tools/copy-accepted-folders-to-vault.r
+++ b/tools/copy-accepted-folders-to-vault.r
@@ -3,35 +3,9 @@
 copyToVault {
 	# Copy research folder to vault.
 	# This script is kept as dumb as possible.
-	# All processing and error handling is done by rule_folder_secure
-	*ContInxOld = 1;
-	msiAddSelectFieldToGenQuery("COLL_NAME", "", *GenQInp);
-	msiAddConditionToGenQuery("META_COLL_ATTR_NAME", "=", UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault", *GenQInp);
-	msiAddConditionToGenQuery("META_COLL_ATTR_VALUE", "=", CRONJOB_PENDING, *GenQInp);
-
-	msiExecGenQuery(*GenQInp, *GenQOut);
-	msiGetContInxFromGenQueryOut(*GenQOut, *ContInxNew);
-
-	while(*ContInxOld > 0) {
-		foreach(*row in *GenQOut) {
-			*folder = *row.COLL_NAME;
-			# When rule_folder_secure fails continue with the other folders.
-                        # *errorcode = '0';
-                        # rule_folder_secure(*folder, *errorcode);
-			# if (*errorcode == '0') {
-                        if (errorcode(iiFolderSecure(*folder)) == 0) {
-				*cronjobState = UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_OK;
-				msiString2KeyValPair(*cronjobState, *cronjobStateKvp);
-				*err = errormsg(msiRemoveKeyValuePairsFromObj(*cronjobStateKvp, *folder, "-C"), *msg);
-			}
-		}
-
-		*ContInxOld = *ContInxNew;
-		if(*ContInxOld > 0) {
-			msiGetMoreRows(*GenQInp, *GenQOut, *ContInxNew);
-		}
-	}
-	msiCloseGenQuery(*GenQInp, *GenQOut);
+	# All processing and error handling is done by rule_vault_copy_accepted_to_vault
+	*state = "CRONJOB_PENDING"
+	rule_vault_copy_to_vault(*state);
 }
 input null
 output ruleExecOut
diff --git a/tools/copy-one-coll-to-vault.r b/tools/copy-one-coll-to-vault.r
new file mode 100644
index 000000000..ee216a250
--- /dev/null
+++ b/tools/copy-one-coll-to-vault.r
@@ -0,0 +1,9 @@
+#!/usr/bin/irule -F
+
+copyOneCollToVault {
+	# Copy research folder to vault.
+    *return = "";
+    rule_folder_secure(*coll, *return);
+}
+input *coll=""
+output ruleExecOut
diff --git a/tools/retry-copy-to-vault.r b/tools/retry-copy-to-vault.r
index f45304849..7b072c0aa 100644
--- a/tools/retry-copy-to-vault.r
+++ b/tools/retry-copy-to-vault.r
@@ -1,35 +1,9 @@
 retryCopyToVault {
 	# Copy research folder to vault.
 	# This script is kept as dumb as possible.
-	# All processing and error handling is done by rule_folder_secure
-	*ContInxOld = 1;
-	msiAddSelectFieldToGenQuery("COLL_NAME", "", *GenQInp);
-	msiAddConditionToGenQuery("META_COLL_ATTR_NAME", "=", UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault", *GenQInp);
-	msiAddConditionToGenQuery("META_COLL_ATTR_VALUE", "=", CRONJOB_RETRY, *GenQInp);
-
-	msiExecGenQuery(*GenQInp, *GenQOut);
-	msiGetContInxFromGenQueryOut(*GenQOut, *ContInxNew);
-
-	while(*ContInxOld > 0) {
-		foreach(*row in *GenQOut) {
-			*folder = *row.COLL_NAME;
-			# When rule_folder_secure fails continue with the other folders.
-                        # *errorcode = '0';
-                        # rule_folder_secure(ctx, *folder, *errorcode);
-			# if (*errorcode == '0') {
-                        if (errorcode(iiFolderSecure(*folder)) == 0) {
-				*cronjobState = UUORGMETADATAPREFIX ++ "cronjob_copy_to_vault=" ++ CRONJOB_OK;
-				msiString2KeyValPair(*cronjobState, *cronjobStateKvp);
-				*err = errormsg(msiRemoveKeyValuePairsFromObj(*cronjobStateKvp, *folder, "-C"), *msg);
-			}
-		}
-
-		*ContInxOld = *ContInxNew;
-		if(*ContInxOld > 0) {
-			msiGetMoreRows(*GenQInp, *GenQOut, *ContInxNew);
-		}
-	}
-	msiCloseGenQuery(*GenQInp, *GenQOut);
+	# All processing and error handling is done by rule_vault_copy_accepted_retry_to_vault
+	*state = "CRONJOB_RETRY"
+	rule_vault_copy_to_vault(*state);
 }
 input null
 output ruleExecOut
diff --git a/tools/scheduled-copytovault.sh b/tools/scheduled-copytovault.sh
index 6b57d8ba6..566edbb55 100755
--- a/tools/scheduled-copytovault.sh
+++ b/tools/scheduled-copytovault.sh
@@ -1,2 +1,2 @@
 #!/bin/sh
-irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/copy-accepted-folders-to-vault.r
+irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/copy-one-coll-to-vault.r  '*coll="'$1'"'
diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py
index be9ef703c..4b81c7676 100644
--- a/unit-tests/test_util_misc.py
+++ b/unit-tests/test_util_misc.py
@@ -1,31 +1,52 @@
 # -*- coding: utf-8 -*-
 """Unit tests for the misc utils module"""
 
-__copyright__ = 'Copyright (c) 2023, Utrecht University'
+__copyright__ = 'Copyright (c) 2023-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import sys
+import time
 from unittest import TestCase
 
 sys.path.append('../util')
 
-from misc import human_readable_size
+from misc import human_readable_size, last_run_time_acceptable
 
 
 class UtilMiscTest(TestCase):
 
+    def test_last_run_time_acceptable(self):
+        """Test the last run time for copy to vault"""
+        # No last run time (job hasn't be tried before)
+        found = False
+        last_run = 1
+        self.assertEqual(last_run_time_acceptable("b", found, last_run, 300), True)
+
+        # Last run time greater than the backoff, so can run
+        now = int(time.time())
+        found = True
+        copy_backoff_time = 300
+        last_run = now - copy_backoff_time - 1
+        self.assertEqual(last_run_time_acceptable("b", found, last_run, copy_backoff_time), True)
+
+        # Last run time more recent than the backoff, so should not run
+        found = True
+        copy_backoff_time = 300
+        last_run = now
+        self.assertEqual(last_run_time_acceptable("b", found, int(time.time()), copy_backoff_time), False)
+
     def test_human_readable_size(self):
         output = human_readable_size(0)
-        self.assertEquals(output, "0 B")
+        self.assertEqual(output, "0 B")
         output = human_readable_size(1024)
-        self.assertEquals(output, "1.0 KiB")
+        self.assertEqual(output, "1.0 KiB")
         output = human_readable_size(1048576)
-        self.assertEquals(output, "1.0 MiB")
+        self.assertEqual(output, "1.0 MiB")
         output = human_readable_size(26843550000)
-        self.assertEquals(output, "25.0 GiB")
+        self.assertEqual(output, "25.0 GiB")
         output = human_readable_size(989560500000000)
-        self.assertEquals(output, "900.0 TiB")
+        self.assertEqual(output, "900.0 TiB")
         output = human_readable_size(112590000000000000)
-        self.assertEquals(output, "100.0 PiB")
+        self.assertEqual(output, "100.0 PiB")
         output = human_readable_size(3931462330709348188)
-        self.assertEquals(output, "3.41 EiB")
+        self.assertEqual(output, "3.41 EiB")
diff --git a/util/avu.py b/util/avu.py
index 9653b96e4..227c4b640 100644
--- a/util/avu.py
+++ b/util/avu.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Utility / convenience functions for dealing with AVUs."""
 
-__copyright__ = 'Copyright (c) 2019-2021, Utrecht University'
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import itertools
@@ -10,6 +10,7 @@
 import genquery
 import irods_types
 
+import log
 import msi
 import pathutil
 
@@ -46,12 +47,40 @@ def set_on_data(ctx, path, a, v):
     msi.set_key_value_pairs_to_obj(ctx, x['arguments'][1], path, '-d')
 
 
-def set_on_coll(ctx, coll, a, v):
-    """Set key/value metadata on a collection."""
+def set_on_coll(ctx, coll, a, v, catch=False):
+    """Set key/value metadata on a collection. Optionally catch any exceptions that occur.
+
+    :param ctx:   Combined type of a callback and rei struct
+    :param coll:  Collection to get paginated contents of
+    :param a:     Attribute
+    :param v:     Value
+    :param catch: Whether to catch any exceptions that occur
+
+    :returns: True if catch=True and no exceptions occurred during operation
+    """
+    if catch:
+        return _set_on_coll_catch(ctx, coll, a, v)
+
+    _set_on_coll(ctx, coll, a, v)
+    return True
+
+
+def _set_on_coll(ctx, coll, a, v):
     x = msi.string_2_key_val_pair(ctx, '{}={}'.format(a, v), irods_types.BytesBuf())
     msi.set_key_value_pairs_to_obj(ctx, x['arguments'][1], coll, '-C')
 
 
+def _set_on_coll_catch(ctx, coll, a, v):
+    """Set AVU, but catch exception."""
+    try:
+        _set_on_coll(ctx, coll, a, v)
+    except Exception:
+        log.write(ctx, "Failed to set AVU {} on coll {}".format(a, coll))
+        return False
+
+    return True
+
+
 def set_on_resource(ctx, resource, a, v):
     """Set key/value metadata on a resource."""
     x = msi.string_2_key_val_pair(ctx, '{}={}'.format(a, v), irods_types.BytesBuf())
@@ -100,11 +129,39 @@ def rm_from_group(ctx, group, a, v):
     msi.remove_key_value_pairs_from_obj(ctx, x['arguments'][1], group, '-u')
 
 
-def rmw_from_coll(ctx, obj, a, v, u=''):
-    """Remove AVU from collection with wildcards."""
+def rmw_from_coll(ctx, obj, a, v, catch=False, u=''):
+    """Remove AVU from collection with wildcards. Optionally catch any exceptions that occur.
+
+    :param ctx:   Combined type of a callback and rei struct
+    :param obj:  Collection to get paginated contents of
+    :param a:     Attribute
+    :param v:     Value
+    :param catch: Whether to catch any exceptions that occur
+    :param u:     Unit
+
+    :returns: True if catch=True and no exceptions occurred during operation
+    """
+    if catch:
+        return _rmw_from_coll_catch(ctx, obj, a, v, u)
+
+    _rmw_from_coll(ctx, obj, a, v, u)
+    return True
+
+
+def _rmw_from_coll(ctx, obj, a, v, u=''):
     msi.rmw_avu(ctx, '-C', obj, a, v, u)
 
 
+def _rmw_from_coll_catch(ctx, obj, a, v, u=''):
+    try:
+        _rmw_from_coll(ctx, obj, a, v, u)
+    except Exception:
+        log.write(ctx, "Failed to rm AVU {} on coll {}".format(a, obj))
+        return False
+
+    return True
+
+
 def rmw_from_data(ctx, obj, a, v, u=''):
     """Remove AVU from data object with wildcards."""
     msi.rmw_avu(ctx, '-d', obj, a, v, u)
diff --git a/util/config.py b/util/config.py
index 2054bc27a..704eb688c 100644
--- a/util/config.py
+++ b/util/config.py
@@ -145,7 +145,10 @@ def __repr__(self):
                 text_file_extensions=[],
                 user_max_connections_enabled=False,
                 user_max_connections_number=4,
-                vault_copy_multithread_enabled=True)
+                vault_copy_backoff_time=300,
+                vault_copy_max_retries=5,
+                vault_copy_multithread_enabled=True,
+                python3_interpreter='/usr/local/bin/python3')
 
 # }}}
 
diff --git a/util/constants.py b/util/constants.py
index b679d090b..2d91034a0 100644
--- a/util/constants.py
+++ b/util/constants.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Constants that apply to all Yoda environments."""
 
-__copyright__ = 'Copyright (c) 2016-2023, Utrecht University'
+__copyright__ = 'Copyright (c) 2016-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 from enum import Enum
@@ -87,6 +87,8 @@
 IIARCHIVEATTRNAME     = UUORGMETADATAPREFIX + 'archival_status'
 IIBAGITOR             = UUORGMETADATAPREFIX + 'bagitor'
 IICOPYPARAMSNAME      = UUORGMETADATAPREFIX + 'copy_to_vault_params'
+IICOPYRETRYCOUNT      = UUORGMETADATAPREFIX + 'retry_count'
+IICOPYLASTRUN         = UUORGMETADATAPREFIX + 'last_run'
 
 DATA_PACKAGE_REFERENCE = UUORGMETADATAPREFIX + 'data_package_reference'
 
diff --git a/util/misc.py b/util/misc.py
index 062a594f0..2ae4169d6 100644
--- a/util/misc.py
+++ b/util/misc.py
@@ -1,10 +1,23 @@
 # -*- coding: utf-8 -*-
 """Miscellaneous util functions."""
 
-__copyright__ = 'Copyright (c) 2019-2023, Utrecht University'
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import math
+import time
+
+
+def last_run_time_acceptable(coll, found, last_run, config_backoff_time):
+    """Return whether the last run time is acceptable to continue with task."""
+    now = int(time.time())
+
+    if found:
+        # Too soon to run
+        if now < last_run + config_backoff_time:
+            return False
+
+    return True
 
 
 def human_readable_size(size_bytes):
diff --git a/uuPolicies.r b/uuPolicies.r
index 54923686a..7e8dcba60 100644
--- a/uuPolicies.r
+++ b/uuPolicies.r
@@ -45,7 +45,7 @@ acCreateUserZoneCollections {
 	uuGetUserType($otherUserName, *type);
 	if (*type == "rodsuser") {
 		# Do not create home directories for regular users.
-		# but do create trash directories as iRODS always uses the personal trash folder evan when in a group directory
+		# but do create trash directories as iRODS always uses the personal trash folder even when in a group directory
 		acCreateCollByAdmin("/"++$rodsZoneProxy++"/trash/home", $otherUserName);
 	} else if (*type == "rodsgroup" && ($otherUserName like "read-*")) {
 		# Do not create home directories for read- groups.
diff --git a/uuTreeWalk.r b/uuTreeWalk.r
index a61673716..bd610d91e 100644
--- a/uuTreeWalk.r
+++ b/uuTreeWalk.r
@@ -7,8 +7,8 @@
 # \brief Walks through a collection tree and calls an arbitrary rule for each tree-item.
 #
 # \param[in] direction           can be "forward" or "reverse"
-#                                forward means process collection itself, then childs
-#                                reverse means process childs first
+#                                forward means process collection itself, then children
+#                                reverse means process children first
 #                                reverse is useful e.g. to delete collection trees
 # \param[in] topLevelCollection  pathname of the root of the tree, must be collection
 #                                NB: the root itself is also processed
diff --git a/vault.py b/vault.py
index beeb86f85..237364a31 100644
--- a/vault.py
+++ b/vault.py
@@ -30,6 +30,7 @@
            'api_vault_republish',
            'api_vault_preservable_formats_lists',
            'api_vault_unpreservable_files',
+           'rule_vault_copy_to_vault',
            'rule_vault_copy_numthreads',
            'rule_vault_copy_original_metadata_to_vault',
            'rule_vault_write_license',
@@ -858,24 +859,61 @@ def api_revoke_read_access_research_group(ctx, coll):
     return {'status': 'Success', 'statusInfo': ''}
 
 
-def copy_folder_to_vault(ctx, folder, target):
-    """Copy folder and all its contents to target in vault.
+@rule.make()
+def rule_vault_copy_to_vault(ctx, state):
+    """ Collect all folders with a given cronjob state
+        and try to copy them to the vault.
+
+    :param ctx:  Combined type of a callback and rei struct
+    :param state: one of constants.CRONJOB_STATE
+    """
+    iter = get_copy_to_vault_colls(ctx, state)
+    for row in iter:
+        coll = row[0]
+        log.write(ctx, "copy_to_vault {}: {}".format(state, coll))
+        if not folder.precheck_folder_secure(ctx, coll):
+            continue
+
+        # failed copy
+        if not folder.folder_secure(ctx, coll):
+            log.write(ctx, "copy_to_vault {} failed for collection <{}>".format(state, coll))
+            folder.folder_secure_set_retry(ctx, coll)
+
+
+def get_copy_to_vault_colls(ctx, cronjob_state):
+    iter = list(genquery.Query(ctx,
+                ['COLL_NAME'],
+                "META_COLL_ATTR_NAME = '{}' AND META_COLL_ATTR_VALUE = '{}'".format(
+                    constants.UUORGMETADATAPREFIX + "cronjob_copy_to_vault",
+                    cronjob_state),
+                output=genquery.AS_LIST))
+    return iter
 
-    The data will reside onder folder '/original' within the vault.
+
+def copy_folder_to_vault(ctx, coll, target):
+    """Copy folder and all its contents to target in vault using irsync.
+
+    The data will reside under folder '/original' within the vault.
 
     :param ctx:    Combined type of a callback and rei struct
-    :param folder: Path of a folder in the research space
+    :param coll:   Path of a folder in the research space
     :param target: Path of a package in the vault space
 
-    :raises Exception: Raises exception when treewalk_and_ingest did not finish correctly
+    :returns: True for successful copy
     """
-    destination = target + '/original'
-    origin = folder
+    returncode = 0
+    try:
+        returncode = subprocess.call(["irsync", "-rK", "i:{}/".format(coll), "i:{}/original".format(target)])
+    except Exception as e:
+        log.write(ctx, "irsync failure: " + e)
+        log.write(ctx, "irsync failure for coll <{}> and target <{}>".format(coll, target))
+        return False
 
-    # Origin is a never changing value to be able to designate a relative path within ingest_object
-    error = 0  # Initial error state. Should stay 0.
-    if treewalk_and_ingest(ctx, folder, destination, origin, error):
-        raise Exception('copy_folder_to_vault: Error copying folder to vault')
+    if returncode != 0:
+        log.write(ctx, "irsync failure for coll <{}> and target <{}>".format(coll, target))
+        return False
+
+    return True
 
 
 def treewalk_and_ingest(ctx, folder, target, origin, error):
@@ -930,6 +968,7 @@ def ingest_object(ctx, parent, item, item_is_collection, destination, origin):
     source_path = parent + "/" + item
     read_access = msi.check_access(ctx, source_path, 'read object', irods_types.BytesBuf())['arguments'][2]
 
+    # TODO use set_acl_check?
     if read_access != b'\x01':
         try:
             msi.set_acl(ctx, "default", "admin:read", user.full_name(ctx), source_path)
@@ -973,12 +1012,16 @@ def ingest_object(ctx, parent, item, item_is_collection, destination, origin):
     return 0
 
 
-def set_vault_permissions(ctx, group_name, folder, target):
+def set_vault_permissions(ctx, coll, target):
     """Set permissions in the vault as such that data can be copied to the vault."""
+    group_name = folder.collection_group_name(ctx, coll)
+    if group_name == '':
+        log.write(ctx, "set_vault_permissions: Cannot determine which deposit or research group <{}> belongs to".format(coll))
+        return False
+
     parts = group_name.split('-')
     base_name = '-'.join(parts[1:])
 
-    parts = folder.split('/')
     vault_group_name = constants.IIVAULTPREFIX + base_name
 
     # Check if noinherit is set
@@ -1056,6 +1099,8 @@ def set_vault_permissions(ctx, group_name, folder, target):
     # Grant research group read access to vault package.
     msi.set_acl(ctx, "recursive", "admin:read", group_name, target)
 
+    return True
+
 
 @rule.make(inputs=range(4), outputs=range(4, 6))
 def rule_vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous_version):
@@ -1187,7 +1232,7 @@ def vault_request_status_transitions(ctx, coll, new_vault_status, previous_versi
     # Except for status transition to PUBLISHED/DEPUBLISHED,
     # because it is requested by the system before previous pending
     # transition is removed.
-    if new_vault_status != constants.vault_package_state.PUBLISHED and new_vault_status != constants.vault_package_state.DEPUBLISHED:
+    if new_vault_status not in (constants.vault_package_state.PUBLISHED, constants.vault_package_state.DEPUBLISHED):
         action_status = constants.UUORGMETADATAPREFIX + '"vault_status_action_' + coll_id
         iter = genquery.row_iterator(
             "COLL_ID",

From eef54ae322a5fd9ec354aec2c80822a4e2105725 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Thu, 20 Jun 2024 10:29:51 +0200
Subject: [PATCH 07/57] YDA-5393: reorder end of copy to vault

---
 folder.py | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/folder.py b/folder.py
index 55779cd45..32fa9c378 100644
--- a/folder.py
+++ b/folder.py
@@ -267,10 +267,6 @@ def folder_secure(ctx, coll):
     if not vault.set_vault_permissions(ctx, coll, target):
         return False
 
-    # Set cronjob status to OK.
-    if not set_cronjob_status(ctx, constants.CRONJOB_STATE['OK'], coll):
-        return False
-
     # Vault package is ready, set vault package state to UNPUBLISHED.
     if not avu.set_on_coll(ctx, target, constants.IIVAULTSTATUSATTRNAME, constants.vault_package_state.UNPUBLISHED, True):
         return False
@@ -283,18 +279,16 @@ def folder_secure(ctx, coll):
     set_vault_data_package(ctx, coll, target)
 
     # Everything is done, set research folder state to SECURED.
-    if not folder_secure_succeed_avus(ctx, coll):
+    if not folder_secure_succeed_avus(ctx, coll, group_name):
         return False
 
     # Deposit group has been deleted once secured status is set,
     # so cannot change AVUs on collection
     if not group_name.startswith("deposit-"):
-        if not set_acl_check(ctx, "recursive", "admin:null", coll, "Could not set ACL (admin:null) for collection: {}".format(coll)):
-            return False
-
+        set_acl_check(ctx, "recursive", "admin:null", coll, "Could not set ACL (admin:null) for collection: {}".format(coll))
         set_acl_parents(ctx, "default", "admin:null", coll)
 
-    # All went well
+    # All (mostly) went well
     return True
 
 
@@ -386,9 +380,8 @@ def retry_attempts(ctx, coll):
     return True
 
 
-def folder_secure_succeed_avus(ctx, coll):
+def folder_secure_succeed_avus(ctx, coll, group_name):
     """Set/rm AVUs on source folder when successfully secured folder"""
-    # attributes = [x[0] for x in avu.of_coll(ctx, coll)]
     attributes = [x[0] for x in get_org_metadata(ctx, coll)]
 
     # In cases where copytovault only ran once, okay that these attributes were not created
@@ -399,15 +392,25 @@ def folder_secure_succeed_avus(ctx, coll):
         if not avu.rmw_from_coll(ctx, coll, constants.IICOPYLASTRUN, "%", True):
             return False
 
-    if (not avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True)
-            or not rm_cronjob_status(ctx, coll)):
+    # Set cronjob status to final state before deletion
+    if not set_cronjob_status(ctx, constants.CRONJOB_STATE['OK'], coll):
+        return False
+
+    if not rm_cronjob_status(ctx, coll):
         return False
 
-    # Note: this is the AVU that must always be *last* to be set in folder secure,
-    # otherwise could be a problem for deposit groups
+    # Note: this is the status that must always be one of the last to be set
+    # in folder secure, otherwise could be a problem for deposit groups
     if not avu.set_on_coll(ctx, coll, constants.IISTATUSATTRNAME, constants.research_package_state.SECURED, True):
         return False
 
+    # Remove target AVU on source folder. This should be done after all possibly failing steps
+    # have occurred in folder_secure (any "return False" steps), so that if those trip a retry state,
+    # on retry folder_secure can reuse the target from before.
+    if (not group_name.startswith("deposit-")
+            and not avu.rmw_from_coll(ctx, coll, constants.IICOPYPARAMSNAME, "%", True)):
+        return False
+
     return True
 
 
@@ -417,14 +420,15 @@ def folder_secure_set_retry(ctx, coll):
     new_retry_count = get_retry_count(ctx, coll) + 1
     if new_retry_count > config.vault_copy_max_retries:
         folder_secure_fail(ctx, coll)
-        send_fail_folder_secure_notification(ctx, coll)
+        send_folder_secure_notification(ctx, coll, "Data package failed to copy to vault after maximum retries")
     else:
-        folder_secure_set_retry_avus(ctx, coll, new_retry_count)
+        if not folder_secure_set_retry_avus(ctx, coll, new_retry_count):
+            send_folder_secure_notification(ctx, coll, "Failed to set retry state on data package")
 
 
 def folder_secure_set_retry_avus(ctx, coll, retry_count):
     avu.set_on_coll(ctx, coll, constants.IICOPYRETRYCOUNT, str(retry_count), True)
-    set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], coll)
+    return set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], coll)
 
 
 def folder_secure_fail(ctx, coll):
@@ -436,11 +440,10 @@ def folder_secure_fail(ctx, coll):
     set_cronjob_status(ctx, constants.CRONJOB_STATE['UNRECOVERABLE'], coll)
 
 
-def send_fail_folder_secure_notification(ctx, coll):
-    """Send notifications to datamanagers that copy to vault failed"""
+def send_folder_secure_notification(ctx, coll, message):
+    """Send notification about folder secure to relevant datamanagers"""
     if datamanager_exists(ctx, coll):
         datamanagers = get_datamanagers(ctx, coll)
-        message = "Data package failed to copy to vault after maximum retries"
         for datamanager in datamanagers:
             datamanager = '{}#{}'.format(*datamanager)
             notifications.set(ctx, "system", datamanager, coll, message)

From 6811673efc12f24f06b87ec930d126a0669876db Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:11:47 +0200
Subject: [PATCH 08/57] YDA-5778: Folder with apostrophe api tests

---
 tests/features/api/api_research.feature | 58 +++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/features/api/api_research.feature b/tests/features/api/api_research.feature
index 39a9fe31d..43994a824 100644
--- a/tests/features/api/api_research.feature
+++ b/tests/features/api/api_research.feature
@@ -21,6 +21,29 @@ Feature: Research API
             | /tempZone/home/research-initial | api_test_1234567890         |
 
 
+    Scenario Outline: Research folder with apostrophe add
+        Given user researcher is authenticated
+        And the Yoda research folder add API is queried with <folder> and <collection>
+        Then the response status code is "400"
+        And folder <folder> does not exist in <collection>
+
+        Examples:
+            | collection                      | folder            |
+            | /tempZone/home/research-initial | api_test_folder's |
+
+
+    @deposit
+    Scenario Outline: Deposit folder with apostrophe add
+        Given user researcher is authenticated
+        And the Yoda research folder add API is queried with <folder> and <collection>
+        Then the response status code is "400"
+        And folder <folder> does not exist in <collection>
+
+        Examples:
+            | collection                   | folder            |
+            | /tempZone/home/deposit-pilot | api_test_folder's |
+
+
     Scenario Outline: Research folder copy
         Given user researcher is authenticated
         And the Yoda research folder copy API is queried with <folder>, <copy>, and <collection>
@@ -34,6 +57,18 @@ Feature: Research API
             | /tempZone/home/research-initial | api_test_copy      | api_test_move1          |
 
 
+    Scenario Outline: Research folder copy with apostrophe
+        Given user researcher is authenticated
+        And the Yoda research folder copy API is queried with <folder>, <copy>, and <collection>
+        Then the response status code is "400"
+        And folder <folder> exists in <collection>
+        And folder <copy> does not exist in <collection>
+
+        Examples:
+            | collection                      | folder        | copy             |
+            | /tempZone/home/research-initial | api_test_copy | api_test_copy2's |
+
+
     Scenario Outline: Research folder move
         Given user researcher is authenticated
         And the Yoda research folder move API is queried with <folder>, <move>, and <collection>
@@ -46,6 +81,17 @@ Feature: Research API
             | /tempZone/home/research-initial | api_test_move1     | api_test_move2      |
 
 
+    Scenario Outline: Research folder move with apostrophe
+        Given user researcher is authenticated
+        And the Yoda research folder move API is queried with <folder>, <move>, and <collection>
+        Then the response status code is "400"
+        And folder <move> does not exist in <collection>
+
+        Examples:
+            | collection                      | folder             | move             |
+            | /tempZone/home/research-initial | api_test_move1     | api_test_move2's |
+
+
     Scenario Outline: Research folder rename
         Given user researcher is authenticated
         And the Yoda research folder rename API is queried with <folder_old>, <folder> and <collection>
@@ -58,6 +104,18 @@ Feature: Research API
             | /tempZone/home/research-initial | api_test_folder  | api_test_folder_renamed |
 
 
+    Scenario Outline: Research folder rename with apostrophe
+        Given user researcher is authenticated
+        And the Yoda research folder rename API is queried with <folder_old>, <folder> and <collection>
+        Then the response status code is "400"
+        And folder <folder_old> exists in <collection>
+        And folder <folder> does not exist in <collection>
+
+        Examples:
+            | collection                      | folder_old              | folder                    |
+            | /tempZone/home/research-initial | api_test_folder_renamed | api_test_folder_renamed's |
+
+
     Scenario Outline: Research file copy
         Given user researcher is authenticated
         And the Yoda research file copy API is queried with <file>, <copy>, <copy_collection> and <collection>

From faf838f5a28c83e43e9feaf9e0c741f2dd0f75e1 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Tue, 23 Jul 2024 17:36:03 +0200
Subject: [PATCH 09/57] YDA-5835: fix zone selection in transform script

In the script for transforming publication metadata to support
base DOIs, adjust the query condition for selecting the zone of
a collection.

Selecting on USER_ZONE does not reliably return all data package
collections in the zone. We need to either check the zone of
the collection itself, or check the beginning of the collection name
to select its zone.
---
 tools/transform-existing-publications.r | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/transform-existing-publications.r b/tools/transform-existing-publications.r
index 91526ad75..9429c0790 100644
--- a/tools/transform-existing-publications.r
+++ b/tools/transform-existing-publications.r
@@ -15,13 +15,13 @@ def main(rule_args, callback, rei):
     # Changing yoda prefix -> version
     iter = genquery.row_iterator(
         "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
-        "USER_ZONE = '{}' AND META_COLL_ATTR_NAME LIKE 'org_publication_yoda%'".format(zone),
+        "COLL_ZONE_NAME = '{}' AND META_COLL_ATTR_NAME LIKE 'org_publication_yoda%'".format(zone),
         genquery.AS_TUPLE,
         callback) 
 
     iter2 = genquery.row_iterator(
         "COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
-        "USER_ZONE = '{}' AND META_COLL_ATTR_NAME in ('org_publication_DOIAvailable', 'org_publication_DOIMinted')".format(zone),
+        "COLL_ZONE_NAME = '{}' AND META_COLL_ATTR_NAME in ('org_publication_DOIAvailable', 'org_publication_DOIMinted')".format(zone),
         genquery.AS_TUPLE,
         callback) 
 
@@ -33,4 +33,4 @@ def main(rule_args, callback, rei):
         subprocess.call(["imeta", "mod", "-C", row[0], row[1], row[2], "n:{}".format(attr_name), "v:{}".format(row[2])])
 
 INPUT null
-OUTPUT ruleExecOut
\ No newline at end of file
+OUTPUT ruleExecOut

From db0f5787722f49e18f09023f00fa88a4f1e084b5 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Tue, 30 Jul 2024 11:05:34 +0200
Subject: [PATCH 10/57] YDA-5747: give readers access to vault package

---
 browse.py                               |   2 +-
 groups.py                               |   2 +-
 tests/features/api/api_vault.feature    |   2 +-
 tests/features/smoke/smoke_test.feature |  10 +-
 tests/features/ui/ui_browse.feature     |   2 +-
 tests/features/ui/ui_vault.feature      |  34 ++++-
 tests/step_defs/ui/test_ui_vault.py     |  65 +++++++---
 vault.py                                | 158 ++++++++++++++----------
 8 files changed, 177 insertions(+), 98 deletions(-)

diff --git a/browse.py b/browse.py
index 71fbeb30f..8687c6a5f 100644
--- a/browse.py
+++ b/browse.py
@@ -310,7 +310,7 @@ def _filter_vault_deposit_index(row):
 
        :param row: row of results data from GenQuery, containing collection name (COLL_NAME)
 
-       :returns: boolean value that indicated whether row should be displayed
+       :returns: boolean value that indicates whether row should be displayed
     """
     # Remove ORDER_BY etc. wrappers from column names.
     x = {re.sub('.*\((.*)\)', '\\1', k): v for k, v in row.items()}
diff --git a/groups.py b/groups.py
index 1cad41faf..6845b88da 100644
--- a/groups.py
+++ b/groups.py
@@ -89,7 +89,7 @@ def getGroupsData(ctx):
         user = row[1]
         zone = row[2]
 
-        if name != user and name != "rodsadmin" and name != "public":
+        if name not in (user, 'rodsadmin', 'public'):
             user = user + "#" + zone
             if name.startswith("read-"):
                 # Match read-* group with research-* or initial-* group.
diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature
index ac3f90e2b..0039a709b 100644
--- a/tests/features/api/api_vault.feature
+++ b/tests/features/api/api_vault.feature
@@ -182,7 +182,7 @@ Feature: Vault API
             | /tempZone/home/vault-default-3 |
 
 
-    Scenario Outline: Revoke grant access to research group
+    Scenario Outline: Grant read access to research group
         Given user datamanager is authenticated
         And data package exists in <vault>
         And the Yoda vault grant read access research group API is queried on datapackage in <vault>
diff --git a/tests/features/smoke/smoke_test.feature b/tests/features/smoke/smoke_test.feature
index df8ba09db..269775f9e 100644
--- a/tests/features/smoke/smoke_test.feature
+++ b/tests/features/smoke/smoke_test.feature
@@ -255,19 +255,19 @@ Feature: Smoke tests
         Then the response status code is "200"
 
         Examples:
-            | vault                          |
-            | /tempZone/home/vault-smoke-test   |
+            | vault                           |
+            | /tempZone/home/vault-smoke-test |
 
 
-    Scenario Outline: Vault revoke grant access to research group
+    Scenario Outline: Vault grant read access to research group
         Given user smoke_account is authenticated
         And data package exists in <vault>
         And the Yoda vault grant read access research group API is queried on datapackage in <vault>
         Then the response status code is "200"
 
         Examples:
-            | vault                          |
-            | /tempZone/home/vault-smoke-test   |
+            | vault                           |
+            | /tempZone/home/vault-smoke-test |
 
 
     Scenario Outline: Vault get publication terms
diff --git a/tests/features/ui/ui_browse.feature b/tests/features/ui/ui_browse.feature
index c85a241e5..de2d0a3ec 100644
--- a/tests/features/ui/ui_browse.feature
+++ b/tests/features/ui/ui_browse.feature
@@ -26,7 +26,7 @@ Feature: Browse UI
           #Then content of sub-folder <folder> is shown
 
           Examples:
-            | user        | folder    |
+            | user        | folder          |
             | researcher  | vault-core-1    |
             | researcher  | vault-default-2 |
             | researcher  | vault-core-2    |
diff --git a/tests/features/ui/ui_vault.feature b/tests/features/ui/ui_vault.feature
index 33f5f2d09..7ccfaa159 100644
--- a/tests/features/ui/ui_vault.feature
+++ b/tests/features/ui/ui_vault.feature
@@ -170,26 +170,50 @@ Feature: Vault UI
         Given user datamanager is logged in
         And module "vault" is shown
         When user browses to data package in <vault>
-        And user clicks action menu to revoke access
-        Then action menu holds option to grant access to research group
+        And user clicks action menu to change access
+        Then revoke text is displayed
+        When user confirms revoke read permissions
 
         Examples:
             | vault          |
             | vault-initial1 |
 
 
+    Scenario Outline: Research group user has had access revoked to vault package
+        Given user <user> is logged in
+        When user browses to previous vault package url
+        Then user does not have access to folder
+
+        Examples:
+            | user       |
+            | researcher |
+            | viewer     |
+
+
     Scenario Outline: Grant read access to research group
         Given user datamanager is logged in
         And module "vault" is shown
         When user browses to data package in <vault>
-        And clicks action menu to grant access
-        Then action menu holds option to revoke access from research group
+        And user clicks action menu to change access
+        Then grant text is displayed
+        When user confirms grant read permissions
 
         Examples:
             | vault          |
             | vault-initial1 |
 
 
+    Scenario Outline: Research group user has been granted access to vault package
+        Given user <user> is logged in
+        When user browses to previous vault package url
+        Then contents of folder are shown
+
+        Examples:
+            | user       | vault          |
+            | researcher | vault-initial1 |
+            | viewer     | vault-initial1 |
+
+
     Scenario Outline: Copy datapackage to research space
         Given user datamanager is logged in
         And module "vault" is shown
@@ -238,4 +262,4 @@ Feature: Vault UI
 
         Examples:
             | vault          | group             |
-            | vault-initial1 | research-initial1 |
\ No newline at end of file
+            | vault-initial1 | research-initial1 |
diff --git a/tests/step_defs/ui/test_ui_vault.py b/tests/step_defs/ui/test_ui_vault.py
index 1ed185ebe..4b0739cb8 100644
--- a/tests/step_defs/ui/test_ui_vault.py
+++ b/tests/step_defs/ui/test_ui_vault.py
@@ -1,7 +1,7 @@
 # coding=utf-8
 """Vault UI feature tests."""
 
-__copyright__ = 'Copyright (c) 2020-2022, Utrecht University'
+__copyright__ = 'Copyright (c) 2020-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import time
@@ -15,9 +15,12 @@
 
 scenarios('../../features/ui/ui_vault.feature')
 
+previous_vault_path = ''
+
 
 @when(parsers.parse("user browses to data package in {vault}"))
 def ui_browse_data_package(browser, vault):
+    global previous_vault_path
     link = []
     while len(link) == 0:
         link = browser.links.find_by_partial_text(vault)
@@ -31,6 +34,7 @@ def ui_browse_data_package(browser, vault):
     research = vault.replace("vault-", "research-")
     data_packages = browser.links.find_by_partial_text(research)
     data_packages.click()
+    previous_vault_path = browser.driver.current_url
 
 
 @when('user submits the data package for publication')
@@ -158,36 +162,59 @@ def ui_data_package_provenance_information_is_visible(browser):
     assert browser.is_element_visible_by_css('.actionlog')
 
 
-@when('user clicks action menu to revoke access')
-def ui_data_package_revoke_vault_access(browser):
+@when('user clicks action menu to change access')
+def ui_data_package_change_vault_access(browser):
     browser.find_by_id('actionMenu').click()
-    browser.find_by_css('a.action-revoke-vault-access').click()
+    browser.find_by_css('a.action-change-vault-access').click()
 
 
-@then('action menu holds option to grant access to research group')
-def ui_data_package_grant_option_present(browser):
-    browser.find_by_id('actionMenu').click()
-    assert browser.is_element_present_by_css('.action-grant-vault-access')
+@then('revoke text is displayed')
+def ui_data_package_revoke_message(browser):
+    time.sleep(3)
+    assert browser.is_text_present('revoke')
 
 
-@when('clicks action menu to grant access')
-def ui_data_package_grant_vault_access(browser):
-    browser.find_by_id('actionMenu').click()
-    browser.find_by_css('a.action-grant-vault-access').click()
+@then('grant text is displayed')
+def ui_data_package_grant_message(browser):
+    time.sleep(3)
+    assert browser.is_text_present('grant')
 
 
-@then('action menu holds option to revoke access from research group')
-def ui_data_package_revoke_option_present(browser):
-    browser.find_by_id('actionMenu').click()
-    assert browser.is_element_present_by_css('.action-revoke-vault-access')
+@when("user confirms revoke read permissions")
+def ui_data_package_revoke_read_permissions_confirm(browser):
+    browser.find_by_css(".action-confirm-revoke-read-permissions").click()
+
+
+@when("user confirms grant read permissions")
+def ui_data_package_grant_read_permissions_confirm(browser):
+    browser.find_by_css(".action-confirm-grant-read-permissions").click()
 
 
 @when('user clicks action menu to copy data package to research')
-def ui_data_package_copy_to_resarch(browser):
+def ui_data_package_copy_to_research(browser):
     browser.find_by_id('actionMenu').click()
     browser.find_by_css('a.action-copy-vault-package-to-research').click()
 
 
+@when('user browses to previous vault package url')
+def ui_data_package_browses_previous_url(browser):
+    if len(previous_vault_path):
+        browser.visit(previous_vault_path)
+    else:
+        assert False
+
+
+@then('contents of folder are shown')
+def ui_data_package_contents(browser):
+    assert browser.is_text_present('yoda-metadata')
+    assert browser.is_text_present('original')
+
+
+@then('user does not have access to folder')
+def ui_data_package_no_access(browser):
+    assert browser.is_text_present('This vault space path does not exist')
+
+
 @when(parsers.parse("user chooses research folder corresponding to {vault}"))
 def ui_browse_research_to_copy_data_package_to(browser, vault):
     research = vault.replace("vault-", "research-")
@@ -208,8 +235,8 @@ def ui_user_presses_copy_package_button(browser):
 
 @then('data package is copied to research area')
 def ui_data_package_is_copied_to_research(browser):
-    browser.find_by_id('actionMenu').click()
-    browser.is_element_present_by_css('.action-revoke-vault-access')
+    # TODO
+    pass
 
 
 @when('user clicks clicks action menu to check compliance')
diff --git a/vault.py b/vault.py
index 237364a31..d70fc8323 100644
--- a/vault.py
+++ b/vault.py
@@ -572,7 +572,7 @@ def api_vault_collection_details(ctx, path):
     if not collection.exists(ctx, path):
         return api.Error('nonexistent', 'The given path does not exist')
 
-    # Check if collection is in vault spcae.
+    # Check if collection is in vault space.
     space, _, group, subpath = pathutil.info(path)
     if space is not pathutil.Space.VAULT:
         return {}
@@ -765,25 +765,74 @@ def api_vault_get_publication_terms(ctx):
         return api.Error('TermsReadFailed', 'Could not open Terms and Agreements.')
 
 
-@api.make()
-def api_grant_read_access_research_group(ctx, coll):
-    """Grant read rights of research group for datapackage in vault.
+def change_read_access_group(ctx, coll, actor, group, grant=True):
+    """Grant/revoke research group read access to vault package.
 
-    :param ctx:  Combined type of a callback and rei struct
-    :param coll: Collection of data package to remove read rights from
+    :param ctx:   Combined type of a callback and rei struct
+    :param coll:  Collection of data package to grant/remove read rights from
+    :param actor: User changing the permissions
+    :param group: Group to grant/revoke read access to vault package
+    :param grant: Whether to grant or revoke access
 
-    :returns: API status
+    :returns: 2-Tuple of boolean successfully changed, API status if error
+    """
+    try:
+        acl_kv = msi.kvpair(ctx, "actor", actor)
+        if grant:
+            msi.sudo_obj_acl_set(ctx, "recursive", "read", group, coll, acl_kv)
+        else:
+            msi.sudo_obj_acl_set(ctx, "recursive", "null", group, coll, acl_kv)
+    except Exception:
+        policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, group, "1", "read")
+        if bool(policy_error):
+            return False, api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll))
+        else:
+            return False, api.Error('ErrorACLs', str(policy_error))
+
+    return True, ''
+
+
+def check_change_read_access_research_group(ctx, coll, grant=True):
+    """Initial checks when changing read rights of research group for datapackage in vault.
+
+    :param ctx:   Combined type of a callback and rei struct
+    :param coll:  Collection of data package to revoke/grant read rights from
+    :param grant: Whether to grant or revoke read rights
+
+    :returns: 2-Tuple of boolean whether ok to continue and API status if error
     """
+    verb = "grant" if grant else "revoke"
+
     if not collection.exists(ctx, coll):
-        return api.Error('nonexistent', 'The given path does not exist')
+        return False, api.Error('nonexistent', 'The given path does not exist')
 
     coll_parts = coll.split('/')
     if len(coll_parts) != 5:
-        return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages')
+        return False, api.Error('invalid_collection', 'The datamanager can only {} permissions to vault packages'.format(verb))
 
-    space, zone, group, subpath = pathutil.info(coll)
+    space, _, _, _ = pathutil.info(coll)
     if space is not pathutil.Space.VAULT:
-        return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages')
+        return False, api.Error('invalid_collection', 'The datamanager can only {} permissions to vault packages'.format(verb))
+
+    return True, ''
+
+
+def change_read_access_research_group(ctx, coll, grant=True):
+    """Grant/revoke read rights of members of research group to a
+    datapackage in vault. This operation also includes read only members.
+
+    :param ctx:   Combined type of a callback and rei struct
+    :param coll:  Collection of data package to grant/remove read rights from
+    :param grant: Whether to grant or revoke access
+
+    :returns: API status
+    """
+    verb = "granting" if grant else "revoking"
+    response, api_error = check_change_read_access_research_group(ctx, coll, True)
+    if not response:
+        return api_error
+
+    _, _, group, subpath = pathutil.info(coll)
 
     # Find category
     group_parts = group.split('-')
@@ -792,71 +841,44 @@ def api_grant_read_access_research_group(ctx, coll):
     else:
         research_group_name = 'research-' + '-'.join(group_parts[1:])
     category = groups.group_category(ctx, group)
+    read_group_name = 'read-' + '-'.join(group_parts[1:])
 
     # Is datamanager?
     actor = user.full_name(ctx)
     if groups.user_role(ctx, actor, 'datamanager-' + category) in ['normal', 'manager']:
-        # Grant research group read access to vault package.
-        try:
-            acl_kv = msi.kvpair(ctx, "actor", actor)
-            msi.sudo_obj_acl_set(ctx, "recursive", "read", research_group_name, coll, acl_kv)
-        except Exception:
-            policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, research_group_name, "1", "read")
-            if bool(policy_error):
-                return api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll))
-            else:
-                return api.Error('ErrorACLs', str(policy_error))
+        # Grant/revoke research group read access to vault package.
+        for group_name in (research_group_name, read_group_name):
+            response, api_error = change_read_access_group(ctx, coll, actor, group_name, grant)
+            if not response:
+                return api_error
     else:
-        return api.Error('NoDatamanager', 'Actor must be a datamanager for granting access')
+        return api.Error('NoDatamanager', 'Actor must be a datamanager for {} access'.format(verb))
 
     return {'status': 'Success', 'statusInfo': ''}
 
 
 @api.make()
-def api_revoke_read_access_research_group(ctx, coll):
-    """Revoke read rights of research group for datapackage in vault.
+def api_grant_read_access_research_group(ctx, coll):
+    """Grant read rights of research group for datapackage in vault.
 
     :param ctx:  Combined type of a callback and rei struct
     :param coll: Collection of data package to remove read rights from
 
     :returns: API status
     """
-    if not collection.exists(ctx, coll):
-        return api.Error('nonexistent', 'The given path does not exist')
+    return change_read_access_research_group(ctx, coll, True)
 
-    coll_parts = coll.split('/')
-    if len(coll_parts) != 5:
-        return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages')
-
-    space, zone, group, subpath = pathutil.info(coll)
-    if space is not pathutil.Space.VAULT:
-        return api.Error('invalid_collection', 'The datamanager can only revoke permissions to vault packages')
 
-    # Find category
-    group_parts = group.split('-')
-    if subpath.startswith("deposit-"):
-        research_group_name = 'deposit-' + '-'.join(group_parts[1:])
-    else:
-        research_group_name = 'research-' + '-'.join(group_parts[1:])
-    category = groups.group_category(ctx, group)
+@api.make()
+def api_revoke_read_access_research_group(ctx, coll):
+    """Revoke read rights of research group for datapackage in vault.
 
-    # Is datamanager?
-    actor = user.full_name(ctx)
-    if groups.user_role(ctx, actor, 'datamanager-' + category) in ['normal', 'manager']:
-        # Grant research group read access to vault package.
-        try:
-            acl_kv = msi.kvpair(ctx, "actor", actor)
-            msi.sudo_obj_acl_set(ctx, "recursive", "null", research_group_name, coll, acl_kv)
-        except Exception:
-            policy_error = policies_datamanager.can_datamanager_acl_set(ctx, coll, actor, research_group_name, "1", "read")
-            if bool(policy_error):
-                return api.Error('ErrorACLs', 'Could not acquire datamanager access to {}.'.format(coll))
-            else:
-                return api.Error('ErrorACLs', str(policy_error))
-    else:
-        return api.Error('NoDatamanager', 'Actor must be a datamanager for revoking access')
+    :param ctx:  Combined type of a callback and rei struct
+    :param coll: Collection of data package to remove read rights from
 
-    return {'status': 'Success', 'statusInfo': ''}
+    :returns: API status
+    """
+    return change_read_access_research_group(ctx, coll, False)
 
 
 @rule.make()
@@ -1021,8 +1043,12 @@ def set_vault_permissions(ctx, coll, target):
 
     parts = group_name.split('-')
     base_name = '-'.join(parts[1:])
+    valid_read_groups = [group_name]
 
     vault_group_name = constants.IIVAULTPREFIX + base_name
+    if parts[0] != 'deposit':
+        read_group_name = "read-" + base_name
+        valid_read_groups.append(read_group_name)
 
     # Check if noinherit is set
     zone = user.zone(ctx)
@@ -1061,11 +1087,12 @@ def set_vault_permissions(ctx, coll, target):
 
         if access_name != "read object":
             # Grant the research group read-only access to the collection to enable browsing through the vault.
-            try:
-                msi.set_acl(ctx, "default", "admin:read", group_name, vault_path)
-                log.write(ctx, "Granted " + group_name + " read access to " + vault_path)
-            except msi.Error:
-                log.write(ctx, "Failed to grant " + group_name + " read access to " + vault_path)
+            for name in valid_read_groups:
+                try:
+                    msi.set_acl(ctx, "default", "admin:read", name, vault_path)
+                    log.write(ctx, "Granted " + name + " read access to " + vault_path)
+                except msi.Error:
+                    log.write(ctx, "Failed to grant " + name + " read access to " + vault_path)
 
     # Check if vault group has ownership
     iter = genquery.row_iterator(
@@ -1096,8 +1123,9 @@ def set_vault_permissions(ctx, coll, target):
     if group.exists(ctx, datamanager_group_name):
         msi.set_acl(ctx, "recursive", "admin:read", datamanager_group_name, target)
 
-    # Grant research group read access to vault package.
-    msi.set_acl(ctx, "recursive", "admin:read", group_name, target)
+    # Grant research group, research group readers read access to vault package.
+    for name in valid_read_groups:
+        msi.set_acl(ctx, "recursive", "admin:read", name, target)
 
     return True
 
@@ -1161,7 +1189,7 @@ def vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous
                 iter = genquery.row_iterator(
                     "META_COLL_ATTR_VALUE",
                     "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_landingPageUrl'" % (coll),
-                    genquery.AS_LIST, callback
+                    genquery.AS_LIST, ctx
                 )
 
                 for row in iter:
@@ -1172,7 +1200,7 @@ def vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous
                 iter = genquery.row_iterator(
                     "META_COLL_ATTR_VALUE",
                     "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_versionDOI'" % (coll),
-                    genquery.AS_LIST, callback
+                    genquery.AS_LIST, ctx
                 )
 
                 for row in iter:

From a2a43bf8fc8e1bf9c407606add6da3ae9b670ea5 Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Thu, 1 Aug 2024 08:56:18 +0200
Subject: [PATCH 11/57] YDA-5775: fix data request internal error

---
 datarequest.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/datarequest.py b/datarequest.py
index 63a278ef5..76ba69a3a 100644
--- a/datarequest.py
+++ b/datarequest.py
@@ -496,7 +496,10 @@ def datarequest_owner_get(ctx, request_id):
                                       + JSON_EXT)
 
     # Get and return data request owner
-    return jsonutil.read(ctx, file_path)['owner']
+    try:
+        return jsonutil.read(ctx, file_path)['owner']
+    except Exception:
+        return None
 
 
 def datarequest_is_reviewer(ctx, request_id, pending=False):
@@ -1046,7 +1049,10 @@ def api_datarequest_get(ctx, request_id):
     datarequest_action_permitted(ctx, request_id, ["PM", "DM", "DAC", "OWN"], None)
 
     # Get request type
-    datarequest_type = type_get(ctx, request_id).value
+    try:
+        datarequest_type = type_get(ctx, request_id).value
+    except Exception as e:
+        return api.Error("datarequest_type_fail", "Error: {}".format(e))
 
     # Get request status
     datarequest_status = status_get(ctx, request_id).value

From f7608d8401834d1835371cdc9d300db0dea2fb8f Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 1 Aug 2024 14:40:35 +0200
Subject: [PATCH 12/57] Add basic integration tests util.group

---
 integration_tests.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/integration_tests.py b/integration_tests.py
index 411581051..68229f49b 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -13,7 +13,8 @@
 import uuid
 
 import folder
-from util import avu, collection, config, constants, data_object, log, msi, resource, rule, user
+import schema
+from util import avu, collection, config, constants, data_object, group, log, msi, resource, rule, user
 
 
 def _call_msvc_stat_vault(ctx, resc_name, data_path):
@@ -363,6 +364,27 @@ def _test_folder_secure_func(ctx, func):
     {"name":   "util.data_object.get_group_owners",
      "test": lambda ctx: data_object.get_group_owners(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"),
      "check": lambda x: x == [['research-initial', 'tempZone']]},
+    {"name":   "util.group.exists.yes",
+     "test": lambda ctx: group.exists(ctx, "research-initial"),
+     "check": lambda x: x},
+    {"name":   "util.group.exists.no",
+     "test": lambda ctx: group.exists(ctx, "research-doesnotexist"),
+     "check": lambda x: not x},
+    {"name":   "util.group.get_category",
+     "test": lambda ctx: group.get_category(ctx, "research-initial"),
+     "check": lambda x: x == "test-automation"},
+    {"name":   "util.group.is_member.yes",
+     "test": lambda ctx: group.is_member(ctx, "research-initial", "researcher"),
+     "check": lambda x: x},
+    {"name":   "util.group.is_member.no",
+     "test": lambda ctx: group.is_member(ctx, "research-initial", "rods"),
+     "check": lambda x: not x},
+    {"name":   "util.group.members.normal",
+     "test": lambda ctx: group.members(ctx, "research-initial"),
+     "check": lambda x: sorted([member for member in x]) == sorted([('functionaladminpriv', 'tempZone'), ('functionaladminpriv@yoda.test', 'tempZone'), ('groupmanager', 'tempZone'), ('groupmanager@yoda.test', 'tempZone'), ('researcher', 'tempZone'), ('researcher@yoda.test', 'tempZone')])},
+    {"name":   "util.group.members.doesnotexist",
+     "test": lambda ctx: user.exists(ctx, "research-doesnotexist"),
+     "check": lambda x: x is False},
     {"name":   "util.resource.exists.yes",
      "test": lambda ctx: resource.exists(ctx, "irodsResc"),
      "check": lambda x: x},

From 370dd3a00cae52d675907ab6acdd006c55bfb4d6 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 1 Aug 2024 17:04:39 +0200
Subject: [PATCH 13/57] YDA-5863: fix log messages schema transformation

The log messages for copying ACLs did not match the actions that the
system was performing.
---
 schema_transformation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/schema_transformation.py b/schema_transformation.py
index 817da02e9..1fd7a0984 100644
--- a/schema_transformation.py
+++ b/schema_transformation.py
@@ -141,10 +141,10 @@ def copy_acls_from_parent(ctx, path, recursive_flag):
             log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
             msi.set_acl(ctx, recursive_flag, "own", user_name, path)
         elif access_name == "read object":
-            log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
+            log.write(ctx, "iiCopyACLsFromParent: granting read to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
             msi.set_acl(ctx, recursive_flag, "read", user_name, path)
         elif access_name == "modify object":
-            log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
+            log.write(ctx, "iiCopyACLsFromParent: granting write to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
             msi.set_acl(ctx, recursive_flag, "write", user_name, path)
 
 

From e654d8208b3501792578f15fe69a8f82d0341772 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Fri, 2 Aug 2024 11:09:20 +0200
Subject: [PATCH 14/57] YDA-5747: script for granting read members access to
 vault packages

---
 browse.py                                     |   2 +-
 folder.py                                     |   1 -
 .../grant-readers-access-to-vault-packages.r  |   9 ++
 .../grant-readers-access-to-vault-packages.sh |   2 +
 vault.py                                      | 134 ++++++++++++++++++
 5 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 tools/grant-readers-access-to-vault-packages.r
 create mode 100755 tools/grant-readers-access-to-vault-packages.sh

diff --git a/browse.py b/browse.py
index 8687c6a5f..56b870f72 100644
--- a/browse.py
+++ b/browse.py
@@ -264,7 +264,7 @@ def transform(row):
         if sort_on == 'modified':
             cols = ['COLL_NAME', 'COLL_PARENT_NAME', 'MIN(COLL_CREATE_TIME)', 'ORDER(COLL_MODIFY_TIME)']
         else:
-            cols = ['ORDER(COLL_NAME)', 'COLL_PARENT_NAME' 'MIN(COLL_CREATE_TIME)', 'MAX(COLL_MODIFY_TIME)']
+            cols = ['ORDER(COLL_NAME)', 'COLL_PARENT_NAME', 'MIN(COLL_CREATE_TIME)', 'MAX(COLL_MODIFY_TIME)']
         where = "COLL_PARENT_NAME like '{}%%' AND COLL_NAME like '%%{}%%'".format("/" + zone + "/home", search_string)
     elif search_type == 'metadata':
         if sort_on == 'modified':
diff --git a/folder.py b/folder.py
index 32fa9c378..8f5a15ac5 100644
--- a/folder.py
+++ b/folder.py
@@ -345,7 +345,6 @@ def set_can_modify(ctx, coll):
     check_access_result = msi.check_access(ctx, coll, 'modify object', irods_types.BytesBuf())
     modify_access = check_access_result['arguments'][2]
     if modify_access != b'\x01':
-        # TODO set to a lower read?
         # This allows us permission to copy the files
         if not set_acl_check(ctx, "recursive", "admin:read", coll, "Could not set ACL (admin:read) for collection: {}".format(coll)):
             return False
diff --git a/tools/grant-readers-access-to-vault-packages.r b/tools/grant-readers-access-to-vault-packages.r
new file mode 100644
index 000000000..db650a521
--- /dev/null
+++ b/tools/grant-readers-access-to-vault-packages.r
@@ -0,0 +1,9 @@
+#!/usr/bin/irule -F
+
+grantReadersAccessVaultPackages {
+	# Grant read- groups access to corresponding vault packages
+    *return = "";
+    rule_vault_grant_readers_vault_access(*dryRun, *verbose, *return);
+}
+input *dryRun="", *verbose=""
+output ruleExecOut
diff --git a/tools/grant-readers-access-to-vault-packages.sh b/tools/grant-readers-access-to-vault-packages.sh
new file mode 100755
index 000000000..ef81bb13d
--- /dev/null
+++ b/tools/grant-readers-access-to-vault-packages.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/grant-readers-access-to-vault-packages.r '*dryRun="'$1'"' '*verbose="'$2'"'
diff --git a/vault.py b/vault.py
index d70fc8323..793143f55 100644
--- a/vault.py
+++ b/vault.py
@@ -37,6 +37,7 @@
            'rule_vault_enable_indexing',
            'rule_vault_disable_indexing',
            'rule_vault_process_status_transitions',
+           'rule_vault_grant_readers_vault_access',
            'api_vault_system_metadata',
            'api_vault_collection_details',
            'api_vault_get_package_by_reference',
@@ -1130,6 +1131,139 @@ def set_vault_permissions(ctx, coll, target):
     return True
 
 
+def reader_needs_access(ctx, group_name, coll):
+    """Return if research group has access to this group but readers do not"""
+    iter = genquery.row_iterator(
+        "COLL_ACCESS_USER_ID",
+        "COLL_NAME = '" + coll + "'",
+        genquery.AS_LIST, ctx
+    )
+    reader_found = False
+    research_found = False
+
+    for row in iter:
+        user_id = row[0]
+        user_name = user.name_from_id(ctx, user_id)
+        # Check if there are *any* readers
+        if user_name.startswith('read-'):
+            reader_found = True
+        elif user_name == group_name:
+            research_found = True
+
+    return not reader_found and research_found
+
+
+def set_reader_vault_permissions(ctx, group_name, zone, dry_run):
+    """Given a research group name, give reader group access to
+    vault packages if they don't have that access already.
+
+    :param ctx:        Combined type of a callback and rei struct
+    :param group_name: Research group name
+    :param zone:       Zone
+    :param dry_run:    Whether to only print which groups would be changed without changing them
+
+    :return: Boolean whether completed successfully or there were errors.
+    """
+    parts = group_name.split('-')
+    base_name = '-'.join(parts[1:])
+    read_group_name = 'read-' + base_name
+    vault_group_name = constants.IIVAULTPREFIX + base_name
+    vault_path = "/" + zone + "/home/" + vault_group_name
+    no_errors = True
+
+    # Do not change the permissions if there aren't any vault packages in this vault.
+    if collection.empty(ctx, vault_path):
+        return True
+
+    if reader_needs_access(ctx, group_name, vault_path):
+        # Grant the research group readers read-only access to the collection
+        # to enable browsing through the vault.
+        try:
+            if dry_run:
+                log.write(ctx, "Would have granted " + read_group_name + " read access to " + vault_path)
+            else:
+                msi.set_acl(ctx, "default", "admin:read", read_group_name, vault_path)
+                log.write(ctx, "Granted " + read_group_name + " read access to " + vault_path)
+        except msi.Error:
+            no_errors = False
+            log.write(ctx, "Failed to grant " + read_group_name + " read access to " + vault_path)
+
+    iter = genquery.row_iterator(
+        "COLL_NAME",
+        "COLL_PARENT_NAME = '{}'".format(vault_path),
+        genquery.AS_LIST, ctx
+    )
+    for row in iter:
+        target = row[0]
+        if reader_needs_access(ctx, group_name, target):
+            try:
+                if dry_run:
+                    log.write(ctx, "Would have granted " + read_group_name + " read access to " + target)
+                else:
+                    msi.set_acl(ctx, "recursive", "admin:read", read_group_name, target)
+                    log.write(ctx, "Granted " + read_group_name + " read access to " + target)
+            except Exception:
+                no_errors = False
+                log.write(ctx, "Failed to set read permissions for <{}> on coll <{}>".format(read_group_name, target))
+
+    return no_errors
+
+
+@rule.make(inputs=[0, 1], outputs=[2])
+def rule_vault_grant_readers_vault_access(ctx, dry_run, verbose):
+    """Rule for granting reader members of research groups access to vault packages in their
+    group if they don't have access already
+
+    :param ctx:     Combined type of a callback and rei struct
+    :param dry_run: Whether to only print which groups would be changed without making changes
+    :param verbose: Whether to be more verbose
+
+    :return: String status of completed successfully ('0') or there were errors ('1')
+    """
+    dry_run = (dry_run == '1')
+    verbose = (verbose == '1')
+    no_errors = True
+
+    log.write(ctx, "grant_readers_vault_access started.")
+
+    if user.user_type(ctx) != 'rodsadmin':
+        log.write(ctx, "User is not rodsadmin")
+        return '1'
+
+    if dry_run or verbose:
+        modes = []
+        if dry_run:
+            modes.append("dry run")
+        if verbose:
+            modes.append("verbose")
+        log.write(ctx, "Running grant_readers_vault_access in {} mode.".format((" and ").join(modes)))
+
+    zone = user.zone(ctx)
+
+    # Get the group names
+    userIter = genquery.row_iterator(
+        "USER_GROUP_NAME",
+        "USER_TYPE = 'rodsgroup' AND USER_ZONE = '{}' AND USER_GROUP_NAME like 'research-%'".format(zone),
+        genquery.AS_LIST,
+        ctx)
+
+    for row in userIter:
+        name = row[0]
+        if verbose:
+            log.write(ctx, "{}: checking permissions".format(name))
+        if not set_reader_vault_permissions(ctx, name, zone, dry_run):
+            no_errors = False
+
+    message = ""
+    if no_errors:
+        message = "grant_readers_vault_access completed successfully."
+    else:
+        message = "grant_readers_vault_access completed, with errors."
+    log.write(ctx, message)
+
+    return '0' if no_errors else '1'
+
+
 @rule.make(inputs=range(4), outputs=range(4, 6))
 def rule_vault_process_status_transitions(ctx, coll, new_coll_status, actor, previous_version):
     """Rule interface for processing vault status transition request.

From 42dd5bc562cf0bd3fb6d228a5a8eedc7770380d7 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Fri, 2 Aug 2024 11:52:55 +0200
Subject: [PATCH 15/57] YDA-5864: ignore deleted users in schema transf.

In the schema transformation code, ignore ACLs of non-existent
users when copying ACLs from a parent. This is needed because
iRODS keeps ACLs referring to deleted users / groups around in the
database (see https://github.com/irods/irods/issues/7778).
---
 schema_transformation.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/schema_transformation.py b/schema_transformation.py
index 1fd7a0984..35bc35dd9 100644
--- a/schema_transformation.py
+++ b/schema_transformation.py
@@ -137,6 +137,11 @@ def copy_acls_from_parent(ctx, path, recursive_flag):
 
         user_name = user.name_from_id(ctx, user_id)
 
+        # iRODS keeps ACLs for deleted users in the iCAT database (https://github.com/irods/irods/issues/7778),
+        # so we need to skip ACLs referring to users that no longer exist.
+        if user_name == "":
+            continue
+
         if access_name == "own":
             log.write(ctx, "iiCopyACLsFromParent: granting own to <" + user_name + "> on <" + path + "> with recursiveFlag <" + recursive_flag + ">")
             msi.set_acl(ctx, recursive_flag, "own", user_name, path)

From 9609b49d4efffd2e37d88ab3b93f18b8d696a882 Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Tue, 6 Aug 2024 08:23:10 +0200
Subject: [PATCH 16/57] YDA-5711: use DAP expiry date notification
 configuration

---
 notifications.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/notifications.py b/notifications.py
index 8d3015168..babf34997 100644
--- a/notifications.py
+++ b/notifications.py
@@ -481,9 +481,10 @@ def rule_process_data_access_token_expiry(ctx):
         exp_time = datetime.strptime(token['exp_time'], '%Y-%m-%d %H:%M:%S.%f')
         date_exp_time = exp_time - timedelta(hours=config.token_expiration_notification)
         r = relativedelta.relativedelta(date_exp_time, datetime.now().date())
+        total_hours = r.years * 12 * 30 * 24 + r.months * 30 * 24 + r.days * 24 + r.hours
 
-        # Send notification if token expires in less than a day.
-        if r.years == 0 and r.months == 0 and r.days <= 1:
+        # Send notification if token expires in less than configured hours.
+        if total_hours <= config.token_expiration_notification:
             actor = 'system'
             target = str(user.from_str(ctx, token['user']))
             message = "Data access password with label <{}> is expiring".format(token["label"])

From b310e6fc62342e300d0d56f424dae9612d0427f0 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Fri, 5 Jul 2024 15:38:11 +0200
Subject: [PATCH 17/57] Add tool for editing vault metadata

This also includes a script for manually updating provenance logs.
---
 tools/edit-vault-metadata.py   | 259 +++++++++++++++++++++++++++++++++
 tools/log-provenance-action.r  |  14 ++
 tools/log-provenance-action.sh |  22 +++
 3 files changed, 295 insertions(+)
 create mode 100755 tools/edit-vault-metadata.py
 create mode 100644 tools/log-provenance-action.r
 create mode 100755 tools/log-provenance-action.sh

diff --git a/tools/edit-vault-metadata.py b/tools/edit-vault-metadata.py
new file mode 100755
index 000000000..d099f7ebb
--- /dev/null
+++ b/tools/edit-vault-metadata.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+
+"""
+   edit-vault-metadata : script for manually editing metadata of a data package
+   in the vault.
+
+   By default, the script lets the vault ingest workflow handle ingestion of new metadata
+   into the vault. In case where that is not possible (e.g. because the vault group no longer
+   has a research group, because the category does not have a datamanager group, etc.), you
+   can use the --direct option to make the script update the vault metadata directly, bypassing
+   the normal vault ingest workflow.
+
+   In direct mode, this script takes care of:
+   - Finding the current (latest) metadata file of the data package
+   - Downloading it
+   - Starting an editor to edit it
+   - Re-uploading the metadata file as a new version
+   - Setting the right ACLs
+   - Updating the provenance log of the data package
+"""
+
+import argparse
+import filecmp
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from typing import List, Tuple, Union
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        'collection',
+        help='Vault collection')
+
+    parser.add_argument(
+        '-m', '--log-message',
+        default="metadata manually updated by technical admin",
+        required=False,
+        help="Message to be logged in the provenance log for this edit (only applies in direct mode)")
+    parser.add_argument(
+        '-d', '--direct',
+        action='store_true',
+        default=False,
+        help="Edit file directly in vault collection. This side-steps the normal ingestion process, but can be needed for vault groups without a research group, categories without a datamanager group, and other situations not support by the default ingestion process.")
+
+    parsed_args = parser.parse_args()
+
+    if not parsed_args.collection.startswith("/"):
+        sys.exit("Error: collection must be an absolute path.")
+
+    return parsed_args
+
+
+def start_editor(filename: str):
+    editor = os.environ.get('EDITOR', 'vim')
+    subprocess.call([editor, filename])
+
+
+def check_edited_file_changed(filename: str) -> bool:
+    return not filecmp.cmp(filename, filename + ".orig")
+
+
+def get_latest_metadata_file(collection: str) -> Union[str, None]:
+    latest_timestamp = None
+    latest_filename = None
+    lines = subprocess.check_output(["ils", collection])
+    for line in lines.decode("utf-8").split("\n"):
+        match = re.search(r"^  (yoda-metadata\[(\d+)\]\.json)\s*$", line)
+        if match and (latest_timestamp is None or match.group(2)
+                      > latest_timestamp):
+            latest_filename = match.group(1)
+            latest_timestamp = match.group(2)
+    return latest_filename
+
+
+def apply_acls(path: str, acls: List[Tuple[str, str]]):
+    for acl in acls:
+        retcode = subprocess.call(["ichmod", "-M", acl[1], acl[0], path])
+        if retcode != 0:
+            sys.exit("Could not set ACL {}:{} for {}".format(acl[1], acl[0], path))
+
+
+def create_collection(path: str):
+    retcode = subprocess.call(["imkdir", path])
+    if retcode != 0:
+        sys.exit("Error: could not create collection " + path)
+
+
+def create_collection_and_apply_acls_recursively(path: str, acls: List[Tuple[str, str]]):
+    path_components = path.split("/")
+    for (level, _) in enumerate(path_components):
+        current_collection = "/".join(path_components[:level + 1])
+        current_collection_exists = collection_exists(current_collection)
+        if level >= 2 and current_collection_exists:
+            apply_acls(current_collection, acls)
+        elif level >= 3 and not current_collection_exists:
+            create_collection(current_collection)
+            apply_acls(current_collection, acls)
+
+
+def get_dataobject_acls(path: str) -> List[Tuple[str, str]]:
+    results = []
+    lines = subprocess.check_output(["ils", "-A", path])
+    for line in lines.decode("utf-8").split("\n"):
+        match = re.search(r"^        ACL - ([\S\s]+)$", line)
+        if match:
+            acl_line = match.group(1)
+            for acl_entry in acl_line.replace("read object", "read").replace("g:", "").split():
+                (acl_group, acl_priv) = acl_entry.split(":")
+                acl_clean_group = acl_group.split("#")[0]
+                results.append((acl_clean_group, acl_priv))
+    return results
+
+
+def upload_new_metadata_file(local_filename: str, remote_filename: str):
+    print("Uploading {} to {}".format(local_filename, remote_filename))
+    retcode = subprocess.call(["iput", local_filename, remote_filename])
+    if retcode != 0:
+        sys.exit("Error: could not upload metadata file {} to {}.".format(
+                 local_filename,
+                 remote_filename))
+
+
+def download_metadata_file(destination_dir: str, remote_path: str) -> str:
+    local_path_edit = os.path.join(destination_dir,
+                                   os.path.basename(remote_path))
+    retcode = subprocess.call(["iget", remote_path, local_path_edit])
+    if retcode != 0:
+        sys.exit("Error: could not download metadata file {} to {}.".format(
+                 remote_path,
+                 local_path_edit))
+
+    local_path_orig = os.path.join(destination_dir,
+                                   os.path.basename(remote_path)) + ".orig"
+    retcode = subprocess.call(["iget", remote_path, local_path_orig])
+    if retcode != 0:
+        sys.exit("Error: could not download metadata file {} to {}.".format(
+                 remote_path,
+                 local_path_orig))
+
+    return local_path_edit
+
+
+def get_datamanager_vault_subcollection(datamanager_collection: str, vault_path: str):
+    vault_group = vault_path.split("/")[3]
+    return os.path.join(os.path.join(datamanager_collection, vault_group), os.path.basename(vault_path))
+
+
+def get_new_metadata_name(collection: str, zone_name: str, direct_mode: bool) -> str:
+    if direct_mode:
+        return os.path.join(collection, "yoda-metadata[{}].json".format(str(int(time.time()))))
+
+    research_collection = get_research_collection_for_vault_path(collection)
+    if research_collection is None:
+        sys.exit("Error: cannot use default workflow. This vault group does not have a research group anymore. You can bypass the default workflow using --direct mode.")
+    research_group = get_research_group_for_research_collection(research_collection)
+    category = get_category_research_group(research_group)
+    dm_collection = get_datamanager_collection_for_category(category, zone_name)
+    if dm_collection is None:
+        sys.exit("Error: cannot use default workflow. The research group for this vault group does not have a datamanager group. You can bypass the default workflow using --direct mode.")
+    dm_subcollection = get_datamanager_vault_subcollection(dm_collection, collection)
+    return os.path.join(dm_subcollection, "yoda-metadata.json")
+
+
+def update_provenance_log(vault_collection: str, log_message: str):
+    retcode = subprocess.call(["/etc/irods/yoda-ruleset/tools/log-provenance-action.sh", vault_collection, "rods", log_message])
+    if retcode != 0:
+        sys.exit("Error: could not update provenance log for {}.".format(vault_collection))
+
+
+def collection_exists(path: str) -> bool:
+    result = subprocess.run(["iquest", "%s", "--no-page", "SELECT COLL_NAME WHERE COLL_NAME ='{}'".format(path)], capture_output=True, text=True)
+    if result.returncode == 0 and path in result.stdout:
+        return True
+    elif result.returncode == 1 and "CAT_NO_ROWS_FOUND" in result.stdout:
+        return False
+    else:
+        sys.exit("Unexpected result when checking for existence of collection " + path)
+
+
+def get_research_collection_for_vault_path(path: str) -> str:
+    if not path.startswith("/"):
+        sys.exit("Error: need absolute vault path to determine research group.")
+    vault_main_collection = "/".join(path.split("/")[:4])
+    research_collection = vault_main_collection.replace("vault-", "research-", 1)
+    return research_collection
+
+
+def get_research_group_for_research_collection(path: str) -> str:
+    if not path.startswith("/"):
+        sys.exit("Error: need absolute research collectoin path to determine research group.")
+    return path.split("/")[3]
+
+
+def get_zone_name_from_path(path: str) -> str:
+    if not path.startswith("/"):
+        sys.exit("Error: need absolute research collection path to determine research group.")
+    return path.split("/")[1]
+
+
+def get_research_group_for_vault_path(path: str) -> Union[str, None]:
+    research_collection = get_research_collection_for_vault_path(path)
+    if collection_exists(research_collection):
+        return get_research_group_for_research_collection(path)
+    else:
+        return None
+
+
+def get_datamanager_collection_for_category(category: str, zone_name: str) -> Union[str, None]:
+    datamanager_collection = "/{}/home/datamanager-{}".format(zone_name, category)
+    return datamanager_collection if collection_exists(datamanager_collection) else None
+
+
+def get_category_research_group(research_group: str) -> str:
+    result = subprocess.run(["iquest", "%s", "--no-page", "SELECT META_USER_ATTR_VALUE WHERE USER_NAME = '{}' and META_USER_ATTR_NAME = 'category'".format(research_group)], capture_output=True, text=True)
+    if result.returncode == 0:
+        return result.stdout.split("\n")[0]
+    else:
+        sys.exit("Error: could not find category for research group " + research_group)
+
+
+def main():
+    args = get_args()
+    if not collection_exists(args.collection):
+        sys.exit("Error: collection {} does not exist.".format(args.collection))
+    zone_name = get_zone_name_from_path(args.collection)
+    with tempfile.TemporaryDirectory() as tempdir:
+        metadata_file = get_latest_metadata_file(args.collection)
+        metadata_file_path = os.path.join(args.collection, metadata_file)
+        metadata_acls = get_dataobject_acls(metadata_file_path)
+        print("Metadata data object: " + metadata_file_path)
+        local_filename = download_metadata_file(tempdir, metadata_file_path)
+        start_editor(local_filename)
+        if check_edited_file_changed(local_filename):
+            remote_filename = get_new_metadata_name(args.collection, zone_name, args.direct)
+            if not args.direct:
+                dm_subcollection = os.path.dirname(remote_filename)
+                print("Creating datamanager subcollection for vault group " + dm_subcollection + " recursively.")
+                create_collection_and_apply_acls_recursively(dm_subcollection, [("rods", "own")])
+            print("Uploading new version of metadata.")
+            upload_new_metadata_file(local_filename, remote_filename)
+            if args.direct:
+                print("Applying ACLs to new metadata.")
+                apply_acls(remote_filename, metadata_acls)
+                print("Updating provenance log ...")
+                update_provenance_log(args.collection, args.log_message)
+                print("Done.")
+        else:
+            print("Not updating metadata, since it wasn't changed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/log-provenance-action.r b/tools/log-provenance-action.r
new file mode 100644
index 000000000..6cfea5bfe
--- /dev/null
+++ b/tools/log-provenance-action.r
@@ -0,0 +1,14 @@
+#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F
+#
+# Logs an action in the provenance log
+#
+import genquery
+
+def main(rule_args, callback, rei):
+    collection = global_vars["*collection"].strip('"')
+    actor = global_vars["*actor"].strip('"')
+    action = global_vars["*action"].strip('"')
+    callback.rule_provenance_log_action(actor, collection, action)
+
+INPUT *collection="", *actor="rods", *action=""
+OUTPUT ruleExecOut
diff --git a/tools/log-provenance-action.sh b/tools/log-provenance-action.sh
new file mode 100755
index 000000000..960c8202f
--- /dev/null
+++ b/tools/log-provenance-action.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+COLLECTION="$1"
+ACTOR="$2"
+ACTION="$3"
+
+if [ -z "$COLLECTION" ]
+then echo "Error: missing collection parameter value."
+     exit 1
+fi
+
+if [ -z "$ACTOR" ]
+then echo "Error: missing actor parameter value."
+     exit 1
+fi
+
+if [ -z "$ACTION" ]
+then echo "Error: missing action parameter value."
+     exit 1
+fi
+
+/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F /etc/irods/yoda-ruleset/tools/log-provenance-action.r "*collection=$COLLECTION" "*actor=$ACTOR" "*action=$ACTION"

From e93427ac2d6c1790705740885ab4811514c277cd Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Tue, 6 Aug 2024 16:36:23 +0200
Subject: [PATCH 18/57] Add integration tests latest vault metadata

Add basic integration tests for meta.get_latest_vault_metadata_path
---
 integration_tests.py | 79 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/integration_tests.py b/integration_tests.py
index 68229f49b..1a7d9ffd5 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -7,12 +7,14 @@
 __all__ = ['rule_run_integration_tests']
 
 import json
+import os
 import re
 import time
 import traceback
 import uuid
 
 import folder
+import meta
 import schema
 from util import avu, collection, config, constants, data_object, group, log, msi, resource, rule, user
 
@@ -92,6 +94,7 @@ def _test_msvc_rmw_avu_collection(ctx, rmw_attributes):
     collection.remove(ctx, tmp_object)
     return result
 
+
 def _test_avu_set_collection(ctx, catch):
     # Test setting avu with catch and without catch
     tmp_object = _create_tmp_collection(ctx)
@@ -139,6 +142,58 @@ def _test_folder_set_get_last_run(ctx):
     return result, found, last_run
 
 
+def _test_schema_active_schema_deposit_from_default(ctx):
+    avu.rm_from_group(ctx, "deposit-pilot", "schema_id", "dag-0")
+    result = schema.get_active_schema_path(ctx, "/tempZone/home/deposit-pilot")
+    avu.associate_to_group(ctx, "deposit-pilot", "schema_id", "dag-0")
+    return result
+
+
+def _test_schema_active_schema_research_from_default(ctx):
+    avu.rm_from_group(ctx, "research-core-2", "schema_id", "core-2")
+    result = schema.get_active_schema_path(ctx, "/tempZone/home/research-core-2")
+    avu.associate_to_group(ctx, "research-core-2", "schema_id", "core-2")
+    return result
+
+
+def _test_schema_active_schema_vault_research_override(ctx):
+    avu.associate_to_group(ctx, "vault-core-2", "schema_id", "integration-test-schema-1")
+    result = schema.get_active_schema_path(ctx, "/tempZone/home/vault-core-2")
+    avu.rm_from_group(ctx, "vault-core-2", "schema_id", "integration-test-schema-1")
+    return result
+
+
+def _test_schema_active_schema_vault_without_research(ctx):
+    ctx.uuGroupAdd("vault-without-research", "test-automation", "something", "", "", "", "", "", "", "")
+    result = schema.get_active_schema_path(ctx, "/tempZone/home/vault-without-research")
+    ctx.uuGroupRemove("vault-without-research", "", "")
+    return result
+
+
+def _test_get_latest_vault_metadata_path_empty(ctx):
+    tmp_collection = _create_tmp_collection(ctx)
+    latest_file = meta.get_latest_vault_metadata_path(ctx, tmp_collection)
+    collection.remove(ctx, tmp_collection)
+    return latest_file is None
+
+
+def _test_get_latest_vault_metadata_path_normal(ctx):
+    tmp_collection = _create_tmp_collection(ctx)
+    data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869873].json"), "test")
+    data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869875].json"), "test")
+    data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869877].json"), "test")
+    data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869876].json"), "test")
+    data_object.write(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869874].json"), "test")
+    latest_file = meta.get_latest_vault_metadata_path(ctx, tmp_collection)
+    data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869873].json"))
+    data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869875].json"))
+    data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869877].json"))
+    data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869876].json"))
+    data_object.remove(ctx, os.path.join(tmp_collection, "yoda-metadata[1722869874].json"))
+    collection.remove(ctx, tmp_collection)
+    return latest_file == os.path.join(tmp_collection, "yoda-metadata[1722869877].json")
+
+
 def _test_folder_secure_func(ctx, func):
     """Create tmp collection, apply func to it and get result, and clean up.
        Used for testing functions that modify avu/acls related to folder secure.
@@ -323,6 +378,30 @@ def _test_folder_secure_func(ctx, func):
     {"name":  "folder.determine_new_vault_target.invalid",
      "test": lambda ctx: folder.determine_new_vault_target(ctx, "/tempZone/home/not-research-group-not-exist/folder-not-exist"),
      "check": lambda x: x == ""},
+    {"name": "groups.rule_group_expiration_date_validate.1",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate("", ""),
+     "check": lambda x: x['arguments'][1] == 'true'},
+    {"name": "groups.rule_group_expiration_date_validate.2",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate(".", ""),
+     "check": lambda x: x['arguments'][1] == 'true'},
+    {"name": "groups.rule_group_expiration_date_validate.3",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate("abc", ""),
+     "check": lambda x: x['arguments'][1] == 'false'},
+    {"name": "groups.rule_group_expiration_date_validate.4",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate("2020-02-02", ""),
+     "check": lambda x: x['arguments'][1] == 'false'},
+    {"name": "groups.rule_group_expiration_date_validate.5",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate("2044-01-32", ""),
+     "check": lambda x: x['arguments'][1] == 'false'},
+    {"name": "groups.rule_group_expiration_date_validate.6",
+     "test": lambda ctx: ctx.rule_group_expiration_date_validate("2044-02-26", ""),
+     "check": lambda x: x['arguments'][1] == 'true'},
+    {"name": "meta.get_latest_vault_metadata_path.empty",
+     "test": lambda ctx: _test_get_latest_vault_metadata_path_empty(ctx),
+     "check": lambda x: x},
+    {"name": "meta.get_latest_vault_metadata_path.normal",
+     "test": lambda ctx: _test_get_latest_vault_metadata_path_normal(ctx),
+     "check": lambda x: x},
     {"name": "policies.check_anonymous_access_allowed.local",
      "test": lambda ctx: ctx.rule_check_anonymous_access_allowed("127.0.0.1", ""),
      "check": lambda x: x['arguments'][1] == 'true'},

From eaac61975e4311797bddbea9f07253eb27321578 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Wed, 7 Aug 2024 11:45:58 +0200
Subject: [PATCH 19/57] YDA-5859: prevent unrecoverable error when creating
 DataCite JSON with invalid affiliation

---
 json_datacite.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/json_datacite.py b/json_datacite.py
index b1e248a1b..04bbefafa 100644
--- a/json_datacite.py
+++ b/json_datacite.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Functions for transforming Yoda JSON to DataCite 4.4 JSON."""
 
-__copyright__ = 'Copyright (c) 2019-2023, Utrecht University'
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 from dateutil import parser
@@ -218,13 +218,18 @@ def get_funders(combi):
 
 
 def get_creators(combi):
-    """Return creator information in datacite format."""
+    """Return creator information in DataCite format.
+
+    :param combi: Combined JSON file that holds both user and system metadata
+
+    :returns: JSON element with creators in DataCite format
+    """
     all_creators = []
 
     for creator in combi.get('Creator', []):
         affiliations = []
         for aff in creator.get('Affiliation', []):
-            if isinstance(aff, dict):
+            if isinstance(aff, dict) and len(aff) > 0:
                 if "Affiliation_Identifier" in aff and len(aff["Affiliation_Identifier"]):
                     affiliations.append({"name": aff['Affiliation_Name'],
                                          "affiliationIdentifier": '{}'.format(aff['Affiliation_Identifier']),
@@ -255,14 +260,14 @@ def get_contributors(combi):
 
     :param combi: Combined JSON file that holds both user and system metadata
 
-    :returns: XML element with contributors in DataCite format
+    :returns: JSON element with contributors in DataCite format
     """
     all = []
     # 1) Contributor
     for person in combi.get('Contributor', []):
         affiliations = []
         for aff in person.get('Affiliation', []):
-            if isinstance(aff, dict) and len(aff):
+            if isinstance(aff, dict) and len(aff) > 0:
                 if "Affiliation_Identifier" in aff and len(aff["Affiliation_Identifier"]):
                     affiliations.append({"name": aff['Affiliation_Name'],
                                          "affiliationIdentifier": '{}'.format(aff['Affiliation_Identifier']),

From 26a7d1b19cb79777d3279620e8d6ee4223524c64 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Wed, 7 Aug 2024 15:01:02 +0200
Subject: [PATCH 20/57] YDA-5859: remove empty objects from metadata

---
 meta_form.py                 |  6 ++++--
 unit-tests/test_util_misc.py | 13 ++++++++++++-
 util/misc.py                 | 21 +++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/meta_form.py b/meta_form.py
index 619a30db3..2de46d63f 100644
--- a/meta_form.py
+++ b/meta_form.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """JSON metadata form handling."""
 
-__copyright__ = 'Copyright (c) 2019-2022, Utrecht University'
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import re
@@ -296,7 +296,6 @@ def save(ctx, coll, metadata):
     is_vault = space is pathutil.Space.VAULT
     if is_vault:
         # It's a vault path - set up a staging area in the datamanager collection.
-
         ret = ctx.iiDatamanagerGroupFromVaultGroup(group, '')
         datamanager_group = ret['arguments'][1]
         if datamanager_group == '':
@@ -312,6 +311,9 @@ def save(ctx, coll, metadata):
         # Use staging area instead of trying to write to the vault directly.
         json_path = '{}/{}'.format(tmp_coll, constants.IIJSONMETADATA)
 
+    # Remove empty objects from metadata.
+    metadata = misc.remove_empty_objects(metadata)
+
     # Add metadata schema id to JSON.
     meta.metadata_set_schema_id(metadata, schema_.get_active_schema_id(ctx, json_path))
 
diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py
index 4b81c7676..45ac1343b 100644
--- a/unit-tests/test_util_misc.py
+++ b/unit-tests/test_util_misc.py
@@ -6,11 +6,12 @@
 
 import sys
 import time
+from collections import OrderedDict
 from unittest import TestCase
 
 sys.path.append('../util')
 
-from misc import human_readable_size, last_run_time_acceptable
+from misc import human_readable_size, last_run_time_acceptable, remove_empty_objects
 
 
 class UtilMiscTest(TestCase):
@@ -50,3 +51,13 @@ def test_human_readable_size(self):
         self.assertEqual(output, "100.0 PiB")
         output = human_readable_size(3931462330709348188)
         self.assertEqual(output, "3.41 EiB")
+
+    def test_remove_empty_objects(self):
+        d = OrderedDict({"key1": None, "key2": "", "key3": {}, "key4": []})
+        self.assertEqual(remove_empty_objects(d), OrderedDict({}))
+        d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": []}})
+        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))
+        d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": [], "key5": "value5"}})
+        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1", "key2": {"key5": "value5"}}))
+        d = OrderedDict({"key1": "value1", "key2": [{}]})
+        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))
diff --git a/util/misc.py b/util/misc.py
index 2ae4169d6..12df2a0af 100644
--- a/util/misc.py
+++ b/util/misc.py
@@ -6,6 +6,7 @@
 
 import math
 import time
+from collections import OrderedDict
 
 
 def last_run_time_acceptable(coll, found, last_run, config_backoff_time):
@@ -29,3 +30,23 @@ def human_readable_size(size_bytes):
     p = math.pow(1024, i)
     s = round(size_bytes / p, 2)
     return '{} {}'.format(s, size_name[i])
+
+
+def remove_empty_objects(d):
+    """Remove empty objects (None, '', {}, []) from OrderedDict."""
+    if isinstance(d, dict):
+        # Create OrderedDict to maintain order.
+        cleaned_dict = OrderedDict()
+        for k, v in d.items():
+            # Recursively remove empty objects.
+            cleaned_value = remove_empty_objects(v)
+            # Only add non-empty values.
+            if cleaned_value not in (None, '', {}, []):
+                cleaned_dict[k] = cleaned_value
+        return cleaned_dict
+    elif isinstance(d, list):
+        # Clean lists by filtering out empty objects.
+        return [remove_empty_objects(item) for item in d if remove_empty_objects(item) not in (None, '', {}, [])]
+    else:
+        # Return the value abecause it is not a dict or list.
+        return d

From 654300f12bc7bc8d831b8514b5395d195088f961 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Wed, 7 Aug 2024 16:02:19 +0200
Subject: [PATCH 21/57] YDA-5859: use assertDictEqual when comparing
 dictionaries in test cases

---
 unit-tests/test_util_misc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py
index 45ac1343b..cddbe5fcd 100644
--- a/unit-tests/test_util_misc.py
+++ b/unit-tests/test_util_misc.py
@@ -54,10 +54,10 @@ def test_human_readable_size(self):
 
     def test_remove_empty_objects(self):
         d = OrderedDict({"key1": None, "key2": "", "key3": {}, "key4": []})
-        self.assertEqual(remove_empty_objects(d), OrderedDict({}))
+        self.assertDictEqual(remove_empty_objects(d), OrderedDict({}))
         d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": []}})
-        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))
+        self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))
         d = OrderedDict({"key1": "value1", "key2": {"key1": None, "key2": "", "key3": {}, "key4": [], "key5": "value5"}})
-        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1", "key2": {"key5": "value5"}}))
+        self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1", "key2": {"key5": "value5"}}))
         d = OrderedDict({"key1": "value1", "key2": [{}]})
-        self.assertEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))
+        self.assertDictEqual(remove_empty_objects(d), OrderedDict({"key1": "value1"}))

From b5b7ce89238872195222d83c75b81a91f758fdf5 Mon Sep 17 00:00:00 2001
From: kaur16 <126662478+kaur16@users.noreply.github.com>
Date: Fri, 9 Aug 2024 13:20:42 +0200
Subject: [PATCH 22/57] YDA-5808: add all the versions of data package to vault
 space

Co-authored-by: Lazlo Westerhof <l.r.westerhof@uu.nl>
---
 publication.py |  2 +-
 vault.py       | 71 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/publication.py b/publication.py
index 2ac7e4d75..4b0aafc6a 100644
--- a/publication.py
+++ b/publication.py
@@ -1505,7 +1505,7 @@ def get_all_versions(ctx, path, doi):
 
     :param ctx:  Combined type of a callback and rei struct
     :param path: Path of the published data package
-    :param doi:  Version DOI of the selected publication
+    :param doi:  Base DOI of the selected publication
 
     :return: Dict of related version DOIs
     """
diff --git a/vault.py b/vault.py
index 793143f55..18ae49d34 100644
--- a/vault.py
+++ b/vault.py
@@ -485,22 +485,6 @@ def api_vault_system_metadata(ctx, coll):
         landinpage_url = row[0]
         system_metadata["Landingpage"] = "<a href=\"{}\">{}</a>".format(landinpage_url, landinpage_url)
 
-    # Check for previous version.
-    previous_version = get_previous_version(ctx, coll)
-    if previous_version:
-        previous_version_doi = get_doi(ctx, previous_version)
-        system_metadata["Persistent Identifier DOI"] = persistent_identifier_doi = "previous version: <a href=\"https://doi.org/{}\">{}</a>".format(previous_version_doi, previous_version_doi)
-
-    # Persistent Identifier DOI.
-    package_doi = get_doi(ctx, coll)
-
-    if package_doi:
-        if previous_version:
-            persistent_identifier_doi = "<a href=\"https://doi.org/{}\">{}</a> (previous version: <a href=\"https://doi.org/{}\">{}</a>)".format(package_doi, package_doi, previous_version_doi, previous_version_doi)
-        else:
-            persistent_identifier_doi = "<a href=\"https://doi.org/{}\">{}</a>".format(package_doi, package_doi)
-        system_metadata["Persistent Identifier DOI"] = persistent_identifier_doi
-
     # Data Package Reference.
     data_package_reference = ""
     iter = genquery.row_iterator(
@@ -561,6 +545,46 @@ def get_coll_vault_status(ctx, path, org_metadata=None):
     return constants.vault_package_state.EMPTY
 
 
+def get_all_published_versions(ctx, path):
+    """Get all published versions of a data package."""
+    base_doi = get_doi(ctx, path, 'base')
+    package_doi = get_doi(ctx, path)
+    coll_parent_name = path.rsplit('/', 1)[0]
+
+    org_publ_info, data_packages, grouped_base_dois = get_all_doi_versions(ctx, coll_parent_name)
+
+    count = 0
+    all_versions = []
+
+    for data in data_packages:
+        if data[2] == package_doi:
+            count += 1
+
+    if count == 1:  # Base DOI does not exist as it is first version of the publication
+        # Convert the date into two formats for display and tooltip (Jan 1, 1990 and 1990-01-01 00:00:00)
+        data_packages = [[x[0], datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime("%b %d, %Y"), x[2],
+                          datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime('%Y-%m-%d %H:%M:%S%z'), x[3]] for x in data_packages]
+
+        for item in data_packages:
+            if item[2] == package_doi:
+                all_versions.append([item[1], item[2], item[3]])
+    else:  # Base DOI exists
+        # Sort by publication date
+        sorted_publ = [sorted(x, key=lambda x: datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f"), reverse=True) for x in grouped_base_dois]
+
+        sorted_publ = [element for innerList in sorted_publ for element in innerList]
+
+        # Convert the date into two formats for display and tooltip (Jan 1, 1990 and 1990-01-01 00:00:00)
+        sorted_publ = [[x[0], datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime("%b %d, %Y"), x[2],
+                        datetime.strptime(x[1], "%Y-%m-%dT%H:%M:%S.%f").strftime('%Y-%m-%d %H:%M:%S%z'), x[3]] for x in sorted_publ]
+
+        for item in sorted_publ:
+            if item[0] == base_doi:
+                all_versions.append([item[1], item[2], item[3]])
+
+    return base_doi, package_doi, all_versions
+
+
 @api.make()
 def api_vault_collection_details(ctx, path):
     """Return details of a vault collection.
@@ -605,6 +629,8 @@ def api_vault_collection_details(ctx, path):
         return {'member_type': member_type, 'is_datamanager': is_datamanager}
     else:
         metadata = True
+        # Retreive all published versions
+        base_doi, package_doi, all_versions = get_all_published_versions(ctx, path)
 
     # Check if a vault action is pending.
     vault_action_pending = False
@@ -654,7 +680,10 @@ def api_vault_collection_details(ctx, path):
         "has_datamanager": has_datamanager,
         "is_datamanager": is_datamanager,
         "vault_action_pending": vault_action_pending,
-        "research_group_access": research_group_access
+        "research_group_access": research_group_access,
+        "all_versions": all_versions,
+        "base_doi": base_doi,
+        "package_doi": package_doi
     }
     if config.enable_data_package_archive:
         import vault_archive
@@ -1462,17 +1491,21 @@ def get_approver(ctx, path):
         return None
 
 
-def get_doi(ctx, path):
+def get_doi(ctx, path, doi='version'):
     """Get the DOI of a data package in the vault.
 
     :param ctx:  Combined type of a callback and rei struct
     :param path: Vault package to get the DOI of
+    :param doi: 'base' or 'version' to retrieve required DOI
 
     :return: Data package DOI or None
     """
+    if doi != 'base':
+        doi = 'version'
+
     iter = genquery.row_iterator(
         "META_COLL_ATTR_VALUE",
-        "COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'org_publication_versionDOI'" % (path),
+        "COLL_NAME = '{}' AND META_COLL_ATTR_NAME = 'org_publication_{}DOI'".format(path, doi),
         genquery.AS_LIST, ctx
     )
 

From 99c7d0a4ec022e9bd7c303bc458e8c98a4a3cf3f Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Thu, 8 Aug 2024 11:14:03 +0200
Subject: [PATCH 23/57] YDA-5857: add user metadata indicating user is invited
 in SRAM

---
 groups.py             | 3 ++-
 util/msi.py           | 3 ++-
 uuGroupPolicyChecks.r | 5 ++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/groups.py b/groups.py
index 6845b88da..5343e1e01 100644
--- a/groups.py
+++ b/groups.py
@@ -1069,7 +1069,8 @@ def group_user_add(ctx, username, group_name):
                     sram.invitation_mail_group_add_user(ctx, group_name, username.split('#')[0], co_identifier)
                 elif config.sram_flow == 'invitation':
                     sram.sram_put_collaboration_invitation(ctx, group_name, username.split('#')[0], co_identifier)
-
+                # Mark user as invited.
+                msi.sudo_obj_meta_set(ctx, username, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "")
             return api.Result.ok()
         else:
             return api.Error('policy_error', message)
diff --git a/util/msi.py b/util/msi.py
index 882fdac7d..f5f4b8280 100644
--- a/util/msi.py
+++ b/util/msi.py
@@ -6,7 +6,7 @@
 all errors to unambiguous Python exceptions.
 """
 
-__copyright__ = 'Copyright (c) 2019-2023, Utrecht University'
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
 import irods_types
@@ -136,6 +136,7 @@ def _make_exception(name, message):
 rmw_avu, RmwAvuError = make('_rmw_avu', 'Could not remove metadata to object')
 
 sudo_obj_acl_set, SudoObjAclSetError = make('SudoObjAclSet', 'Could not set ACLs as admin')
+sudo_obj_meta_set, SudoObjMetaSetError = make('SudoObjMetaSet', 'Could not set metadata as admin')
 
 touch, TouchError = make('_touch', 'Could not update the data object or collection')
 obj_stat, ObjStatError = make('ObjStat', 'Could not get the stat of data object or collection')
diff --git a/uuGroupPolicyChecks.r b/uuGroupPolicyChecks.r
index 7a649644d..54d08ee04 100644
--- a/uuGroupPolicyChecks.r
+++ b/uuGroupPolicyChecks.r
@@ -639,10 +639,13 @@ uuUserPolicyCanUserModify(*actor, *userName, *attribute, *allowed, *reason) {
         } else {
             *reason = "Cannot modify settings of other user.";
         }
+    # User SRAM invitation
+    } else if (*attribute == "org_sram_invited") {
+        *allowed = 1;
     # User notifications
     } else if (trimr(*attribute, "_") == "org_notification") {
         *allowed = 1;
     } else {
-		*reason = "Invalid user attribute name.";
+        *reason = "Invalid user attribute name.";
     }
 }

From 7c08c97520889184db1ca85fb17e07a044884d93 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Thu, 8 Aug 2024 16:19:53 +0200
Subject: [PATCH 24/57] YDA-5857: return SRAM invitation status in groups API

---
 groups.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/groups.py b/groups.py
index 5343e1e01..a3759b5d1 100644
--- a/groups.py
+++ b/groups.py
@@ -64,7 +64,8 @@ def getGroupsData(ctx):
                 "name": name,
                 "managers": [],
                 "members": [],
-                "read": []
+                "read": [],
+                "invited": []
             }
             groups[name] = group
 
@@ -113,6 +114,22 @@ def getGroupsData(ctx):
                 except KeyError:
                     pass
 
+    # Third query: obtain list of invited SRAM users
+    if config.enable_sram:
+        iter = genquery.row_iterator(
+            "META_USER_ATTR_VALUE, USER_NAME, USER_ZONE",
+            "USER_TYPE != 'rodsgroup' AND META_USER_ATTR_NAME = '{}'".format(constants.UUORGMETADATAPREFIX + "sram_invited"),
+            genquery.AS_LIST, ctx
+        )
+        for row in iter:
+            name = row[0]
+            user = row[1] + "#" + row[2]
+            try:
+                group = groups[name]
+                group["invited"].append(user)
+            except KeyError:
+                pass
+
     return groups.values()
 
 
@@ -392,6 +409,10 @@ def api_group_data(ctx):
         for member in group['read']:
             members[member] = {'access': 'reader'}
 
+        # Invited SRAM users
+        for member in group['invited']:
+            members[member]['sram'] = 'invited'
+
         if not group_hierarchy.get(group['category']):
             group_hierarchy[group['category']] = OrderedDict()
 

From e0172eacd91625936048aac17d15029abe0d1b91 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Tue, 13 Aug 2024 12:23:49 +0200
Subject: [PATCH 25/57] Add missing priv group type managed via group manager

---
 groups.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/groups.py b/groups.py
index a3759b5d1..de3d3f668 100644
--- a/groups.py
+++ b/groups.py
@@ -378,6 +378,10 @@ def api_group_data(ctx):
         # Filter groups (only return groups user is part of), convert to json and write to stdout.
         groups = list(filter(lambda group: full_name in group['read'] + group['members'] or group['category'] in categories, groups))
 
+    # Only process group types managed via group manager
+    managed_prefixes = ("priv-", "deposit-", "research-", "grp-", "datamanager-", "datarequests-", "intake-")
+    groups = list(filter(lambda group: group['name'].startswith(managed_prefixes), groups))
+
     # Sort groups on name.
     groups = sorted(groups, key=lambda d: d['name'])
 

From fcb97ae31d14ad256ef575145a1002b84bfd9f7e Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Tue, 13 Aug 2024 12:43:32 +0200
Subject: [PATCH 26/57] API tests: ensure privileged users have access to
 privilege groups

(backport to Yoda 1.9)
---
 tests/features/api/api_group.feature | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/features/api/api_group.feature b/tests/features/api/api_group.feature
index 202165e66..dcb248e5c 100644
--- a/tests/features/api/api_group.feature
+++ b/tests/features/api/api_group.feature
@@ -8,10 +8,12 @@ Feature: Group API
         And group <group> exists
 
         Examples:
-            | user        | group                       |
-            | researcher  | research-initial            |
-            | researcher  | research-initial1           |
-            | datamanager | datamanager-test-automation |
+            | user                | group                       |
+            | researcher          | research-initial            |
+            | groupmanager        | research-initial            |
+            | functionaladminpriv | research-initial            |
+            | datamanager         | datamanager-test-automation |
+            | technicaladmin      | priv-category-add           |
 
 
     Scenario Outline: Group categories
@@ -179,7 +181,7 @@ Feature: Group API
         Given user technicaladmin is authenticated
         And the Yoda API for processing csv group data API is queried for data "<group_name>"
         Then the response status code is "400"
-        
+
         Examples:
             | group_name         |
             | csv-missing-header |

From 6605e7ca12c8f3ad3c1455b6138fe3a71ab2ad84 Mon Sep 17 00:00:00 2001
From: kaur16 <126662478+kaur16@users.noreply.github.com>
Date: Tue, 13 Aug 2024 14:18:54 +0200
Subject: [PATCH 27/57] YDA-5865: handle invalid UTF-8 sequence in file

Added errors='replace' parameter in decode function to replace invalid UTF-8 sequences in file.
---
 util/jsonutil.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/util/jsonutil.py b/util/jsonutil.py
index 6d775c741..114ecf798 100644
--- a/util/jsonutil.py
+++ b/util/jsonutil.py
@@ -69,7 +69,7 @@ def _promote_strings(json_data):
     :returns: JSON structure with UTF-8 encoded strings transformed to unicode strings
     """
     return _fold(json_data,
-                 str=lambda x: x.decode('utf-8'),
+                 str=lambda x: x.decode('utf-8', errors='replace'),
                  OrderedDict=lambda x: OrderedDict([(k.decode('utf-8'), v) for k, v in x.items()]),
                  dict=lambda x: OrderedDict([(k.decode('utf-8'), v) for k, v in x.items()]))
 

From de372a5cfb6ff025750232f02e31fa6b9a2578b7 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Tue, 13 Aug 2024 16:09:51 +0200
Subject: [PATCH 28/57]  YDA-5857: sync SRAM invitation status

---
 groups.py   | 45 +++++++++++++++++++++++++++++++--------------
 util/msi.py |  1 +
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/groups.py b/groups.py
index de3d3f668..f7a477ae7 100644
--- a/groups.py
+++ b/groups.py
@@ -1226,10 +1226,11 @@ def rule_group_sram_sync(ctx):
         members = group['members'] + group['read']
         managers = group['managers']
         description = group['description'] if 'description' in group else ''
+        invited = group['invited']
 
         log.write(ctx, "Sync group {} with SRAM".format(group_name))
-
         sram_group, co_identifier = sram_enabled(ctx, group_name)
+
         # Post collaboration group is not yet already SRAM enabled.
         if not sram_group:
             response_sram = sram.sram_post_collaboration(ctx, group_name, description)
@@ -1252,27 +1253,43 @@ def rule_group_sram_sync(ctx):
 
         log.write(ctx, "Sync members of group {} with SRAM".format(group_name))
         for member in members:
-            # Validate email
+            # Validate email.
             if not yoda_names.is_email_username(member):
                 log.write(ctx, "User {} cannot be added to group {} because user email is invalid".format(member, group_name))
                 continue
 
-            if member.split('#')[0] not in co_members:
+            # Check if member is invited.
+            if member in invited:
+                if member.split('#')[0] in co_members:
+                    log.write(ctx, "User {} added to group {}".format(member, group_name))
+                    # Remove invitation metadata.
+                    msi.sudo_obj_meta_remove(ctx, member, "-u", "", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "")
+                else:
+                    log.write(ctx, "User {} already invited to group {}".format(member, group_name))
+                    continue
+
+            # Not invited and not yet in the CO.
+            if member not in invited and member.split('#')[0] not in co_members:
                 if config.sram_flow == 'join_request':
                     sram.invitation_mail_group_add_user(ctx, group_name, member.split('#')[0], co_identifier)
-                    log.write(ctx, "User {} added to group {}".format(member, group_name))
+                    msi.sudo_obj_meta_set(ctx, member, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "")
+                    log.write(ctx, "User {} invited to group {}".format(member, group_name))
+                    continue
                 elif config.sram_flow == 'invitation':
                     sram.sram_put_collaboration_invitation(ctx, group_name, member.split('#')[0], co_identifier)
-                    log.write(ctx, "User {} added to group {}".format(member, group_name))
-            else:
-                if member in managers:
-                    uid = sram.sram_get_uid(ctx, co_identifier, member)
-                    if uid == '':
-                        log.write(ctx, "Something went wrong getting the SRAM user id for user {} of group {}".format(member, group_name))
+                    msi.sudo_obj_meta_set(ctx, member, "-u", constants.UUORGMETADATAPREFIX + "sram_invited", group_name, "", "")
+                    log.write(ctx, "User {} invited to group {}".format(member, group_name))
+                    continue
+
+            # Member is group manager and in the CO.
+            if member in managers and member.split('#')[0] in co_members:
+                uid = sram.sram_get_uid(ctx, co_identifier, member)
+                if uid == '':
+                    log.write(ctx, "Something went wrong getting the SRAM user id for user {} of group {}".format(member, group_name))
+                else:
+                    if sram.sram_update_collaboration_membership(ctx, co_identifier, uid, "manager"):
+                        log.write(ctx, "Updated {} user to manager of group {}".format(member, group_name))
                     else:
-                        if sram.sram_update_collaboration_membership(ctx, co_identifier, uid, "manager"):
-                            log.write(ctx, "Updated {} user to manager of group {}".format(member, group_name))
-                        else:
-                            log.write(ctx, "Something went wrong updating {} user to manager of group {} in SRAM".format(member, group_name))
+                        log.write(ctx, "Something went wrong updating {} user to manager of group {} in SRAM".format(member, group_name))
 
     log.write(ctx, "Finished syncing groups with SRAM")
diff --git a/util/msi.py b/util/msi.py
index f5f4b8280..37f938b77 100644
--- a/util/msi.py
+++ b/util/msi.py
@@ -137,6 +137,7 @@ def _make_exception(name, message):
 
 sudo_obj_acl_set, SudoObjAclSetError = make('SudoObjAclSet', 'Could not set ACLs as admin')
 sudo_obj_meta_set, SudoObjMetaSetError = make('SudoObjMetaSet', 'Could not set metadata as admin')
+sudo_obj_meta_remove, SudoObjMetaRemoveError = make('SudoObjMetaRemove', 'Could not remove metadata as admin')
 
 touch, TouchError = make('_touch', 'Could not update the data object or collection')
 obj_stat, ObjStatError = make('ObjStat', 'Could not get the stat of data object or collection')

From e99780858265681ee6d4cb87955767181fa71b32 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Thu, 15 Aug 2024 16:07:34 +0200
Subject: [PATCH 29/57] =?UTF-8?q?YDA-5728:=20reduce=20queries=20for=20cate?=
 =?UTF-8?q?gory=20statistics=20from=20O(n=C2=B2)=20to=20O(1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 resources.py | 116 ++++++++++++++++++++++++++-------------------------
 1 file changed, 60 insertions(+), 56 deletions(-)

diff --git a/resources.py b/resources.py
index 9f715bf8c..1b3a2b6f6 100644
--- a/resources.py
+++ b/resources.py
@@ -218,81 +218,85 @@ def api_resource_category_stats(ctx):
     if len(categories) == 0:
         return {'categories': [], 'external_filter': ''}
 
-    # Continue for admins and datamanagers
-    storage = {}
-
-    # Go through current groups of current categories.
-    # This function has no historic value so it is allowed to do so
-    for category in categories:
-        storage[category] = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0, 'internal': 0, 'external': 0}
+    # Retrieve storage statistics of groups.
+    iter = list(genquery.Query(ctx,
+                ['USER_GROUP_NAME', 'ORDER_DESC(META_USER_ATTR_NAME)', 'META_USER_ATTR_VALUE'],
+                "META_USER_ATTR_NAME like '{}%%'".format(constants.UUMETADATAGROUPSTORAGETOTALS),
+                output=genquery.AS_LIST))
 
-        # for all groups in category
-        groups = get_groups_on_categories(ctx, [category])
-        for groupname in groups:
-            if groupname.startswith(('research', 'deposit', 'intake', 'grp')):
-                # Only check the most recent storage measurement
-                iter = list(genquery.Query(ctx,
-                            ['META_USER_ATTR_VALUE', 'ORDER_DESC(META_USER_ATTR_NAME)', 'USER_NAME', 'USER_GROUP_NAME'],
-                            "META_USER_ATTR_VALUE like '[\"{}\",%%' AND META_USER_ATTR_NAME like '{}%%' AND USER_NAME = '{}'".format(category, constants.UUMETADATAGROUPSTORAGETOTALS, groupname),
-                            offset=0, limit=1, output=genquery.AS_LIST))
-
-                for row in iter:
-                    temp = jsonutil.parse(row[0])
-
-                    storage[category]['total'] += temp[4]
-                    storage[category]['research'] += temp[1]
-                    storage[category]['vault'] += temp[2]
-                    storage[category]['revision'] += temp[3]
+    # Go through storage statistics of groups.
+    storage = {}
+    group_counted = []
+    for group_name, _storage_attribute, storage_json in iter:
+        # Check if group is valid and has not been counted yet.
+        if group_name.startswith(('research-', 'deposit-', 'intake-', 'grp-')) and group_name not in group_counted:
+            # Add group to list of groups counted for category statistics.
+            group_counted.append(group_name)
+
+            # Add group to category statistics.
+            category, research, vault, revisions, total = jsonutil.parse(storage_json)
+            storage.setdefault(category, {'research': 0, 'vault': 0, 'revision': 0, 'total': 0})
+            storage[category]['research'] += research
+            storage[category]['vault'] += vault
+            storage[category]['revision'] += revisions
+            storage[category]['total'] += total
+
+    # Retrieve groups and their members.
+    iter = list(genquery.Query(ctx,
+                ['USER_GROUP_NAME', 'USER_NAME'],
+                "USER_TYPE != 'rodsgroup'",
+                output=genquery.AS_LIST))
+
+    # Calculate number of members per type per group.
+    members = {}
+    for group_name, user_name in iter:
+        members.setdefault(group_name, {'internal': set(), 'external': set()})
+        if yoda_names.is_internal_user(user_name):
+            members[group_name]['internal'].add(user_name)
+        else:
+            members[group_name]['external'].add(user_name)
 
-    # Now go through all totals
+    # Calculate category members and storage totals.
+    instance_totals = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0, 'internals': set(), 'externals': set()}
     all_storage = []
-
-    # Totalization for the entire instance.
-    instance_totals = {'total': 0, 'research': 0, 'vault': 0, 'revision': 0}
-
-    # Member counts
-    cat_members = {}
-    members_total = []
     for category in categories:
-        members = []
-        # this information is only available for yoda-admins
-        for groupname in get_groups_on_categories(ctx, [category]):
-            group_members = list(group.members(ctx, groupname))
-            for gm in group_members:
-                members.append(gm[0])
-                members_total.append(gm[0])
-        # deduplicate member list
-        cat_members[category] = list(set(members))
+        if category not in storage:
+            continue
 
-    cat_members['YODA_INSTANCE_TOTAL'] = list(set(members_total))
+        # Calculate category members and totals.
+        internals = set()
+        externals = set()
+        for group_name in get_groups_on_categories(ctx, [category]):
+            members.setdefault(group_name, {'internal': set(), 'external': set()})
+            internals.update(members[group_name]['internal'])
+            externals.update(members[group_name]['external'])
 
-    def count_externals(members):
-        return len([member for member in members if not yoda_names.is_internal_user(member)])
+        # Deduplicate group members.
+        users = {'internals': len(internals), 'externals': len(externals)}
 
-    def count_internals(members):
-        return len([member for member in members if yoda_names.is_internal_user(member)])
+        # Count instance totals.
+        instance_totals['internals'].update(internals)
+        instance_totals['externals'].update(externals)
 
-    for category in categories:
+        # Humanize storage sizes for the frontend and calculate instance totals.
         storage_humanized = {}
-        # humanize storage sizes for the frontend
-        for type in ['total', 'research', 'vault', 'revision']:
-            storage_humanized[type] = misc.human_readable_size(1.0 * storage[category][type])
-            instance_totals[type] += 1.0 * storage[category][type]
+        for storage_type in ['research', 'vault', 'revision', 'total']:
+            storage_humanized[storage_type] = misc.human_readable_size(1.0 * storage[category][storage_type])
+            instance_totals[storage_type] += 1.0 * storage[category][storage_type]
 
-        users = {'internals': count_internals(cat_members[category]), 'externals': count_externals(cat_members[category])}
         all_storage.append({'category': category,
                             'storage': storage_humanized,
                             'users': users})
 
-    # Add the yoda instance information as an extra row with category name YODA_INSTANCE_TOTAL
-    # So the frontend can distinguish instance totals from real category totals
-    users = {'internals': count_internals(cat_members['YODA_INSTANCE_TOTAL']), 'externals': count_externals(cat_members['YODA_INSTANCE_TOTAL'])}
+    # Add the Yoda instance information as an extra row with category name YODA_INSTANCE_TOTAL.
+    # So the frontend can distinguish instance totals from real category totals.
     all_storage.append({'category': "YODA_INSTANCE_TOTAL",
                         'storage': {'total': misc.human_readable_size(instance_totals['total']),
                                     'research': misc.human_readable_size(instance_totals['research']),
                                     'vault': misc.human_readable_size(instance_totals['vault']),
                                     'revision': misc.human_readable_size(instance_totals['revision'])},
-                        'users': users})
+                        'users': {'internals': len(instance_totals['internals']),
+                                  'externals': len(instance_totals['externals'])}})
 
     return {'categories': sorted(all_storage, key=lambda d: d['category']),
             'external_filter': ', '.join(config.external_users_domain_filter)}

From 9a799a3cbc806ed7eb909b74d2d3a56356d0f698 Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Wed, 21 Aug 2024 14:31:24 +0200
Subject: [PATCH 30/57] YDA-5912: add support for
 msi_atomic_apply_metadata_operations

(backport to Yoda 1.9)
---
 integration_tests.py | 84 +++++++++++++++++++++++++++++++++++++++++---
 util/avu.py          | 19 ++++++++++
 util/msi.py          |  2 ++
 3 files changed, 101 insertions(+), 4 deletions(-)

diff --git a/integration_tests.py b/integration_tests.py
index 1a7d9ffd5..095e28097 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -13,6 +13,7 @@
 import traceback
 import uuid
 
+import data_access_token
 import folder
 import meta
 import schema
@@ -66,10 +67,10 @@ def _test_msvc_add_avu_object(ctx):
 
 
 def _test_msvc_add_avu_collection(ctx):
-    tmp_object = _create_tmp_collection(ctx)
-    ctx.msi_add_avu('-c', tmp_object, "foo", "bar", "baz")
-    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_object)]
-    collection.remove(ctx, tmp_object)
+    tmp_coll = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_coll, "foo", "bar", "baz")
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_coll)]
+    collection.remove(ctx, tmp_coll)
     return result
 
 
@@ -124,6 +125,70 @@ def _test_folder_set_retry_avus(ctx):
     return True
 
 
+def _test_msvc_apply_atomic_operations_collection(ctx):
+    tmp_coll = _create_tmp_collection(ctx)
+    operations = {
+        "entity_name": tmp_coll,
+        "entity_type": "collection",
+        "operations": [
+            {
+                "operation": "add",
+                "attribute": "aap",
+                "value": "noot",
+                "units": "mies"
+            },
+            {
+                "operation": "add",
+                "attribute": "foo",
+                "value": "bar",
+                "units": "baz"
+            },
+            {
+                "operation": "remove",
+                "attribute": "aap",
+                "value": "noot",
+                "units": "mies"
+            }
+        ]
+    }
+    avu.apply_atomic_operations(ctx, operations)
+    result = [(m.attr, m.value, m.unit) for m in avu.of_coll(ctx, tmp_coll)]
+    collection.remove(ctx, tmp_coll)
+    return result
+
+
+def _test_msvc_apply_atomic_operations_object(ctx):
+    tmp_object = _create_tmp_object(ctx)
+    operations = {
+        "entity_name": tmp_object,
+        "entity_type": "data_object",
+        "operations": [
+            {
+                "operation": "add",
+                "attribute": "aap",
+                "value": "noot",
+                "units": "mies"
+            },
+            {
+                "operation": "add",
+                "attribute": "foo",
+                "value": "bar",
+                "units": "baz"
+            },
+            {
+                "operation": "remove",
+                "attribute": "aap",
+                "value": "noot",
+                "units": "mies"
+            }
+        ]
+    }
+    avu.apply_atomic_operations(ctx, operations)
+    result = [(m.attr, m.value, m.unit) for m in avu.of_data(ctx, tmp_object)]
+    data_object.remove(ctx, tmp_object)
+    return result
+
+
 def _test_folder_cronjob_status(ctx):
     tmp_coll = _create_tmp_collection(ctx)
     result_set = folder.set_cronjob_status(ctx, constants.CRONJOB_STATE['RETRY'], tmp_coll)
@@ -348,6 +413,17 @@ def _test_folder_secure_func(ctx, func):
      "check": lambda x: (("aap", "noot", "mies") in x
                          and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
                          )},
+    {"name": "avu.apply_atomic_operations.collection",
+     "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx),
+     "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)},
+    {"name": "avu.apply_atomic_operations.object",
+     "test": lambda ctx: _test_msvc_apply_atomic_operations_object(ctx),
+     "check": lambda x: (("foo", "bar", "baz") in x
+                         and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
+                         )},
+    {"name": "data_access_token.get_all_tokens",
+     "test": lambda ctx: data_access_token.get_all_tokens(ctx),
+     "check": lambda x: isinstance(x, list)},
     {"name":  "folder.set_can_modify",
      "test": lambda ctx: _test_folder_secure_func(ctx, folder.set_can_modify),
      "check": lambda x: x},
diff --git a/util/avu.py b/util/avu.py
index 227c4b640..9f47a0fa6 100644
--- a/util/avu.py
+++ b/util/avu.py
@@ -5,6 +5,7 @@
 __license__   = 'GPLv3, see LICENSE'
 
 import itertools
+import json
 from collections import namedtuple
 
 import genquery
@@ -170,3 +171,21 @@ def rmw_from_data(ctx, obj, a, v, u=''):
 def rmw_from_group(ctx, group, a, v, u=''):
     """Remove AVU from group with wildcards."""
     msi.rmw_avu(ctx, '-u', group, a, v, u)
+
+
+def apply_atomic_operations(ctx, operations):
+    """Sequentially executes all operations as a single transaction.
+
+    Operations should be a dict with structure as defined in
+    https://docs.irods.org/4.2.12/doxygen/libmsi__atomic__apply__metadata__operations_8cpp.html
+
+    If an error occurs, all updates are rolled back and an error is returned.
+    Result will contain specific information about the error.
+
+    :param ctx:        Combined type of a callback and rei struct
+    :param operations: Dict containing the batch of metadata operations
+
+    :returns: Dict containing the error information on failure
+    """
+    ret = msi.atomic_apply_metadata_operations(ctx, json.dumps(operations), "")
+    return json.loads(ret['arguments'][1])
diff --git a/util/msi.py b/util/msi.py
index 37f938b77..68393f95c 100644
--- a/util/msi.py
+++ b/util/msi.py
@@ -135,6 +135,8 @@ def _make_exception(name, message):
 add_avu, AddAvuError = make('_add_avu', 'Could not add metadata to object')
 rmw_avu, RmwAvuError = make('_rmw_avu', 'Could not remove metadata to object')
 
+atomic_apply_metadata_operations, AtomicApplyMetadataOperationsError = make('_atomic_apply_metadata_operations', 'Could not apply atomic metadata operations')
+
 sudo_obj_acl_set, SudoObjAclSetError = make('SudoObjAclSet', 'Could not set ACLs as admin')
 sudo_obj_meta_set, SudoObjMetaSetError = make('SudoObjMetaSet', 'Could not set metadata as admin')
 sudo_obj_meta_remove, SudoObjMetaRemoveError = make('SudoObjMetaRemove', 'Could not remove metadata as admin')

From 42052c69c0f6213d5bc3cb4f8b9eb807d20ec592 Mon Sep 17 00:00:00 2001
From: Dylan Hsin <x.fu1@uu.nl>
Date: Fri, 23 Aug 2024 09:40:49 +0200
Subject: [PATCH 31/57] YDA-5866 fix rev creation job stopping on error

The revision creation job used to stop with a failure when a data object scheduled for revision creation could no longer be found on the resource where the revision creation job was looking for it. This can happen e.g. after a resource tree change. The revision creation job now logs an error message and continues with processing the next data object scheduled for revision creation, so that one such error can't stop the entire revision creation process.

This also extracts the function to gather data object properties to util.data_object and covers it with integration tests, so that this function can be re-used in other parts of the ruleset.
---
 integration_tests.py | 25 ++++++++++++++++++++++++
 revisions.py         | 37 ++++++++++++++++--------------------
 util/data_object.py  | 45 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/integration_tests.py b/integration_tests.py
index 095e28097..33475fc24 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -510,6 +510,31 @@ def _test_folder_secure_func(ctx, func):
     {"name":   "util.data_object.exists.no",
      "test": lambda ctx: data_object.exists(ctx, "/tempZone/home/research-initial/testdata/doesnotexist.txt"),
      "check": lambda x: not x},
+    {"name": "util.data_object.get_properties.by_data_name",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["DATA_NAME"] == "lorem.txt"},
+    {"name": "util.data_object.get_properties.by_modify_time",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["DATA_MODIFY_TIME"].isdigit()},
+    {"name": "util.data_object.get_properties.by_owner_name",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["DATA_OWNER_NAME"] == "rods"},
+    {"name": "util.data_object.get_properties.by_coll_name",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["COLL_NAME"] == "/tempZone/home/research-initial/testdata"},
+    {"name": "util.data_object.get_properties.by_coll_id",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["COLL_ID"].isdigit()},
+    {"name": "util.data_object.get_properties.by_data_resc_hier",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["DATA_RESC_HIER"].startswith('irodsResc')},
+    {"name": "util.data_object.get_properties.by_data_size",
+     "test": lambda ctx: data_object.get_properties(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"), "irodsResc"),
+     "check": lambda x: x["DATA_SIZE"].isdigit()},
+    # Using the resource_id as data_id to ensure no existing data object uses this occupied identifier
+    {"name":   "util.data_object.get_properties.no_data_object",
+     "test": lambda ctx: data_object.get_properties(ctx, resource.id_from_name(ctx, "irodsResc"), "irodsResc"),
+     "check": lambda x: x is None},
     {"name":   "util.data_object.owner",
      "test": lambda ctx: data_object.owner(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"),
      "check": lambda x: x == ('rods', 'tempZone')},
diff --git a/revisions.py b/revisions.py
index 321d5307b..783845158 100644
--- a/revisions.py
+++ b/revisions.py
@@ -514,26 +514,6 @@ def is_revision_blocked_by_admin(ctx):
     return collection.exists(ctx, path)
 
 
-def get_data_object(ctx, data_id, resource):
-    """Return data on data object necessary to create a revision."""
-    iter = genquery.row_iterator(
-        "DATA_ID, DATA_MODIFY_TIME, DATA_OWNER_NAME, DATA_SIZE, COLL_ID, DATA_RESC_HIER, DATA_NAME, COLL_NAME",
-        "DATA_ID = '{}' AND DATA_RESC_HIER like '{}%'".format(data_id, resource),
-        genquery.AS_LIST, ctx
-    )
-    for row in iter:
-        data_id = row[0]
-        modify_time = row[1]
-        data_size = row[3]
-        coll_id = row[4]
-        data_owner = row[2]
-        basename = row[6]
-        parent = row[7]
-        break
-
-    return modify_time, data_size, coll_id, data_owner, basename, parent
-
-
 def get_revision_store(ctx, group_name):
     """Get path to revision store for group if the path exists.
 
@@ -563,7 +543,22 @@ def revision_create(ctx, print_verbose, data_id, resource, group_name, revision_
     :returns: True / False as an indication whether a revision was successfully created
     """
     revision_created = False
-    modify_time, data_size, coll_id, data_owner, basename, parent = get_data_object(ctx, data_id, resource)
+
+    # Retrieve properties of the data object
+    data_properties = data_object.get_properties(ctx, data_id, resource)
+
+    # Skip current revision task if data object is not found
+    if data_properties is None:
+        log.write(ctx, "ERROR - No data object found for data_id {} on resource {}, move to the next revision creation".format(data_id, resource))
+        return False
+
+    modify_time = data_properties["DATA_MODIFY_TIME"]
+    data_size = data_properties["DATA_SIZE"]
+    coll_id = data_properties["COLL_ID"]
+    data_owner = data_properties["DATA_OWNER_NAME"]
+    basename = data_properties["DATA_NAME"]
+    parent = data_properties["COLL_NAME"]
+
     path = '{}/{}'.format(parent, basename)
 
     # Allow rodsadmin to create subcollections.
diff --git a/util/data_object.py b/util/data_object.py
index b625672fb..73fed02c2 100644
--- a/util/data_object.py
+++ b/util/data_object.py
@@ -24,6 +24,39 @@ def exists(ctx, path):
                genquery.AS_LIST, ctx))) > 0
 
 
+def get_properties(ctx, data_id, resource):
+    """ Retrieves default properties of a data object from iRODS.
+
+    :param ctx:                                   Combined type of a callback and rei struct
+    :param data_id:                               data_id of the data object
+    :param resource:                              Name of resource
+
+    :returns: dictionary mapping each requested property to its retrieved value, or None if not found.
+    """
+    # Default properties available for retrieva
+    properties = [
+        "DATA_ID", "DATA_MODIFY_TIME", "DATA_OWNER_NAME", "DATA_SIZE",
+        "COLL_ID", "DATA_RESC_HIER", "DATA_NAME", "COLL_NAME",
+    ]
+
+    # Retrieve data obejct with default properties
+    query_fields = ", ".join(properties)
+    iter = genquery.row_iterator(
+        query_fields,
+        "DATA_ID = '{}' AND DATA_RESC_HIER like '{}%'".format(data_id, resource),
+        genquery.AS_LIST, ctx
+    )
+
+    # Return a None when no data object is found
+    prop_dict = None
+
+    for row in iter:
+        prop_dict = {prop: value for prop, value in zip(properties, row)}
+        break
+
+    return prop_dict
+
+
 def owner(ctx, path):
     """Find the owner of a data object. Returns (name, zone) or None."""
     owners = list(genquery.row_iterator(
@@ -198,6 +231,18 @@ def name_from_id(ctx, data_id):
         return '/'.join(x)
 
 
+def id_from_path(ctx, path):
+    """Get data object id from data object path at its first appearance.
+
+    :param ctx:  Combined type of a callback and rei struct
+    :param path: Path to iRODS data object
+
+    :returns: Data object id
+    """
+    return genquery.Query(ctx, "DATA_ID",
+                          "COLL_NAME = '%s' AND DATA_NAME = '%s'" % pathutil.chop(path)).first()
+
+
 def decode_checksum(checksum):
     """Decode data object checksum.
 

From 261a8ff096c022a1948bdcd2623396c984965a9b Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Fri, 23 Aug 2024 10:50:28 +0200
Subject: [PATCH 32/57] YDA-5912: improve error handling in
 apply_atomic_operations

---
 integration_tests.py |  3 +++
 util/avu.py          | 19 ++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/integration_tests.py b/integration_tests.py
index 33475fc24..46c9ab8d3 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -421,6 +421,9 @@ def _test_folder_secure_func(ctx, func):
      "check": lambda x: (("foo", "bar", "baz") in x
                          and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
                          )},
+    {"name": "avu.apply_atomic_operations.invalid",
+     "test": lambda ctx: avu.apply_atomic_operations(ctx, {"inspector": "gadget"}),
+     "check": lambda x: not x},
     {"name": "data_access_token.get_all_tokens",
      "test": lambda ctx: data_access_token.get_all_tokens(ctx),
      "check": lambda x: isinstance(x, list)},
diff --git a/util/avu.py b/util/avu.py
index 9f47a0fa6..e92876d9b 100644
--- a/util/avu.py
+++ b/util/avu.py
@@ -178,14 +178,23 @@ def apply_atomic_operations(ctx, operations):
 
     Operations should be a dict with structure as defined in
     https://docs.irods.org/4.2.12/doxygen/libmsi__atomic__apply__metadata__operations_8cpp.html
-
     If an error occurs, all updates are rolled back and an error is returned.
-    Result will contain specific information about the error.
 
     :param ctx:        Combined type of a callback and rei struct
     :param operations: Dict containing the batch of metadata operations
 
-    :returns: Dict containing the error information on failure
+    :returns: Boolean indicating if all metadata operations were executed
     """
-    ret = msi.atomic_apply_metadata_operations(ctx, json.dumps(operations), "")
-    return json.loads(ret['arguments'][1])
+    try:
+        msi.atomic_apply_metadata_operations(ctx, json.dumps(operations), "")
+        return True
+    except msi.Error as e:
+        # iRODS errorcode -1811000 (INVALID_OPERATION)
+        if str(e).find("-1811000") > -1:
+            log.write(ctx, "apply_atomic_operations: invalid metadata operation")
+        # iRODS errorcode -130000 (SYS_INVALID_INPUT_PARAM)
+        elif str(e).find("-130000") > -1:
+            log.write(ctx, "apply_atomic_operations: invalid entity name or entity type")
+        else:
+            log.write(ctx, "apply_atomic_operations: {}".format(e))
+        return False

From 1b0eef979dfb8f43274c7438afbf48ea9b6b8526 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Mon, 26 Aug 2024 11:21:29 +0200
Subject: [PATCH 33/57] YDA-5929: add parameter to update one publication

Add optional parameter to the update-publications rule so that
it updates only a single publication, rather than all publications
on the system. This is useful when troubleshooting an issue with
a single publication.
---
 tools/update-publications.r | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/tools/update-publications.r b/tools/update-publications.r
index 374bcdb9b..e4ef73069 100644
--- a/tools/update-publications.r
+++ b/tools/update-publications.r
@@ -1,5 +1,18 @@
+#!/usr/bin/irule -r irods_rule_engine_plugin-irods_rule_language-instance -F
+#
+# Updates publication endpoints (Landing page, MOAI, DataCite) for either all data
+# packages or one selected data package.
+#
+# To update one data package:
+# $ irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/update-publications.r \
+#   '*package="/tempZone/home/vault-mygroup/package[123456789]"'
+#
+# To update all data packages:
+# $ irule -r irods_rule_engine_plugin-irods_rule_language-instance -F /etc/irods/yoda-ruleset/tools/update-publications.r
+#
 updatePublications() {
-    writeLine("stdout", "[UPDATE PUBLICATIONS] Start scan");
+	writeLine("stdout", "[UPDATE PUBLICATIONS] Start for *package");
+	*packagesFound = 0;
 
 	# Scan for published vault packages.
 	*ContInxOld = 1;
@@ -15,8 +28,10 @@ updatePublications() {
 		foreach(*row in *GenQ2Out) {
 			*collName = *row.COLL_NAME;
 
-			# Check if this really is a vault package
-			if (*collName like regex "/[^/]+/home/vault-.*") {
+			# Check if this really is a vault package, or selected vault package
+			if ((*package == '*' && *collName like regex "/[^/]+/home/vault-.*") ||
+			    (*package != '*' && *collName like regex "/[^/]+/home/vault-.*" && *collName == *package ) ) {
+			    *packagesFound = 1;
 			    *status = ''
 			    *statusInfo = '';
 			    rule_update_publication(*collName, *updateDatacite, *updateLandingpage, *updateMOAI, *status, *statusInfo);
@@ -30,7 +45,14 @@ updatePublications() {
 		}
 	}
 	msiCloseGenQuery(*GenQ2Inp, *GenQ2Out);
-    writeLine("stdout", "[UPDATE PUBLICATIONS] Finished scan");
+
+	if (*packagesFound == 0) {
+		writeLine("stdout", "[UPDATE PUBLICATIONS] No packages found for *package")
+	}
+	else {
+		writeLine("stdout", "[UPDATE PUBLICATIONS] Finished for *package");
+	}
 }
-input *updateDatacite="Yes", *updateLandingpage="Yes", *updateMOAI="Yes"
+
+input *updateDatacite="Yes", *updateLandingpage="Yes", *updateMOAI="Yes", *package='*'
 output ruleExecOut

From af3aa91d0416894af003a5bae2e13fa77952457b Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Mon, 26 Aug 2024 20:10:28 +0200
Subject: [PATCH 34/57] Integration tests: add util.data_object.to_from_id

---
 integration_tests.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/integration_tests.py b/integration_tests.py
index 46c9ab8d3..0e4f8edc6 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -544,6 +544,9 @@ def _test_folder_secure_func(ctx, func):
     {"name":   "util.data_object.size",
      "test": lambda ctx: data_object.size(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"),
      "check": lambda x: x == 1003240},
+    {"name":   "util.data_object.to_from_id",
+     "test": lambda ctx: data_object.name_from_id(ctx, data_object.id_from_path(ctx, "/tempZone/home/research-initial/testdata/lorem.txt")),
+     "check": lambda x: x == "/tempZone/home/research-initial/testdata/lorem.txt"},
     {"name":   "util.data_object.get_group_owners",
      "test": lambda ctx: data_object.get_group_owners(ctx, "/tempZone/home/research-initial/testdata/lorem.txt"),
      "check": lambda x: x == [['research-initial', 'tempZone']]},

From d1952609447d14595932b6e770a706cf68d62ad3 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Wed, 28 Aug 2024 11:06:04 +0200
Subject: [PATCH 35/57] Schemas: Fix dag-0 typos

---
 schemas/dag-0/metadata.json | 8 ++++----
 schemas/dag-0/uischema.json | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/schemas/dag-0/metadata.json b/schemas/dag-0/metadata.json
index 32c41ed77..eb9d8cbea 100644
--- a/schemas/dag-0/metadata.json
+++ b/schemas/dag-0/metadata.json
@@ -50,11 +50,11 @@
         "50"
       ],
       "enumNames": [
-        "2 - appriopiate period when data can be considered as a snapshot which is outdated in the short term",
-        "5 - appriopiate period when the value of data decreases significantly after a longer period of time",
+        "2 - appropriate period when data can be considered as a snapshot which is outdated in the short term",
+        "5 - appropriate period when the value of data decreases significantly after a longer period of time",
         "10 - default retention period according to UU's policy framework for research data, maximum retention period for personal data (GDPR)",
-        "20 - appriopiate period when the value of the data decreases slowly over a long period of time",
-        "50 - appriopiate period when data will always be relevant"
+        "20 - appropriate period when the value of the data decreases slowly over a long period of time",
+        "50 - appropriate period when data will always be relevant"
       ]
     },
     "optionsOwnerRole": {
diff --git a/schemas/dag-0/uischema.json b/schemas/dag-0/uischema.json
index 947d10895..23424fc96 100644
--- a/schemas/dag-0/uischema.json
+++ b/schemas/dag-0/uischema.json
@@ -49,7 +49,7 @@
     "ui:description": "Free text field to add characteristic words or terms that typify and describe the data, so it becomes better searchable. Please fill in one word or term per field, use the + if you want to add more keywords"
   },
   "Related_Datapackage": {
-    "ui:description": "Reference to other resources which are used to create the data set, such as another data package an online publication. Please fill in the title or citing information of the resource, together with type persistant identifier (select an option) and the identifier itself",
+    "ui:description": "Reference to other resources which are used to create the data set, such as another data package an online publication. Please fill in the title or citing information of the resource, together with type persistent identifier (select an option) and the identifier itself",
     "items": {
       "Relation_Type": {
         "ui:help": "Relation to this data package",
@@ -108,7 +108,7 @@
         }
       },
       "Affiliation": {
-        "ui:help": "Organizational or institutional affliation of the data owner"
+        "ui:help": "Organizational or institutional affiliation of the data owner"
       },
       "Owner_Role": {
         "ui:help": "Which role does the data owner have in the context in which the data package originated?"

From 1fd7b9207d0e5779fbc392c058c70680de09145f Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:33:15 +0200
Subject: [PATCH 36/57] YDA-5924: Copytovault: check if in research space

---
 folder.py        | 18 +++++++++++++++---
 vault_archive.py |  4 ++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/folder.py b/folder.py
index 8f5a15ac5..6931067f5 100644
--- a/folder.py
+++ b/folder.py
@@ -204,6 +204,7 @@ def precheck_folder_secure(ctx, coll):
 
     found, last_run = get_last_run_time(ctx, coll)
     if (not correct_copytovault_start_status(ctx, coll)
+            or not correct_copytovault_start_location(coll)
             or not misc.last_run_time_acceptable(coll, found, last_run, config.vault_copy_backoff_time)):
         return False
 
@@ -318,6 +319,18 @@ def correct_copytovault_start_status(ctx, coll):
     return False
 
 
+def correct_copytovault_start_location(coll):
+    """Confirm that the folder to be copied is in the correct location.
+       For example: in a research or deposit folder and not in the trash.
+
+    :param coll:   Source collection (folder being secured)
+
+    :returns: True when a valid start location
+    """
+    space, _, _, _ = pathutil.info(coll)
+    return space in (pathutil.Space.RESEARCH, pathutil.Space.DEPOSIT)
+
+
 def get_last_run_time(ctx, coll):
     """Get the last run time, if found"""
     found = False
@@ -420,9 +433,8 @@ def folder_secure_set_retry(ctx, coll):
     if new_retry_count > config.vault_copy_max_retries:
         folder_secure_fail(ctx, coll)
         send_folder_secure_notification(ctx, coll, "Data package failed to copy to vault after maximum retries")
-    else:
-        if not folder_secure_set_retry_avus(ctx, coll, new_retry_count):
-            send_folder_secure_notification(ctx, coll, "Failed to set retry state on data package")
+    elif not folder_secure_set_retry_avus(ctx, coll, new_retry_count):
+        send_folder_secure_notification(ctx, coll, "Failed to set retry state on data package")
 
 
 def folder_secure_set_retry_avus(ctx, coll, retry_count):
diff --git a/vault_archive.py b/vault_archive.py
index d52da9edd..6bac3bb83 100644
--- a/vault_archive.py
+++ b/vault_archive.py
@@ -134,11 +134,11 @@ def create_archive(ctx, coll):
 def extract_archive(ctx, coll):
     while True:
         state = ctx.dmattr(package_archive_path(ctx, coll), config.data_package_archive_fqdn, "")["arguments"][2]
-        if state != "UNM" and state != "MIG":
+        if state not in ("UNM", "MIG"):
             break
         time.sleep(10)
 
-    if state != "DUL" and state != "REG" and state != "INV":
+    if state not in ("DUL", "REG", "INV"):
         log.write(ctx, "Archive of data package <{}> is not available, state is <{}>".format(coll, state))
         raise Exception("Archive is not available")
 

From 9834e598da47151fbee3c0a04873e24705f72c3c Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Wed, 28 Aug 2024 10:44:49 +0200
Subject: [PATCH 37/57] UI Tests: Fix failing publication test

---
 tests/step_defs/ui/test_ui_publication.py | 11 +++++++++--
 tests/step_defs/ui/test_ui_statistics.py  |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/step_defs/ui/test_ui_publication.py b/tests/step_defs/ui/test_ui_publication.py
index a2d5750c6..1bda48371 100644
--- a/tests/step_defs/ui/test_ui_publication.py
+++ b/tests/step_defs/ui/test_ui_publication.py
@@ -98,7 +98,14 @@ def ui_check_version_provenance_vault(browser):
     action_log_rows = browser.find_by_css('.list-group-item-action')
 
     # Chronological (backward) status changes
-    prov_statuses = ['Published', 'Approved for publication', 'Added metadata: related datapackage', 'Submitted for publication', 'Secured in vault', 'Accepted for vault', 'Submitted for vault']
+    prov_statuses = ['Published',
+                     'Approved for publication',
+                     'Removed metadata: additional lab',
+                     'Added metadata: related datapackage',
+                     'Submitted for publication',
+                     'Secured in vault',
+                     'Accepted for vault',
+                     'Submitted for vault']
     for index in range(0, len(prov_statuses)):
         assert action_log_rows[index].value.find(prov_statuses[index]) != -1
 
@@ -243,7 +250,7 @@ def ui_data_package_approve(browser):
 @then(parsers.parse('the data package status is "{status}"'))
 def ui_data_package_status(browser, status):
     for _i in range(30):
-        if browser.is_text_present(status, wait_time=3):
+        if browser.is_text_present(status, wait_time=4):
             return True
         browser.reload()
 
diff --git a/tests/step_defs/ui/test_ui_statistics.py b/tests/step_defs/ui/test_ui_statistics.py
index 4d0a27057..05cc4b640 100644
--- a/tests/step_defs/ui/test_ui_statistics.py
+++ b/tests/step_defs/ui/test_ui_statistics.py
@@ -43,7 +43,7 @@ def ui_statistics_group_view(browser, group):
 
 @when('export statistics button is clicked')
 def ui_statistics_export(browser):
-    # For now prevent downloading on windows platforn
+    # For now prevent downloading on windows platform
     if os.name == "nt":
         return
     # Only click when not in Windows

From 91f791bb6c3802fe16387e23d4e5d4cb976100bd Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Tue, 3 Sep 2024 09:57:40 +0200
Subject: [PATCH 38/57] YDA-5939: fix publication fail on third+ version

Get baseDOIMinted from the previous version, use Python get rather than try except
---
 publication.py   | 42 ++++++++++++++++++++----------------------
 vault_archive.py |  2 +-
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/publication.py b/publication.py
index 4b0aafc6a..1e6b70bde 100644
--- a/publication.py
+++ b/publication.py
@@ -41,7 +41,7 @@ def get_publication_config(ctx):
                  "davrods_anonymous_vhost": "davrodsAnonymousVHost",
                  "publication_verbose_mode": "verboseMode"}
     optional_keys = ["publication_verbose_mode"]
-    configKeys = {}
+    config_keys = {}
     found_attrs = []
 
     prefix_length = len(constants.UUORGMETADATAPREFIX)
@@ -58,7 +58,7 @@ def get_publication_config(ctx):
 
         try:
             found_attrs.append(attr)
-            configKeys[attr2keys[attr]] = val
+            config_keys[attr2keys[attr]] = val
         except KeyError:
             continue
 
@@ -67,7 +67,7 @@ def get_publication_config(ctx):
         if key not in found_attrs and key not in optional_keys:
             log.write(ctx, 'Missing config key ' + key)
 
-    return configKeys
+    return config_keys
 
 
 def generate_combi_json(ctx, publication_config, publication_state):
@@ -151,8 +151,8 @@ def get_publication_state(ctx, vault_package):
     publ_metadata = get_collection_metadata(ctx, vault_package, constants.UUORGMETADATAPREFIX + 'publication_')
 
     # Take over all actual values as saved earlier.
-    for key in publ_metadata.keys():
-        publication_state[key] = publ_metadata[key]
+    for key, value in publ_metadata.items():
+        publication_state[key] = value
 
     # Handle access restriction.
     iter = genquery.row_iterator(
@@ -300,7 +300,7 @@ def get_last_modified_datetime(ctx, vault_package):
         return my_date.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
 
 
-def generate_preliminary_DOI(ctx, publication_config, publication_state):
+def generate_preliminary_doi(ctx, publication_config, publication_state):
     """Generate a Preliminary DOI. Preliminary, because we check for collision later.
 
     :param ctx:                Combined type of a callback and rei struct
@@ -316,7 +316,7 @@ def generate_preliminary_DOI(ctx, publication_config, publication_state):
     publication_state["versionDOI"] = dataCitePrefix + "/" + yodaPrefix + "-" + randomId
 
 
-def generate_base_DOI(ctx, publication_config, publication_state):
+def generate_base_doi(ctx, publication_config, publication_state):
     """Generate a base DOI.
 
     :param ctx:                Combined type of a callback and rei struct
@@ -666,17 +666,17 @@ def check_doi_availability(ctx, publication_state, type_flag):
     :param publication_state:  Dict with state of the publication process
     :param type_flag:          Flag indicating DOI type ('version' or 'base')
     """
-    DOI = publication_state[type_flag + "DOI"]
+    doi = publication_state[type_flag + "DOI"]
 
     try:
-        httpCode = datacite.metadata_get(ctx, DOI)
+        http_code = datacite.metadata_get(ctx, doi)
 
-        if httpCode == 404:
+        if http_code == 404:
             publication_state[type_flag + "DOIAvailable"] = "yes"
-        elif httpCode in [401, 403, 500, 503, 504]:
+        elif http_code in [401, 403, 500, 503, 504]:
             # request failed, worth a retry
             publication_state["status"] = "Retry"
-        elif httpCode in [200, 204]:
+        elif http_code in [200, 204]:
             # DOI already in use
             publication_state[type_flag + "DOIAvailable"] = "no"
             publication_state["status"] = "Retry"
@@ -745,13 +745,14 @@ def process_publication(ctx, vault_package):
         if "baseDOI" in previous_publication_state:
             # Set the link to previous publication state
             publication_state["baseDOI"] = previous_publication_state["baseDOI"]
+            publication_state["baseDOIMinted"] = previous_publication_state["baseDOIMinted"]
             publication_state["baseRandomId"] = previous_publication_state["baseRandomId"]
 
         # Create base DOI if it does not exist in the previous publication state.
         elif "baseDOI" not in previous_publication_state:
             log.write(ctx, "Creating base DOI for the vault package <{}>".format(vault_package))
             try:
-                generate_base_DOI(ctx, publication_config, publication_state)
+                generate_base_doi(ctx, publication_config, publication_state)
                 check_doi_availability(ctx, publication_state, 'base')
                 publication_state["baseDOIMinted"] = 'no'
                 # Set the link to previous publication state
@@ -764,7 +765,7 @@ def process_publication(ctx, vault_package):
             save_publication_state(ctx, previous_vault_package, previous_publication_state)
             save_publication_state(ctx, vault_package, publication_state)
 
-            if status in ["Retry"]:
+            if status == "Retry":
                 if verbose:
                     log.write(ctx, "Error status for creating base DOI: " + status)
                 return status
@@ -779,7 +780,7 @@ def process_publication(ctx, vault_package):
     if "versionDOI" not in publication_state:
         if verbose:
             log.write(ctx, "Generating preliminary DOI.")
-        generate_preliminary_DOI(ctx, publication_config, publication_state)
+        generate_preliminary_doi(ctx, publication_config, publication_state)
 
         save_publication_state(ctx, vault_package, publication_state)
 
@@ -788,7 +789,7 @@ def process_publication(ctx, vault_package):
             if verbose:
                 log.write(ctx, "Version DOI available: no")
                 log.write(ctx, "Generating preliminary DOI.")
-            generate_preliminary_DOI(ctx, publication_config, publication_state)
+            generate_preliminary_doi(ctx, publication_config, publication_state)
 
             publication_state["combiJsonPath"] = ""
             publication_state["dataCiteJsonPath"] = ""
@@ -857,11 +858,8 @@ def process_publication(ctx, vault_package):
 
     # Determine whether an update ('put') or create ('post') message has to be sent to datacite
     datacite_action = 'post'
-    try:
-        if publication_state['versionDOIMinted'] == 'yes':
-            datacite_action = 'put'
-    except KeyError:
-        pass
+    if publication_state.get('versionDOIMinted') == 'yes':
+        datacite_action = 'put'
 
     # Send DataCite JSON to metadata end point
     if "dataCiteMetadataPosted" not in publication_state:
@@ -874,7 +872,7 @@ def process_publication(ctx, vault_package):
             if update_base_doi:
                 base_doi = None
                 datacite_action = 'post'
-                if publication_state['baseDOIMinted'] == 'yes':
+                if publication_state.get('baseDOIMinted') == 'yes':
                     datacite_action = 'put'
                 if verbose:
                     log.write(ctx, "Updating base DOI.")
diff --git a/vault_archive.py b/vault_archive.py
index 6bac3bb83..ea034a42a 100644
--- a/vault_archive.py
+++ b/vault_archive.py
@@ -253,7 +253,7 @@ def vault_extract_archive(ctx, coll):
 
 
 def update(ctx, coll, attr):
-    if pathutil.info(coll).space == pathutil.Space.VAULT and attr != constants.IIARCHIVEATTRNAME and attr != constants.UUPROVENANCELOG and vault_archival_status(ctx, coll) == "archived":
+    if pathutil.info(coll).space == pathutil.Space.VAULT and attr not in (constants.IIARCHIVEATTRNAME, constants.UUPROVENANCELOG) and vault_archival_status(ctx, coll) == "archived":
         avu.set_on_coll(ctx, coll, constants.IIARCHIVEATTRNAME, "update")
         ctx.dmget(package_archive_path(ctx, coll), config.data_package_archive_fqdn, "OFL")
 

From 8cc2bb638e058a91efc1faa6a3fff4005809e208 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Wed, 4 Sep 2024 15:18:26 +0200
Subject: [PATCH 39/57] Log: write stdout option

---
 util/log.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/util/log.py b/util/log.py
index 994f42e4f..ab96e96f9 100644
--- a/util/log.py
+++ b/util/log.py
@@ -40,6 +40,15 @@ def _write(ctx, message):
         ctx.writeLine('serverLog', message)
 
 
+def write_stdout(ctx, message):
+    """Write a message to stdout. Used for some of our scripts.
+
+    :param ctx:      Combined type of a callback and rei struct
+    :param message:  Message to write to log
+    """
+    ctx.writeLine("stdout", message)
+
+
 def debug(ctx, message):
     """"Write a message to the log, if in a development environment.
 

From c49c616e02183b0581e8b6f5ee5a70e5248e97c7 Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Wed, 25 Sep 2024 11:36:51 +0200
Subject: [PATCH 40/57] YDA-5892: delete revision avu for data objects in trash

The job for revision creation now removes revision creation AVUs from data objects in trash.
This makes it easier to monitor the number of data objects waiting for revision creation.
---
 revisions.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/revisions.py b/revisions.py
index 783845158..96918c8b5 100644
--- a/revisions.py
+++ b/revisions.py
@@ -349,6 +349,10 @@ def rule_revision_batch(ctx, verbose, balance_id_min, balance_id_max, batch_size
 
         minimum_timestamp = int(time.time() - config.async_revision_delay_time)
 
+        # Remove revision creation AVUs from deleted data objects.
+        # This makes it easier to monitor the number of data objects waiting for revision creation.
+        remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose)
+
         # Get list of up to batch size limit of data objects (in research space) scheduled for revision, taking into account
         # modification time.
         log.write(ctx, "verbose = {}".format(verbose))
@@ -1030,3 +1034,28 @@ def memory_limit_exceeded(rss_limit):
     """
     rss_limit = int(rss_limit)
     return rss_limit and memory_rss_usage() > rss_limit
+
+
+def remove_revision_creation_avu_from_deleted_data_objects(ctx, print_verbose):
+    """
+    Removes revision creation AVUs from deleted data objects [marked with 'org_revision_scheduled' metadata].
+
+    :param ctx:  Combined type of a callback and rei struct
+    :param print_verbose: Whether to log verbose messages for troubleshooting (Boolean)
+    """
+    revision_avu_name = constants.UUORGMETADATAPREFIX + "revision_scheduled"
+
+    iter = genquery.row_iterator(
+        "COLL_NAME, DATA_NAME",
+        "COLL_NAME like '%{}/trash/home/%' AND META_DATA_ATTR_NAME = '{}'".format(user.zone(ctx), revision_avu_name),
+        genquery.AS_LIST, ctx
+    )
+
+    for coll_name, data_name in iter:
+        path = coll_name + '/' + data_name
+        try:
+            avu.rmw_from_data(ctx, path, revision_avu_name, "%")  # use wildcard cause rm_from_data causes problems
+            if print_verbose:
+                log.write(ctx, 'Removed revision creation AVUs from data object: {}'.format(path))
+        except Exception as e:
+            log.write(ctx, "Error processing data object {}: {}".format(path, str(e)))

From b7b96a88cd6316a4802c6d0b5607e383afcf8daf Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Thu, 26 Sep 2024 09:45:19 +0200
Subject: [PATCH 41/57] YDA-5951 add transformation default-3 ISNI and Scopus
 ID

The metadata schema transformation code from default-2 to default-3 transformed ORCID-IDs and Researcher IDs, but not Scopus IDs and ISNI IDs. Because of this, metadata files with the default-2 schema containing Scopus or ISNI IDs can usually not be converted automatically to default-3.

Solution:

Add a transformation function that handles transformation of an ISNI and Scopus ID that consists of a series of digits (with optional spaces) to the format that is specified in the default-3 schema
---
 schema_transformations.py                 | 89 +++++++++++++---------
 schema_transformations_utils.py           | 61 +++++++++++++++
 unit-tests/test_schema_transformations.py | 93 +++++++++++++++++++++++
 unit-tests/unit_tests.py                  |  4 +
 4 files changed, 210 insertions(+), 37 deletions(-)
 create mode 100644 schema_transformations_utils.py
 create mode 100644 unit-tests/test_schema_transformations.py

diff --git a/schema_transformations.py b/schema_transformations.py
index e2b57098f..98e412b65 100644
--- a/schema_transformations.py
+++ b/schema_transformations.py
@@ -6,6 +6,8 @@
 
 import re
 
+from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_researcher_id, correctify_scopus
+
 import meta
 from util import *
 
@@ -128,21 +130,44 @@ def _default2_default3(ctx, m):
 
             person_identifiers = []
             for person_identifier in creator.get('Person_Identifier', []):
+                # Check ORCID
                 if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
                     # Check for incorrect ORCID format.
                     if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
                         corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
-                        # Only it an actual correction took place change the value and mark this data as 'changed'.
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
                         if corrected_orcid is None:
                             log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually."
                                       % (person_identifier['Name_Identifier']))
                         elif corrected_orcid != person_identifier['Name_Identifier']:
                             person_identifier['Name_Identifier'] = corrected_orcid
+                # Check Scopus
+                elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)':
+                    # Check for incorrect Scopus format.
+                    if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)):
+                        corrected_scopus = correctify_scopus(person_identifier['Name_Identifier'])
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
+                        if corrected_scopus is None:
+                            log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually."
+                                      % (person_identifier['Name_Identifier']))
+                        elif corrected_scopus != person_identifier['Name_Identifier']:
+                            person_identifier['Name_Identifier'] = corrected_scopus
+                # Check ISNI
+                elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI':
+                    # Check for incorrect ISNI format.
+                    if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)):
+                        corrected_isni = correctify_isni(person_identifier['Name_Identifier'])
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
+                        if corrected_isni is None:
+                            log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually."
+                                      % (person_identifier['Name_Identifier']))
+                        elif corrected_isni != person_identifier['Name_Identifier']:
+                            person_identifier['Name_Identifier'] = corrected_isni
                 elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
                     # Check for incorrect ResearcherID format.
                     if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
                         corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
-                        # Only it an actual correction took place change the value and mark this data as 'changed'.
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
                         if corrected_researcher_id != person_identifier['Name_Identifier']:
                             person_identifier['Name_Identifier'] = corrected_researcher_id
                 elif 'Name_Identifier_Scheme' not in person_identifier:
@@ -164,21 +189,44 @@ def _default2_default3(ctx, m):
 
             person_identifiers = []
             for person_identifier in contributor.get('Person_Identifier', []):
+                # Check ORCID
                 if person_identifier.get('Name_Identifier_Scheme', None) == 'ORCID':
                     # Check for incorrect ORCID format.
                     if not re.search("^(https://orcid.org/)[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", person_identifier.get('Name_Identifier', None)):
                         corrected_orcid = correctify_orcid(person_identifier['Name_Identifier'])
-                        # Only it an actual correction took place change the value and mark this data as 'changed'.
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
                         if corrected_orcid is None:
                             log.write(ctx, "Warning: could not correct ORCID %s during schema transformation. It needs to be fixed manually."
                                       % (person_identifier['Name_Identifier']))
                         elif corrected_orcid != person_identifier['Name_Identifier']:
                             person_identifier['Name_Identifier'] = corrected_orcid
+                # Check Scopus
+                elif person_identifier.get('Name_Identifier_Scheme', None) == 'Author identifier (Scopus)':
+                    # Check for incorrect Scopus format.
+                    if not re.search("^\d{1,11}$", person_identifier.get('Name_Identifier', None)):
+                        corrected_scopus = correctify_scopus(person_identifier['Name_Identifier'])
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
+                        if corrected_scopus is None:
+                            log.write(ctx, "Warning: could not correct Scopus %s during schema transformation. It needs to be fixed manually."
+                                      % (person_identifier['Name_Identifier']))
+                        elif corrected_scopus != person_identifier['Name_Identifier']:
+                            person_identifier['Name_Identifier'] = corrected_scopus
+                # Check ISNI
+                elif person_identifier.get('Name_Identifier_Scheme', None) == 'ISNI':
+                    # Check for incorrect ISNI format.
+                    if not re.search("^(https://isni.org/isni/)[0-9]{15}[0-9X]$", person_identifier.get('Name_Identifier', None)):
+                        corrected_isni = correctify_isni(person_identifier['Name_Identifier'])
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
+                        if corrected_isni is None:
+                            log.write(ctx, "Warning: could not correct ISNI %s during schema transformation. It needs to be fixed manually."
+                                      % (person_identifier['Name_Identifier']))
+                        elif corrected_isni != person_identifier['Name_Identifier']:
+                            person_identifier['Name_Identifier'] = corrected_isni
                 elif person_identifier.get('Name_Identifier_Scheme', None) == 'ResearcherID (Web of Science)':
                     # Check for incorrect ResearcherID format.
                     if not re.search("^(https://www.researcherid.com/rid/)[A-Z]-[0-9]{4}-[0-9]{4}$", person_identifier.get('Name_Identifier', None)):
                         corrected_researcher_id = correctify_researcher_id(person_identifier['Name_Identifier'])
-                        # Only it an actual correction took place change the value and mark this data as 'changed'.
+                        # Only if an actual correction took place change the value and mark this data as 'changed'.
                         if corrected_researcher_id != person_identifier['Name_Identifier']:
                             person_identifier['Name_Identifier'] = corrected_researcher_id
                 elif 'Name_Identifier_Scheme' not in person_identifier:
@@ -702,36 +750,3 @@ def get(src_id, dst_id):
 
     x = transformations.get(src_id)
     return None if x is None else x.get(dst_id)
-
-
-def correctify_orcid(org_orcid):
-    """Correct illformatted ORCID."""
-    # Get rid of all spaces.
-    orcid = org_orcid.replace(' ', '')
-
-    # Upper-case X.
-    orcid = org_orcid.replace('x', 'X')
-
-    # The last part should hold a valid id like eg: 1234-1234-1234-123X.
-    # If not, it is impossible to correct it to the valid orcid format
-    orcs = orcid.split('/')
-    if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
-        # Return original value.
-        return org_orcid
-
-    return "https://orcid.org/{}".format(orcs[-1])
-
-
-def correctify_researcher_id(org_researcher_id):
-    """Correct illformatted ResearcherID."""
-    # Get rid of all spaces.
-    researcher_id = org_researcher_id.replace(' ', '')
-
-    # The last part should hold a valid id like eg: A-1234-1234
-    # If not, it is impossible to correct it to the valid ResearcherID format
-    orcs = researcher_id.split('/')
-    if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
-        # Return original value.
-        return org_researcher_id
-
-    return "https://www.researcherid.com/rid/{}".format(orcs[-1])
diff --git a/schema_transformations_utils.py b/schema_transformations_utils.py
new file mode 100644
index 000000000..34904a39a
--- /dev/null
+++ b/schema_transformations_utils.py
@@ -0,0 +1,61 @@
+import re
+
+
+def correctify_orcid(org_orcid):
+    """Correct illformatted ORCID."""
+    # Get rid of all spaces.
+    orcid = org_orcid.replace(' ', '')
+
+    # Upper-case X.
+    orcid = orcid.replace('x', 'X')
+
+    # The last part should hold a valid id like eg: 1234-1234-1234-123X.
+    # If not, it is impossible to correct it to the valid orcid format
+    orcs = orcid.split('/')
+    if not re.search("^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]$", orcs[-1]):
+        return None
+
+    return "https://orcid.org/{}".format(orcs[-1])
+
+
+def correctify_scopus(org_scopus):
+    """Correct illformatted Scopus."""
+    # Get rid of all spaces.
+    new_scopus = org_scopus.replace(' ', '')
+
+    if not re.search("^\d{1,11}$", new_scopus):
+        return None
+
+    return new_scopus
+
+
+def correctify_isni(org_isni):
+    """Correct ill-formatted ISNI."""
+    # Remove all spaces.
+    new_isni = org_isni.replace(' ', '')
+
+    # Upper-case X.
+    new_isni = new_isni.replace('x', 'X')
+
+    # The last part should hold a valid id like eg: 123412341234123X.
+    # If not, it is impossible to correct it to the valid isni format
+    new_isni = new_isni.split('/')
+    if not re.search("^[0-9]{15}[0-9X]$", new_isni[-1]):
+        return None
+
+    return "https://isni.org/isni/{}".format(new_isni[-1])
+
+
+def correctify_researcher_id(org_researcher_id):
+    """Correct illformatted ResearcherID."""
+    # Get rid of all spaces.
+    researcher_id = org_researcher_id.replace(' ', '')
+
+    # The last part should hold a valid id like eg: A-1234-1234
+    # If not, it is impossible to correct it to the valid ResearcherID format
+    orcs = researcher_id.split('/')
+    if not re.search("^[A-Z]-[0-9]{4}-[0-9]{4}$", orcs[-1]):
+        # Return original value.
+        return org_researcher_id
+
+    return "https://www.researcherid.com/rid/{}".format(orcs[-1])
diff --git a/unit-tests/test_schema_transformations.py b/unit-tests/test_schema_transformations.py
new file mode 100644
index 000000000..d273365ca
--- /dev/null
+++ b/unit-tests/test_schema_transformations.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""Unit tests for the correctify functions in schema_transformations"""
+
+__copyright__ = 'Copyright (c) 2024, Utrecht University'
+__license__   = 'GPLv3, see LICENSE'
+
+import sys
+from unittest import TestCase
+
+sys.path.append('..')
+
+from schema_transformations_utils import correctify_isni, correctify_orcid, correctify_scopus
+
+
+class CorrectifyIsniTest(TestCase):
+    def test_isni_correct_format(self):
+        """Test ISNI with correct format"""
+        isni = "https://isni.org/isni/1234123412341234"
+        self.assertEqual(correctify_isni(isni), isni)
+
+    def test_isni_correct_format_containing_x(self):
+        """Test ISNI with correct format"""
+        isni = "https://isni.org/isni/123412341234123x"
+        correct_isni = "https://isni.org/isni/123412341234123X"
+        self.assertEqual(correctify_isni(isni), correct_isni)
+
+    def test_isni_invalid_format(self):
+        """Test ISNI with invalid format (1 less number)"""
+        isni = "123412341234123"
+        self.assertIsNone(correctify_isni(isni))
+
+    def test_isni_malformed_format(self):
+        """Test ISNI with invalid format"""
+        isni = "foobar0123456789"
+        self.assertIsNone(correctify_isni(isni))
+
+    def test_isni_with_spaces(self):
+        """Test ISNI that contains spaces and should be corrected"""
+        isni = " https://isni.org/isni/123412341234123x    "
+        corrected_isni = "https://isni.org/isni/123412341234123X"
+        self.assertEqual(correctify_isni(isni), corrected_isni)
+
+
+class CorrectifyOrcidTest(TestCase):
+    def test_orcid_correct_format(self):
+        """Test ORCID with correct format"""
+        orcid = "https://orcid.org/1234-1234-1234-1234"
+        self.assertEqual(correctify_orcid(orcid), orcid)
+
+    def test_orcid_correct_format_containing_x(self):
+        """Test ORCID with correct format"""
+        orcid = "https://orcid.org/1234-1234-1234-123x"
+        correct_orcid = "https://orcid.org/1234-1234-1234-123X"
+        self.assertEqual(correctify_orcid(orcid), correct_orcid)
+
+    def test_orcid_invalid_format(self):
+        """Test ORCID with invalid format (1 less number)"""
+        orcid = "1234-1234-1234-123"
+        self.assertIsNone(correctify_orcid(orcid))
+
+    def test_orcid_malformed_format(self):
+        """Test ORCID with invalid format"""
+        orcid = "1234-foo-bar-1234"
+        self.assertIsNone(correctify_orcid(orcid))
+
+    def test_orcid_with_spaces(self):
+        """Test ORCID that contains spaces and should be corrected"""
+        orcid = " https://orcid.org/1234-1234-1234-123x    "
+        corrected_orcid = "https://orcid.org/1234-1234-1234-123X"
+        self.assertEqual(correctify_orcid(orcid), corrected_orcid)
+
+
+class CorrectifyScopusTest(TestCase):
+    def test_correctify_format(self):
+        """Test SCOPUS with correct format"""
+        scopus = "12345678901"
+        self.assertEqual(correctify_scopus(scopus), scopus)
+
+    def test_correctify_invalid_format(self):
+        """Test SCOPUS with invalid format"""
+        scopus = "123456789012"
+        self.assertIsNone(correctify_scopus(scopus))
+
+    def test_malformed_format(self):
+        """Test SCOPUS with invalid format"""
+        scopus = "foobar1234"
+        self.assertIsNone(correctify_scopus(scopus))
+
+    def test_orcid_with_spaces(self):
+        """Test SCOPUS that contains spaces and should be corrected"""
+        scopus = " 01234567890    "
+        corrected_scopus = "01234567890"
+        self.assertEqual(correctify_scopus(scopus), corrected_scopus)
diff --git a/unit-tests/unit_tests.py b/unit-tests/unit_tests.py
index a008c8607..3bd9d873e 100644
--- a/unit-tests/unit_tests.py
+++ b/unit-tests/unit_tests.py
@@ -9,6 +9,7 @@
 from test_intake import IntakeTest
 from test_policies import PoliciesTest
 from test_revisions import RevisionTest
+from test_schema_transformations import CorrectifyIsniTest, CorrectifyOrcidTest, CorrectifyScopusTest
 from test_util_misc import UtilMiscTest
 from test_util_pathutil import UtilPathutilTest
 from test_util_yoda_names import UtilYodaNamesTest
@@ -16,6 +17,9 @@
 
 def suite():
     test_suite = TestSuite()
+    test_suite.addTest(makeSuite(CorrectifyIsniTest))
+    test_suite.addTest(makeSuite(CorrectifyOrcidTest))
+    test_suite.addTest(makeSuite(CorrectifyScopusTest))
     test_suite.addTest(makeSuite(GroupImportTest))
     test_suite.addTest(makeSuite(IntakeTest))
     test_suite.addTest(makeSuite(PoliciesTest))

From b912d776d8615570d953cfd6ea17ee5b63f8b4a2 Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Tue, 1 Oct 2024 08:20:04 +0200
Subject: [PATCH 42/57] YDA-5942: improve messages CSV group import
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem

When modifying groups with a CSV, I use groups that are already created (to add users for example) . I check the “Allow updates” button and “Process CSV”. I then get an error for existing groups even though the users have been added

Solution
When import groups using a CSV file, now the user can be informed with a simple, but useful descriptive message of the actions taken:

A message may consist of 1 or more of the following phrases:

Group '<group_name>' created.
Group '<group_name>' already exists.
Users added (<no_users_added>).
Users removed (<no_users_removed>).
---
 groups.py | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/groups.py b/groups.py
index f7a477ae7..30d62e188 100644
--- a/groups.py
+++ b/groups.py
@@ -523,11 +523,11 @@ def api_group_process_csv(ctx, csv_header_and_data, allow_update, delete_users):
         return api.Error('errors', validation_errors)
 
     # Step 3: Create / update groups.
-    error = apply_data(ctx, data, allow_update, delete_users)
-    if len(error):
-        return api.Error('errors', [error])
+    status_msg = apply_data(ctx, data, allow_update, delete_users)
+    if status_msg['status'] == 'error':
+        return api.Error('errors', [status_msg['message']])
 
-    return api.Result.ok()
+    return api.Result.ok(info=[status_msg['message']])
 
 
 def validate_data(ctx, data, allow_update):
@@ -547,7 +547,7 @@ def validate_data(ctx, data, allow_update):
     for (category, subcategory, groupname, managers, members, viewers, _, _) in data:
 
         if group.exists(ctx, groupname) and not allow_update:
-            errors.append('Group "{}" already exists'.format(groupname))
+            errors.append('Group "{}" already exists. It has not been updated.'.format(groupname))
 
         # Is user admin or has category add privileges?
         if not (is_admin or can_add_category):
@@ -569,11 +569,13 @@ def apply_data(ctx, data, allow_update, delete_users):
     :param allow_update: Allow updates in groups
     :param delete_users:  Allow for deleting of users from groups
 
-    :returns: Errors if found any
+    :returns: Errors if found any, or message with actions if everything is successful
     """
 
     for (category, subcategory, group_name, managers, members, viewers, schema_id, expiration_date) in data:
         new_group = False
+        users_added, users_removed = 0, 0
+        message = ''
 
         log.write(ctx, 'CSV import - Adding and updating group: {}'.format(group_name))
 
@@ -584,10 +586,12 @@ def apply_data(ctx, data, allow_update, delete_users):
 
         if response:
             new_group = True
+            message += "Group '{}' created.".format(group_name)
         elif response.status == "error_group_exists" and allow_update:
             log.write(ctx, 'CSV import - WARNING: group "{}" not created, it already exists'.format(group_name))
+            message += "Group '{}' already exists.".format(group_name)
         else:
-            return "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info)
+            return {status: 'error', message: "Error while attempting to create group {}. Status/message: {} / {}".format(group_name, response.status, response.status_info)}
 
         # Now add the users and set their role if other than member
         allusers = managers + members + viewers
@@ -598,6 +602,7 @@ def apply_data(ctx, data, allow_update, delete_users):
                 if response:
                     currentrole = "normal"
                     log.write(ctx, "CSV import - Notice: added user {} to group {}".format(username, group_name))
+                    users_added += 1
                 else:
                     log.write(ctx, "CSV import - Warning: error occurred while attempting to add user {} to group {}".format(username, group_name))
                     log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info))
@@ -663,11 +668,21 @@ def apply_data(ctx, data, allow_update, delete_users):
                     response = group_remove_user_from_group(ctx, username, usergroupname)
                     if response:
                         log.write(ctx, "CSV import - Removing user {} from group {}".format(username, usergroupname))
+                        users_removed += 1
                     else:
                         log.write(ctx, "CSV import - Warning: error while attempting to remove user {} from group {}".format(username, usergroupname))
                         log.write(ctx, "CSV import - Status: {} , Message: {}".format(response.status, response.status_info))
 
-    return ''
+        if users_added > 0:
+            message += ' Users added ({}).'.format(users_added)
+        if users_removed > 0:
+            message += ' Users removed ({}).'.format(users_removed)
+
+        # If no users added, no users removed and not new group created.
+        if not users_added and not users_removed and not new_group:
+            message += ' No changes made.'
+
+    return {"status": "ok", "message": message}
 
 
 def _are_roles_equivalent(a, b):
@@ -967,12 +982,15 @@ def group_create(ctx, group_name, category, subcategory, schema_id, expiration_d
             if not sram.sram_connect_service_collaboration(ctx, short_name):
                 return api.Error('sram_error', 'Something went wrong connecting service to group "{}" in SRAM'.format(group_name))
 
+        if group.exists(ctx, group_name):
+            return api.Error('group_exists', "Group {} not created, it already exists".format(group_name))
+
         response = ctx.uuGroupAdd(group_name, category, subcategory, schema_id, expiration_date, description, data_classification, co_identifier, '', '')['arguments']
         status = response[8]
         message = response[9]
         if status == '0':
             return api.Result.ok()
-        elif status == '-1089000' or status == '-809000':
+        elif status == '-1089000' or status == '-809000' or status == '-806000':
             return api.Error('group_exists', "Group {} not created, it already exists".format(group_name))
         else:
             return api.Error('policy_error', message)

From 7125337d29a17c033bc4dc4cd8597b19c52fe78a Mon Sep 17 00:00:00 2001
From: Lazlo Westerhof <l.r.westerhof@uu.nl>
Date: Tue, 1 Oct 2024 13:16:03 +0200
Subject: [PATCH 43/57] Schema transformation utils: add missing description
 and copyright notice

---
 schema_transformations_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/schema_transformations_utils.py b/schema_transformations_utils.py
index 34904a39a..d5cf58f68 100644
--- a/schema_transformations_utils.py
+++ b/schema_transformations_utils.py
@@ -1,3 +1,9 @@
+# -*- coding: utf-8 -*-
+"""JSON schema transformation utility functions."""
+
+__copyright__ = 'Copyright (c) 2024, Utrecht University'
+__license__   = 'GPLv3, see LICENSE'
+
 import re
 
 

From e2d6acb6c8e3a45e29b7fafea35f2803d572570b Mon Sep 17 00:00:00 2001
From: Dylan Hsin <x.fu1@uu.nl>
Date: Wed, 23 Oct 2024 13:50:16 +0200
Subject: [PATCH 44/57] YDA-5829: troubleshooting tool for published data
 packages

(Backport to Yoda 1.9)

Co-authored-by: claravox <claravox@users.noreply.github.com>
Co-authored-by: Sirjan <s.kaur@uu.nl>
---
 __init__.py                          |  49 +--
 integration_tests.py                 |  27 ++
 meta.py                              |  50 ++-
 publication.py                       |  27 +-
 publication_troubleshoot.py          | 440 +++++++++++++++++++++++++++
 schema_transformation.py             |  41 +--
 tests/features/api/api_vault.feature |  11 +
 tests/step_defs/api/common_vault.py  |  15 +
 tools/troubleshoot-published-data.py |  46 +++
 tools/troubleshoot_data.r            |  11 +
 unit-tests/test_util_misc.py         | 169 +++++++++-
 util/avu.py                          |  65 ++++
 util/log.py                          |  28 +-
 util/misc.py                         |  82 +++++
 14 files changed, 982 insertions(+), 79 deletions(-)
 create mode 100644 publication_troubleshoot.py
 create mode 100644 tools/troubleshoot-published-data.py
 create mode 100644 tools/troubleshoot_data.r

diff --git a/__init__.py b/__init__.py
index f6a4e1f09..e7dae4bfa 100644
--- a/__init__.py
+++ b/__init__.py
@@ -24,30 +24,31 @@
 # Import all modules containing rules into the package namespace,
 # so that they become visible to iRODS.
 
-from browse                 import *
-from folder                 import *
-from groups                 import *
-from json_datacite          import *
-from json_landing_page      import *
-from mail                   import *
-from meta                   import *
-from meta_form              import *
-from provenance             import *
-from research               import *
-from resources              import *
-from schema                 import *
-from schema_transformation  import *
-from schema_transformations import *
-from vault                  import *
-from datacite               import *
-from epic                   import *
-from publication            import *
-from policies               import *
-from replication            import *
-from revisions              import *
-from settings               import *
-from notifications          import *
-from integration_tests      import *
+from browse                   import *
+from folder                   import *
+from groups                   import *
+from json_datacite            import *
+from json_landing_page        import *
+from mail                     import *
+from meta                     import *
+from meta_form                import *
+from provenance               import *
+from research                 import *
+from resources                import *
+from schema                   import *
+from schema_transformation    import *
+from schema_transformations   import *
+from publication_troubleshoot import *
+from vault                    import *
+from datacite                 import *
+from epic                     import *
+from publication              import *
+from policies                 import *
+from replication              import *
+from revisions                import *
+from settings                 import *
+from notifications            import *
+from integration_tests        import *
 
 # Import certain modules only when enabled.
 from .util.config import config
diff --git a/integration_tests.py b/integration_tests.py
index 0e4f8edc6..e7f59fa88 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -116,6 +116,27 @@ def _test_avu_rmw_collection(ctx, rmw_attributes):
     return result
 
 
+def _test_avu_get_attr_val_of_coll(ctx, attr, value):
+    # Test getting the value of an attribute on a collection
+    tmp_coll = _create_tmp_collection(ctx)
+    ctx.msi_add_avu('-c', tmp_coll, attr, value, "baz")
+    result = avu.get_attr_val_of_coll(ctx, tmp_coll, attr)
+    collection.remove(ctx, tmp_coll)
+    return result
+
+
+def _test_avu_get_attr_val_of_coll_exception(ctx):
+    # Test that getting a non existing attribute on a collection raises an exception (True for exception raised)
+    tmp_coll = _create_tmp_collection(ctx)
+    result = False
+    try:
+        result = avu.get_attr_val_of_coll(ctx, tmp_coll, "foo")
+    except Exception:
+        result = True
+    collection.remove(ctx, tmp_coll)
+    return result
+
+
 def _test_folder_set_retry_avus(ctx):
     tmp_coll = _create_tmp_collection(ctx)
     folder.folder_secure_set_retry_avus(ctx, tmp_coll, 2)
@@ -413,6 +434,12 @@ def _test_folder_secure_func(ctx, func):
      "check": lambda x: (("aap", "noot", "mies") in x
                          and len([a for a in x if a[0] not in ["org_replication_scheduled"]]) == 1
                          )},
+    {"name": "avu.get_attr_val_of_coll.exists.yes",
+     "test": lambda ctx: _test_avu_get_attr_val_of_coll(ctx, "foo", "bar"),
+     "check": lambda x: x == "bar"},
+    {"name": "avu.get_attr_val_of_coll.exists.no",
+     "test": lambda ctx: _test_avu_get_attr_val_of_coll_exception(ctx),
+     "check": lambda x: x},
     {"name": "avu.apply_atomic_operations.collection",
      "test": lambda ctx: _test_msvc_apply_atomic_operations_collection(ctx),
      "check": lambda x: (("foo", "bar", "baz") in x and len(x) == 1)},
diff --git a/meta.py b/meta.py
index eb329a03a..a91f367e4 100644
--- a/meta.py
+++ b/meta.py
@@ -14,6 +14,7 @@
 from deepdiff import DeepDiff
 
 import avu_json
+import meta_form
 import provenance
 import publication
 import schema as schema_
@@ -709,4 +710,51 @@ def copy_user_metadata(ctx, source, target):
 
         log.write(ctx, "rule_copy_user_metadata: copied user metadata from <{}> to <{}>".format(source, target))
     except Exception:
-        log.write(ctx, "rule_copy_user_metadata: failed to copy user metadata from <{}> to <{}>".format(source, target))
+        log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}/original>".format(source, target))
+
+
+def vault_metadata_matches_schema(ctx, coll_name, schema_cache, report_name, write_stdout):
+    """Process a single data package to retrieve and validate that its metadata conforms to the schema.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param coll_name:    String representing the data package collection path.
+    :param schema_cache: Dictionary storing schema blueprints, can be empty.
+    :param report_name:  Name of report script (for logging)
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns:            A dictionary result containing if schema matches and the schema short name.
+    """
+    metadata_path = get_latest_vault_metadata_path(ctx, coll_name)
+
+    if not metadata_path:
+        log.write(ctx, "{} skips {}, because metadata could not be found.".format(report_name, coll_name), write_stdout)
+        return None
+
+    try:
+        metadata = jsonutil.read(ctx, metadata_path)
+    except Exception as exc:
+        log.write(ctx, "{} skips {}, because of exception while reading metadata file {}: {}".format(report_name, coll_name, metadata_path, str(exc)), write_stdout)
+        log.write(ctx, "vault_metadata_matches_schema: Error while reading metadata file {} of data package {}: {}".format(metadata_path, coll_name, str(exc)), write_stdout)
+        return None
+
+    # Determine schema
+    schema_id = schema_.get_schema_id(ctx, metadata_path)
+    schema_shortname = schema_id.split("/")[-2]
+
+    # Retrieve schema and cache it for future use
+    schema_path = schema_.get_schema_path_by_id(ctx, metadata_path, schema_id)
+    if schema_shortname in schema_cache:
+        schema_contents = schema_cache[schema_shortname]
+    else:
+        schema_contents = jsonutil.read(ctx, schema_path)
+        schema_cache[schema_shortname] = schema_contents
+
+    # Check whether metadata matches schema and log any errors
+    error_list = get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
+    match_schema = len(error_list) == 0
+    if not match_schema:
+        errors_formatted = [meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list]
+        log.write(ctx, "{}: metadata {} did not match schema {}: {}".format(report_name, metadata_path, schema_shortname, str(errors_formatted)), write_stdout)
+        log.write(ctx, "vault_metadata_matches_schema: Metadata {} of data package {} did not match the schema {}. Error list: {}".format(metadata_path, coll_name, schema_shortname, str(errors_formatted)), write_stdout)
+
+    return {"schema": schema_shortname, "match_schema": match_schema}
diff --git a/publication.py b/publication.py
index 1e6b70bde..33ed10b0a 100644
--- a/publication.py
+++ b/publication.py
@@ -1327,7 +1327,32 @@ def rule_update_publication(ctx, vault_package, update_datacite, update_landingp
 
     :returns: "OK" if all went ok
     """
-    return update_publication(ctx, vault_package, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes')
+    if user.user_type(ctx) != 'rodsadmin':
+        log.write(ctx, "User is no rodsadmin", True)
+        return
+
+    log.write(ctx, "[UPDATE PUBLICATIONS] Start for {}".format(vault_package), True)
+    collections = genquery.row_iterator(
+        "COLL_NAME",
+        "COLL_NAME like '%%/home/vault-%%' "
+        "AND META_COLL_ATTR_NAME = '" + constants.UUORGMETADATAPREFIX + "vault_status' "
+        "AND META_COLL_ATTR_VALUE = '{}'".format(str(constants.vault_package_state.PUBLISHED)),
+        genquery.AS_LIST,
+        ctx
+    )
+
+    packages_found = False
+    for collection in collections:
+        coll_name = collection[0]
+        if ((vault_package == '*' and re.match(r'/[^/]+/home/vault-.*', coll_name)) or (vault_package != '*' and re.match(r'/[^/]+/home/vault-.*', coll_name) and coll_name == vault_package)):
+            packages_found = True
+            output = update_publication(ctx, coll_name, update_datacite == 'Yes', update_landingpage == 'Yes', update_moai == 'Yes')
+            log.write(ctx, coll_name + ': ' + output, True)
+
+    if not packages_found:
+        log.write(ctx, "[UPDATE PUBLICATIONS] No packages found for {}".format(vault_package), True)
+    else:
+        log.write(ctx, "[UPDATE PUBLICATIONS] Finished for {}".format(vault_package), True)
 
 
 def update_publication(ctx, vault_package, update_datacite=False, update_landingpage=False, update_moai=False):
diff --git a/publication_troubleshoot.py b/publication_troubleshoot.py
new file mode 100644
index 000000000..8f948fcbb
--- /dev/null
+++ b/publication_troubleshoot.py
@@ -0,0 +1,440 @@
+# -*- coding: utf-8 -*-
+"""Functions and rules for troubleshooting published data packages."""
+
+__copyright__ = 'Copyright (c) 2024, Utrecht University'
+__license__   = 'GPLv3, see LICENSE'
+
+__all__ = [
+    'api_batch_troubleshoot_published_data_packages',
+    'rule_batch_troubleshoot_published_data_packages'
+]
+
+import json
+from datetime import datetime
+
+import genquery
+import requests
+import urllib3
+
+import datacite
+from meta import vault_metadata_matches_schema
+from publication import get_publication_config
+from util import *
+
+
+def find_full_package_path(ctx, package_name, write_stdout):
+    """
+    Find the full path of a data package based on its short name.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param package_name: The short name of the data package to find.
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns: The full path of the data package if found, otherwise None.
+    """
+    try:
+        query_condition = (
+            "COLL_NAME like '%{}%'".format(package_name)
+        )
+        query_attributes = "COLL_NAME"
+        iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx)
+
+        # Return full package path if exists
+        for row in iter:
+            return row[0]
+    except Exception as e:
+        log.write(ctx, "find_full_package_path: An error occurred while executing the query: {}".format(e), write_stdout)
+        return None
+
+
+def find_data_packages(ctx, write_stdout):
+    """
+    Find all data packages in Retry, Unrecoverable and Unknown status by matching its AVU.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns:   A list of collection names that have not been processed successfully
+    """
+    user_zone = user.zone(ctx)
+
+    try:
+        # Get all the vault packages that have org_publication_status in metadata
+        query_condition = (
+            "COLL_NAME like '/{}/home/vault-%' AND "
+            "META_COLL_ATTR_NAME = '{}publication_status'".format(user_zone, constants.UUORGMETADATAPREFIX)
+        )
+        query_attributes = "COLL_NAME"
+        iter = genquery.row_iterator(query_attributes, query_condition, genquery.AS_LIST, ctx)
+
+        # Collecting only the collection names
+        return [row[0] for row in iter]
+
+    except Exception as e:
+        log.write(ctx, "find_data_packages: An error occurred while executing the query: {}".format(e), write_stdout)
+        return []
+
+
+def check_print_data_package_system_avus(ctx, data_package, write_stdout):
+    """
+    Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_').
+    This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from
+    a successfully published data package.
+    This also prints if there are any missing or unexpected results.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param data_package: String representing the data package collection path.
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns:            A 2-tuple containing boolean results of checking results
+    """
+    extracted_avus = avu.of_coll(ctx, data_package)
+    results = misc.check_data_package_system_avus(extracted_avus)
+
+    if not results["no_missing_avus"]:
+        log.write(ctx, "check_data_package_system_avus: There are some missing AVUs in data package <{}> - {}".format(data_package, list(results["missing_avus"])), write_stdout)
+
+    if not results["no_unexpected_avus"]:
+        log.write(ctx, "check_data_package_system_avus: There are some unexpected AVUs in data package <{}> - {}".format(data_package, list(results["unexpected_avus"])), write_stdout)
+
+    return (results["no_missing_avus"], results["no_unexpected_avus"])
+
+
+def check_one_datacite_doi_reg(ctx, data_package, doi_name, write_stdout):
+    try:
+        doi = get_val_for_attr_with_pub_prefix(ctx, data_package, doi_name)
+    except ValueError as e:
+        log.write(ctx, "check_datacite_doi_registration: Error while trying to get {} - {}".format(doi_name, e), write_stdout)
+        return False
+
+    status_code = datacite.metadata_get(ctx, doi)
+    return status_code == 200
+
+
+def check_datacite_doi_registration(ctx, data_package, write_stdout):
+    """
+    Check the registration status of both versionDOI and baseDOI with the DataCite API,
+    ensuring that both DOIs return a 200 status code, which indicates successful registration.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param data_package: String representing the data package collection path.
+    :param write_stdout: A boolean representing whether to write to stdout or rodsLog
+
+    :returns:            A tuple of booleans indicating check success or not (base doi check may be None if not relevant).
+    """
+    version_doi_check = check_one_datacite_doi_reg(ctx, data_package, "versionDOI", write_stdout)
+
+    previous_version = ''
+    try:
+        previous_version = get_val_for_attr_with_pub_prefix(ctx, data_package, "previous_version")
+    except Exception:
+        pass
+
+    if previous_version:
+        base_doi_check = check_one_datacite_doi_reg(ctx, data_package, "baseDOI", write_stdout)
+        return version_doi_check, base_doi_check
+
+    return (version_doi_check, None)
+
+
+def get_val_for_attr_with_pub_prefix(ctx, data_package, attribute_suffix):
+    """
+    Retrieves the value given the suffix of the attribute from a data package.
+
+    :param ctx:              Combined type of a callback and rei struct
+    :param data_package:     String representing the data package collection path.
+    :param attribute_suffix: Suffix of the attribute before adding prefix such as "org_publication_"
+
+    :returns:                Value of the attribute.
+    """
+    attr = constants.UUORGMETADATAPREFIX + "publication_" + attribute_suffix
+    return avu.get_attr_val_of_coll(ctx, data_package, attr)
+
+
+def get_landingpage_paths(ctx, data_package, write_stdout):
+    """Given a data package get what the path and remote url should be"""
+    file_path = ''
+    try:
+        file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPagePath")
+        url = get_val_for_attr_with_pub_prefix(ctx, data_package, "landingPageUrl")
+        return file_path, url
+
+    except Exception:
+        log.write(ctx, "get_landingpage_paths: Could not find landing page for data package: {}".format(data_package), write_stdout)
+        return '', ''
+
+
+def compare_local_remote_landingpage(ctx, file_path, url, offline, api_call):
+    """
+    Compares file contents between a file in irods and its remote version to verify their integrity.
+
+    :param ctx:          Combined type of a callback and rei struct
+    :param file_path:    Path to file in irods
+    :param url:          URL of file on remote
+    :param offline:      Whether to skip requests.get call
+    :param api_call:     Boolean representing whether was called by api and not a script
+
+    :returns:         True if the file contents match, False otherwise
+    """
+    write_stdout = not api_call
+    # Local/irods file
+    if api_call:
+        # If called by technicaladmin, only check that the file exists since we don't have access to the contents
+        return data_object.exists(ctx, file_path)
+    else:
+        try:
+            local_data = data_object.read(ctx, file_path)
+        except Exception:
+            log.write(ctx, "compare_local_remote_landingpage: Local file not found at path {}.".format(file_path), write_stdout)
+            return False
+
+    if offline:
+        return len(local_data) > 0
+
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    try:
+        response = requests.get(url, verify=False)
+    except requests.exceptions.ConnectionError as e:
+        log.write(ctx, "compare_local_remote_landingpage: Failed to connect to {}".format(url), write_stdout)
+        log.write(ctx, "compare_local_remote_landingpage: Error: {}".format(e), write_stdout)
+        return False
+
+    if response.status_code != 200:
+        log.write(ctx, "compare_local_remote_landingpage: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout)
+        return False
+
+    # Set encoding to utf-8 for the response text (otherwise will not match local_data)
+    response.encoding = 'utf-8'
+
+    if local_data == response.text:
+        return True
+
+    log.write(ctx, "compare_local_remote_landingpage: File contents at irods path <{}> and remote landing page <{}> do not match.".format(file_path, url), write_stdout)
+    return False
+
+
+def check_landingpage(ctx, data_package, offline, api_call):
+    """
+    Checks the integrity of landing page by comparing the contents
+
+    :param ctx:                Combined type of a callback and rei struct
+    :param data_package:       String representing the data package collection path.
+    :param offline:            Whether to skip any checks that require external server access
+    :param api_call:           Boolean of whether this is for an api call version of the troubleshooting script
+
+    :returns:                  A tuple containing boolean results of checking
+    """
+    irods_file_path, landing_page_url = get_landingpage_paths(ctx, data_package, not api_call)
+    if len(irods_file_path) == 0 or len(landing_page_url) == 0:
+        return False
+
+    return compare_local_remote_landingpage(ctx, irods_file_path, landing_page_url, offline, api_call)
+
+
+def check_combi_json(ctx, data_package, publication_config, offline, write_stdout):
+    """
+    Checks the integrity of combi JSON by checking URL and existence of file.
+
+    :param ctx:                Combined type of a callback and rei struct
+    :param data_package:       String representing the data package collection path.
+    :param publication_config: Dictionary of publication config
+    :param offline:            Whether to skip any checks that require external server access
+    :param write_stdout:       A boolean representing whether to write to stdout or rodsLog
+
+    :returns:                  A tuple containing boolean results of checking
+    """
+    # Check that the combi json in irods exists
+    file_path = ''
+    try:
+        file_path = get_val_for_attr_with_pub_prefix(ctx, data_package, "combiJsonPath")
+    except Exception:
+        pass
+    exists = data_object.exists(ctx, file_path)
+    if not exists:
+        log.write(ctx, "check_combi_json: combi JSON file in irods does not exist: {}".format(file_path), write_stdout)
+        return False
+
+    if offline:
+        return True
+
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    # Get the version doi
+    version_doi = ''
+    try:
+        version_doi = get_val_for_attr_with_pub_prefix(ctx, data_package, "versionDOI")
+    except Exception:
+        pass
+    url = "https://{}/oai/oai?verb=GetRecord&metadataPrefix=oai_datacite&identifier=oai:{}".format(publication_config["publicVHost"], version_doi)
+    try:
+        response = requests.get(url, verify=False)
+    except requests.exceptions.ConnectionError as e:
+        log.write(ctx, "check_combi_json: Failed to connect to {}".format(url), write_stdout)
+        log.write(ctx, "check_combi_json: Error: {}".format(e), write_stdout)
+        return False
+
+    if response.status_code != 200:
+        log.write(ctx, "check_combi_json: Error {} when connecting to <{}>.".format(response.status_code, url), write_stdout)
+        return False
+
+    # Look at the first few parts of the response for signs of error.
+    if "idDoesNotExist" in response.text[:5000]:
+        log.write(ctx, "check_combi_json: combiJson not found in oai for data package <{}>".format(data_package), write_stdout)
+        return False
+
+    return True
+
+
+def print_troubleshoot_result(ctx, data_package, result, datacite_check):
+    """Print the result of troubleshooting one package in human-friendly format"""
+    pass_all_tests = all(result.values())
+
+    log.write(ctx, "Results for: {}".format(data_package), True)
+    if pass_all_tests:
+        log.write(ctx, "Package passed all tests.", True)
+    else:
+        log.write(ctx, "Package FAILED one or more tests:", True)
+        log.write(ctx, "Schema matches: {}".format(result['schema_check']), True)
+        log.write(ctx, "All expected AVUs exist: {}".format(result['no_missing_AVUs_check']), True)
+        log.write(ctx, "No unexpected AVUs: {}".format(result['no_unexpected_AVUs_check']), True)
+
+        if datacite_check:
+            log.write(ctx, "Version DOI matches: {}".format(result['versionDOI_check']), True)
+            if 'baseDOI_check' in result:
+                log.write(ctx, "Base DOI matches: {}".format(result['baseDOI_check']), True)
+
+        log.write(ctx, "Landing page matches: {}".format(result['landingPage_check']), True)
+        log.write(ctx, "Combined JSON matches: {}".format(result['combiJson_check']), True)
+
+    log.write(ctx, "", True)
+
+
+def collect_troubleshoot_data_packages(ctx, requested_package, write_stdout):
+    data_packages = []
+
+    if requested_package == 'None':
+        # Retrieve all data packages
+        all_packages = find_data_packages(ctx, write_stdout)
+        if not all_packages:
+            log.write(ctx, "collect_troubleshoot_data_packages: No packages found.", write_stdout)
+            return None
+
+        data_packages = all_packages
+    else:
+        # Get full path of the given package
+        full_package_path = find_full_package_path(ctx, requested_package, write_stdout)
+
+        if not full_package_path:
+            log.write(ctx, "collect_troubleshoot_data_packages: Data package '{}' cannot be found.".format(requested_package), write_stdout)
+            return None
+
+        data_packages.append(full_package_path)
+
+    return data_packages
+
+
+def batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, api_call, check_datacite):
+    """
+    Troubleshoots published data packages.
+
+    :param ctx:               Context that combines a callback and rei struct.
+    :param requested_package: A string representing a specific data package path or all packages with failed publications.
+    :param log_file:          A boolean representing to write results in log.
+    :param offline:           A boolean representing whether to perform all checks without connecting to external servers.
+    :param api_call:          Boolean of whether this is run by a script or api test.
+    :param check_datacite:    Boolean representing whether to do the datacite checks
+
+    :returns: A dictionary of dictionaries providing the results of the job.
+    """
+    write_stdout = not api_call
+    # Check permissions - rodsadmin only
+    if user.user_type(ctx) != 'rodsadmin':
+        log.write(ctx, "User is not rodsadmin", write_stdout)
+        return {}
+
+    data_packages = collect_troubleshoot_data_packages(ctx, requested_package, write_stdout)
+    if not data_packages:
+        return {}
+    schema_cache = {}
+    results = {}
+
+    # Troubleshooting
+    for data_package in data_packages:
+        log.write(ctx, "Troubleshooting data package: {}".format(data_package), write_stdout)
+        result = {}
+        # Cannot check the metadata as technicaladmin
+        if not api_call:
+            schema_check_dict = vault_metadata_matches_schema(ctx, data_package, schema_cache, "troubleshoot-publications", write_stdout)
+            result['schema_check'] = schema_check_dict['match_schema'] if schema_check_dict else False
+
+        result['no_missing_AVUs_check'], result['no_unexpected_AVUs_check'] = check_print_data_package_system_avus(ctx, data_package, write_stdout)
+
+        # Only check datacite if enabled
+        if check_datacite:
+            result['versionDOI_check'], base_doi_check = check_datacite_doi_registration(ctx, data_package, write_stdout)
+            if base_doi_check is not None:
+                result['baseDOI_check'] = base_doi_check
+
+        result['landingPage_check'] = check_landingpage(ctx, data_package, offline, api_call)
+        publication_config = get_publication_config(ctx)
+        result['combiJson_check'] = check_combi_json(ctx, data_package, publication_config, offline, write_stdout)
+
+        results[data_package] = result
+
+        if not api_call:
+            print_troubleshoot_result(ctx, data_package, result, check_datacite)
+
+        if log_file:
+            log_loc = "/var/lib/irods/log/troubleshoot_publications.log"
+            with open(log_loc, "a") as writer:
+                writer.writelines("Batch run date and time: {}".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
+                writer.writelines('\n')
+                writer.writelines("Troubleshooting data package: {}".format(data_package))
+                writer.writelines('\n')
+                json.dump(result, writer)
+                writer.writelines('\n')
+
+    return results
+
+
+@api.make()
+def api_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline):
+    """
+    Wrapper for the batch script for troubleshooting published data packages.
+    Runs a subset of the tests since "technicaladmin" is usually more restricted than "rods".
+
+    :param ctx:               Combined type of a callback and rei struct
+    :param requested_package: A string representing a specific data package path or all packages with failed publications.
+    :param log_file:          A boolean representing to write results in log.
+    :param offline:           A boolean representing whether to perform all checks without connecting to external servers.
+
+    :returns: A dictionary of dictionaries providing the results of the job.
+    """
+    return batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, True, False)
+
+
+@rule.make(inputs=[0, 1, 2, 3], outputs=[])
+def rule_batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, no_datacite):
+    """
+    Troubleshoots published data packages.
+
+    Prints results of the following checks:
+        1. Metadata schema compliance.
+        2. Presence and correctness of expected AVUs.
+        3. Registration with Data Cite.
+        4. File integrity of landing page and combi JSON files.
+
+    Operates on either a single specified package or all published packages, depending on the input.
+
+    :param ctx:               Context that combines a callback and rei struct.
+    :param requested_package: A string representing a specific data package path or all packages with failed publications.
+    :param log_file:          A string boolean representing to write results in log.
+    :param offline:           A string boolean representing whether to perform all checks without connecting to external servers.
+    :param no_datacite:       A string boolean representing whether to skip the datacite checks
+    """
+    offline = offline == "True"
+    log_file = log_file == "True"
+    check_datacite = no_datacite == "False"
+
+    batch_troubleshoot_published_data_packages(ctx, requested_package, log_file, offline, False, check_datacite)
diff --git a/schema_transformation.py b/schema_transformation.py
index 35bc35dd9..e4ef569b7 100644
--- a/schema_transformation.py
+++ b/schema_transformation.py
@@ -19,7 +19,6 @@
 import session_vars
 
 import meta
-import meta_form
 import schema
 import schema_transformations
 from util import *
@@ -405,41 +404,13 @@ def rule_batch_vault_metadata_schema_report(ctx):
         genquery.AS_LIST, ctx)
 
     for row in iter:
-        coll_name = row[0]
-        metadata_path = meta.get_latest_vault_metadata_path(ctx, coll_name)
-
-        if metadata_path == '' or metadata_path is None:
-            log.write(ctx, "Vault metadata schema report skips %s, because metadata could not be found."
-                           % (coll_name))
-            continue
-
         try:
-            metadata = jsonutil.read(ctx, metadata_path)
-        except Exception as exc:
-            log.write(ctx, "Vault metadata report skips %s, because of exception while reading metadata file %s: %s."
-                           % (coll_name, metadata_path, str(exc)))
+            coll_name = row[0]
+            result = meta.vault_metadata_matches_schema(ctx, coll_name, schema_cache, "Vault metadata schema report", True)
+            if result:
+                results[coll_name] = result
+        except Exception as e:
+            log.write(ctx, "Error processing collection {}: {}".format(coll_name, str(e)))
             continue
 
-        # Determine schema
-        schema_id = schema.get_schema_id(ctx, metadata_path)
-        schema_shortname = schema_id.split("/")[-2]
-
-        # Retrieve schema and cache it for future use
-        schema_path = schema.get_schema_path_by_id(ctx, metadata_path, schema_id)
-        if schema_shortname in schema_cache:
-            schema_contents = schema_cache[schema_shortname]
-        else:
-            schema_contents = jsonutil.read(ctx, schema_path)
-            schema_cache[schema_shortname] = schema_contents
-
-        # Check whether metadata matches schema and log any errors
-        error_list = meta.get_json_metadata_errors(ctx, metadata_path, metadata=metadata, schema=schema_contents)
-        match_schema = len(error_list) == 0
-        if not match_schema:
-            log.write(ctx, "Vault metadata schema report: metadata %s did not match schema %s: %s" %
-                           (metadata_path, schema_shortname, str([meta_form.humanize_validation_error(e).encode('utf-8') for e in error_list])))
-
-        # Update results
-        results[coll_name] = {"schema": schema_shortname, "match_schema": match_schema}
-
     return json.dumps(results)
diff --git a/tests/features/api/api_vault.feature b/tests/features/api/api_vault.feature
index 0039a709b..4ed3d018b 100644
--- a/tests/features/api/api_vault.feature
+++ b/tests/features/api/api_vault.feature
@@ -98,6 +98,17 @@ Feature: Vault API
             | /tempZone/home/vault-default-2 |
             | /tempZone/home/vault-core-2    |
             | /tempZone/home/vault-default-3 |
+    
+
+    Scenario Outline: Published vault package passes troubleshooting script checks
+        Given user technicaladmin is authenticated
+        And data package exists in <vault>
+        Then data package in <vault> passes troubleshooting script checks
+
+        Examples:
+            | vault                          |
+            | /tempZone/home/vault-default-2 |
+            | /tempZone/home/vault-default-3 |
 
 
     Scenario Outline: Vault preservable formats lists
diff --git a/tests/step_defs/api/common_vault.py b/tests/step_defs/api/common_vault.py
index 2cfa8fa55..9b2706221 100644
--- a/tests/step_defs/api/common_vault.py
+++ b/tests/step_defs/api/common_vault.py
@@ -174,6 +174,21 @@ def data_package_status(user, vault, data_package, status):
     raise AssertionError()
 
 
+@then(parsers.parse('data package in {vault} passes troubleshooting script checks'))
+def api_vault_batch_troubleshoot(user, vault, data_package):
+    http_status, result = api_request(
+        user,
+        "batch_troubleshoot_published_data_packages",
+        {"requested_package": data_package, "log_file": True, "offline": True}
+    )
+    assert http_status == 200
+    data = result['data']
+    assert len(data) == 1
+    # Confirm that all checks passed for this data package
+    for checks in data.values():
+        assert all(checks.values())
+
+
 @then('preservable formats lists are returned')
 def preservable_formats_lists(api_response):
     http_status, body = api_response
diff --git a/tools/troubleshoot-published-data.py b/tools/troubleshoot-published-data.py
new file mode 100644
index 000000000..bba14bc72
--- /dev/null
+++ b/tools/troubleshoot-published-data.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""This script collects all published packages and checks that they have all the required info.
+
+Example:
+To check all published packages:
+python3 troubleshoot-published-data.py
+
+To check one specific package by name:
+python3 troubleshoot-published-data.py -p research-initial[1725262507]
+
+To put results into a log file and complete the checks offline:
+python3 troubleshoot-published-data.py -l -o
+"""
+import argparse
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        prog="troubleshoot-published-data.py",
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("-l", "--log-file", action='store_true',
+                        help="If log file parameter is true then write to log at: /var/lib/irods/log/troubleshoot_publications.log")
+    parser.add_argument("-o", "--offline", action='store_true',
+                        help="If actions should be performed without connecting to external servers (needed for the Yoda team's development setup).")
+    parser.add_argument("-n", "--no-datacite", action='store_true',
+                        help="If datacite check should be skipped (needed for the Yoda team's development environment in some cases).")
+    parser.add_argument("-p", "--package", type=str, required=False,
+                        help="Troubleshoot a specific data package by name (default: troubleshoot all packages)")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    rule_name = "/etc/irods/yoda-ruleset/tools/troubleshoot_data.r"
+    data_package = f"*data_package={args.package}"
+    log_loc = f"*log_loc={args.log_file if args.log_file else ''}"
+    offline = f"*offline={args.offline}"
+    no_datacite = f"*no_datacite={args.no_datacite}"
+    subprocess.call(['irule', '-r', 'irods_rule_engine_plugin-python-instance', '-F',
+                    rule_name, data_package, log_loc, offline, no_datacite])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/troubleshoot_data.r b/tools/troubleshoot_data.r
new file mode 100644
index 000000000..3caac4671
--- /dev/null
+++ b/tools/troubleshoot_data.r
@@ -0,0 +1,11 @@
+#!/usr/bin/irule -r irods_rule_engine_plugin-python-instance -F
+
+def main(rule_args, callback, rei):
+    data_package = global_vars["*data_package"].strip('"')
+    log_loc = global_vars["*log_loc"].strip('"')
+    offline = global_vars["*offline"].strip('"')
+    no_datacite = global_vars["*no_datacite"].strip('"')
+    callback.rule_batch_troubleshoot_published_data_packages(data_package, log_loc, offline, no_datacite)
+
+INPUT *data_package="", *log_loc="", *offline="", *no_datacite=""
+OUTPUT ruleExecOut
diff --git a/unit-tests/test_util_misc.py b/unit-tests/test_util_misc.py
index cddbe5fcd..5962f2164 100644
--- a/unit-tests/test_util_misc.py
+++ b/unit-tests/test_util_misc.py
@@ -6,16 +6,181 @@
 
 import sys
 import time
-from collections import OrderedDict
+from collections import namedtuple, OrderedDict
 from unittest import TestCase
 
 sys.path.append('../util')
 
-from misc import human_readable_size, last_run_time_acceptable, remove_empty_objects
+from misc import check_data_package_system_avus, human_readable_size, last_run_time_acceptable, remove_empty_objects
+
+# AVs of a successfully published data package, that is the first version of the package
+avs_success_data_package = {
+    "org_publication_accessRestriction": "Open - freely retrievable",
+    "org_publication_anonymousAccess": "yes",
+    "org_publication_approval_actor": "datamanager#tempZone",
+    "org_publication_combiJsonPath": "/tempZone/yoda/publication/ICGVFV-combi.json",
+    "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/ICGVFV-dataCite.json",
+    "org_publication_dataCiteMetadataPosted": "yes",
+    "org_publication_landingPagePath": "/tempZone/yoda/publication/ICGVFV.html",
+    "org_publication_landingPageUploaded": "yes",
+    "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/ICGVFV.html",
+    "org_publication_lastModifiedDateTime": "2024-10-04T15:32:46.000000",
+    "org_publication_license": "Creative Commons Attribution 4.0 International Public License",
+    "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode",
+    "org_publication_oaiUploaded": "yes",
+    "org_publication_publicationDate": "2024-10-04T15:33:17.853806",
+    "org_publication_randomId": "ICGVFV",
+    "org_publication_status": "OK",
+    "org_publication_submission_actor": "researcher#tempZone",
+    "org_publication_vaultPackage": "/tempZone/home/vault-default-3/research-default-3[1728048679]",
+    "org_publication_versionDOI": "10.00012/UU01-ICGVFV",
+    "org_publication_versionDOIMinted": "yes",
+}
+
+avs_success_data_package_multiversion = {
+    "org_publication_accessRestriction": "Open - freely retrievable",
+    "org_publication_anonymousAccess": "yes",
+    "org_publication_approval_actor": "datamanager#tempZone",
+    "org_publication_baseDOI": "10.00012/UU01-X0GU3S",
+    "org_publication_baseDOIMinted": "yes",
+    "org_publication_baseRandomId": "X0GU3S",
+    "org_publication_combiJsonPath": "/tempZone/yoda/publication/YU0JDH-combi.json",
+    "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/YU0JDH-dataCite.json",
+    "org_publication_dataCiteMetadataPosted": "yes",
+    "org_publication_landingPagePath": "/tempZone/yoda/publication/YU0JDH.html",
+    "org_publication_landingPageUploaded": "yes",
+    "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/YU0JDH.html",
+    "org_publication_lastModifiedDateTime": "2024-10-11T08:49:17.000000",
+    "org_publication_license": "Custom",
+    "org_publication_oaiUploaded": "yes",
+    "org_publication_previous_version": "/tempZone/home/vault-initial1/new-group01[1728550839]",
+    "org_publication_publicationDate": "2024-10-11T08:50:01.812220",
+    "org_publication_randomId": "YU0JDH",
+    "org_publication_status": "OK",
+    "org_publication_submission_actor": "datamanager#tempZone",
+    "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728629336]",
+    "org_publication_versionDOI": "10.00012/UU01-YU0JDH",
+    "org_publication_versionDOIMinted": "yes"
+}
+
+avs_success_data_package_multiversion_first = {
+    "org_publication_accessRestriction": "Open - freely retrievable",
+    "org_publication_anonymousAccess": "yes",
+    "org_publication_approval_actor": "datamanager#tempZone",
+    "org_publication_baseDOI": "10.00012/UU01-X0GU3S",
+    "org_publication_baseRandomId": "X0GU3S",
+    "org_publication_combiJsonPath": "/tempZone/yoda/publication/T8D8QU-combi.json",
+    "org_publication_dataCiteJsonPath": "/tempZone/yoda/publication/T8D8QU-dataCite.json",
+    "org_publication_dataCiteMetadataPosted": "yes",
+    "org_publication_landingPagePath": "/tempZone/yoda/publication/T8D8QU.html",
+    "org_publication_landingPageUploaded": "yes",
+    "org_publication_landingPageUrl": "https://public.yoda.test/allinone/UU01/T8D8QU.html",
+    "org_publication_lastModifiedDateTime": "2024-10-10T09:06:05.000000",
+    "org_publication_license": "Creative Commons Attribution 4.0 International Public License",
+    "org_publication_licenseUri": "https://creativecommons.org/licenses/by/4.0/legalcode",
+    "org_publication_next_version": "/tempZone/home/vault-initial1/new-group01[1728545387]",
+    "org_publication_oaiUploaded": "yes",
+    "org_publication_publicationDate": "2024-10-10T09:06:02.177810",
+    "org_publication_randomId": "T8D8QU",
+    "org_publication_status": "OK",
+    "org_publication_submission_actor": "datamanager#tempZone",
+    "org_publication_vaultPackage": "/tempZone/home/vault-initial1/new-group01[1728543897]",
+    "org_publication_versionDOI": "10.00012/UU01-T8D8QU",
+    "org_publication_versionDOIMinted": "yes",
+}
+
+# From avu.py
+Avu = namedtuple('Avu', list('avu'))
+Avu.attr  = Avu.a
+Avu.value = Avu.v
+Avu.unit  = Avu.u
 
 
 class UtilMiscTest(TestCase):
 
+    def test_check_data_package_system_avus(self):
+        # Success
+        avs = avs_success_data_package
+        avus_success = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_success)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
+        # Success, extra optional avu
+        avs['org_publication_baseDOIAvailable'] = 'yes'
+        avus_success = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_success)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+        del avs['org_publication_baseDOIAvailable']
+
+        # Missing license Uri for non-custom license
+        del avs['org_publication_licenseUri']
+        avus_missing_license_uri = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_missing_license_uri)
+        self.assertFalse(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 1)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
+        # Custom license, no license Uri (happy flow)
+        avs['org_publication_license'] = "Custom"
+        avus_custom_license = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_custom_license)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
+        # Unexpected
+        avs['org_publication_userAddedSomethingWeird'] = "yodayoda:)"
+        avus_unexpected = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_unexpected)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertFalse(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 1)
+
+        # Missing and unexpected
+        del avs['org_publication_landingPagePath']
+        avus_missing_unexpected = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_missing_unexpected)
+        self.assertFalse(result['no_missing_avus'])
+        self.assertFalse(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 1)
+        self.assertTrue(len(result['unexpected_avus']) == 1)
+
+        # Missing
+        del avs['org_publication_userAddedSomethingWeird']
+        avus_missing = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_missing)
+        self.assertFalse(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 1)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
+        # Success, latest version of a publication
+        avs = avs_success_data_package_multiversion
+        avus_success = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_success)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
+        # Success, first version of a publication that has had other versions
+        avs = avs_success_data_package_multiversion_first
+        avus_success = [Avu(attr, val, "") for attr, val in avs.items()]
+        result = check_data_package_system_avus(avus_success)
+        self.assertTrue(result['no_missing_avus'])
+        self.assertTrue(result['no_unexpected_avus'])
+        self.assertTrue(len(result['missing_avus']) == 0)
+        self.assertTrue(len(result['unexpected_avus']) == 0)
+
     def test_last_run_time_acceptable(self):
         """Test the last run time for copy to vault"""
         # No last run time (job hasn't be tried before)
diff --git a/util/avu.py b/util/avu.py
index e92876d9b..44836722a 100644
--- a/util/avu.py
+++ b/util/avu.py
@@ -35,6 +35,71 @@ def of_data(ctx, path):
                                               "COLL_NAME = '{}' AND DATA_NAME = '{}'".format(*pathutil.chop(path))))
 
 
+def get_attr_val_of_coll(ctx, coll, attr):
+    """Get the value corresponding to an attr for a given collection."""
+    iter = genquery.Query(
+        ctx,
+        "META_COLL_ATTR_VALUE",
+        "META_COLL_ATTR_NAME = '{}' AND COLL_NAME = '{}'".format(attr, coll))
+
+    for row in iter:
+        return row
+    raise ValueError("Attribute {} not found in AVUs of collection {}".format(attr, coll))
+
+
+def inside_coll(ctx, path, recursive=False):
+    """Get a list of all AVUs inside a collection with corresponding paths.
+
+    Note: the returned value is a generator / lazy list, so that large
+          collections can be handled without keeping everything in memory.
+          use list(...) on the result to get an actual list if necessary.
+
+    The returned paths are absolute paths (e.g. '/tempZone/home/x').
+
+    :param ctx:       Combined type of a callback and rei struct
+    :param path:      Path of collection
+    :param recursive: List AVUs recursively
+
+    :returns: List of all AVUs inside a collection with corresponding paths
+    """
+    # coll+name -> path
+    def to_absolute(row, type):
+        if type == "collection":
+            return (row[1], type, row[2], row[3], row[4])
+        else:
+            return ('{}/{}'.format(row[0], row[1]), type, row[2], row[3], row[4])
+
+    collection_root = genquery.row_iterator(
+        "COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS",
+        "COLL_PARENT_NAME = '{}'".format(path),
+        genquery.AS_LIST, ctx)
+    collection_root = itertools.imap(lambda x: to_absolute(x, "collection"), collection_root)
+
+    data_objects_root = genquery.row_iterator(
+        "COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
+        "COLL_NAME = '{}'".format(path),
+        genquery.AS_LIST, ctx)
+    data_objects_root = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_root)
+
+    if not recursive:
+        return itertools.chain(collection_root, data_objects_root)
+
+    collection_sub = genquery.row_iterator(
+        "COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS",
+        "COLL_PARENT_NAME like '{}/%'".format(path),
+        genquery.AS_LIST, ctx)
+    collection_sub = itertools.imap(lambda x: to_absolute(x, "collection"), collection_sub)
+
+    data_objects_sub = genquery.row_iterator(
+        "COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
+        "COLL_NAME like '{}/%'".format(path),
+        genquery.AS_LIST, ctx)
+    data_objects_sub = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_sub)
+
+    return itertools.chain(collection_root, data_objects_root, collection_sub, data_objects_sub)
+>>>>>>> 076ca8f0 (YDA-5829: troubleshooting tool for published data packages)
+
+
 def of_group(ctx, group):
     """Get (a,v,u) triplets for a given group."""
     return itertools.imap(lambda x: Avu(*x),
diff --git a/util/log.py b/util/log.py
index ab96e96f9..729f8fdd5 100644
--- a/util/log.py
+++ b/util/log.py
@@ -17,15 +17,20 @@
     import user
 
 
-def write(ctx, message):
-    """Write a message to the log, including client name and originating module.
+def write(ctx, message, write_stdout=False):
+    """Write a message to the log or stdout.
+    Includes client name and originating module if writing to log.
 
-    :param ctx:     Combined type of a callback and rei struct
-    :param message: Message to write to log
+    :param ctx:          Combined type of a callback and rei struct
+    :param message:      Message to write to log
+    :param write_stdout: Whether to write to stdout (used for a few of our scripts)
     """
-    stack = inspect.stack()[1]
-    module = inspect.getmodule(stack[0])
-    _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message))
+    if write_stdout:
+        ctx.writeLine("stdout", message)
+    else:
+        stack = inspect.stack()[1]
+        module = inspect.getmodule(stack[0])
+        _write(ctx, '[{}] {}'.format(module.__name__.replace("rules_uu.", ""), message))
 
 
 def _write(ctx, message):
@@ -40,15 +45,6 @@ def _write(ctx, message):
         ctx.writeLine('serverLog', message)
 
 
-def write_stdout(ctx, message):
-    """Write a message to stdout. Used for some of our scripts.
-
-    :param ctx:      Combined type of a callback and rei struct
-    :param message:  Message to write to log
-    """
-    ctx.writeLine("stdout", message)
-
-
 def debug(ctx, message):
     """"Write a message to the log, if in a development environment.
 
diff --git a/util/misc.py b/util/misc.py
index 12df2a0af..6c1e54623 100644
--- a/util/misc.py
+++ b/util/misc.py
@@ -8,6 +8,88 @@
 import time
 from collections import OrderedDict
 
+import constants
+
+
+def check_data_package_system_avus(extracted_avus):
+    """
+    Checks whether a data package has the expected system AVUs that start with constants.UUORGMETADATAPREFIX (i.e, 'org_').
+    This function compares the AVUs of the provided data package against a set of ground truth AVUs derived from
+    a successfully published data package.
+
+    :param extracted_avus: AVUs of the data package in AVU form
+
+    :returns:            Dictionary of the results of the check
+    """
+    # Filter those starting with 'org_publication'
+    extracted_avs = {}
+    for m in extracted_avus:
+        if m.attr.startswith(constants.UUORGMETADATAPREFIX + 'publication_'):
+            extracted_avs[m.attr] = m.value
+    extracted_attrs = set(extracted_avs.keys())
+
+    # Define the set of ground truth AVUs
+    avu_names_suffix = {
+        'approval_actor', 'randomId',
+        'versionDOI', 'dataCiteJsonPath', 'license',
+        'anonymousAccess', 'versionDOIMinted',
+        'accessRestriction', 'landingPagePath',
+        'publicationDate',
+        'vaultPackage', 'submission_actor', 'status',
+        'lastModifiedDateTime', 'combiJsonPath',
+        'landingPageUploaded', 'oaiUploaded',
+        'landingPageUrl', 'dataCiteMetadataPosted'
+    }
+
+    # If the license is not Custom, it must have a licenseUri
+    if constants.UUORGMETADATAPREFIX + 'publication_license' in extracted_attrs:
+        if extracted_avs[constants.UUORGMETADATAPREFIX + 'publication_license'] != "Custom":
+            avu_names_suffix.add('licenseUri')
+
+    # Define additional set of AVUs with more than one version of publication
+    avu_names_version_suffix = {
+        'previous_version', 'baseDOI', 'baseRandomId',
+        'baseDOIMinted'
+    }
+
+    # Define additional set of AVUs expected for the first version of a publication, when there are multiple versions
+    avu_names_first_version_suffix = {
+        'baseRandomId', 'baseDOI', 'next_version'
+    }
+
+    # for the second version, all we need is next_version in addition to avu_names_version_suffix
+    avu_names_previous_version_suffix = {'next_version'}
+
+    # optional avus
+    avu_names_optional_suffix = {
+        'versionDOIAvailable', 'baseDOIAvailable'
+    }
+
+    combined_avu_names_suffix = avu_names_suffix
+
+    if constants.UUORGMETADATAPREFIX + 'publication_previous_version' in extracted_attrs:
+        combined_avu_names_suffix.update(avu_names_version_suffix)
+        if constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs:
+            combined_avu_names_suffix.update(avu_names_previous_version_suffix)
+    elif constants.UUORGMETADATAPREFIX + 'publication_next_version' in extracted_attrs:
+        combined_avu_names_suffix.update(avu_names_first_version_suffix)
+
+    ground_truth_avus = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix}
+    combined_avu_names_suffix.update(avu_names_optional_suffix)
+    ground_truth_avus_with_optional = {"{}publication_{}".format(constants.UUORGMETADATAPREFIX, name) for name in combined_avu_names_suffix}
+    # Find missing and unexpected AVUs
+    missing_avus = ground_truth_avus - extracted_attrs
+    unexpected_avus = extracted_attrs - ground_truth_avus_with_optional
+
+    results = {
+        'no_missing_avus': not bool(missing_avus),
+        'missing_avus': list(missing_avus),
+        'no_unexpected_avus': not bool(unexpected_avus),
+        'unexpected_avus': list(unexpected_avus)
+    }
+
+    return results
+
 
 def last_run_time_acceptable(coll, found, last_run, config_backoff_time):
     """Return whether the last run time is acceptable to continue with task."""

From f5deb3ec92cc11731e91ff2a5605436c178f5777 Mon Sep 17 00:00:00 2001
From: claravox <claravox@users.noreply.github.com>
Date: Mon, 28 Oct 2024 14:43:53 +0100
Subject: [PATCH 45/57] YDA-5829: Unicode landing page comparison

(Backport to Yoda 1.9)
---
 publication_troubleshoot.py | 4 +++-
 util/avu.py                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/publication_troubleshoot.py b/publication_troubleshoot.py
index 8f948fcbb..6ceafe737 100644
--- a/publication_troubleshoot.py
+++ b/publication_troubleshoot.py
@@ -205,9 +205,11 @@ def compare_local_remote_landingpage(ctx, file_path, url, offline, api_call):
         return False
 
     # Set encoding to utf-8 for the response text (otherwise will not match local_data)
+    # response.text is then returned as unicode
     response.encoding = 'utf-8'
+    local_data_uni = local_data.decode("utf-8")
 
-    if local_data == response.text:
+    if local_data_uni == response.text:
         return True
 
     log.write(ctx, "compare_local_remote_landingpage: File contents at irods path <{}> and remote landing page <{}> do not match.".format(file_path, url), write_stdout)
diff --git a/util/avu.py b/util/avu.py
index 44836722a..7d4bfcfb4 100644
--- a/util/avu.py
+++ b/util/avu.py
@@ -97,7 +97,6 @@ def to_absolute(row, type):
     data_objects_sub = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_sub)
 
     return itertools.chain(collection_root, data_objects_root, collection_sub, data_objects_sub)
->>>>>>> 076ca8f0 (YDA-5829: troubleshooting tool for published data packages)
 
 
 def of_group(ctx, group):

From 0e2ec77948b1b621e8a100c53a9fff39bcd640e6 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Tue, 3 Dec 2024 17:09:17 +0100
Subject: [PATCH 46/57] CI: fix integration/API tests for 1.9 RC versions

Configure API/integration tests to run on the Yoda 1.9 container
images for Yoda 1.9.x RC versions.
---
 .github/workflows/api-and-integration-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml
index 436be2c6a..55d02ba0e 100644
--- a/.github/workflows/api-and-integration-tests.yml
+++ b/.github/workflows/api-and-integration-tests.yml
@@ -34,7 +34,7 @@ jobs:
     # For other branches, we use the Docker setup of the development branch.
     - name: Determine Yoda repository branch
       run: |
-        if [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.9" ]; then
+        if [ "${{ steps.extract_branch.outputs.branch }}" = "release-1.9" ] || [[ "${{ steps.extract_branch.outputs.branch }}" == rc-1.9.* ]]; then
           echo "branch=release-1.9" >> $GITHUB_OUTPUT
         else
           echo "branch=development" >> $GITHUB_OUTPUT

From 9bb667eafe43a0ca51f9bdf8a2a7ebba6f3634fe Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Fri, 12 Apr 2024 14:36:48 +0200
Subject: [PATCH 47/57] YDA-5241 Add vault stat microservice

---
 util/msi.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/util/msi.py b/util/msi.py
index 68393f95c..fa8410977 100644
--- a/util/msi.py
+++ b/util/msi.py
@@ -111,6 +111,7 @@ def _make_exception(name, message):
 get_icat_time,    GetIcatTimeError    = make('GetIcatTime',    'Could not get Icat time')
 get_obj_type,     GetObjTypeError     = make('GetObjType',     'Could not get object type')
 mod_avu_metadata, ModAVUMetadataError = make('ModAVUMetadata', 'Could not modify AVU metadata')
+stat_vault,       MSIStatVaultError   = make("_stat_vault",    'Could not stat file system object in vault.')
 
 archive_create,   ArchiveCreateError  = make('ArchiveCreate',  'Could not create archive')
 archive_index,    ArchiveIndexError   = make('ArchiveIndex',   'Could not index archive')

From ab484ba44d1ea91b690dfb8eca9973864fb94651 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 4 Jul 2024 14:58:35 +0200
Subject: [PATCH 48/57] Add file checksum and dir list integration tests

Co-authored-by: Lazlo Westerhof <l.r.westerhof@uu.nl>
---
 integration_tests.py | 72 ++++++++++++++++++++++++++++++++++++++++++++
 util/msi.py          |  2 ++
 2 files changed, 74 insertions(+)

diff --git a/integration_tests.py b/integration_tests.py
index e7f59fa88..ea7fb6848 100644
--- a/integration_tests.py
+++ b/integration_tests.py
@@ -381,6 +381,36 @@ def _test_folder_secure_func(ctx, func):
     {"name": "msvc.msi_vault_stat.outsidevault2",
      "test": lambda ctx: _call_msvc_stat_vault_check_exc(ctx, "dev001_1", "/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.uri"),
      "check": lambda x: x},
+    {"name": "msvc.msi_file_checksum.file",
+     "test": lambda ctx: _call_file_checksum_either_resc(ctx, "/var/lib/irods/VaultX/yoda/licenses/GNU General Public License v3.0.txt"),
+     "check": lambda x: x == "sha2:OXLcl0T2SZ8Pmy2/dmlvKuetivmyPd5m1q+Gyd+zaYY="},
+    {"name": "msvc.msi_file_checksum.file_not_exist",
+     "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/licenses/doesnotexist.txt', 'dev001_2'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_file_checksum.resc_not_exist",
+     "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.txt', 'non-existent-resource'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_file_checksum.outside_vault",
+     "test": lambda ctx: _call_file_checksum_check_exc(ctx, '/etc/passwd', 'dev001_2'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_dir_list.dir",
+     "test": lambda ctx: _call_dir_list(ctx, "/var/lib/irods/Vault1_1/yoda", "dev001_1"),
+     "check": lambda x: len(x) == len([entry for entry in os.listdir("/var/lib/irods/Vault1_1/yoda") if os.path.isdir("/var/lib/irods/Vault1_1/yoda/" + entry)])},
+    {"name": "msvc.msi_dir_list.dir_not_exist",
+     "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/doesnotexist', 'dev001_2'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_dir_list.file_resc_1",
+     "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda/licenses/GNU General Public License v3.0.txt', 'dev001_1'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_dir_list.file_resc_2",
+     "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_2/yoda/licenses/GNU General Public License v3.0.txt', 'dev001_2'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_dir_list.resc_not_exist",
+     "test": lambda ctx: _call_dir_list_check_exc(ctx, '/var/lib/irods/Vault1_1/yoda', 'non-existent-resource'),
+     "check": lambda x: x},
+    {"name": "msvc.msi_dir_list.outside_vault",
+     "test": lambda ctx: _call_dir_list_check_exc(ctx, '/etc/passwd', 'dev001_2'),
+     "check": lambda x: x},
     {"name": "msvc.rmw_avu_collection_literal",
      "test": lambda ctx: _test_msvc_rmw_avu_collection(ctx, ("foo", "bar", "baz")),
      "check": lambda x: (("aap", "noot", "mies") in x
@@ -706,3 +736,45 @@ def rule_run_integration_tests(ctx, tests):
         return_value += name + " " + verdict + "\n"
 
     return return_value
+
+
+def _call_file_checksum_either_resc(ctx, filename):
+    """Returns result of file checksum microservice for either of the
+       two main UFS resources (dev001_1, dev001_2). If one returns an
+       exception, we try the other.
+
+       :param ctx: combined type of a callback and rei struct
+       :param filename: name of file to checksum
+
+       :returns: output of file checksum microservice
+    """
+    try:
+        vault_filename = filename.replace("VaultX", "Vault1_1")
+        ret = msi.file_checksum(ctx, vault_filename, 'dev001_1', '')
+    except Exception:
+        vault_filename = filename.replace("VaultX", "Vault1_2")
+        ret = msi.file_checksum(ctx, vault_filename, 'dev001_2', '')
+    return ret['arguments'][2]
+
+
+def _call_file_checksum_check_exc(ctx, filename, resc_name):
+    """Verifies whether a call to the file checksum microservice raises an exception"""
+    try:
+        msi.file_checksum(ctx, filename, resc_name, '')
+        return False
+    except Exception:
+        return True
+
+
+def _call_dir_list(ctx, dirname, resc_name):
+    ret = msi.dir_list(ctx, dirname, resc_name, "")
+    print(ret['arguments'][2])
+    return json.loads(ret['arguments'][2])
+
+
+def _call_dir_list_check_exc(ctx, dirname, resc_name):
+    try:
+        msi.dir_list(ctx, dirname, resc_name, "")
+        return False
+    except Exception:
+        return True
diff --git a/util/msi.py b/util/msi.py
index fa8410977..74c5431da 100644
--- a/util/msi.py
+++ b/util/msi.py
@@ -112,6 +112,8 @@ def _make_exception(name, message):
 get_obj_type,     GetObjTypeError     = make('GetObjType',     'Could not get object type')
 mod_avu_metadata, ModAVUMetadataError = make('ModAVUMetadata', 'Could not modify AVU metadata')
 stat_vault,       MSIStatVaultError   = make("_stat_vault",    'Could not stat file system object in vault.')
+file_checksum,    FileChecksumError   = make("_file_checksum", 'Could not calculate non-persistent checksum of vault file.')
+dir_list,         DirListError        = make("_dir_list",      'Could not list vault directory contents.')
 
 archive_create,   ArchiveCreateError  = make('ArchiveCreate',  'Could not create archive')
 archive_index,    ArchiveIndexError   = make('ArchiveIndex',   'Could not index archive')

From 86d4c1168f9a1e5ac591de18e3c55fad70bff96c Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Tue, 25 Jun 2024 08:51:54 +0200
Subject: [PATCH 49/57] YDA-5737: Add data transfer page UI tests

Co-authored-by: Lazlo Westerhof <l.r.westerhof@uu.nl>
---
 tests/features/ui/ui_data_transfer.feature  |  87 +++++++++++++
 tests/requirements.txt                      |   1 +
 tests/step_defs/ui/test_ui_data_transfer.py | 136 ++++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 tests/features/ui/ui_data_transfer.feature
 create mode 100644 tests/step_defs/ui/test_ui_data_transfer.py

diff --git a/tests/features/ui/ui_data_transfer.feature b/tests/features/ui/ui_data_transfer.feature
new file mode 100644
index 000000000..502a70c88
--- /dev/null
+++ b/tests/features/ui/ui_data_transfer.feature
@@ -0,0 +1,87 @@
+@ui
+Feature: Data Transfer UI
+
+    Scenario Outline: Data Transfer page
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        Then Data Transfer is shown
+
+        Examples:
+            | user           |
+            | researcher     |
+            | technicaladmin |
+
+
+    Scenario Outline: User clicks on the iCommands docs page
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on the iCommands docs page
+        Then iCommands docs page is displayed
+
+        Examples:
+            | user           |
+            | researcher     |
+            | technicaladmin |
+
+
+    Scenario Outline: User copies iCommands configuration
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on iCommands copy button
+        Then iCommands configuration is copied
+
+        Examples:
+            | user           |
+            | researcher     |
+            | technicaladmin |
+
+
+    Scenario Outline: User downloads iCommands configuration file
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on iCommands download button
+        Then iCommands configuration file is downloaded as <format>
+
+        Examples:
+            | user           | format       |
+            | researcher     | json         |
+            | technicaladmin | json         |
+
+
+    Scenario Outline: User clicks on the Gocommands docs page
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on Gocommands tab
+        And user clicks on the Gocommands docs page
+        Then Gocommands docs page is displayed
+
+        Examples:
+            | user           |
+            | researcher     |
+            | technicaladmin |
+
+
+    Scenario Outline: User copies Gocommands configuration
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on Gocommands tab
+        And user clicks on Gocommands copy button
+        Then Gocommands configuration is copied
+
+        Examples:
+            | user           |
+            | researcher     |
+            | technicaladmin |
+
+
+    Scenario Outline: User downloads Gocommands configuration file
+        Given user <user> is logged in
+        When user opens the Data Transfer page
+        And user clicks on Gocommands tab
+        And user clicks on Gocommands download button
+        Then Gocommands configuration file is downloaded as <format>
+
+        Examples:
+            | user           | format      |
+            | researcher     | yml         |
+            | technicaladmin | yml         |
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 83db9541c..9df72093d 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -5,3 +5,4 @@ pytest-splinter==3.3.2
 pytest_bdd==7.0.1
 pytest==7.4.4
 deepdiff==6.6.1
+pyperclip==1.9.0
diff --git a/tests/step_defs/ui/test_ui_data_transfer.py b/tests/step_defs/ui/test_ui_data_transfer.py
new file mode 100644
index 000000000..5d324a27a
--- /dev/null
+++ b/tests/step_defs/ui/test_ui_data_transfer.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+"""Data Transfer UI feature tests."""
+
+__copyright__ = 'Copyright (c) 2024, Utrecht University'
+__license__ = 'GPLv3, see LICENSE'
+
+import os
+import time
+from pathlib import Path
+from urllib.parse import urlparse
+
+import pyperclip
+from pytest_bdd import parsers, scenarios, then, when
+
+from conftest import portal_url
+
+scenarios('../../features/ui/ui_data_transfer.feature')
+
+icommands_url = "https://docs.irods.org/4.2.12/icommands/user/"
+gocommands_url = "https://github.com/cyverse/gocommands/blob/main/README.md"
+
+
+@when("user opens the Data Transfer page")
+def ui_data_transfer_page(browser):
+    url = "{}/user/data_transfer".format(portal_url)
+    browser.visit(url)
+
+
+@then(parsers.parse("{title} is shown"))
+def ui_data_transfer_page_content(browser, title):
+    assert browser.is_text_present(title)
+
+
+@when("user clicks on the iCommands docs page")
+def ui_data_transfer_icommands_page(browser):
+    browser.links.find_by_href(icommands_url).first.click()
+    time.sleep(2)
+
+    # change to the new tab
+    browser.windows.current = browser.windows[-1]
+
+
+@then("iCommands docs page is displayed")
+def ui_data_transfer_icommands_page_content(browser):
+    assert browser.url == icommands_url
+    assert urlparse(browser.url).path == urlparse(icommands_url).path
+
+
+@when('user clicks on iCommands copy button')
+def ui_data_transfer_icommands_configuration_copy_button(browser):
+    browser.find_by_id('button1').click()
+
+
+@then('iCommands configuration is copied')
+def ui_data_transfer_icommands_configuration_copied():
+    clipboard_content = pyperclip.paste()
+    assert clipboard_content is not None
+
+
+@when("user clicks on iCommands download button")
+def ui_data_transfer_icommands_configuration_download_button(browser):
+    browser.find_by_id('download-button1').click()
+
+
+@then(parsers.parse("iCommands configuration file is downloaded as {format}"))
+def ui_data_transfer_icommands_configuration_file_downloaded(browser, tmpdir, format):
+    if os.name == "nt":
+        assert True
+        return
+
+    root_dir = Path(tmpdir).parent
+    if os.name == "nt":
+        download_dir = root_dir.joinpath("pytest-splinter0/splinter/download/")
+    else:
+        download_dir = root_dir.joinpath("pytest-splintercurrent/splinter/download/")
+
+    for child in download_dir.iterdir():
+        if os.path.basename(str(child)) == "irods_environment.{}".format(format):
+            assert True
+            return
+    raise AssertionError()
+
+
+@when('user clicks on Gocommands tab')
+def ui_data_transfer_gocommands_tab(browser):
+    browser.find_by_text('Gocommands').click()
+
+
+@when("user clicks on the Gocommands docs page")
+def ui_data_transfer_gocommands_page(browser):
+    browser.links.find_by_href(gocommands_url).first.click()
+    time.sleep(2)
+
+    # change to the new tab
+    browser.windows.current = browser.windows[-1]
+
+
+@then("Gocommands docs page is displayed")
+def ui_data_transfer_gocommands_page_content(browser):
+    assert browser.url == gocommands_url
+    assert urlparse(browser.url).path == urlparse(gocommands_url).path
+
+
+@when('user clicks on Gocommands copy button')
+def ui_data_transfer_gocommands_configuration_copy_button(browser):
+    browser.find_by_id('button2').click()
+
+
+@then("Gocommands configuration is copied")
+def ui_data_transfer_gocommands_configuration_is_copied():
+    clipboard_content = pyperclip.paste()
+    assert clipboard_content is not None
+
+
+@when("user clicks on Gocommands download button")
+def ui_data_transfer_gocommands_configuration_download_button(browser):
+    browser.find_by_id('download-button2').click()
+
+
+@then(parsers.parse("Gocommands configuration file is downloaded as {format}"))
+def ui_data_transfer_gocommands_configuration_downloaded(browser, tmpdir, format):
+    if os.name == "nt":
+        assert True
+        return
+
+    root_dir = Path(tmpdir).parent
+    if os.name == "nt":
+        download_dir = root_dir.joinpath("pytest-splinter0/splinter/download/")
+    else:
+        download_dir = root_dir.joinpath("pytest-splintercurrent/splinter/download/")
+
+    for child in download_dir.iterdir():
+        if os.path.basename(str(child)) == "config.{}".format(format):
+            assert True
+            return
+    raise AssertionError()

From db3e2f82ee4299c7ebcce903ca89978acd471b77 Mon Sep 17 00:00:00 2001
From: Leonidas Triantafyllou <leonidastri@users.noreply.github.com>
Date: Tue, 9 Jul 2024 08:21:54 +0200
Subject: [PATCH 50/57] Fix Data Transfer UI tests after latest changes

---
 tests/step_defs/ui/test_ui_data_transfer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/step_defs/ui/test_ui_data_transfer.py b/tests/step_defs/ui/test_ui_data_transfer.py
index 5d324a27a..abcaada4a 100644
--- a/tests/step_defs/ui/test_ui_data_transfer.py
+++ b/tests/step_defs/ui/test_ui_data_transfer.py
@@ -83,7 +83,7 @@ def ui_data_transfer_icommands_configuration_file_downloaded(browser, tmpdir, fo
 
 @when('user clicks on Gocommands tab')
 def ui_data_transfer_gocommands_tab(browser):
-    browser.find_by_text('Gocommands').click()
+    browser.find_by_text('GoCommands').click()
 
 
 @when("user clicks on the Gocommands docs page")

From 6982c180ee426e6605975eaa3f5f2cc0b7d42242 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 5 Dec 2024 12:22:36 +0100
Subject: [PATCH 51/57] CI: upgrade UU MSVC for rc-1.9.5

Upgrade UU microservices to v1.2.0 for testing the release
candidate version of Yoda 1.9.5, since it works with a different
microservices version than we currently have in the Docker image.
---
 .github/workflows/api-and-integration-tests.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml
index 55d02ba0e..2d4538e17 100644
--- a/.github/workflows/api-and-integration-tests.yml
+++ b/.github/workflows/api-and-integration-tests.yml
@@ -74,6 +74,12 @@ jobs:
         docker exec provider.yoda sh -c "set -x ; cd /etc/irods/yoda-ruleset && sudo -u irods git checkout ${{ steps.extract_branch.outputs.branch }} && sudo -u irods python -m pip --no-cache-dir install --user -r /etc/irods/yoda-ruleset/requirements.txt && sudo -u irods make && sudo -u irods make install"
         docker exec provider.yoda sh -c "set -x ; sudo -u irods /var/lib/irods/irodsctl restart"
 
+    - name: Upgrade UU microservices for testing the RC version of Yoda 1.9.5
+      if: github.event.pull_request.head.ref == 'rc-1.9.5'
+      shell: bash
+      run: |
+        docker exec provider.yoda sh -c 'set -x ; sudo yum remove -y irods-uu-microservices ; sudo wget https://github.com/UtrechtUniversity/irods-uu-microservices/releases/download/v1.2.0/irods-uu-microservices-4.2.12_1.2.0-0.rpm ; sudo rpm -ivh irods-uu-microservices-4.2.12_1.2.0-0.rpm'
+
     - name: Pull and install latest version of portal
       shell: bash
       run: |

From 7aba7b51e0f5892fa1e0643451d40e70875e7a12 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 5 Dec 2024 16:18:03 +0100
Subject: [PATCH 52/57] YDA-6040: add portal info API tests

On API test failures, print more information for troubleshooting
purposes:
- CI: On failures, print web server error logs and portal access log
  for troubleshooting on portal level.
- API tests: if tests are unable to get a CSRF token, print the user
  that the error occurred with and the response from the web server
  for troubleshooting purposes.

(version for Yoda 1.9)
---
 .github/workflows/api-and-integration-tests.yml |  6 ++++++
 tests/conftest.py                               | 15 +++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml
index 2d4538e17..aeff50cfe 100644
--- a/.github/workflows/api-and-integration-tests.yml
+++ b/.github/workflows/api-and-integration-tests.yml
@@ -124,6 +124,12 @@ jobs:
       run: |
         docker exec provider.yoda sh -c 'set -x ; cat /var/lib/irods/log/rodsLog*'
 
+
+    - name: Output web server logs
+      if: failure()
+      run: |
+        docker exec portal.yoda sh -c 'set -x ; for log in error_log portal_access.log ; do echo "${log}:" ; cat "/var/log/httpd/$log" ; echo; done'
+
 # Uncomment section below when needed for debugging.
 #
 #    - name: Setup tmate session for debugging
diff --git a/tests/conftest.py b/tests/conftest.py
index 4ab47e948..14f59d0bc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,7 @@
 
 import json
 import re
+import sys
 
 import pytest
 import requests
@@ -201,7 +202,12 @@ def login(user, password):
     # Retrieve the login CSRF token.
     content = client.get(url, verify=False).content.decode()
     p = re.compile("tokenValue: '([a-zA-Z0-9._-]*)'")
-    csrf = p.findall(content)[0]
+    found_csrf_tokens = p.findall(content)
+    if len(found_csrf_tokens) == 0:
+        print(f"Error: could not find login CSRF token in response from server for login of user {user}. Response was:")
+        print(content)
+        sys.exit(1)
+    csrf = found_csrf_tokens[0]
 
     # Login as user.
     if verbose_test:
@@ -214,7 +220,12 @@ def login(user, password):
     # Retrieve the authenticated CSRF token.
     content = response.content.decode()
     p = re.compile("tokenValue: '([a-zA-Z0-9._-]*)'")
-    csrf = p.findall(content)[0]
+    found_csrf_tokens = p.findall(content)
+    if len(found_csrf_tokens) == 0:
+        print(f"Error: could not find authenticated CSRF token in response from server for login of user {user}. Response was:")
+        print(content)
+        sys.exit(1)
+    csrf = found_csrf_tokens[0]
 
     # Return CSRF and session cookies.
     if verbose_test:

From 52e5570b472d05c6c3fe7d23f937390f72d29829 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 5 Dec 2024 16:46:10 +0100
Subject: [PATCH 53/57] CI: install any new portal dependencies if needed

Install any new portal dependencies in CI. These may be needed for
the API tests.
---
 .github/workflows/api-and-integration-tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/api-and-integration-tests.yml b/.github/workflows/api-and-integration-tests.yml
index aeff50cfe..26d3244ea 100644
--- a/.github/workflows/api-and-integration-tests.yml
+++ b/.github/workflows/api-and-integration-tests.yml
@@ -86,6 +86,7 @@ jobs:
         cd yoda/docker/compose
         docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git config remote.origin.fetch  "+refs/heads/*:refs/remotes/origin/*" && git pull'
         docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git checkout ${{ steps.extract_branch.outputs.branch }} || git checkout development'
+        docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && . venv/bin/activate && venv/bin/pip3 install -r requirements.txt'
         docker exec portal.yoda sh -c 'set -x ; cd /var/www/yoda && git status'
         docker exec portal.yoda sh -c 'set -x ; touch /var/www/yoda/*.wsgi'
 

From 9fedc09a60acc96a93dd9f17f5619f80951b6ef8 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Thu, 5 Dec 2024 16:49:34 +0100
Subject: [PATCH 54/57] CI: Ignore all API/UI test code for Python 2 lint

We always run them with Python 3
---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 1bac72b55..ba0e5a08e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,5 +4,5 @@ import-order-style=smarkets
 strictness=short
 docstring_style=sphinx
 max-line-length=127
-exclude=__init__.py,tools,tests/env/
+exclude=__init__.py,tools,tests
 application-import-names=avu_json,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils

From 0a8c09aafd8085cb80ef70e22e3d713ad946cfc8 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Mon, 16 Dec 2024 19:20:53 +0100
Subject: [PATCH 55/57] Backport fix for edit-vault-metadata.py

Use Python 3.8 instead of distribution default Python 3.6, because
Python 3.6 has different (incompatible) arguments for the subprocess
module.
---
 tools/edit-vault-metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/edit-vault-metadata.py b/tools/edit-vault-metadata.py
index d099f7ebb..d6c99c1e4 100755
--- a/tools/edit-vault-metadata.py
+++ b/tools/edit-vault-metadata.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/local/bin/python3
 
 """
    edit-vault-metadata : script for manually editing metadata of a data package

From fc9bd90d9588029343da63e302cdb35876ca07d9 Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Mon, 6 Jan 2025 15:44:27 +0100
Subject: [PATCH 56/57] YDA-6082: specify vault resource on copy-to-vault

If a vault resource has been configured, use it
when copying data from a research collection to the
vault collection. Also use configuration for number
of threads when copying data.
---
 setup.cfg                |  2 +-
 unit-tests/test_vault.py | 26 ++++++++++++++++++++++++++
 unit-tests/unit_tests.py |  2 ++
 vault.py                 |  8 +++++++-
 vault_utils.py           | 28 ++++++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 unit-tests/test_vault.py
 create mode 100644 vault_utils.py

diff --git a/setup.cfg b/setup.cfg
index ba0e5a08e..ca6511c23 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,4 +5,4 @@ strictness=short
 docstring_style=sphinx
 max-line-length=127
 exclude=__init__.py,tools,tests
-application-import-names=avu_json,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils
+application-import-names=avu_json,conftest,util,api,config,constants,data_access_token,datacite,datarequest,data_object,epic,error,folder,groups,groups_import,intake,intake_dataset,intake_lock,intake_scan,intake_utils,intake_vault,json_datacite,json_landing_page,jsonutil,log,mail,meta,meta_form,msi,notifications,schema,schema_transformation,schema_transformations,settings,pathutil,provenance,policies_intake,policies_datamanager,policies_datapackage_status,policies_folder_status,policies_datarequest_status,publication,query,replication,revisions,revision_strategies,revision_utils,rule,user,vault,sram,arb_data_manager,cached_data_manager,resource,yoda_names,policies_utils,vault_utils
diff --git a/unit-tests/test_vault.py b/unit-tests/test_vault.py
new file mode 100644
index 000000000..5b430217d
--- /dev/null
+++ b/unit-tests/test_vault.py
@@ -0,0 +1,26 @@
+"""Unit tests for the vault functions"""
+
+__copyright__ = 'Copyright (c) 2023-2024, Utrecht University'
+__license__   = 'GPLv3, see LICENSE'
+
+import sys
+from unittest import TestCase
+
+sys.path.append('..')
+
+from vault_utils import get_copy_folder_to_vault_irsync_command
+
+
+class VaultTest(TestCase):
+
+    def test_get_copy_folder_to_vault_irsync_command_with_vault_resc(self):
+        output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", "vaultResc", True)
+        self.assertEqual(output, ["irsync", "-rK", "-R", "vaultResc", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"])
+
+    def test_get_copy_folder_to_vault_irsync_command_without_vault_resc(self):
+        output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", None, True)
+        self.assertEqual(output, ["irsync", "-rK", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"])
+
+    def test_get_copy_folder_to_vault_irsync_command_no_multithreading(self):
+        output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", "vaultResc", False)
+        self.assertEqual(output, ["irsync", "-rK", "-R", "vaultResc", "-N", "0", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"])
diff --git a/unit-tests/unit_tests.py b/unit-tests/unit_tests.py
index 3bd9d873e..58b0e6f87 100644
--- a/unit-tests/unit_tests.py
+++ b/unit-tests/unit_tests.py
@@ -13,6 +13,7 @@
 from test_util_misc import UtilMiscTest
 from test_util_pathutil import UtilPathutilTest
 from test_util_yoda_names import UtilYodaNamesTest
+from test_vault import VaultTest
 
 
 def suite():
@@ -27,4 +28,5 @@ def suite():
     test_suite.addTest(makeSuite(UtilMiscTest))
     test_suite.addTest(makeSuite(UtilPathutilTest))
     test_suite.addTest(makeSuite(UtilYodaNamesTest))
+    test_suite.addTest(makeSuite(VaultTest))
     return test_suite
diff --git a/vault.py b/vault.py
index 18ae49d34..7dec783d6 100644
--- a/vault.py
+++ b/vault.py
@@ -22,6 +22,7 @@
 import policies_datamanager
 import policies_datapackage_status
 from util import *
+from vault_utils import get_copy_folder_to_vault_irsync_command
 
 __all__ = ['api_vault_submit',
            'api_vault_approve',
@@ -954,8 +955,13 @@ def copy_folder_to_vault(ctx, coll, target):
     :returns: True for successful copy
     """
     returncode = 0
+    irsync_command = get_copy_folder_to_vault_irsync_command(coll,
+                                                             target,
+                                                             config.resource_vault,
+                                                             config.vault_copy_multithread_enabled)
+
     try:
-        returncode = subprocess.call(["irsync", "-rK", "i:{}/".format(coll), "i:{}/original".format(target)])
+        returncode = subprocess.call(irsync_command)
     except Exception as e:
         log.write(ctx, "irsync failure: " + e)
         log.write(ctx, "irsync failure for coll <{}> and target <{}>".format(coll, target))
diff --git a/vault_utils.py b/vault_utils.py
new file mode 100644
index 000000000..50cf36257
--- /dev/null
+++ b/vault_utils.py
@@ -0,0 +1,28 @@
+"""Utility functions for vault module."""
+
+__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
+__license__   = 'GPLv3, see LICENSE'
+
+
+def get_copy_folder_to_vault_irsync_command(coll, target, vault_resource, multi_threading):
+    """Internal function to determine rsync command for copy-to-vault
+
+       :param coll: source collection
+       :param target: target collection
+       :param vault_resource: resource to store vault data on (can be None)
+       :param multi_threading: if set to false, disable multi threading,
+                               otherwise use server default
+
+       :returns: irsync command with parameters in list format
+    """
+
+    irsync_command = ["irsync", "-rK"]
+
+    if vault_resource is not None:
+        irsync_command.extend(["-R", vault_resource])
+
+    if not multi_threading:
+        irsync_command.extend(["-N", "0"])  # 0 means no multi threading
+
+    irsync_command.extend(["i:{}/".format(coll), "i:{}/original".format(target)])
+    return irsync_command

From 6fa27292b44b5989735f8efcf397afff61ad9b9d Mon Sep 17 00:00:00 2001
From: Sietse Snel <s.t.snel@uu.nl>
Date: Tue, 7 Jan 2025 21:33:04 +0100
Subject: [PATCH 57/57] YDA-6084: add sanity checks copy-to-vault

Add sanity checks for source and destination paths
when copying data to the vault so that users can't
make the vault module copy data in an unintended way
by manipulating vault metadata. This also makes this part
of the system more robust against unexpected inputs.

(Backport to Yoda 1.9.x)
---
 folder.py                | 11 ++++++++++
 unit-tests/test_vault.py | 34 ++++++++++++++++++++++++++++-
 vault.py                 |  8 ++++++-
 vault_utils.py           | 46 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/folder.py b/folder.py
index 6931067f5..c35afe982 100644
--- a/folder.py
+++ b/folder.py
@@ -17,6 +17,7 @@
 import provenance
 import vault
 from util import *
+from vault_utils import get_sanity_checks_results_copy_to_vault_paths
 
 __all__ = ['rule_collection_group_name',
            'api_folder_get_locks',
@@ -568,6 +569,16 @@ def determine_and_set_vault_target(ctx, coll):
     """Determine and set target on coll"""
     found, target = get_existing_vault_target(ctx, coll)
 
+    # Overwrite vault target if it does not pass sanity checks. This should usually
+    # fix any wrong vault target. There's a second check in the copy_folder_to_vault
+    # function to prevent TOCTOU issues.
+    sanity_check_results = get_sanity_checks_results_copy_to_vault_paths(coll, target)
+    if len(sanity_check_results) > 0:
+        log.write(ctx, "folder_secure: overwriting previous vault target for " + coll
+                       + "(" + target + "), because it did not meet sanity checks: "
+                       + str(sanity_check_results))
+        found = False
+
     # Determine vault target if it does not exist.
     if not found:
         target = determine_new_vault_target(ctx, coll)
diff --git a/unit-tests/test_vault.py b/unit-tests/test_vault.py
index 5b430217d..c0bd7c5fe 100644
--- a/unit-tests/test_vault.py
+++ b/unit-tests/test_vault.py
@@ -8,7 +8,7 @@
 
 sys.path.append('..')
 
-from vault_utils import get_copy_folder_to_vault_irsync_command
+from vault_utils import get_copy_folder_to_vault_irsync_command, get_sanity_checks_results_copy_to_vault_paths
 
 
 class VaultTest(TestCase):
@@ -24,3 +24,35 @@ def test_get_copy_folder_to_vault_irsync_command_without_vault_resc(self):
     def test_get_copy_folder_to_vault_irsync_command_no_multithreading(self):
         output = get_copy_folder_to_vault_irsync_command("/zoneName/home/research-foo/abc", "/zoneName/home/vault-foo/abc", "vaultResc", False)
         self.assertEqual(output, ["irsync", "-rK", "-R", "vaultResc", "-N", "0", "i:/zoneName/home/research-foo/abc/", "i:/zoneName/home/vault-foo/abc/original"])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_ok(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/vault-foo")
+        self.assertEqual(output, [])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_relative_source(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("research-foo", "/tempZone/home/vault-foo")
+        self.assertEqual(output, ["Source path is not absolute."])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_relative_target(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "vault-foo")
+        self.assertEqual(output, ["Target path is not absolute."])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_dotdot_source(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo/..", "/tempZone/home/vault-foo")
+        self.assertEqual(output, ["Source path contains parent references (..)"])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_dotdot_target(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/../vault-foo")
+        self.assertEqual(output, ["Target path contains parent references (..)"])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_wrong_source_space(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/vault-foo", "/tempZone/home/vault-foo")
+        self.assertEqual(output, ["Source path not in research or deposit group."])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_wrong_target_space(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/deposit-foo")
+        self.assertEqual(output, ["Target path not in vault group."])
+
+    def test_get_sanity_check_results_copy_to_vault_paths_source_target_mismatch(self):
+        output = get_sanity_checks_results_copy_to_vault_paths("/tempZone/home/research-foo", "/tempZone/home/vault-bar")
+        self.assertEqual(output, ["Source and target group are not in same compartment."])
diff --git a/vault.py b/vault.py
index 7dec783d6..1e8d9f653 100644
--- a/vault.py
+++ b/vault.py
@@ -22,7 +22,7 @@
 import policies_datamanager
 import policies_datapackage_status
 from util import *
-from vault_utils import get_copy_folder_to_vault_irsync_command
+from vault_utils import get_copy_folder_to_vault_irsync_command, get_sanity_checks_results_copy_to_vault_paths
 
 __all__ = ['api_vault_submit',
            'api_vault_approve',
@@ -954,6 +954,12 @@ def copy_folder_to_vault(ctx, coll, target):
 
     :returns: True for successful copy
     """
+    sanity_check_results = get_sanity_checks_results_copy_to_vault_paths(coll, target)
+    if len(sanity_check_results) > 0:
+        log.write(ctx, "Not copying folder to vault because of sanity check failures: "
+                  + str(sanity_check_results))
+        return False
+
     returncode = 0
     irsync_command = get_copy_folder_to_vault_irsync_command(coll,
                                                              target,
diff --git a/vault_utils.py b/vault_utils.py
index 50cf36257..5875b52f4 100644
--- a/vault_utils.py
+++ b/vault_utils.py
@@ -3,6 +3,8 @@
 __copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
 __license__   = 'GPLv3, see LICENSE'
 
+from util import pathutil
+
 
 def get_copy_folder_to_vault_irsync_command(coll, target, vault_resource, multi_threading):
     """Internal function to determine rsync command for copy-to-vault
@@ -26,3 +28,47 @@ def get_copy_folder_to_vault_irsync_command(coll, target, vault_resource, multi_
 
     irsync_command.extend(["i:{}/".format(coll), "i:{}/original".format(target)])
     return irsync_command
+
+
+def get_sanity_checks_results_copy_to_vault_paths(source, target):
+    """Internal function to determine whether a source and destination path for
+       archiving data in the vault pass sanity checks.
+
+       :param source: source collection
+       :param target: target collection
+
+       :returns: list of sanity check fails (empty list means all tests passed)
+    """
+    failed = []
+
+    if not source.startswith("/"):
+        failed.append("Source path is not absolute.")
+
+    if not target.startswith("/"):
+        failed.append("Target path is not absolute.")
+
+    if ".." in source.split("/"):
+        failed.append("Source path contains parent references (..)")
+
+    if ".." in target.split("/"):
+        failed.append("Target path contains parent references (..)")
+
+    if len(failed) > 0:
+        # The remaining tests assume absolute paths without parent references,
+        # so skip these tests if previous tests did not pass.
+        return failed
+
+    (source_space, source_zone, source_group, _) = pathutil.info(source)
+    (target_space, target_zone, target_group, _) = pathutil.info(target)
+
+    if source_space not in (pathutil.Space.DEPOSIT, pathutil.Space.RESEARCH):
+        failed.append("Source path not in research or deposit group.")
+
+    if target_space != pathutil.Space.VAULT:
+        failed.append("Target path not in vault group.")
+
+    if (source_zone != target_zone
+            or "-".join(source_group.split("-")[1:]) != "-".join(target_group.split("-")[1:])):
+        failed.append("Source and target group are not in same compartment.")
+
+    return failed