From 11a1eed98fc1d09c91b7cb416c52a4907c75ab08 Mon Sep 17 00:00:00 2001 From: Stefanie Taepke Date: Sat, 19 Aug 2023 11:27:12 +0200 Subject: [PATCH 1/4] fix: call auth-function before resource_show otherwise we get a NotFound for every resource, even though they exist O_o --- ckanext/ogdchcommands/logic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/ogdchcommands/logic.py b/ckanext/ogdchcommands/logic.py index d3300fd..4fd1223 100644 --- a/ckanext/ogdchcommands/logic.py +++ b/ckanext/ogdchcommands/logic.py @@ -226,7 +226,7 @@ def ogdch_cleanup_filestore(context, data_dict): relpath = os.path.relpath(fullpath, storage_path) resource_id = get_resource_id(relpath) - # check if associated resource exists + tk.check_access('resource_show', context, {'id': resource_id}) try: tk.get_action('resource_show')( context, From 33be45384ba01dfcd7b40231a15325af162af745 Mon Sep 17 00:00:00 2001 From: Stefanie Taepke Date: Tue, 22 Aug 2023 14:13:28 +0200 Subject: [PATCH 2/4] fix: resource_id-regex The path in a productive environment looks like this `resources/bfb/f4c/75-1efd-474c-a347-6b2690e6344b` so we must strip that part of the path as well to get the resource_id --- ckanext/ogdchcommands/logic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/ogdchcommands/logic.py b/ckanext/ogdchcommands/logic.py index 4fd1223..02dc6ae 100644 --- a/ckanext/ogdchcommands/logic.py +++ b/ckanext/ogdchcommands/logic.py @@ -208,8 +208,9 @@ def ogdch_cleanup_resources(context, data_dict): } def get_resource_id(filepath): - # filepath: bfb/f4c/75-1efd-474c-a347-6b2690e6344b + # filepath: resources/bfb/f4c/75-1efd-474c-a347-6b2690e6344b # resource id: bfbf4c75-1efd-474c-a347-6b2690e6344b + filepath = re.sub(r'resources', '', filepath) return re.sub(r'\/', '', filepath) def ogdch_cleanup_filestore(context, data_dict): From 138269a01df3f09bd9bb6973249a9477c300eb94 Mon Sep 17 00:00:00 2001 From: Stefanie Taepke Date: Tue, 22 Aug 2023 16:21:09 +0200 Subject: [PATCH 3/4] fix: only check for orphaned filestores in resources-directory --- ckanext/ogdchcommands/logic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/ogdchcommands/logic.py b/ckanext/ogdchcommands/logic.py index 02dc6ae..83852d0 100644 --- a/ckanext/ogdchcommands/logic.py +++ b/ckanext/ogdchcommands/logic.py @@ -208,9 +208,8 @@ def ogdch_cleanup_resources(context, data_dict): } def get_resource_id(filepath): - # filepath: resources/bfb/f4c/75-1efd-474c-a347-6b2690e6344b + # filepath: bfb/f4c/75-1efd-474c-a347-6b2690e6344b # resource id: bfbf4c75-1efd-474c-a347-6b2690e6344b - filepath = re.sub(r'resources', '', filepath) return re.sub(r'\/', '', filepath) def ogdch_cleanup_filestore(context, data_dict): @@ -218,13 +217,14 @@ def ogdch_cleanup_filestore(context, data_dict): cleans up the filestore files that are no longer associated to any resources. """ dryrun = data_dict.get('dryrun') + resource_path = storage_path + "/resources/" filepaths = [] errors = [] - for subdir, dirs, files in os.walk(storage_path): + for subdir, dirs, files in os.walk(resource_path): for file in files: fullpath = os.path.join(subdir, file) - relpath = os.path.relpath(fullpath, storage_path) + relpath = os.path.relpath(fullpath, resource_path) resource_id = get_resource_id(relpath) tk.check_access('resource_show', context, {'id': resource_id}) From 4bdb3f70b511b668efeb9d955ff0acfe145fb409 Mon Sep 17 00:00:00 2001 From: Stefanie Taepke Date: Fri, 25 Aug 2023 14:29:04 +0200 Subject: [PATCH 4/4] chore: add cleanup-filestore to documentation --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 26ea6a5..84553e7 100755 --- a/README.md +++ b/README.md @@ -36,6 +36,16 @@ It also comes with a dryrun option. paster --plugin=ckanext-ogdchcommands ogdch cleanup_resources -c /var/www/ckan/development.ini ``` +### Command to cleanup resource-files from the filestore. +When a resource gets deleted will be marked as deleted in the database and also its associated file in the CKAN-FileStore won't be deleted. +This command finds these orphaned files by checking whether their corresponding resource still exists. +It is meant to be run regularly by a cronjob. +It also comes with a dryrun option. + +```bash +paster --plugin=ckanext-ogdchcommands ogdch cleanup_filestore -c /var/www/ckan/development.ini +``` + ## Command to cleanup the package extra table. When a key is no longer needed in the package_extra table, since it is no longer part of the dataset, then after the data have been migrated that old key can be removed from the package_extra table