From e1d04c145f51de19706c2eb54c5f24217a69bab0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Oct 2024 09:23:52 -0400 Subject: [PATCH] osf:storageByteCount supplementary metadata [ENG-6187] --- api/caching/tasks.py | 67 +++++++++++++------ osf/metadata/osf_gathering.py | 16 +++++ .../preprint_supplement.turtle | 3 +- .../project_supplement.turtle | 1 + .../registration_supplement.turtle | 3 +- osf_tests/metadata/test_osf_gathering.py | 11 +++ 6 files changed, 77 insertions(+), 24 deletions(-) diff --git a/api/caching/tasks.py b/api/caching/tasks.py index 0b7a4b6670f2..3163fcf180e2 100644 --- a/api/caching/tasks.py +++ b/api/caching/tasks.py @@ -1,11 +1,12 @@ +import logging from urllib.parse import urlparse + +from django.apps import apps +from django.contrib.contenttypes.models import ContentType from django.db import connection from django.db.models import Sum - import requests -import logging -from django.apps import apps from api.caching.utils import storage_usage_cache from framework.postcommit_tasks.handlers import enqueue_postcommit_task @@ -114,32 +115,54 @@ def ban_url(instance): def update_storage_usage_cache(target_id, target_guid, per_page=500000): if not settings.ENABLE_STORAGE_USAGE_CACHE: return - sql = """ - SELECT count(size), sum(size) from - (SELECT size FROM osf_basefileversionsthrough AS obfnv - LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id - LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id - LEFT JOIN django_content_type type on file.target_content_type_id = type.id + from osf.models import Guid + storage_usage_total = compute_storage_usage_total(Guid.load(target_guid)) + key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) + storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + + +def compute_storage_usage_total(target_obj, per_page=500000): + sql = """SELECT count(file_page.size), sum(file_page.size) from ( + SELECT version.size AS size + FROM osf_basefileversionsthrough AS obfnv + LEFT JOIN osf_basefilenode AS file ON obfnv.basefilenode_id = file.id + LEFT JOIN osf_fileversion AS version ON obfnv.fileversion_id = version.id WHERE file.provider = 'osfstorage' - AND type.model = 'abstractnode' AND file.deleted_on IS NULL - AND file.target_object_id=%s + AND file.target_object_id=%(target_id)s + AND file.target_content_type_id = %(target_content_type_id)s ORDER BY version.id - LIMIT %s OFFSET %s) file_page + LIMIT %(per_page)s OFFSET %(offset)s + ) file_page """ - count = per_page + last_count = 1 # initialize non-zero offset = 0 storage_usage_total = 0 with connection.cursor() as cursor: - while count: - cursor.execute(sql, [target_id, per_page, offset]) - result = cursor.fetchall() - storage_usage_total += int(result[0][1]) if result[0][1] else 0 - count = int(result[0][0]) if result[0][0] else 0 - offset += count - - key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) - storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + while last_count: + cursor.execute(sql, { + 'target_id': target_obj.pk, + 'target_content_type_id': ContentType.objects.get_for_model(target_obj).pk, + 'per_page': per_page, + 'offset': offset, + }) + page_count, size_sum = cursor.fetchall()[0] + storage_usage_total += int(size_sum or 0) + last_count = (page_count or 0) + offset += last_count + return storage_usage_total + + +def get_storage_usage_total(target_obj): + _storage_usage_total = None + if settings.ENABLE_STORAGE_USAGE_CACHE: + _cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id) + _storage_usage_total = storage_usage_cache.get(_cache_key) + if _storage_usage_total is None: + _storage_usage_total = compute_storage_usage_total(target_obj) + if settings.ENABLE_STORAGE_USAGE_CACHE: + storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + return _storage_usage_total def update_storage_usage(target): diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 97723e2e235a..2456b412c687 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -8,6 +8,7 @@ from django import db import rdflib +from api.caching.tasks import get_storage_usage_total from osf import models as osfdb from osf.metadata import gather from osf.metadata.rdfutils import ( @@ -213,19 +214,24 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: OSFMAP_SUPPLEMENT = { OSF.Project: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.ProjectComponent: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Registration: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.RegistrationComponent: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Preprint: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.File: { @@ -1149,3 +1155,13 @@ def gather_storage_region(focus): _region_ref = rdflib.URIRef(_region.absolute_api_v2_url) yield (OSF.storageRegion, _region_ref) yield (_region_ref, SKOS.prefLabel, rdflib.Literal(_region.name, lang='en')) + + +@gather.er( + OSF.storageByteCount, + focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint] +) +def gather_storage_byte_count(focus): + _storage_usage_total = get_storage_usage_total(focus.dbmodel) + if _storage_usage_total is not None: + yield (OSF.storageByteCount, _storage_usage_total) diff --git a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle index c30205f27c9d..dd941d6092cf 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle @@ -1,7 +1,8 @@ @prefix osf: . @prefix skos: . - osf:storageRegion . + osf:storageByteCount 1337 ; + osf:storageRegion . a osf:Region ; skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle index 41341cca00f1..53ece58a0491 100644 --- a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle @@ -3,6 +3,7 @@ @prefix skos: . osf:hasOsfAddon ; + osf:storageByteCount 7 ; osf:storageRegion . a osf:AddonImplementation ; diff --git a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle index ac9d0d08673c..bc3d320771db 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle @@ -1,7 +1,8 @@ @prefix osf: . @prefix skos: . - osf:storageRegion . + osf:storageByteCount 17 ; + osf:storageRegion . a osf:Region ; skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index 790be8679db7..afc73c179476 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -821,3 +821,14 @@ def test_gather_storage_region(self): (self.preprintfocus.iri, OSF.storageRegion, _default_region_ref), (_default_region_ref, SKOS.prefLabel, Literal('United States', lang='en')), }) + + def test_gather_storage_byte_count(self): + assert_triples(osf_gathering.gather_storage_byte_count(self.projectfocus), { + (self.projectfocus.iri, OSF.storageByteCount, Literal(123456)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageByteCount, Literal(0)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageByteCount, Literal(1337)), + })