From 462fae41a023896a6479a63fa63a50a7d3a1062b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Oct 2024 09:23:52 -0400 Subject: [PATCH] osf:storageByteCount supplementary metadata [ENG-6187] --- api/caching/tasks.py | 64 +++++++++++++------ osf/metadata/osf_gathering.py | 16 +++++ .../preprint_supplement.turtle | 3 +- .../project_supplement.turtle | 1 + .../registration_supplement.turtle | 3 +- osf_tests/metadata/test_osf_gathering.py | 11 ++++ 6 files changed, 78 insertions(+), 20 deletions(-) diff --git a/api/caching/tasks.py b/api/caching/tasks.py index 0b7a4b6670f..fa1b87be843 100644 --- a/api/caching/tasks.py +++ b/api/caching/tasks.py @@ -1,11 +1,12 @@ +import logging from urllib.parse import urlparse + +from django.apps import apps +from django.contrib.contenttypes.models import ContentType from django.db import connection from django.db.models import Sum - import requests -import logging -from django.apps import apps from api.caching.utils import storage_usage_cache from framework.postcommit_tasks.handlers import enqueue_postcommit_task @@ -16,6 +17,9 @@ logger = logging.getLogger(__name__) +_DEFAULT_FILEVERSION_PAGE_SIZE = 500000 + + def get_varnish_servers(): # TODO: this should get the varnish servers from HAProxy or a setting return settings.VARNISH_SERVERS @@ -111,35 +115,59 @@ def ban_url(instance): @app.task(max_retries=5, default_retry_delay=10) -def update_storage_usage_cache(target_id, target_guid, per_page=500000): +def update_storage_usage_cache(target_id, target_guid, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): if not settings.ENABLE_STORAGE_USAGE_CACHE: return + from osf.models import Guid + storage_usage_total = compute_storage_usage_total(Guid.load(target_guid).referent, per_page=per_page) + key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) + storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + + +def compute_storage_usage_total(target_obj, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE): sql = """ SELECT count(size), sum(size) from (SELECT size FROM osf_basefileversionsthrough AS obfnv LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id - LEFT JOIN django_content_type type on file.target_content_type_id = type.id WHERE file.provider = 'osfstorage' - AND type.model = 'abstractnode' AND file.deleted_on IS NULL - AND file.target_object_id=%s + AND file.target_object_id=%(target_pk)s + AND file.target_content_type_id=%(target_content_type_pk)s ORDER BY version.id - LIMIT %s OFFSET %s) file_page + LIMIT %(per_page)s OFFSET %(offset)s + ) file_page """ - count = per_page + last_count = 1 # initialize non-zero offset = 0 storage_usage_total = 0 + content_type_pk = ContentType.objects.get_for_model(target_obj).pk with connection.cursor() as cursor: - while count: - cursor.execute(sql, [target_id, per_page, offset]) - result = cursor.fetchall() - storage_usage_total += int(result[0][1]) if result[0][1] else 0 - count = int(result[0][0]) if result[0][0] else 0 - offset += count - - key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) - storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + while last_count: + cursor.execute( + sql, { + 'target_pk': target_obj.pk, + 'target_content_type_pk': content_type_pk, + 'per_page': per_page, + 'offset': offset, + }, + ) + this_count, size_sum = cursor.fetchall()[0] + storage_usage_total += int(size_sum or 0) + last_count = (this_count or 0) + offset += last_count + return storage_usage_total + + +def get_storage_usage_total(target_obj): + if not settings.ENABLE_STORAGE_USAGE_CACHE: + return compute_storage_usage_total(target_obj) + _cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id) + _storage_usage_total = storage_usage_cache.get(_cache_key) + if _storage_usage_total is None: + _storage_usage_total = compute_storage_usage_total(target_obj) + storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + return _storage_usage_total def update_storage_usage(target): diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index ba0da33fec5..617e22b237d 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -8,6 +8,7 @@ from django import db import rdflib +from api.caching.tasks import get_storage_usage_total from osf import models as osfdb from osf.metadata import gather from osf.metadata.rdfutils import ( @@ -218,19 +219,24 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: OSFMAP_SUPPLEMENT = { OSF.Project: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.ProjectComponent: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Registration: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.RegistrationComponent: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Preprint: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.File: { @@ -1172,3 +1178,13 @@ def gather_storage_region(focus): _region_ref = rdflib.URIRef(_region.absolute_api_v2_url) yield (OSF.storageRegion, _region_ref) yield (_region_ref, SKOS.prefLabel, rdflib.Literal(_region.name, lang='en')) + + +@gather.er( + OSF.storageByteCount, + focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint] +) +def gather_storage_byte_count(focus): + _storage_usage_total = get_storage_usage_total(focus.dbmodel) + if _storage_usage_total is not None: + yield (OSF.storageByteCount, _storage_usage_total) diff --git a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle index 8ac4aa1b988..9ff0732a509 100644 --- a/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/preprint_supplement.turtle @@ -1,6 +1,7 @@ @prefix osf: . @prefix skos: . - osf:storageRegion . + osf:storageByteCount 1337 ; + osf:storageRegion . skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle index 70363ed33a3..d055e97554f 100644 --- a/osf_tests/metadata/expected_metadata_files/project_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/project_supplement.turtle @@ -3,6 +3,7 @@ @prefix skos: . osf:hasOsfAddon ; + osf:storageByteCount 7 ; osf:storageRegion . a osf:AddonImplementation ; diff --git a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle index 9c2599245e7..9e8201b7915 100644 --- a/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle +++ b/osf_tests/metadata/expected_metadata_files/registration_supplement.turtle @@ -1,6 +1,7 @@ @prefix osf: . @prefix skos: . - osf:storageRegion . + osf:storageByteCount 17 ; + osf:storageRegion . skos:prefLabel "United States"@en . diff --git a/osf_tests/metadata/test_osf_gathering.py b/osf_tests/metadata/test_osf_gathering.py index 016b2578295..73355562e68 100644 --- a/osf_tests/metadata/test_osf_gathering.py +++ b/osf_tests/metadata/test_osf_gathering.py @@ -871,3 +871,14 @@ def test_gather_qualified_attributions(self): (_attribution_readonly, PROV.agent, self.userfocus__readonly), (_attribution_readonly, DCAT.hadRole, OSF['readonly-contributor']), }) + + def test_gather_storage_byte_count(self): + assert_triples(osf_gathering.gather_storage_byte_count(self.projectfocus), { + (self.projectfocus.iri, OSF.storageByteCount, Literal(123456)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.registrationfocus), { + (self.registrationfocus.iri, OSF.storageByteCount, Literal(0)), + }) + assert_triples(osf_gathering.gather_storage_byte_count(self.preprintfocus), { + (self.preprintfocus.iri, OSF.storageByteCount, Literal(1337)), + })