From bf2a490cf9c9c89901d2b990532ab520659014e8 Mon Sep 17 00:00:00 2001 From: Jitesh Kamble Date: Tue, 13 Mar 2018 09:20:56 -0700 Subject: [PATCH 1/7] [JARS-82] Reduce resources list page load times Don't display resource tags to reduce resource list page load times. --- cookies/filters.py | 11 +++++++++-- cookies/templates/fragment_resource_list.html | 2 +- cookies/views/resource.py | 11 +++++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/cookies/filters.py b/cookies/filters.py index 76308e1..46166d6 100644 --- a/cookies/filters.py +++ b/cookies/filters.py @@ -78,10 +78,17 @@ class ResourceContainerFilter(django_filters.FilterSet): content = django_filters.CharFilter(name='primary__indexable_content', lookup_expr='icontains') part_of = django_filters.ModelChoiceFilter(name='part_of', queryset=Collection.objects.all()) + + # FIXME: The following statement results in a very expensive Postgres query. + # entity_type = django_filters.ModelChoiceFilter( + # name='primary__entity_type', + # queryset=Type.objects.annotate(num_instances=Count('resource'))\ + # .filter(num_instances__gt=0) + # ) + # As a temporary workaround, use a static list for choices. entity_type = django_filters.ModelChoiceFilter( name='primary__entity_type', - queryset=Type.objects.annotate(num_instances=Count('resource'))\ - .filter(num_instances__gt=0) + queryset=Type.objects.all(), ) # FIXME: The following statement results in a very expensive Postgres query. diff --git a/cookies/templates/fragment_resource_list.html b/cookies/templates/fragment_resource_list.html index 76feba4..c7893d1 100644 --- a/cookies/templates/fragment_resource_list.html +++ b/cookies/templates/fragment_resource_list.html @@ -50,7 +50,7 @@ }); }); - + {# #} {% endif %} diff --git a/cookies/views/resource.py b/cookies/views/resource.py index 2205170..810956b 100644 --- a/cookies/views/resource.py +++ b/cookies/views/resource.py @@ -140,13 +140,16 @@ def resource_list(request): del params['page'] filter_parameters = urlquote_plus(params.urlencode()) filtered_resources = ResourceContainerFilter(request.GET, queryset=resources) - tags = filtered_resources.qs.order_by('primary__tags__tag__id')\ - .values_list('primary__tags__tag__id', 'primary__tags__tag__name')\ - .distinct('primary__tags__tag__id') + + # The following statement results in an expensive Postgres Query. + # Disable tags in this view for now. + # tags = filtered_resources.qs.order_by('primary__tags__tag__id')\ + # .values_list('primary__tags__tag__id', 'primary__tags__tag__name')\ + # .distinct('primary__tags__tag__id') context = { 'filtered_objects': filtered_resources, - 'tags': filter(lambda tag: tag[0] is not None, tags), + # 'tags': filter(lambda tag: tag[0] is not None, tags), 'q': request.GET.get('name'), 'filter_parameters': filter_parameters, 'resource_count': filtered_resources.qs.count() From f78bdd4356d3d6c7216b3dbc637c03892e3f6fdd Mon Sep 17 00:00:00 2001 From: Jitesh Kamble Date: Thu, 15 Mar 2018 09:47:46 -0700 Subject: [PATCH 2/7] [JARS-106] Fix error in giles log view --- cookies/views/giles.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cookies/views/giles.py b/cookies/views/giles.py index 8aff20a..ae7c8ca 100644 --- a/cookies/views/giles.py +++ b/cookies/views/giles.py @@ -278,9 +278,9 @@ def _change_priority(request, form_data, filtered_objects): def log(request): # FIXME: We don't want upload enabled for Enqueued/Assigned/Sent state. - upload_enabled = lambda f: True if filtered_objects.data['state'] not in (GilesUpload.PENDING, GilesUpload.DONE) else False + upload_enabled = lambda f: True if filtered_objects.data.get('state') not in (GilesUpload.PENDING, GilesUpload.DONE) else False - priority_changeable = lambda f: True if filtered_objects.data['state'] == GilesUpload.PENDING else False + priority_changeable = lambda f: True if filtered_objects.data.get('state') == GilesUpload.PENDING else False if request.method == 'GET': qs = auth.apply_filter(ResourceAuthorization.VIEW, request.user, GilesUpload.objects.all()) From 3eccd96c659d3a1b71528e078f6ec7c3c1f11929 Mon Sep 17 00:00:00 2001 From: Jitesh Date: Thu, 29 Mar 2018 12:26:54 -0700 Subject: [PATCH 3/7] [JARS-124] Handle additional files form Giles (#118) * [JARS-124] Handle additional files form Giles Create resources for additional files reported by Giles. * [JARS-124] Address review comments * [JARS-124] Store & display source of content Store the source of each content resource reported by Giles. Display the source along with the content type. * [JARS-124] Move creator map to settings --- cookies/giles.py | 118 +++++++++++++++++++++-- cookies/templates/resource.html | 7 ++ cookies/templatetags/resource_creator.py | 9 ++ cookies/tests/test_giles.py | 4 + jars/settings.py | 12 +++ 5 files changed, 142 insertions(+), 8 deletions(-) create mode 100644 cookies/templatetags/resource_creator.py diff --git a/cookies/giles.py b/cookies/giles.py index c65089b..5ec9e55 100644 --- a/cookies/giles.py +++ b/cookies/giles.py @@ -15,6 +15,7 @@ from datetime import datetime, timedelta from django.utils import timezone from collections import defaultdict +from jars.settings import GILES_RESPONSE_CREATOR_MAP _fix_url = lambda url: url#url.replace('http://', 'https://') if url is not None else None @@ -556,6 +557,52 @@ def process_upload(upload_id, username): upload.save() +def _create_additional_files_resources(additional_files, parent_resource, creator, + processor_predicate, resource_type_fn, + name_fn=lambda x:x['url']): + """ + Helper function for creating content resources for 'additionalFiles' in + processed Giles upload response. + + Parameters + ---------- + additional_files : list + List of dicts containing details of each additional file. + parent_resource : :class:`.Resource` instance + Represents the document/object of which the content resource (to be + created) is a digital surrogate. + creator : :class:`.User` + The person responsible for adding the content to Giles. + processor_predicate : :class:`.Field` instance + The predicate to use for defining 'processor' relation. + resource_type_fn : callable + A function accepting additional file info dict and returning the + resource type. Must return an instance of :class:`.Type`. + name_fn : callable + A function accepting dict and returning name for each additional file + content resource. Must return a str. + """ + for additional_file in additional_files: + content_type = additional_file.get('content-type') + uri = additional_file.get('url') + resource_type = resource_type_fn(additional_file) + content_resource = _create_content_resource( + parent_resource, + resource_type, + creator, + uri, uri, + content_type=content_type, + name=name_fn(additional_file), + ) + + if additional_file.get('processor'): + Relation.objects.create( + source=content_resource, + predicate=processor_predicate, + target=Value.objects.create(name=additional_file.get('processor')), + container=content_resource.container, + ) + def process_details(data, upload_id, username): """ Process document data from Giles. @@ -577,14 +624,30 @@ def process_details(data, upload_id, username): raise ValueError('data is empty') data = data[0] + __creator__ = Field.objects.get(uri='http://purl.org/dc/elements/1.1/creator') + __text__ = Type.objects.get(uri='http://purl.org/dc/dcmitype/Text') + __image__ = Type.objects.get(uri='http://purl.org/dc/dcmitype/Image') + __document__ = Type.objects.get(uri='http://xmlns.com/foaf/0.1/Document') + __dataset__ = Type.objects.get(uri='http://purl.org/dc/dcmitype/Dataset') + CONTENT_RESOURCE_TYPE_MAP = { + 'text/plain': __text__, + 'text/csv': __dataset__, + } + + def _get_resource_type(data): + content_type = data.get('content-type') + try: + return CONTENT_RESOURCE_TYPE_MAP[content_type] + except KeyError: + if 'image' in content_type: + return __image__ + return __document__ + upload = GilesUpload.objects.get(upload_id=upload_id) resource = upload.resource # This is the master resource. creator = User.objects.get(username=username) giles = settings.GILES - __text__ = Type.objects.get(uri='http://purl.org/dc/dcmitype/Text') - __image__ = Type.objects.get(uri='http://purl.org/dc/dcmitype/Image') - __document__ = Type.objects.get(uri='http://xmlns.com/foaf/0.1/Document') document_id = data.get('documentId') @@ -621,8 +684,23 @@ def process_details(data, upload_id, username): text_content_type = text_data.get('content-type') # text_uri = '%s/files/%s' % (giles, text_data.get('id')) text_uri = text_data.get('url') - _create_content_resource(resource, __text__, creator, text_uri, - text_uri, content_type=text_content_type) + content_resource = _create_content_resource(resource, __text__, creator, + text_uri, text_uri, + content_type=text_content_type) + Relation.objects.create( + source=content_resource, + predicate=__creator__, + target=Value.objects.create(name=GILES_RESPONSE_CREATOR_MAP['extractedText']), + container=content_resource.container, + ) + + # Content resource for each additional file, if available. + _create_additional_files_resources(data.get('additionalFiles', []), + resource, + creator, + processor_predicate=__creator__, + resource_type_fn=_get_resource_type, + ) # Keep track of page resources so that we can populate ``next_page``. pages = defaultdict(dict) @@ -639,14 +717,14 @@ def process_details(data, upload_id, username): pages[page_nr]['resource'] = page_resource # Each page resource can have several content resources. - for fmt in ['image', 'text']: + for fmt in ['image', 'text', 'ocr',]: # We may not have both formats for each page. fmt_data = page_data.get(fmt, None) if fmt_data is None: continue page_fmt_uri = '%s/files/%s' % (giles, fmt_data.get('id')) - pages[page_nr][fmt] = _create_content_resource(page_resource, + content_resource = _create_content_resource(page_resource, __image__ if fmt == 'image' else __text__, creator, page_fmt_uri, _fix_url(fmt_data.get('url')), @@ -654,10 +732,34 @@ def process_details(data, upload_id, username): content_type=fmt_data.get('content-type'), name='%s (%s)' % (page_resource.name, fmt)) + try: + Relation.objects.create( + source=content_resource, + predicate=__creator__, + target=Value.objects.create(name=GILES_RESPONSE_CREATOR_MAP[fmt]), + container=content_resource.container, + ) + except KeyError: + # Creator not defined for `fmt` in GILES_RESPONSE_CREATOR_MAP + pass + + pages[page_nr][fmt] = content_resource + + name_fn = lambda d: '%s - %s (%s)' % (page_resource.name, d.get('id'), + d.get('content_type')) + # Content resource for each additional file, if available. + _create_additional_files_resources(page_data.get('additionalFiles', []), + page_resource, + creator, + processor_predicate=__creator__, + resource_type_fn=_get_resource_type, + name_fn=name_fn, + ) + # Populate the ``next_page`` field for pages, and for their content # resources. for i in sorted(pages.keys())[:-1]: - for fmt in ['resource', 'image', 'text']: + for fmt in ['resource', 'image', 'text', 'ocr',]: if fmt not in pages[i]: continue pages[i][fmt].next_page = pages[i + 1][fmt] diff --git a/cookies/templates/resource.html b/cookies/templates/resource.html index 41f4b94..94b6043 100644 --- a/cookies/templates/resource.html +++ b/cookies/templates/resource.html @@ -4,6 +4,7 @@ {% load breadcrumbs %} {% load preview %} {% load authorization %} +{% load resource_creator %} {% block extrastyle %}