From afc3ced39db91339578e22c16d17501edc01637d Mon Sep 17 00:00:00 2001 From: Josh Humphries Date: Thu, 12 Sep 2024 19:20:31 +0100 Subject: [PATCH] feat: compatibility with vds vNext BREAKING CHANGE: compatibility with vds vNext Main change here it that resources and versions are purely an internal concept now just stored in the database not passed to the user or expected from the user. This is because with the new vds changes we don't actually need to store this information to recreate a search across multiple resources. In the past, vds allowed new resource versions to be created as long as they were greater than the previous version of a resource which forced us to store the resources and versions to avoid creating DOIs which could reference data created after the DOI is minted but with a version before it. Because this is no longer possible, we don't need to worry about this any more. Currently, the database still stores these resource and version dicts but it's strictly not necessary (though it does make matching queries easier). This might get changed in the future (we could achieve the same by storing a requested version and a single rounded version for all resources) but for now it's not a problem. --- ckanext/query_dois/lib/doi.py | 177 +++------- ckanext/query_dois/lib/query.py | 314 +++++++++--------- ckanext/query_dois/lib/utils.py | 77 ----- ckanext/query_dois/logic/action.py | 42 +-- ckanext/query_dois/logic/schema.py | 3 +- ckanext/query_dois/logic/utils.py | 114 ------- ckanext/query_dois/model.py | 2 +- ckanext/query_dois/plugin.py | 91 ++--- ckanext/query_dois/routes/_helpers.py | 40 +-- ckanext/query_dois/routes/query_doi.py | 4 +- .../assets/scripts/multisearch_download.js | 6 +- 11 files changed, 265 insertions(+), 605 deletions(-) delete mode 100644 ckanext/query_dois/logic/utils.py diff --git a/ckanext/query_dois/lib/doi.py b/ckanext/query_dois/lib/doi.py index 203c9cb..fd4ba31 100644 --- a/ckanext/query_dois/lib/doi.py +++ b/ckanext/query_dois/lib/doi.py @@ -7,15 +7,17 @@ import logging import random import string -from ckan import model -from ckan.common import asbool -from ckan.plugins import toolkit +from datetime import datetime +from typing import Tuple, Optional + from datacite import DataCiteMDSClient, schema41 from datacite.errors import DataCiteError, DataCiteNotFoundError -from datetime import datetime -from .utils import get_resource_and_package, get_authors, get_resource_counts -from ..model import QueryDOI +from ckan import model +from ckan.common import asbool +from ckan.plugins import toolkit +from ckanext.query_dois.lib.query import Query +from ckanext.query_dois.model import QueryDOI log = logging.getLogger(__name__) @@ -118,37 +120,35 @@ def generate_doi(client): raise Exception('Failed to generate a DOI') -def find_existing_doi(resources_and_versions, query_hash, query_version=None): +def find_existing_doi(query: Query) -> Optional[QueryDOI]: """ - Returns a QueryDOI object representing the same search, or returns None if one - doesn't exist. + Returns a QueryDOI object representing the query, or returns None if one doesn't + exist. - :param resources_and_versions: the resource ids being queried mapped to the versions they're - being queried at - :param query_hash: the hash of the query - :param query_version: the query version + :param query: a Query object :return: a QueryDOI object or None """ return ( model.Session.query(QueryDOI) .filter( - QueryDOI.resources_and_versions == resources_and_versions, - QueryDOI.query_hash == query_hash, - QueryDOI.query_version == query_version, + QueryDOI.query_hash == query.query_hash, + QueryDOI.query_version == query.query_version, + QueryDOI.resources_and_versions == query.resources_and_versions, ) .first() ) -def create_doi_on_datacite(client, doi, authors, timestamp, count): +def create_doi_on_datacite( + client: DataCiteMDSClient, doi: str, timestamp: datetime, query: Query +): """ Mints the given DOI on datacite using the client. :param client: the MDS datacite client :param doi: the doi (full, prefix and suffix) - :param authors: the authors to associate with the DOI :param timestamp: the datetime when the DOI was created - :param count: the number of records contained in the DOI's data + :param query: a Query object """ # create the data for datacite data = { @@ -189,96 +189,34 @@ def create_doi_on_datacite(client, doi, authors, timestamp, count): def create_database_entry( - doi, - query, - query_hash, - resources_and_versions, - timestamp, - record_count, - requested_version=None, - query_version=None, - resource_counts=None, + doi: str, + query: Query, + timestamp: datetime, ): """ Inserts the database row for the query DOI. :param doi: the doi (full, prefix and suffix) - :param query: the query dict - :param query_hash: the query hash - :param resources_and_versions: the resource ids mapped to their rounded versions (as a dict) + :param query: the query :param timestamp: the datetime the DOI was created - :param record_count: the number of records contained in the DOI's data - :param requested_version: the version requested by the user, if provided - :param query_version: the query version, if provided - :param resource_counts: the resource counts, if provided :return: the QueryDOI object """ query_doi = QueryDOI( doi=doi, - resources_and_versions=resources_and_versions, timestamp=timestamp, - query=query, - query_hash=query_hash, - requested_version=requested_version, - count=record_count, - query_version=query_version, - resource_counts=resource_counts, + resources_and_versions=query.resources_and_versions, + requested_version=query.version, + query=query.query, + query_version=query.query_version, + query_hash=query.query_hash, + count=query.count, + resource_counts=query.counts, ) query_doi.save() return query_doi -def mint_doi(resource_ids, datastore_query): - """ - Mint a DOI on datacite using their API and create a new QueryDOI object, saving it - to the database. If we already have a query which would produce identical data to - the one passed then we return the existing QueryDOI object and don't mint or insert - anything. - - :param resource_ids: a list or resource ids against which the query should be run. Note that as - this extension doesn't support multi-resource search yet an exception will - be thrown if the length of this list isn't 1. - :param datastore_query: the DatastoreQuery object containing the query information - :return: a boolean indicating whether a new DOI was minted and the QueryDOI object representing - the query's DOI - """ - # currently we only deal with single resource searches - if len(resource_ids) != 1: - raise NotImplemented( - "This plugin currently doesn't support multi-resource searches" - ) - resource_id = resource_ids[0] - rounded_version = datastore_query.get_rounded_version(resource_id) - resources_and_versions = {resource_id: rounded_version} - - existing_doi = find_existing_doi(resources_and_versions, datastore_query.query_hash) - if existing_doi is not None: - return False, existing_doi - - # collect up some details we're going to need to mint the DOI - timestamp = datetime.now() - resource, package = get_resource_and_package(resource_id) - record_count = datastore_query.get_count(resource_id) - client = get_client() - - # generate a new DOI to store this query against - doi = generate_doi(client) - create_doi_on_datacite(client, doi, [package['author']], timestamp, record_count) - query_doi = create_database_entry( - doi, - datastore_query.query, - datastore_query.query_hash, - resources_and_versions, - timestamp, - record_count, - requested_version=datastore_query.requested_version, - query_version='v0', - resource_counts={resource_id: record_count}, - ) - return True, query_doi - - -def mint_multisearch_doi(query, query_version, resource_ids_and_versions): +def mint_multisearch_doi(query: Query) -> Tuple[bool, QueryDOI]: """ Mint a DOI on datacite using their API and create a new QueryDOI object, saving it to the database. If we already have a query which would produce identical data to @@ -287,56 +225,19 @@ def mint_multisearch_doi(query, query_version, resource_ids_and_versions): This function handles DOIs created for the versioned datastore's multisearch action. - :param query: the query dict - :param query_version: the query schema version - :param resource_ids_and_versions: a dict of resource ids -> versions - :return: a boolean indicating whether a new DOI was minted and the QueryDOI object representing - the query's DOI + :param query: the query + :return: a boolean indicating whether a new DOI was minted and the QueryDOI object + representing the query's DOI """ - # first off, ask the versioned datastore extension to create a hash of the query - hash_data_dict = dict(query=query, query_version=query_version) - query_hash = toolkit.get_action('datastore_hash_query')({}, hash_data_dict) - - # now check if there are any dois already for this query - existing_doi = find_existing_doi( - resource_ids_and_versions, query_hash, query_version - ) + # check if there are any dois already for this query + existing_doi = find_existing_doi(query) if existing_doi is not None: return False, existing_doi - # collect up some details we're going to need to mint the DOI - timestamp = datetime.now() - authors = get_authors(resource_ids_and_versions.keys()) - # find out how many records come from each resource - resource_counts = get_resource_counts( - query, query_version, resource_ids_and_versions - ) - # sum the count from each resource to work out the total hits - record_count = sum(resource_counts.values()) - # the list call is used here so that we can modify the resource_ids_and_versions - # dict as we iterate over it without causing an error - for resource_id in list(resource_ids_and_versions.keys()): - if resource_counts[resource_id] == 0: - del resource_ids_and_versions[resource_id] - del resource_counts[resource_id] - - if not resource_ids_and_versions: - raise toolkit.ValidationError( - 'The DOI must be associated with at least one record' - ) - # generate a new DOI to store this query against + timestamp = datetime.now() client = get_client() doi = generate_doi(client) - create_doi_on_datacite(client, doi, authors, timestamp, record_count) - query_doi = create_database_entry( - doi, - query, - query_hash, - resource_ids_and_versions, - timestamp, - record_count, - query_version=query_version, - resource_counts=resource_counts, - ) + create_doi_on_datacite(client, doi, timestamp, query) + query_doi = create_database_entry(doi, query, timestamp) return True, query_doi diff --git a/ckanext/query_dois/lib/query.py b/ckanext/query_dois/lib/query.py index 77aed16..47aa411 100644 --- a/ckanext/query_dois/lib/query.py +++ b/ckanext/query_dois/lib/query.py @@ -4,183 +4,183 @@ # This file is part of ckanext-query-dois # Created by the Natural History Museum in London, UK -import copy -import hashlib -import json +from dataclasses import dataclass +from functools import partial, cached_property +from typing import List, Optional, Dict + +import itertools import time -from collections import defaultdict +from sqlalchemy import false +from ckan import model from ckan.plugins import toolkit -class DatastoreQuery(object): +def find_invalid_resources(resource_ids: List[str]) -> List[str]: + """ + Given a list of resource IDs, return a list of resource IDs which are invalid. + Resources are invalid if they are any of the following: + + - not datastore active resources (checked with vds_resource_check) + - not active + - not in an active package + - not in a public package + + :param resource_ids: the resource IDs to check + :return: a list of resource IDs which failed the tests """ - This models datastore queries passed to datastore_search, not the DOIs created from - them. + invalid_resource_ids = set() + + # cache this action (with context) so that we don't have to retrieve it over and + # over again + is_datastore_resource = partial(toolkit.get_action("vds_resource_check"), {}) + + # retrieve all resource ids passed to this function that are also active, in an + # active package and in a public package + query = ( + model.Session.query(model.Resource) + .join(model.Package) + .filter(model.Resource.id.in_(list(resource_ids))) + .filter(model.Resource.state == "active") + .filter(model.Package.state == "active") + .filter(model.Package.private == false()) + .with_entities(model.Resource.id) + ) + # go through each resource ID we found and check if they are datastore resources + for row in query: + if not is_datastore_resource(dict(resource_id=row.id)): + invalid_resource_ids.add(row.id) + + return sorted(invalid_resource_ids) + + +@dataclass(frozen=True) +class Query: + """ + Class representing a query against the versioned datastore. """ - @staticmethod - def _parse_from_query_dict(query_dict): - ''' - Parse a dict of query string parameters which represents the data dict for the - datastore_search action in the URL format used by CKAN. The query_dict parameter is expected - to look something like this (for example): - - { - "q": "banana", - "filters": "colour:yellow|length:200|colour:brown|type:tasty", - etc - } - - If a version is present, either as the version parameter or as the __version__ filter, it - is extracted with preference given to the version parameter if both are provided. - - :param query_dict: the query string dict - :return: the query dict (defaults to {} if nothing can be extracted from the query_dict) and - the requested version (defaults to None, if not provided in the query_dict) - ''' - query = {} - requested_version = None - for param, param_value in query_dict.items(): - if param == 'version': - requested_version = int(param_value) - elif param == 'filters': - filters = defaultdict(list) - for filter_pair in param_value.split('|'): - filter_field, filter_value = filter_pair.split(':', 1) - filters[filter_field].append(filter_value) - if requested_version is None: - popped_version = filters.pop('__version__', None) - if popped_version: - requested_version = int(popped_version[0]) - if filters: - query[param] = filters - else: - query[param] = param_value - return query, requested_version - - @staticmethod - def _parse_from_data_dict(data_dict): - ''' - Parse a dict of query string parameters which represents the data dict for the - datastore_search action in data dict form it expects. The data_dict parameter is expected to - look something like this (for example): - - { - "q": "banana", - "filters": { - "colour": ["yellow", "brown"], - "length": "200", - "type": ["tasty"], - } - etc - } - - If a version is present, either as the version parameter or as the __version__ filter, it - is extracted with preference given to the version parameter if both are provided. - - :param data_dict: the query string dict - :return: the query dict (defaults to {} if nothing can be extracted from the query_dict) and - the requested version (defaults to None, if not provided in the query_dict) - ''' - query = {} - requested_version = None - for param, param_value in data_dict.items(): - if param == 'version': - requested_version = int(param_value) - elif param == 'filters': - filters = {} - for filter_field, filter_value in param_value.items(): - if not isinstance(filter_value, list): - filter_value = [filter_value] - filters[filter_field] = filter_value - if requested_version is None: - popped_version = filters.pop('__version__', None) - if popped_version: - requested_version = int(popped_version[0]) - if filters: - query[param] = filters - else: - query[param] = param_value - return query, requested_version - - def __init__(self, query_dict=None, data_dict=None): - """ - Provide one of the 3 parameters depending on the format you have the query in. + resource_ids: List[str] + version: int + query: dict + query_version: str - :param query_dict: a dict of query string parameters in the CKAN URL format - i.e. the - filters are split with colons and pipes etc - :param data_dict: a dict of data dict parameters - i.e. the typical action data_dict format + @cached_property + def query_hash(self) -> str: """ - if query_dict is not None: - self.query, self.requested_version = self._parse_from_query_dict(query_dict) - elif data_dict is not None: - self.query, self.requested_version = self._parse_from_data_dict(data_dict) - else: - self.query = {} - self.requested_version = None - - if self.requested_version is None: - # default the requested time to now - self.requested_version = int(time.time() * 1000) - self.query_hash = self._generate_query_hash() - - def _generate_query_hash(self): + :return: a unique hash made from the query and query version + """ + return toolkit.get_action("vds_multi_hash")( + {}, {"query": self.query, "query_version": self.query_version} + ) + + @cached_property + def authors(self) -> List[str]: """ - Create a unique hash for this query. To do this we have to ensure that the - features like the order of filters is ignored to ensure that the meaning of the - query is what we're capturing. + Given some resource ids, return a list of unique authors from the packages + associated with them. - :return: a unique hash of the query + :return: a list of authors """ - query = {} - for key, value in self.query.items(): - if key == 'filters': - filters = {} - for filter_field, filter_value in value.items(): - # to ensure the order doesn't matter we have to convert everything to unicode - # and then sort it - filters[str(filter_field)] = sorted(map(str, filter_value)) - query['filters'] = filters - else: - query[str(key)] = str(value) - - # sort_keys=True is used otherwise the key ordering would change between python versions - # and the hash wouldn't match even if the query was the same - dumped_query = json.dumps(query, ensure_ascii=False, sort_keys=True).encode( - 'utf8' + query = ( + model.Session.query(model.Resource) + .join(model.Package) + .filter(model.Resource.id.in_(self.resource_ids)) + .with_entities(model.Package.author) ) - return hashlib.sha1(dumped_query).hexdigest() + return list(set(itertools.chain.from_iterable(query))) + + @cached_property + def resources_and_versions(self) -> Dict[str, int]: + """ + Returns a dict containing the resource IDs as keys and their rounded versions as + values. The rounded versions are acquired via the vds_version_round action. - def get_rounded_version(self, resource_id): + :return: a dict of resource IDs to rounded versions + """ + action = toolkit.get_action("vds_version_round") + return { + resource_id: action( + {}, {"resource_id": resource_id, "version": self.version} + ) + for resource_id in sorted(self.resource_ids) + } + + @cached_property + def counts(self) -> Dict[str, int]: """ - Round the requested version of this query down to the nearest actual version of - the resource. See the versioned-search plugin for more details. + Returns a dict containing the resource IDs as keys and the number of records + which match this query in the resource as the values. - :param resource_id: the id of the resource being searched - :return: the rounded version or None if no versions are available for the given resource id + :return: a dict of resource ids to counts """ - # first retrieve the rounded version to use - data_dict = {'resource_id': resource_id, 'version': self.requested_version} - return toolkit.get_action('datastore_get_rounded_version')({}, data_dict) + data_dict = { + "query": self.query, + "query_version": self.query_version, + "resource_ids": self.resource_ids, + "version": self.version, + } + return toolkit.get_action("vds_multi_count")({}, data_dict)["counts"] + + @cached_property + def count(self) -> int: + """ + The total number of records matching this query. - def get_count(self, resource_id): + :return: an integer + """ + return sum(self.counts.values()) + + @classmethod + def create( + cls, + resource_ids: List[str], + version: Optional[int] = None, + query: Optional[dict] = None, + query_version: Optional[str] = None, + ) -> "Query": + """ + Creates a Query object using the given parameters. The resource_ids are the only + required parameters, everything else is optional and will be defaulted to + sensible values if needed. + + :param resource_ids: the resource IDs + :param version: the version to query at (if missing, defaults to now) + :param query: the query to run (if missing, defaults to any empty query) + :param query_version: the version of the query (if missing, defaults to the + latest query schema version) + :return: a Query object + """ + invalid_resource_ids = find_invalid_resources(resource_ids) + if invalid_resource_ids: + # not all of them were public/active + raise toolkit.ValidationError( + f"Some of the resources requested are private or not active, DOIs can " + f"only be created using public, active resources. Invalid resources: " + f"{', '.format(invalid_resource_ids)}" + ) + + # sort them to ensure comparisons work consistently + resource_ids = sorted(resource_ids) + # default the version to now if not provided + version = version if version is not None else int(time.time() * 1000) + query = query or {} + query_version = query_version or toolkit.get_action("vds_schema_latest")({}, {}) + + return cls(resource_ids, version, query, query_version) + + @classmethod + def create_from_download_request(cls, download_request): """ - Retrieve the number of records matched by this query, resource id and version - combination. + Given a download request from the vds, turn it into our representation of a + query. - :param resource_id: the resource id - :return: an integer value + :param download_request: a DownloadRequest object from vds + :return: """ - data_dict = copy.deepcopy(self.query) - data_dict.update( - { - 'resource_id': resource_id, - # use the version parameter cause it's nicer than having to go in and modify the filters - 'version': self.get_rounded_version(resource_id), - # we don't need the results, just the total - 'limit': 0, - } + return Query.create( + download_request.core_record.resource_ids_and_versions, + download_request.core_record.get_version(), + download_request.core_record.query, + download_request.core_record.query_version, ) - result = toolkit.get_action('datastore_search')({}, data_dict) - return result['total'] diff --git a/ckanext/query_dois/lib/utils.py b/ckanext/query_dois/lib/utils.py index 4899096..2494a93 100644 --- a/ckanext/query_dois/lib/utils.py +++ b/ckanext/query_dois/lib/utils.py @@ -4,13 +4,7 @@ # This file is part of ckanext-query-dois # Created by the Natural History Museum in London, UK -import itertools -from functools import partial -from typing import Dict - -from ckan import model from ckan.plugins import toolkit -from sqlalchemy import false def get_resource_and_package(resource_id): @@ -23,74 +17,3 @@ def get_resource_and_package(resource_id): resource = toolkit.get_action('resource_show')({}, {'id': resource_id}) package = toolkit.get_action('package_show')({}, {'id': resource['package_id']}) return resource, package - - -def get_public_datastore_resources(only=None): - """ - Retrieve all the public resource ids from the database that are also in the - datastore. If the only parameter is provided, it is used to filter the return so - that it only includes those in the only list. - - :param only: a list/set/whatever of resource ids to include in the returned set - :return: a set of public resource ids - """ - # retrieve all resource ids that are active, in an active package and in a public package - query = ( - model.Session.query(model.Resource) - .join(model.Package) - .filter(model.Resource.state == 'active') - .filter(model.Package.state == 'active') - .filter(model.Package.private == false()) - .with_entities(model.Resource.id) - ) - if only: - query = query.filter(model.Resource.id.in_(list(only))) - - public_resource_ids = set() - - # cache this action (with context) so that we don't have to retrieve it over and over again - is_datastore_resource = partial( - toolkit.get_action('datastore_is_datastore_resource'), {} - ) - for resource_id in query: - if is_datastore_resource(dict(resource_id=resource_id)): - public_resource_ids.add(resource_id) - - return public_resource_ids - - -def get_authors(resource_ids): - """ - Given some resource ids, return a list of unique authors from the packages - associated with them. - - :param resource_ids: the resource ids - :return: a list of authors - """ - query = ( - model.Session.query(model.Resource) - .join(model.Package) - .filter(model.Resource.id.in_(list(resource_ids))) - .with_entities(model.Package.author) - ) - return list(set(itertools.chain.from_iterable(query))) - - -def get_resource_counts( - query, query_version, resource_ids_and_versions -) -> Dict[str, int]: - """ - Given a set of query parameters, figure out how many records are found on each - resource. - - :param query: the query dict - :param query_version: the query version - :param resource_ids_and_versions: the resource ids and their specific versions - :return: a dict of resource ids to counts - """ - data_dict = { - "query": query, - "query_version": query_version, - "resource_ids_and_versions": resource_ids_and_versions, - } - return toolkit.get_action("datastore_multisearch_counts")({}, data_dict) diff --git a/ckanext/query_dois/logic/action.py b/ckanext/query_dois/logic/action.py index 783cac4..a2a69e8 100644 --- a/ckanext/query_dois/logic/action.py +++ b/ckanext/query_dois/logic/action.py @@ -5,12 +5,11 @@ # Created by the Natural History Museum in London, UK from ckan.plugins import toolkit - -from . import schema as schema_lib -from .utils import extract_resource_ids_and_versions -from ..lib.doi import mint_multisearch_doi -from ..lib.emails import send_saved_search_email -from ..lib.stats import SAVE_ACTION, record_stat +from ckanext.query_dois.lib.doi import mint_multisearch_doi +from ckanext.query_dois.lib.emails import send_saved_search_email +from ckanext.query_dois.lib.query import Query +from ckanext.query_dois.lib.stats import SAVE_ACTION, record_stat +from ckanext.query_dois.logic import schema as schema_lib def create_doi(context, data_dict): @@ -29,9 +28,6 @@ def create_doi(context, data_dict): :type version: int, number of milliseconds (not seconds!) since UNIX epoch :param resource_ids: the resource ids to search :type resource_ids: list of strings - :param resource_ids_and_versions: a dict of resource ids -> versions defining specific versions - that the resources should be searched at - :type resource_ids_and_versions: a dict of strings -> version ints Returns: @@ -50,31 +46,19 @@ def create_doi(context, data_dict): if errors: raise toolkit.ValidationError(errors) - # then extract the parameters from the data dict, defaulting some things as we go - email_address = data_dict['email_address'] - query = data_dict.get('query', {}) - query_version = data_dict.get( - 'query_version', - toolkit.get_action('datastore_get_latest_query_schema_version')({}, {}), - ) - version = data_dict.get('version', None) - resource_ids = data_dict.get('resource_ids', None) - resource_ids_and_versions = data_dict.get('resource_ids_and_versions', None) - - # figure out which resources and which versions we're going to be creating a DOI for - resource_ids_and_versions = extract_resource_ids_and_versions( - version, resource_ids, resource_ids_and_versions + email_address = data_dict["email_address"] + query = Query.create( + data_dict["resource_ids"], + data_dict.get("version"), + data_dict.get("query"), + data_dict.get("query_version"), ) # create a new DOI or retrieve an existing one - created, doi = mint_multisearch_doi(query, query_version, resource_ids_and_versions) + created, doi = mint_multisearch_doi(query) # record a stat for this action record_stat(doi, SAVE_ACTION, email_address) # send the email to the requesting user email_sent = send_saved_search_email(email_address, doi) - return { - 'is_new': created, - 'doi': doi.doi, - 'email_sent': email_sent, - } + return {"is_new": created, "doi": doi.doi, "email_sent": email_sent} diff --git a/ckanext/query_dois/logic/schema.py b/ckanext/query_dois/logic/schema.py index 3f02f5e..e927603 100644 --- a/ckanext/query_dois/logic/schema.py +++ b/ckanext/query_dois/logic/schema.py @@ -38,10 +38,9 @@ def validator(value): def create_doi(): return { + 'resource_ids': [list_of_strings()], 'email_address': [email_validator], 'query': [ignore_missing, json_validator], 'query_version': [ignore_missing, str], 'version': [ignore_missing, int_validator], - 'resource_ids': [ignore_missing, list_of_strings()], - 'resource_ids_and_versions': [ignore_missing, json_validator], } diff --git a/ckanext/query_dois/logic/utils.py b/ckanext/query_dois/logic/utils.py deleted file mode 100644 index de4254e..0000000 --- a/ckanext/query_dois/logic/utils.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -# -# This file is part of ckanext-query-dois -# Created by the Natural History Museum in London, UK - -import itertools -from datetime import datetime -from functools import partial - -from ckan import model -from ckan.plugins import toolkit -from sqlalchemy import false - - -def get_public_datastore_resources(only=None): - """ - Retrieve all the public resource ids from the database that are also in the - datastore. If the only parameter is provided, it is used to filter the return so - that it only includes those in the only list. - - :param only: a list/set/whatever of resource ids to include in the returned set - :return: a set of public resource ids - """ - # retrieve all resource ids that are active, in an active package and in a public package - query = ( - model.Session.query(model.Resource) - .join(model.Package) - .filter(model.Resource.state == 'active') - .filter(model.Package.state == 'active') - .filter(model.Package.private == false()) - .with_entities(model.Resource.id) - ) - if only: - query = query.filter(model.Resource.id.in_(list(only))) - - public_resource_ids = set() - - # cache this action (with context) so that we don't have to retrieve it over and over again - is_datastore_resource = partial( - toolkit.get_action('datastore_is_datastore_resource'), {} - ) - for row in query: - if is_datastore_resource(dict(resource_id=row.id)): - public_resource_ids.add(row.id) - - return public_resource_ids - - -def get_invalid_resources(resource_ids): - ''' - - :param resource_ids: - :return: - ''' - resource_ids = set(resource_ids) - public_resource_ids = get_public_datastore_resources(only=resource_ids) - return resource_ids - public_resource_ids - - -def extract_resource_ids_and_versions( - req_version=None, req_resource_ids=None, req_resource_ids_and_versions=None -): - if req_resource_ids_and_versions is not None: - req_resource_ids = set(req_resource_ids_and_versions.keys()) - else: - req_resource_ids = ( - set(req_resource_ids) if req_resource_ids is not None else set() - ) - - resource_ids = get_public_datastore_resources(only=req_resource_ids) - bad_resources = req_resource_ids - resource_ids - if bad_resources: - # resources were requested, but not all of them were public/active - raise toolkit.ValidationError( - f'Some of the resources requested are private or not active, ' - f'DOIs can only be created using public, active resources. ' - f'Invalid resources: {", ".format(bad_resources)}' - ) - elif len(resource_ids) == 0: - # no resources available - raise toolkit.ValidationError('No public resources are available') - - version = req_version if req_version is not None else to_timestamp(datetime.now()) - # round all the versions down for each resource - if req_resource_ids_and_versions is not None: - iterator = req_resource_ids_and_versions.items() - else: - iterator = zip(resource_ids, itertools.repeat(version)) - - round_version_action = partial( - toolkit.get_action('datastore_get_rounded_version'), {} - ) - resource_ids_and_versions = {} - for resource_id, resource_version in iterator: - data_dict = {'resource_id': resource_id, 'version': resource_version} - rounded_version = round_version_action(data_dict) - # this isn't really something that should happen, but if it does it just means there's no - # data in the resource's datastore index, leave it out of the return dict - if rounded_version is not None: - resource_ids_and_versions[resource_id] = rounded_version - return resource_ids_and_versions - - -def to_timestamp(moment: datetime) -> int: - """ - Converts the given moment to a UNIX epoch in milliseconds. - - :param moment: a datetime object - :return: integer UNIX epoch in milliseconds - """ - ts = moment.timestamp() - # multiply by 1000 to get the time in milliseconds and use int to remove any decimal places - return int(ts * 1000) diff --git a/ckanext/query_dois/model.py b/ckanext/query_dois/model.py index 48fdf8c..7d3e319 100644 --- a/ckanext/query_dois/model.py +++ b/ckanext/query_dois/model.py @@ -95,7 +95,7 @@ def to_dict(self): 'action': self.action, 'domain': self.domain, 'identifier': self.identifier, - 'timestamp': unicode(self.timestamp), + 'timestamp': str(self.timestamp), } diff --git a/ckanext/query_dois/plugin.py b/ckanext/query_dois/plugin.py index 09c178a..3e063bb 100644 --- a/ckanext/query_dois/plugin.py +++ b/ckanext/query_dois/plugin.py @@ -4,19 +4,15 @@ # This file is part of ckanext-query-dois # Created by the Natural History Museum in London, UK -import json import logging -from contextlib import suppress from ckan import plugins - +from ckan.plugins import toolkit from . import helpers, routes, cli -from .lib.doi import mint_doi, mint_multisearch_doi, find_existing_doi -from .lib.query import DatastoreQuery +from .lib.doi import mint_multisearch_doi, find_existing_doi +from .lib.query import Query from .lib.stats import DOWNLOAD_ACTION, record_stat from .logic import auth, action -from .logic.utils import extract_resource_ids_and_versions - log = logging.getLogger(__name__) @@ -67,90 +63,75 @@ def update_config(self, config): # IVersionedDatastoreDownloads def download_after_init(self, request): try: - # check to see if the download is something we can stick a DOI on (this will - # throw a validation error if any of the resources aren't valid for DOI-ing - extract_resource_ids_and_versions( - req_resource_ids_and_versions=request.core_record.resource_ids_and_versions - ) - + query = Query.create_from_download_request(request) # mint the DOI on datacite if necessary - created, doi = mint_multisearch_doi( - request.core_record.query, - request.core_record.query_version, - request.core_record.resource_ids_and_versions, + mint_multisearch_doi(query) + except toolkit.ValidationError: + log.warning( + "Could not create DOI for download, it contains private resources" ) except: - # if anything goes wrong we don't want to stop the download from completing; - # just log the error and move on - log.error('Failed to mint/retrieve DOI', exc_info=True) + # if anything unexpected goes wrong we don't want to stop the download from + # completing; just log the error and move on + log.error("Failed to mint/retrieve DOI", exc_info=True) def download_modify_notifier_template_context(self, request, context): try: - # if a DOI can be created it should already have been created in download_after_init - doi = find_existing_doi( - request.core_record.resource_ids_and_versions, - request.core_record.query_hash, - request.core_record.query_version, - ) - + query = Query.create_from_download_request(request) + # if a DOI can be created it should already have been created in + # download_after_init + doi = find_existing_doi(query) if doi: # update the context with the doi - context['doi'] = doi.doi + context["doi"] = doi.doi except: # if anything goes wrong we don't want to stop the download; just log the # error and move on - log.error('Failed to retrieve DOI', exc_info=True) + log.error("Failed to retrieve DOI", exc_info=True) # always return the context return context def download_modify_manifest(self, manifest, request): try: - # if a DOI can be created it should already have been created in download_after_init - doi = find_existing_doi( - request.core_record.resource_ids_and_versions, - request.core_record.query_hash, - request.core_record.query_version, - ) - + query = Query.create_from_download_request(request) + # if a DOI can be created it should already have been created in + # download_after_init + doi = find_existing_doi(query) if doi: # add the doi to the manifest - manifest['query-doi'] = doi.doi + manifest["query-doi"] = doi.doi except: # if anything goes wrong we don't want to stop the download from completing; # just log the error and move on - log.error('Failed to retrieve DOI', exc_info=True) + log.error("Failed to retrieve DOI", exc_info=True) # always return the manifest return manifest def download_after_run(self, request): try: + query = Query.create_from_download_request(request) # if a DOI can be created it should already have been created in # download_modify_manifest - doi = find_existing_doi( - request.core_record.resource_ids_and_versions, - request.core_record.query_hash, - request.core_record.query_version, - ) - - if doi and request.state == 'complete': + doi = find_existing_doi(query) + if doi and request.state == "complete": # record a download stat against the DOI record_stat(doi, DOWNLOAD_ACTION, identifier=request.id) except: # just log the error and move on - log.error('Failed to retrieve DOI and/or create stats', exc_info=True) + log.error("Failed to retrieve DOI and/or create stats", exc_info=True) # ITemplateHelpers def get_helpers(self): return { - 'render_filter_value': helpers.render_filter_value, - 'get_most_recent_dois': helpers.get_most_recent_dois, - 'get_time_ago_description': helpers.get_time_ago_description, - 'get_landing_page_url': helpers.get_landing_page_url, - 'create_citation_text': helpers.create_citation_text, - 'create_multisearch_citation_text': helpers.create_multisearch_citation_text, - 'pretty_print_query': helpers.pretty_print_query, - 'get_doi_count': helpers.get_doi_count, - 'versioned_datastore_available': self.versioned_datastore_available, + "render_filter_value": helpers.render_filter_value, + "get_most_recent_dois": helpers.get_most_recent_dois, + "get_time_ago_description": helpers.get_time_ago_description, + "get_landing_page_url": helpers.get_landing_page_url, + "create_citation_text": helpers.create_citation_text, + "create_multisearch_citation_text": helpers.create_multisearch_citation_text, + "pretty_print_query": helpers.pretty_print_query, + "get_doi_count": helpers.get_doi_count, + "versioned_datastore_available": self.versioned_datastore_available, } diff --git a/ckanext/query_dois/routes/_helpers.py b/ckanext/query_dois/routes/_helpers.py index 5508e1f..896ef79 100644 --- a/ckanext/query_dois/routes/_helpers.py +++ b/ckanext/query_dois/routes/_helpers.py @@ -260,35 +260,25 @@ def get_package_and_resource_info(resource_ids): return packages, resources -def create_slugs(query_doi): +def create_current_slug(query_doi: QueryDOI) -> str: """ - Create two slugs, one for the original query and one for the query at the current - version (to achieve this we just leave out any version information from the slug). + Creates a slug for the given query DOI at the current version, this is done with a + nav slug which has no version. - :param query_doi: a query doi object - :return: a slug for the original query and a slug for the current query + :param query_doi: the QueryDOI + :return: a slug """ - original_slug_data_dict = { - 'query': query_doi.query, - 'query_version': query_doi.query_version, - 'resource_ids_and_versions': query_doi.resources_and_versions, + slug_data_dict = { + "query": query_doi.query, + "query_version": query_doi.query_version, + "resource_ids": query_doi.get_resource_ids(), + "nav_slug": True, } - original_slug = toolkit.get_action('datastore_create_slug')( - {}, original_slug_data_dict - ) - - current_slug_data_dict = { - 'query': query_doi.query, - 'query_version': query_doi.query_version, - 'resource_ids': query_doi.get_resource_ids(), - } - current_slug = toolkit.get_action('datastore_create_slug')( - {}, current_slug_data_dict - ) - return original_slug['slug'], current_slug['slug'] + current_slug = toolkit.get_action("vds_slug_create")({}, slug_data_dict) + return current_slug["slug"] -def render_multisearch_doi_page(query_doi): +def render_multisearch_doi_page(query_doi: QueryDOI): """ Renders a DOI landing page for a datastore_multisearch based query DOI. @@ -301,7 +291,7 @@ def render_multisearch_doi_page(query_doi): sorted_resource_counts = sorted( query_doi.resource_counts.items(), key=operator.itemgetter(1), reverse=True ) - original_slug, current_slug = create_slugs(query_doi) + current_slug = create_current_slug(query_doi) context = { 'query_doi': query_doi, @@ -313,7 +303,7 @@ def render_multisearch_doi_page(query_doi): 'saves': saves, 'last_download_timestamp': last_download_timestamp, 'sorted_resource_counts': sorted_resource_counts, - 'original_slug': original_slug, + 'original_slug': query_doi.doi, 'current_slug': current_slug, } return toolkit.render('query_dois/multisearch_landing_page.html', context) diff --git a/ckanext/query_dois/routes/query_doi.py b/ckanext/query_dois/routes/query_doi.py index ff1869d..0f2b8f4 100644 --- a/ckanext/query_dois/routes/query_doi.py +++ b/ckanext/query_dois/routes/query_doi.py @@ -5,10 +5,10 @@ # Created by the Natural History Museum in London, UK -from ckan import model -from ckan.plugins import toolkit from flask import Blueprint, jsonify +from ckan import model +from ckan.plugins import toolkit from . import _helpers from ..model import QueryDOI, QueryDOIStat diff --git a/ckanext/query_dois/theme/assets/scripts/multisearch_download.js b/ckanext/query_dois/theme/assets/scripts/multisearch_download.js index fd7857d..361814f 100644 --- a/ckanext/query_dois/theme/assets/scripts/multisearch_download.js +++ b/ckanext/query_dois/theme/assets/scripts/multisearch_download.js @@ -7,9 +7,6 @@ $(document).ready(function () { // this data is added in the template, extract it and parse the JSON data const query = JSON.parse(downloadButton.attr('data-query')); const queryVersion = downloadButton.attr('data-query-version'); - const resourceIdsAndVersions = JSON.parse( - downloadButton.attr('data-resources-and-versions'), - ); downloadButton.on('click', function () { // pull out the form data @@ -26,13 +23,12 @@ $(document).ready(function () { query: { query: query, query_version: queryVersion, - resource_ids_and_versions: resourceIdsAndVersions, }, notifier: { type: 'none', }, }; - fetch('/api/3/action/datastore_queue_download', { + fetch('/api/3/action/vds_download_queue', { method: 'POST', body: JSON.stringify(payload), headers: {