From a28fc2e07e6ead415c8af84667ca4ed8e6dd00b9 Mon Sep 17 00:00:00 2001 From: Fatimah Zulfiqar Date: Fri, 16 Aug 2024 09:12:42 +0200 Subject: [PATCH] subjects: added euroscivoc datastream --- .../contrib/subjects/datastreams.py | 14 +- .../subjects/euroscivoc/datastreams.py | 158 ++++++++++++++++++ 2 files changed, 170 insertions(+), 2 deletions(-) create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py index 04925290..0d03efb4 100644 --- a/invenio_vocabularies/contrib/subjects/datastreams.py +++ b/invenio_vocabularies/contrib/subjects/datastreams.py @@ -12,6 +12,15 @@ from invenio_i18n import lazy_gettext as _ from ...datastreams.writers import ServiceWriter +from .euroscivoc.datastreams import ( + VOCABULARIES_DATASTREAM_READERS as euroscivoc_readers, +) +from .euroscivoc.datastreams import ( + VOCABULARIES_DATASTREAM_TRANSFORMERS as euroscivoc_transformers, +) +from .euroscivoc.datastreams import ( + VOCABULARIES_DATASTREAM_WRITERS as euroscivoc_writers, +) from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers @@ -30,15 +39,16 @@ def _entry_id(self, entry): return entry["id"] -VOCABULARIES_DATASTREAM_READERS = {**mesh_readers} +VOCABULARIES_DATASTREAM_READERS = {**mesh_readers, **euroscivoc_readers} """Subjects Data Streams readers.""" -VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers} +VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers, **euroscivoc_transformers} """Subjects Data Streams transformers.""" VOCABULARIES_DATASTREAM_WRITERS = { "subjects-service": SubjectsServiceWriter, **mesh_writers, + **euroscivoc_writers, } """Subjects Data Streams writers.""" diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py new file mode 100644 index 00000000..b093aca8 --- /dev/null +++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2022-2024 CERN. +# Copyright (C) 2024 California Institute of Technology. +# +# Invenio-Vocabularies is free software; you can redistribute it and/or +# modify it under the terms of the MIT License; see LICENSE file for more +# details. + +"""EuroSciVoc subjects datastreams, readers, transformers, and writers.""" + +import io +from collections import namedtuple + +import requests +from rdflib import OWL, RDF, Graph, Namespace + +from invenio_vocabularies.datastreams.readers import BaseReader +from invenio_vocabularies.datastreams.transformers import BaseTransformer + + +class EuroSciVocSubjectsHTTPReader(BaseReader): + """Reader class to fetch and process EuroSciVoc RDF data.""" + + def __init__(self, origin=None, mode="r", since=None, *args, **kwargs): + """Initialize the reader with the data source. + + :param origin: The URL from which to fetch the RDF data. + :param mode: Mode of operation (default is 'r' for reading). + """ + self.origin = origin + super().__init__(origin=origin, mode=mode, *args, **kwargs) + + def _iter(self, rdf_graph): + """Iterate over the RDF graph, yielding one subject at a time. + + :param rdf_graph: The RDF graph to process. + :yield: Processed EuroSciVoc subject data. + """ + SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") + Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"]) + + for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)): + # Handle multilingual labels + languages = {} + pref_label = None + + for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)): + languages[label.language] = label.value + if label.language == "en": + pref_label = label.value + + # Fallback to alternative labels if no preferred label in English + if not pref_label: + for _, _, label in rdf_graph.triples( + (subject, SKOS_CORE.altLabel, None) + ): + if label.language not in languages: + languages[label.language] = label.value + if label.language == "en": + pref_label = label.value + break + + # Handle deprecation status + deprecated = False + for _, _, dep in rdf_graph.triples((subject, OWL.deprecated, None)): + deprecated = dep.value + break + + # Handle parent relationship + parent = None + for _, _, broader in rdf_graph.triples((subject, SKOS_CORE.broader, None)): + parent = str(broader) + break + + # Build the entry + title = languages + props = { + "deprecated": str(deprecated), + "parent": parent, + } + entry = Entry(str(subject), "EuroSciVoc", pref_label, title, props) + yield entry + + def read(self, item=None, *args, **kwargs): + """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time. + + :param item: The RDF data provided as bytes (optional). + :yield: Processed EuroSciVoc subject data. + """ + if item: + raise NotImplementedError( + "EuroSciVocSubjectsHTTPReader does not support being chained after another reader" + ) + # Fetch the RDF data from the specified origin URL + response = requests.get(self.origin) + response.raise_for_status() + + # Treat the response content as a file-like object + rdf_data = io.BytesIO(response.content) + + # Parse the RDF data into a graph + rdf_graph = Graph() + rdf_graph.parse(rdf_data, format="xml") + + # Yield each processed subject from the RDF graph + yield from self._iter(rdf_graph) + + +class EuroSciVocSubjectsTransformer(BaseTransformer): + """Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" + + def _as_dict(self, entry): + """Convert an entry to a dictionary.""" + return { + "id": entry.id, + "scheme": entry.scheme, + "subject": entry.subject, + } + + def apply(self, stream_entry, *args, **kwargs): + """Transform a stream entry to the required dictionary format. + + :param stream_entry: The entry to be transformed. + :return: The transformed stream entry. + """ + entry_data = stream_entry.entry + entry_data = self._as_dict(entry_data) + stream_entry.entry = entry_data # Update the stream entry with transformed data + return stream_entry + + +# Configuration for datastream readers, transformers, and writers +VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader} + +VOCABULARIES_DATASTREAM_WRITERS = {} + +VOCABULARIES_DATASTREAM_TRANSFORMERS = { + "euroscivoc-transformer": EuroSciVocSubjectsTransformer +} + +DATASTREAM_CONFIG = { + "readers": [ + { + "type": "euroscivoc-reader", + "args": { + "origin": "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf" + }, + } + ], + "transformers": [{"type": "euroscivoc-transformer"}], + "writers": [ + { + "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}}, + "type": "async", + } + ], +}