Skip to content

Commit

Permalink
subjects: added euroscivoc datastream
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Aug 16, 2024
1 parent aaf4e15 commit a28fc2e
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 2 deletions.
14 changes: 12 additions & 2 deletions invenio_vocabularies/contrib/subjects/datastreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
from invenio_i18n import lazy_gettext as _

from ...datastreams.writers import ServiceWriter
from .euroscivoc.datastreams import (
VOCABULARIES_DATASTREAM_READERS as euroscivoc_readers,
)
from .euroscivoc.datastreams import (
VOCABULARIES_DATASTREAM_TRANSFORMERS as euroscivoc_transformers,
)
from .euroscivoc.datastreams import (
VOCABULARIES_DATASTREAM_WRITERS as euroscivoc_writers,
)
from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
Expand All @@ -30,15 +39,16 @@ def _entry_id(self, entry):
return entry["id"]


VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
VOCABULARIES_DATASTREAM_READERS = {**mesh_readers, **euroscivoc_readers}
"""Subjects Data Streams readers."""

VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers, **euroscivoc_transformers}
"""Subjects Data Streams transformers."""

VOCABULARIES_DATASTREAM_WRITERS = {
"subjects-service": SubjectsServiceWriter,
**mesh_writers,
**euroscivoc_writers,
}
"""Subjects Data Streams writers."""

Expand Down
158 changes: 158 additions & 0 deletions invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022-2024 CERN.
# Copyright (C) 2024 California Institute of Technology.
#
# Invenio-Vocabularies is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.

"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""

import io
from collections import namedtuple

import requests
from rdflib import OWL, RDF, Graph, Namespace

from invenio_vocabularies.datastreams.readers import BaseReader
from invenio_vocabularies.datastreams.transformers import BaseTransformer


class EuroSciVocSubjectsHTTPReader(BaseReader):
"""Reader class to fetch and process EuroSciVoc RDF data."""

def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
"""Initialize the reader with the data source.
:param origin: The URL from which to fetch the RDF data.
:param mode: Mode of operation (default is 'r' for reading).
"""
self.origin = origin
super().__init__(origin=origin, mode=mode, *args, **kwargs)

def _iter(self, rdf_graph):
"""Iterate over the RDF graph, yielding one subject at a time.
:param rdf_graph: The RDF graph to process.
:yield: Processed EuroSciVoc subject data.
"""
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"])

for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
# Handle multilingual labels
languages = {}
pref_label = None

for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)):
languages[label.language] = label.value
if label.language == "en":
pref_label = label.value

# Fallback to alternative labels if no preferred label in English
if not pref_label:
for _, _, label in rdf_graph.triples(
(subject, SKOS_CORE.altLabel, None)
):
if label.language not in languages:
languages[label.language] = label.value
if label.language == "en":
pref_label = label.value
break

# Handle deprecation status
deprecated = False
for _, _, dep in rdf_graph.triples((subject, OWL.deprecated, None)):
deprecated = dep.value
break

# Handle parent relationship
parent = None
for _, _, broader in rdf_graph.triples((subject, SKOS_CORE.broader, None)):
parent = str(broader)
break

# Build the entry
title = languages
props = {
"deprecated": str(deprecated),
"parent": parent,
}
entry = Entry(str(subject), "EuroSciVoc", pref_label, title, props)
yield entry

def read(self, item=None, *args, **kwargs):
"""Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
:param item: The RDF data provided as bytes (optional).
:yield: Processed EuroSciVoc subject data.
"""
if item:
raise NotImplementedError(
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
)
# Fetch the RDF data from the specified origin URL
response = requests.get(self.origin)
response.raise_for_status()

# Treat the response content as a file-like object
rdf_data = io.BytesIO(response.content)

# Parse the RDF data into a graph
rdf_graph = Graph()
rdf_graph.parse(rdf_data, format="xml")

# Yield each processed subject from the RDF graph
yield from self._iter(rdf_graph)


class EuroSciVocSubjectsTransformer(BaseTransformer):
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format."""

def _as_dict(self, entry):
"""Convert an entry to a dictionary."""
return {
"id": entry.id,
"scheme": entry.scheme,
"subject": entry.subject,
}

def apply(self, stream_entry, *args, **kwargs):
"""Transform a stream entry to the required dictionary format.
:param stream_entry: The entry to be transformed.
:return: The transformed stream entry.
"""
entry_data = stream_entry.entry
entry_data = self._as_dict(entry_data)
stream_entry.entry = entry_data # Update the stream entry with transformed data
return stream_entry


# Configuration for datastream readers, transformers, and writers
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}

VOCABULARIES_DATASTREAM_WRITERS = {}

VOCABULARIES_DATASTREAM_TRANSFORMERS = {
"euroscivoc-transformer": EuroSciVocSubjectsTransformer
}

DATASTREAM_CONFIG = {
"readers": [
{
"type": "euroscivoc-reader",
"args": {
"origin": "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
},
}
],
"transformers": [{"type": "euroscivoc-transformer"}],
"writers": [
{
"args": {"writer": {"args": {"update": True}, "type": "subjects-service"}},
"type": "async",
}
],
}

0 comments on commit a28fc2e

Please sign in to comment.