-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
subjects: added euroscivoc datastream
- Loading branch information
1 parent
aaf4e15
commit a28fc2e
Showing
2 changed files
with
170 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
158 changes: 158 additions & 0 deletions
158
invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022-2024 CERN. | ||
# Copyright (C) 2024 California Institute of Technology. | ||
# | ||
# Invenio-Vocabularies is free software; you can redistribute it and/or | ||
# modify it under the terms of the MIT License; see LICENSE file for more | ||
# details. | ||
|
||
"""EuroSciVoc subjects datastreams, readers, transformers, and writers.""" | ||
|
||
import io | ||
from collections import namedtuple | ||
|
||
import requests | ||
from rdflib import OWL, RDF, Graph, Namespace | ||
|
||
from invenio_vocabularies.datastreams.readers import BaseReader | ||
from invenio_vocabularies.datastreams.transformers import BaseTransformer | ||
|
||
|
||
class EuroSciVocSubjectsHTTPReader(BaseReader): | ||
"""Reader class to fetch and process EuroSciVoc RDF data.""" | ||
|
||
def __init__(self, origin=None, mode="r", since=None, *args, **kwargs): | ||
"""Initialize the reader with the data source. | ||
:param origin: The URL from which to fetch the RDF data. | ||
:param mode: Mode of operation (default is 'r' for reading). | ||
""" | ||
self.origin = origin | ||
super().__init__(origin=origin, mode=mode, *args, **kwargs) | ||
|
||
def _iter(self, rdf_graph): | ||
"""Iterate over the RDF graph, yielding one subject at a time. | ||
:param rdf_graph: The RDF graph to process. | ||
:yield: Processed EuroSciVoc subject data. | ||
""" | ||
SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#") | ||
Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"]) | ||
|
||
for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)): | ||
# Handle multilingual labels | ||
languages = {} | ||
pref_label = None | ||
|
||
for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)): | ||
languages[label.language] = label.value | ||
if label.language == "en": | ||
pref_label = label.value | ||
|
||
# Fallback to alternative labels if no preferred label in English | ||
if not pref_label: | ||
for _, _, label in rdf_graph.triples( | ||
(subject, SKOS_CORE.altLabel, None) | ||
): | ||
if label.language not in languages: | ||
languages[label.language] = label.value | ||
if label.language == "en": | ||
pref_label = label.value | ||
break | ||
|
||
# Handle deprecation status | ||
deprecated = False | ||
for _, _, dep in rdf_graph.triples((subject, OWL.deprecated, None)): | ||
deprecated = dep.value | ||
break | ||
|
||
# Handle parent relationship | ||
parent = None | ||
for _, _, broader in rdf_graph.triples((subject, SKOS_CORE.broader, None)): | ||
parent = str(broader) | ||
break | ||
|
||
# Build the entry | ||
title = languages | ||
props = { | ||
"deprecated": str(deprecated), | ||
"parent": parent, | ||
} | ||
entry = Entry(str(subject), "EuroSciVoc", pref_label, title, props) | ||
yield entry | ||
|
||
def read(self, item=None, *args, **kwargs): | ||
"""Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time. | ||
:param item: The RDF data provided as bytes (optional). | ||
:yield: Processed EuroSciVoc subject data. | ||
""" | ||
if item: | ||
raise NotImplementedError( | ||
"EuroSciVocSubjectsHTTPReader does not support being chained after another reader" | ||
) | ||
# Fetch the RDF data from the specified origin URL | ||
response = requests.get(self.origin) | ||
response.raise_for_status() | ||
|
||
# Treat the response content as a file-like object | ||
rdf_data = io.BytesIO(response.content) | ||
|
||
# Parse the RDF data into a graph | ||
rdf_graph = Graph() | ||
rdf_graph.parse(rdf_data, format="xml") | ||
|
||
# Yield each processed subject from the RDF graph | ||
yield from self._iter(rdf_graph) | ||
|
||
|
||
class EuroSciVocSubjectsTransformer(BaseTransformer): | ||
"""Transformer class to convert EuroSciVoc RDF data to a dictionary format.""" | ||
|
||
def _as_dict(self, entry): | ||
"""Convert an entry to a dictionary.""" | ||
return { | ||
"id": entry.id, | ||
"scheme": entry.scheme, | ||
"subject": entry.subject, | ||
} | ||
|
||
def apply(self, stream_entry, *args, **kwargs): | ||
"""Transform a stream entry to the required dictionary format. | ||
:param stream_entry: The entry to be transformed. | ||
:return: The transformed stream entry. | ||
""" | ||
entry_data = stream_entry.entry | ||
entry_data = self._as_dict(entry_data) | ||
stream_entry.entry = entry_data # Update the stream entry with transformed data | ||
return stream_entry | ||
|
||
|
||
# Configuration for datastream readers, transformers, and writers | ||
VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader} | ||
|
||
VOCABULARIES_DATASTREAM_WRITERS = {} | ||
|
||
VOCABULARIES_DATASTREAM_TRANSFORMERS = { | ||
"euroscivoc-transformer": EuroSciVocSubjectsTransformer | ||
} | ||
|
||
DATASTREAM_CONFIG = { | ||
"readers": [ | ||
{ | ||
"type": "euroscivoc-reader", | ||
"args": { | ||
"origin": "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf" | ||
}, | ||
} | ||
], | ||
"transformers": [{"type": "euroscivoc-transformer"}], | ||
"writers": [ | ||
{ | ||
"args": {"writer": {"args": {"update": True}, "type": "subjects-service"}}, | ||
"type": "async", | ||
} | ||
], | ||
} |