From a28fc2e07e6ead415c8af84667ca4ed8e6dd00b9 Mon Sep 17 00:00:00 2001
From: Fatimah Zulfiqar <fatimah0zulfiqar@gmail.com>
Date: Fri, 16 Aug 2024 09:12:42 +0200
Subject: [PATCH] subjects: added euroscivoc datastream

---
 .../contrib/subjects/datastreams.py           |  14 +-
 .../subjects/euroscivoc/datastreams.py        | 158 ++++++++++++++++++
 2 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py

diff --git a/invenio_vocabularies/contrib/subjects/datastreams.py b/invenio_vocabularies/contrib/subjects/datastreams.py
index 04925290..0d03efb4 100644
--- a/invenio_vocabularies/contrib/subjects/datastreams.py
+++ b/invenio_vocabularies/contrib/subjects/datastreams.py
@@ -12,6 +12,15 @@
 from invenio_i18n import lazy_gettext as _
 
 from ...datastreams.writers import ServiceWriter
+from .euroscivoc.datastreams import (
+    VOCABULARIES_DATASTREAM_READERS as euroscivoc_readers,
+)
+from .euroscivoc.datastreams import (
+    VOCABULARIES_DATASTREAM_TRANSFORMERS as euroscivoc_transformers,
+)
+from .euroscivoc.datastreams import (
+    VOCABULARIES_DATASTREAM_WRITERS as euroscivoc_writers,
+)
 from .mesh.datastreams import VOCABULARIES_DATASTREAM_READERS as mesh_readers
 from .mesh.datastreams import VOCABULARIES_DATASTREAM_TRANSFORMERS as mesh_transformers
 from .mesh.datastreams import VOCABULARIES_DATASTREAM_WRITERS as mesh_writers
@@ -30,15 +39,16 @@ def _entry_id(self, entry):
         return entry["id"]
 
 
-VOCABULARIES_DATASTREAM_READERS = {**mesh_readers}
+VOCABULARIES_DATASTREAM_READERS = {**mesh_readers, **euroscivoc_readers}
 """Subjects Data Streams readers."""
 
-VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers}
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {**mesh_transformers, **euroscivoc_transformers}
 """Subjects Data Streams transformers."""
 
 VOCABULARIES_DATASTREAM_WRITERS = {
     "subjects-service": SubjectsServiceWriter,
     **mesh_writers,
+    **euroscivoc_writers,
 }
 """Subjects Data Streams writers."""
 
diff --git a/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
new file mode 100644
index 00000000..b093aca8
--- /dev/null
+++ b/invenio_vocabularies/contrib/subjects/euroscivoc/datastreams.py
@@ -0,0 +1,158 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2022-2024 CERN.
+# Copyright (C) 2024 California Institute of Technology.
+#
+# Invenio-Vocabularies is free software; you can redistribute it and/or
+# modify it under the terms of the MIT License; see LICENSE file for more
+# details.
+
+"""EuroSciVoc subjects datastreams, readers, transformers, and writers."""
+
+import io
+from collections import namedtuple
+
+import requests
+from rdflib import OWL, RDF, Graph, Namespace
+
+from invenio_vocabularies.datastreams.readers import BaseReader
+from invenio_vocabularies.datastreams.transformers import BaseTransformer
+
+
+class EuroSciVocSubjectsHTTPReader(BaseReader):
+    """Reader class to fetch and process EuroSciVoc RDF data."""
+
+    def __init__(self, origin=None, mode="r", since=None, *args, **kwargs):
+        """Initialize the reader with the data source.
+
+        :param origin: The URL from which to fetch the RDF data.
+        :param mode: Mode of operation (default is 'r' for reading).
+        """
+        self.origin = origin
+        super().__init__(origin=origin, mode=mode, *args, **kwargs)
+
+    def _iter(self, rdf_graph):
+        """Iterate over the RDF graph, yielding one subject at a time.
+
+        :param rdf_graph: The RDF graph to process.
+        :yield: Processed EuroSciVoc subject data.
+        """
+        SKOS_CORE = Namespace("http://www.w3.org/2004/02/skos/core#")
+        Entry = namedtuple("Entry", ["id", "scheme", "subject", "title", "props"])
+
+        for subject, _, _ in rdf_graph.triples((None, RDF.type, SKOS_CORE.Concept)):
+            # Handle multilingual labels
+            languages = {}
+            pref_label = None
+
+            for _, _, label in rdf_graph.triples((subject, SKOS_CORE.prefLabel, None)):
+                languages[label.language] = label.value
+                if label.language == "en":
+                    pref_label = label.value
+
+            # Fallback to alternative labels if no preferred label in English
+            if not pref_label:
+                for _, _, label in rdf_graph.triples(
+                    (subject, SKOS_CORE.altLabel, None)
+                ):
+                    if label.language not in languages:
+                        languages[label.language] = label.value
+                    if label.language == "en":
+                        pref_label = label.value
+                        break
+
+            # Handle deprecation status
+            deprecated = False
+            for _, _, dep in rdf_graph.triples((subject, OWL.deprecated, None)):
+                deprecated = dep.value
+                break
+
+            # Handle parent relationship
+            parent = None
+            for _, _, broader in rdf_graph.triples((subject, SKOS_CORE.broader, None)):
+                parent = str(broader)
+                break
+
+            # Build the entry
+            title = languages
+            props = {
+                "deprecated": str(deprecated),
+                "parent": parent,
+            }
+            entry = Entry(str(subject), "EuroSciVoc", pref_label, title, props)
+            yield entry
+
+    def read(self, item=None, *args, **kwargs):
+        """Fetch and process the EuroSciVoc RDF data, yielding it one subject at a time.
+
+        :param item: The RDF data provided as bytes (optional).
+        :yield: Processed EuroSciVoc subject data.
+        """
+        if item:
+            raise NotImplementedError(
+                "EuroSciVocSubjectsHTTPReader does not support being chained after another reader"
+            )
+        # Fetch the RDF data from the specified origin URL
+        response = requests.get(self.origin)
+        response.raise_for_status()
+
+        # Treat the response content as a file-like object
+        rdf_data = io.BytesIO(response.content)
+
+        # Parse the RDF data into a graph
+        rdf_graph = Graph()
+        rdf_graph.parse(rdf_data, format="xml")
+
+        # Yield each processed subject from the RDF graph
+        yield from self._iter(rdf_graph)
+
+
+class EuroSciVocSubjectsTransformer(BaseTransformer):
+    """Transformer class to convert EuroSciVoc RDF data to a dictionary format."""
+
+    def _as_dict(self, entry):
+        """Convert an entry to a dictionary."""
+        return {
+            "id": entry.id,
+            "scheme": entry.scheme,
+            "subject": entry.subject,
+        }
+
+    def apply(self, stream_entry, *args, **kwargs):
+        """Transform a stream entry to the required dictionary format.
+
+        :param stream_entry: The entry to be transformed.
+        :return: The transformed stream entry.
+        """
+        entry_data = stream_entry.entry
+        entry_data = self._as_dict(entry_data)
+        stream_entry.entry = entry_data  # Update the stream entry with transformed data
+        return stream_entry
+
+
+# Configuration for datastream readers, transformers, and writers
+VOCABULARIES_DATASTREAM_READERS = {"euroscivoc-reader": EuroSciVocSubjectsHTTPReader}
+
+VOCABULARIES_DATASTREAM_WRITERS = {}
+
+VOCABULARIES_DATASTREAM_TRANSFORMERS = {
+    "euroscivoc-transformer": EuroSciVocSubjectsTransformer
+}
+
+DATASTREAM_CONFIG = {
+    "readers": [
+        {
+            "type": "euroscivoc-reader",
+            "args": {
+                "origin": "https://op.europa.eu/o/opportal-service/euvoc-download-handler?cellarURI=http%3A%2F%2Fpublications.europa.eu%2Fresource%2Fdistribution%2Feuroscivoc%2F20231115-0%2Frdf%2Fskos_ap_eu%2FEuroSciVoc-skos-ap-eu.rdf&fileName=EuroSciVoc-skos-ap-eu.rdf"
+            },
+        }
+    ],
+    "transformers": [{"type": "euroscivoc-transformer"}],
+    "writers": [
+        {
+            "args": {"writer": {"args": {"update": True}, "type": "subjects-service"}},
+            "type": "async",
+        }
+    ],
+}