aiondemand · josvandervelde · Nov 15, 2023 · Nov 14, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ Information on how to install Docker is found in [their documentation](https://d
 
 ### Using docker compose
 ```bash
-docker compose up
+docker compose --profile examples up
 ```
 starts the MYSQL Server, the REST API, Keycloak for Identy and access management and Nginx for reverse proxing. \
 Once started, you should be able to visit the REST API server at: http://localhost and Keycloak at http://localhost/aiod-auth \
@@ -93,6 +93,13 @@ mysql> SHOW DATABASES;
 
 Now, you can visit the server from your browser at `localhost:8000/docs`.
 
+#### Using connectors
+You can specify different connectors using
+
+```bash
+docker compose --profile examples --profile huggingface-datasets --profile openml-datasets up -d
+docker compose --profile examples --profile huggingface-datasets --profile openml-datasets down
+```
 
 #### Local Installation
 

diff --git a/connectors/huggingface/datasets.sh b/connectors/huggingface/datasets.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 WORK_DIR=/opt/connectors/data/huggingface/dataset
+mkdir -p $WORK_DIR
 
 python3 connectors/synchronization.py \
   -c connectors.huggingface.huggingface_dataset_connector.HuggingFaceDatasetConnector \

diff --git a/connectors/openml/datasets.sh b/connectors/openml/datasets.sh
@@ -15,6 +15,6 @@ echo $(date -u) "Starting synchronization..."
 PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \
       -c $CONNECTOR \
       -w $WORK_DIR \
-      --from-identifier 4500 \
-      --save-every 100 > ${WORK_DIR}/connector.log 2>&1
+      --from-identifier 1 \
+      --save-every 100 >> ${WORK_DIR}/connector.log 2>&1
 echo $(date -u) "Synchronization Done."
diff --git a/connectors/zenodo/datasets.sh b/connectors/zenodo/datasets.sh
@@ -16,5 +16,5 @@ PYTHONPATH=/app /usr/local/bin/python3 /app/connectors/synchronization.py \
       -c $CONNECTOR \
       -w $WORK_DIR \
       --from-date "2023-08-01" \
-      --save-every 100 > ${WORK_DIR}/connector.log 2>&1
+      --save-every 100 >> ${WORK_DIR}/connector.log 2>&1
 echo $(date -u) "Synchronization Done."
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -13,7 +13,7 @@ services:
     ports:
       - 8000:8000
     volumes:
-      - ./src:/app
+      - ./src:/app:ro
     command: >
       python main.py
       --rebuild-db only-if-empty
@@ -30,15 +30,16 @@ services:
         condition: service_healthy
 
   fill-db-with-examples:
+    profiles: ["examples"]
     image: ai4eu_server
     container_name: fill-db-with-examples
     env_file: .env
     environment:
       - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET
     volumes:
-      - ./src:/app
+      - ./src:/app:ro
       - ./data/connectors:/opt/connectors/data
-      - ./connectors:/opt/connectors/script
+      - ./connectors:/opt/connectors/script:ro
     command: >
       /bin/bash -c "/opt/connectors/script/fill-examples.sh"
     depends_on:
@@ -63,22 +64,24 @@ services:
         condition: service_healthy
 
   huggingface-dataset-connector:
+    profiles: ["huggingface-datasets"]
     image: ai4eu_server
     container_name: huggingface-dataset-connector
     env_file: .env
     environment:
       - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET
     volumes:
-      - ./src:/app
+      - ./src:/app:ro
       - ./data/connectors:/opt/connectors/data
-      - ./connectors/huggingface/:/opt/connectors/script
+      - ./connectors/huggingface/:/opt/connectors/script:ro
     command: >
       /bin/bash -c "/opt/connectors/script/datasets.sh"
     depends_on:
       app:
         condition: service_healthy
 
   openml-dataset-connector:
+    profiles: ["openml-datasets"]
     build:
       context: connectors/openml
       dockerfile: Dockerfile
@@ -88,16 +91,17 @@ services:
     environment:
       - KEYCLOAK_CLIENT_SECRET=$KEYCLOAK_CLIENT_SECRET
     volumes:
-      - ./src:/app
+      - ./src:/app:ro
       - ./data/connectors:/opt/connectors/data
-      - ./connectors/openml/:/opt/connectors/script
+      - ./connectors/openml/:/opt/connectors/script:ro
     command: >
       /bin/bash -c "/opt/connectors/script/entry.sh"
     depends_on:
       app:
         condition: service_healthy
 
   zenodo-dataset-connector:
+    profiles: ["zenodo-datasets"]
     build:
       context: connectors/zenodo
       dockerfile: Dockerfile
@@ -142,7 +146,7 @@ services:
     ports:
       - 8080:8080
     volumes:
-     - ./quay-keycloak:/opt/keycloak/data/import
+     - ./quay-keycloak:/opt/keycloak/data/import:ro
     command: >
       start-dev
       --hostname-url http://${HOSTNAME}/aiod-auth
@@ -157,7 +161,7 @@ services:
     container_name: nginx
     restart: unless-stopped
     volumes:
-      - ./nginx:/etc/nginx/conf.d
+      - ./nginx:/etc/nginx/conf.d:ro
     ports:
       - 80:80
     depends_on:

diff --git a/src/connectors/abstract/resource_connector_by_date.py b/src/connectors/abstract/resource_connector_by_date.py
@@ -1,11 +1,10 @@
 import abc
 import logging
-from datetime import datetime, date
+from datetime import datetime, timedelta
 from typing import Generic, Iterator, Tuple
+
 from connectors.abstract.resource_connector import ResourceConnector
 from connectors.record_error import RecordError
-
-
 from connectors.resource_with_relations import ResourceWithRelations
 from routers.resource_router import RESOURCE
 
@@ -27,9 +26,10 @@ def fetch(
     def run(
         self,
         state: dict,
-        from_date: date | None = None,
         limit: int | None = None,
+        from_incl: datetime | None = None,
         to_excl: datetime | None = None,
+        time_per_loop: timedelta = timedelta(days=1),
         **kwargs,
     ) -> Iterator[RESOURCE | ResourceWithRelations[RESOURCE] | RecordError]:
         if limit is not None:
@@ -44,17 +44,19 @@ def run(
 
         first_run = not state
         if first_run:
-            if from_date is None:
-                raise ValueError("In the first run, the from-date needs to be set")
-            from_incl = datetime.combine(from_date, datetime.min.time())
+            if from_incl is None:
+                raise ValueError("In the first run, from_incl needs to be set")
         else:
             from_incl = datetime.fromtimestamp(state["last"] + 0.001)
 
-        logging.info(f"Starting synchronisation {from_incl=}, {to_excl=}.")
-        state["from_incl"] = from_incl.timestamp()
-        state["to_excl"] = to_excl.timestamp()
-        for datetime_, result in self.fetch(from_incl=from_incl, to_excl=to_excl):
-            yield result
-            if datetime_:
-                state["last"] = datetime_.timestamp()
+        while from_incl < to_excl:
+            to_excl_current = min(from_incl + time_per_loop, to_excl)
+            logging.info(f"Starting synchronisation {from_incl=}, {to_excl_current=}.")
+            state["from_incl"] = from_incl.timestamp()
+            state["to_excl"] = to_excl_current.timestamp()
+            for datetime_, result in self.fetch(from_incl=from_incl, to_excl=to_excl_current):
+                yield result
+                if datetime_:
+                    state["last"] = datetime_.timestamp()
+            from_incl = to_excl_current
         state["result"] = "Complete run done (although there might be errors)."
diff --git a/...s/example/resources/resource/contact.json → .../example/resources/resource/contacts.json b/...s/example/resources/resource/contact.json → .../example/resources/resource/contacts.json
diff --git a/src/connectors/example/resources/resource/educational_resources.json b/src/connectors/example/resources/resource/educational_resources.json
@@ -12,6 +12,7 @@
         "editor": [],
         "status": "draft"
       },
+      "access_mode": ["textual"],
       "alternate_name": [
         "alias 1",
         "alias 2"
@@ -23,7 +24,7 @@
       ],
       "citation": [],
       "contact": [],
-      "content": [{"plain": "An alternative to using .distribution.content_url"}],
+      "content": {"plain": "An alternative to using .distribution.content_url"},
       "creator": [],
       "distribution": [
         {
@@ -39,7 +40,9 @@
           "technology_readiness_level": 1
         }
       ],
+      "educational_level": ["primary school", "secondary school", "university"],
       "has_part": [],
+      "in_language": ["eng", "fra", "spa"],
       "industrial_sector": [
         "Finance",
         "eCommerce",
@@ -50,6 +53,22 @@
         "keyword1",
         "keyword2"
       ],
+      "location": [{
+        "address": {
+          "region": "California",
+          "locality": "Paris",
+          "street": "Wetstraat 170",
+          "postal_code": "1040 AA",
+          "address": "Wetstraat 170, 1040 Brussel",
+          "country": "BEL"
+        },
+        "geo": {
+          "latitude": 37.42242,
+          "longitude": -122.08585,
+          "elevation_millimeters": 0
+        }
+      }],
+      "prerequisite": ["undergraduate knowledge of statistics", "graduate knowledge of linear algebra"],
       "relevant_link": ["https://www.example.com/a_relevant_link", "https://www.example.com/another_relevant_link"],
       "license": "https://creativecommons.org/share-your-work/public-domain/cc0/",
       "media": [
@@ -80,6 +99,11 @@
         "Computer Vision."
       ],
       "level": "EQF level 3",
+      "target_audience": [
+        "professionals",
+        "students in higher education",
+        "teachers in secondary school"
+      ],
       "type": "presentation"
     }
 ]
diff --git a/src/connectors/huggingface/huggingface_dataset_connector.py b/src/connectors/huggingface/huggingface_dataset_connector.py
@@ -1,6 +1,5 @@
 import logging
 import typing
-
 import bibtexparser
 import requests
 from huggingface_hub import list_datasets
@@ -58,27 +57,7 @@ def fetch(
                 yield RecordError(identifier=dataset.id, error=e)
 
     def fetch_dataset(self, dataset: DatasetInfo, pydantic_class, pydantic_class_publication):
-        citations = []
-        if hasattr(dataset, "citation") and dataset.citation:
-            parsed_citations = bibtexparser.loads(dataset.citation).entries
-            if len(parsed_citations) == 0:
-                if dataset.citation:
-                    citations = [
-                        pydantic_class_publication(
-                            name=dataset.citation,
-                        )
-                    ]
-            else:
-                citations = [
-                    pydantic_class_publication(
-                        platform=self.platform_name,
-                        platform_resource_identifier=citation["ID"],
-                        name=citation["title"],
-                        same_as=citation["link"] if "link" in citation else None,
-                        type=citation["ENTRYTYPE"],
-                    )
-                    for citation in parsed_citations
-                ]
+        citations = self._parse_citations(dataset, pydantic_class_publication)
 
         parquet_info = HuggingFaceDatasetConnector._get(
             url="https://datasets-server.huggingface.co/parquet",
@@ -96,11 +75,18 @@ def fetch_dataset(self, dataset: DatasetInfo, pydantic_class, pydantic_class_pub
         ]
         size = None
         ds_license = None
-        if dataset.card_data is not None and "license" in dataset.card_data:
+        if (
+            dataset.card_data is not None
+            and "license" in dataset.card_data
+            and dataset.card_data["license"]
+        ):
             if isinstance(dataset.card_data["license"], str):
                 ds_license = dataset.card_data["license"]
             else:
-                (ds_license,) = dataset.card_data["license"]
+                # There can be more than one license in HF, e.g., ['cc-by-sa-3.0', 'gfdl']. This
+                # seems weird, what does it mean to have two different licenses? That's why we're
+                # only saving the first.
+                ds_license = dataset.card_data["license"][0]
 
             # TODO(issue 8): implement
             # if "dataset_info" in dataset.cardData:
@@ -129,10 +115,47 @@ def fetch_dataset(self, dataset: DatasetInfo, pydantic_class, pydantic_class_pub
                 description=description,
                 date_published=dataset.createdAt if hasattr(dataset, "createdAt") else None,
                 license=ds_license,
-                distributions=distributions,
+                distribution=distributions,
                 is_accessible_for_free=True,
                 size=size,
-                keywords=dataset.tags,
+                keyword=dataset.tags,
             ),
             related_resources=related_resources,
         )
+
+    def _parse_citations(self, dataset, pydantic_class_publication) -> list:
+        """Best effort parsing of the citations. There are many"""
+        raw_citation = getattr(dataset, "citation", None)
+        if raw_citation is None:
+            return []
+
+        try:
+            parsed_citations = bibtexparser.loads(raw_citation).entries
+            if len(parsed_citations) == 0 and raw_citation.startswith("@"):
+                # Ugly fix: many HF datasets have a wrong citation (see testcase)
+                parsed_citations = bibtexparser.loads(raw_citation + "}").entries
+            elif len(parsed_citations) == 0 and len(raw_citation) <= field_length.NORMAL:
+                # Sometimes dataset.citation is not a bibtex field, but just the title of an article
+                return [
+                    pydantic_class_publication(
+                        name=raw_citation, aiod_entry=AIoDEntryCreate(status="published")
+                    )
+                ]
+            return [
+                pydantic_class_publication(
+                    platform=self.platform_name,
+                    platform_resource_identifier=citation["ID"],
+                    name=citation["title"],
+                    same_as=citation["link"] if "link" in citation else None,
+                    type=citation["ENTRYTYPE"],
+                    description=Text(plain=f"By {citation['author']}")
+                    if "author" in citation
+                    else None,
+                    aiod_entry=AIoDEntryCreate(status="published"),
+                )
+                for citation in parsed_citations
+            ]
+        except Exception:
+            return []
+            # Probably an incorrect bibtex. There are many mistakes in the HF citations. E.g.,
+            # @Inproceedings(Conference) instead of @inproceedings (note the capitals).