From 358038da833d16aeffd62ad921e3a89dae7258a5 Mon Sep 17 00:00:00 2001 From: roll Date: Mon, 6 May 2024 10:13:39 +0100 Subject: [PATCH] serializers: add datapackage serializer --- .../resources/serializers/__init__.py | 2 + .../serializers/datapackage/__init__.py | 26 +++++ .../serializers/datapackage/schema.py | 84 ++++++++++++++ .../test_datapackage_serializer.py | 108 ++++++++++++++++++ 4 files changed, 220 insertions(+) create mode 100644 invenio_rdm_records/resources/serializers/datapackage/__init__.py create mode 100644 invenio_rdm_records/resources/serializers/datapackage/schema.py create mode 100644 tests/resources/serializers/test_datapackage_serializer.py diff --git a/invenio_rdm_records/resources/serializers/__init__.py b/invenio_rdm_records/resources/serializers/__init__.py index f150cfa14..3b67a347a 100644 --- a/invenio_rdm_records/resources/serializers/__init__.py +++ b/invenio_rdm_records/resources/serializers/__init__.py @@ -22,6 +22,7 @@ from .csl import CSLJSONSerializer, StringCitationSerializer from .csv import CSVRecordSerializer from .datacite import DataCite43JSONSerializer, DataCite43XMLSerializer +from .datapackage import DataPackageSerializer from .dcat import DCATSerializer from .dublincore import DublinCoreJSONSerializer, DublinCoreXMLSerializer from .geojson import GeoJSONSerializer @@ -43,6 +44,7 @@ "CSVRecordSerializer", "DataCite43JSONSerializer", "DataCite43XMLSerializer", + "DataPackageSerializer", "DublinCoreJSONSerializer", "DublinCoreXMLSerializer", "FAIRSignpostingProfileLvl2Serializer", diff --git a/invenio_rdm_records/resources/serializers/datapackage/__init__.py b/invenio_rdm_records/resources/serializers/datapackage/__init__.py new file mode 100644 index 000000000..e55d5e8d8 --- /dev/null +++ b/invenio_rdm_records/resources/serializers/datapackage/__init__.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 Open Knowledge Foundation +# +# Invenio-RDM-Records is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Data Package Serializers for Invenio RDM Records.""" + +from flask_resources import BaseListSchema, MarshmallowSerializer +from flask_resources.serializers import JSONSerializer + +from .schema import DataPackageSchema + + +class DataPackageSerializer(MarshmallowSerializer): + """Marshmallow based Data Package serializer for records.""" + + def __init__(self, **options): + """Constructor.""" + super().__init__( + format_serializer_cls=JSONSerializer, + object_schema_cls=DataPackageSchema, + list_schema_cls=BaseListSchema, + **options + ) diff --git a/invenio_rdm_records/resources/serializers/datapackage/schema.py b/invenio_rdm_records/resources/serializers/datapackage/schema.py new file mode 100644 index 000000000..7da1dd09a --- /dev/null +++ b/invenio_rdm_records/resources/serializers/datapackage/schema.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 Open Knowledge Foundation +# +# Invenio-RDM-Records is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Data Package based Schema for Invenio RDM Records.""" + +from marshmallow import Schema, fields, missing + +PROFILE_URL = "https://datapackage.org/profiles/2.0/datapackage.json" + + +class DataPackageSchema(Schema): + """Schema for Data Package in JSON.""" + + profile = fields.Constant(PROFILE_URL, data_key="$schema") + + id = fields.Str(attribute="links.doi") + name = fields.Str(attribute="id") + title = fields.Str(attribute="metadata.title") + description = fields.Str(attribute="metadata.description") + version = fields.Str(attribute="metadata.version") + created = fields.Str(attribute="created") + homepage = fields.Str(attribute="links.self_html") + keywords = fields.Method("get_keywords") + resources = fields.Method("get_resources") + licenses = fields.Method("get_licenses") + contributors = fields.Method("get_contributors") + + def get_keywords(self, obj): + keywords = [] + for subject in obj.get("metadata", {}).get("subjects", []): + keyword = subject.get("subject") + if keyword: + keywords.append(keyword) + return keywords if keywords else missing + + def get_resources(self, obj): + resources = [] + basepath = obj.get("links", {}).get("self_html") + if basepath: + for file in obj.get("files", {}).get("entries", {}).values(): + resource = {} + resource["name"] = file.get("key") + resource["path"] = f'{basepath}/files/{file.get("key")}' + resource["format"] = file.get("ext") + resource["mimetype"] = file.get("mimetype") + resource["bytes"] = file.get("size") + resource["hash"] = file.get("checksum") + resource = {k: v for k, v in resource.items() if v is not None} + if resource.get("name") and resource.get("path"): + resources.append(resource) + return resources + + def get_licenses(self, obj): + licenses = [] + for item in obj.get("metadata", {}).get("rights", []): + license = {} + license["name"] = item.get("id") + license["path"] = item.get("link") or item.get("props", {}).get("url") + license["title"] = item.get("title", {}).get("en") + license = {k: v for k, v in license.items() if v is not None} + if license.get("name"): + licenses.append(license) + return licenses if licenses else missing + + def get_contributors(self, obj): + contributors = [] + for type in ["creator", "contributor"]: + for item in obj.get("metadata", {}).get(f"{type}s", []): + entity = item.get("person_or_org", {}) + parent = (item.get("affiliations") or [{}])[0] + contributor = {} + contributor["title"] = entity.get("name") + contributor["givenName"] = entity.get("given_name") + contributor["familyName"] = entity.get("family_name") + contributor["roles"] = [item.get("role", {}).get("id", type)] + contributor["organization"] = parent.get("name") + contributor = {k: v for k, v in contributor.items() if v is not None} + if contributor: + contributors.append(contributor) + return contributors if contributors else missing diff --git a/tests/resources/serializers/test_datapackage_serializer.py b/tests/resources/serializers/test_datapackage_serializer.py new file mode 100644 index 000000000..7eeeb66b3 --- /dev/null +++ b/tests/resources/serializers/test_datapackage_serializer.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2024 Open Knowledge Foundation +# +# Invenio-RDM-Records is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""Resources serializers tests.""" + +from invenio_rdm_records.resources.serializers.datapackage import DataPackageSerializer + + +def test_data_package_serializer_empty_record(): + serializer = DataPackageSerializer() + serialized_record = serializer.dump_obj({}) + assert serialized_record == { + "$schema": "https://datapackage.org/profiles/2.0/datapackage.json", + "resources": [], + } + + +def test_data_package_serializer_minimal_record(minimal_record_to_dict): + serializer = DataPackageSerializer() + serialized_record = serializer.dump_obj(minimal_record_to_dict) + assert serialized_record == { + "$schema": "https://datapackage.org/profiles/2.0/datapackage.json", + "id": "https://handle.stage.datacite.org/10.1234/67890-fghij", + "name": "67890-fghij", + "title": "A Romans story", + "created": "2023-11-14T19:33:09.837080+00:00", + "homepage": "https://127.0.0.1:5000/records/67890-fghij", + "resources": [], + "contributors": [ + { + "familyName": "Brown", + "givenName": "Troy", + "roles": ["creator"], + }, + { + "roles": ["creator"], + "title": "Troy Inc.", + }, + ], + } + + +def test_data_package_serializer_full_record(full_record_to_dict): + serializer = DataPackageSerializer() + serialized_record = serializer.dump_obj(full_record_to_dict) + assert serialized_record == { + "$schema": "https://datapackage.org/profiles/2.0/datapackage.json", + "id": "https://handle.stage.datacite.org/10.1234/inveniordm.1234", + "name": "12345-abcde", + "title": "InvenioRDM", + "description": "

A description

with HTML tags

", + "version": "v1.0", + "created": "2023-11-14T18:30:55.738898+00:00", + "homepage": "https://127.0.0.1:5000/records/12345-abcde", + "keywords": [ + "Abdominal Injuries", + "custom", + ], + "resources": [ + { + "name": "test.txt", + "path": "https://127.0.0.1:5000/records/12345-abcde/files/test.txt", + "format": "txt", + "mimetype": "text/plain", + "bytes": 9, + "hash": "md5:e795abeef2c38de2b064be9f6364ceae", + }, + ], + "licenses": [ + { + "name": "cc-by-4.0", + "path": "https://creativecommons.org/licenses/by/4.0/legalcode", + "title": "Creative Commons Attribution 4.0 International", + }, + ], + "contributors": [ + { + "familyName": "Nielsen", + "givenName": "Lars Holm", + "organization": "CERN", + "roles": ["creator"], + "title": "Nielsen, Lars Holm", + }, + { + "familyName": "Tom", + "givenName": "Blabin", + "roles": ["creator"], + "title": "Tom, Blabin", + }, + { + "familyName": "Nielsen", + "givenName": "Lars Holm", + "organization": "CERN", + "roles": ["other"], + "title": "Nielsen, Lars Holm", + }, + { + "familyName": "Dirk", + "givenName": "Dirkin", + "roles": ["other"], + "title": "Dirk, Dirkin", + }, + ], + }