Skip to content

Commit

Permalink
awards: added subjects and orgs, updated mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
0einstein0 committed Sep 30, 2024
1 parent b61f971 commit 83b7979
Show file tree
Hide file tree
Showing 11 changed files with 777 additions and 72 deletions.
18 changes: 14 additions & 4 deletions invenio_rdm_records/records/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
EDTFListDumperExt,
GrantTokensDumperExt,
StatisticsDumperExt,
SubjectHierarchyDumperExt,
)
from .systemfields import (
HasDraftCheckField,
Expand Down Expand Up @@ -122,6 +123,7 @@ class CommonFieldsMixin:
CombinedSubjectsDumperExt(),
CustomFieldsDumperExt(fields_var="RDM_CUSTOM_FIELDS"),
StatisticsDumperExt("stats"),
SubjectHierarchyDumperExt(),
]
)

Expand Down Expand Up @@ -150,14 +152,22 @@ class CommonFieldsMixin:
funding_award=PIDListRelation(
"metadata.funding",
relation_field="award",
keys=["title", "number", "identifiers", "acronym", "program", "subjects", "organizations"],
keys=[
"title",
"number",
"identifiers",
"acronym",
"program",
"subjects",
"organizations",
],
pid_field=Award.pid,
cache_key="awards",
),
funding_award_subjects=PIDListRelation(
funding_award_subjects=PIDNestedListRelation(
"metadata.funding",
relation_field="award.subjects",
keys=["subject", "scheme", "identifiers", "props"],
keys=["subject", "scheme", "props"],
pid_field=Subject.pid,
cache_key="subjects",
),
Expand All @@ -176,7 +186,7 @@ class CommonFieldsMixin:
),
subjects=PIDListRelation(
"metadata.subjects",
keys=["subject", "scheme"],
keys=["subject", "scheme", "props"],
pid_field=Subject.pid,
cache_key="subjects",
),
Expand Down
2 changes: 2 additions & 0 deletions invenio_rdm_records/records/dumpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .locations import LocationsDumper
from .pids import PIDsDumperExt
from .statistics import StatisticsDumperExt
from .subject_hierarchy import SubjectHierarchyDumperExt

__all__ = (
"CombinedSubjectsDumperExt",
Expand All @@ -22,4 +23,5 @@
"GrantTokensDumperExt",
"LocationsDumper",
"StatisticsDumperExt",
"SubjectHierarchyDumperExt",
)
68 changes: 68 additions & 0 deletions invenio_rdm_records/records/dumpers/subject_hierarchy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# Invenio-RDM-Records is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""Search dumpers for subject hierarchy support."""

from invenio_records.dumpers import SearchDumperExt


class SubjectHierarchyDumperExt(SearchDumperExt):
"""Search dumper extension for subject hierarchy support.
It parses the values of the `subjects` field in the document, builds hierarchical
parent notations, and adds entries to the `hierarchy` field for each subject in award.
This dumper needs to be placed after the RelationDumper for subjects as it relies
on dereferenced subjects with scheme, subject, and props.
"""

def __init__(self, splitchar=","):
"""Constructor.
:param splitchar: string to use to combine subject ids in hierarchy
"""
super().__init__()
self._splitchar = splitchar

def dump(self, record, data):
"""Dump the data to secondary storage (OpenSearch-like)."""
awards = data.get("metadata", {}).get("funding", [])

def build_hierarchy(parents_str, current_subject_id):
"""Build the hierarchy by progressively combining parent notations."""
parents = parents_str.split(self._splitchar) # Split the parent notations
hierarchy = []
current_hierarchy = parents[0] # Start with the top-level parent

hierarchy.append(current_hierarchy)
for parent in parents[1:]:
current_hierarchy = f"{current_hierarchy}{self._splitchar}{parent}"
hierarchy.append(current_hierarchy)

hierarchy.append(
f"{current_hierarchy}{self._splitchar}{current_subject_id}"
)
return hierarchy

for award in awards:
subjects = award.get("award", {}).get("subjects", [])
for subject in subjects:
parents = subject.get("props", {}).get("parents", "")
current_subject_id = subject.get("id", "")
if parents and current_subject_id:
subject_hierarchy = build_hierarchy(parents, current_subject_id)
subject.setdefault("props", {})["hierarchy"] = subject_hierarchy

data["metadata"] = data.get("metadata", {})

def load(self, data, record_cls):
"""Load the data from secondary storage (OpenSearch-like).
This is run against the parent too (for some reason), so presence of any
field cannot be assumed.
"""
pass
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"affiliations": {
"type": "array",
"items": {"$ref": "#/affiliation"}
"items": { "$ref": "#/affiliation" }
},
"agent": {
"description": "An agent (user, software process, community, ...).",
Expand Down Expand Up @@ -72,16 +72,11 @@
{
"title": "GeoJSON Point",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"Point"
]
"enum": ["Point"]
},
"coordinates": {
"type": "array",
Expand All @@ -102,16 +97,11 @@
{
"title": "GeoJSON LineString",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"LineString"
]
"enum": ["LineString"]
},
"coordinates": {
"type": "array",
Expand All @@ -136,16 +126,11 @@
{
"title": "GeoJSON Polygon",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"Polygon"
]
"enum": ["Polygon"]
},
"coordinates": {
"type": "array",
Expand Down Expand Up @@ -173,16 +158,11 @@
{
"title": "GeoJSON MultiPoint",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"MultiPoint"
]
"enum": ["MultiPoint"]
},
"coordinates": {
"type": "array",
Expand All @@ -206,16 +186,11 @@
{
"title": "GeoJSON MultiLineString",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"MultiLineString"
]
"enum": ["MultiLineString"]
},
"coordinates": {
"type": "array",
Expand Down Expand Up @@ -243,16 +218,11 @@
{
"title": "GeoJSON MultiPolygon",
"type": "object",
"required": [
"type",
"coordinates"
],
"required": ["type", "coordinates"],
"properties": {
"type": {
"type": "string",
"enum": [
"MultiPolygon"
]
"enum": ["MultiPolygon"]
},
"coordinates": {
"type": "array",
Expand Down Expand Up @@ -311,10 +281,7 @@
"nameType": {
"description": "Type of name.",
"type": "string",
"enum": [
"personal",
"organizational"
]
"enum": ["personal", "organizational"]
},
"person_or_org": {
"type": "object",
Expand Down Expand Up @@ -381,12 +348,55 @@
},
"subject": {
"type": "string"
},
"scheme": {
"description": "Identifier of the subject scheme.",
"$ref": "local://definitions-v1.0.0.json#/identifier"
},
"props": {
"type": "object",
"patternProperties": {
"^.*$": {
"type": "string"
}
}
},
"identifiers": {
"description": "Alternate identifiers for the subject.",
"type": "array",
"items": {
"$ref": "local://definitions-v2.0.0.json#/identifiers_with_scheme"
},
"uniqueItems": true
}
}
},
"subjects": {
"type": "array",
"items": {"$ref": "#/subject"}
"items": { "$ref": "#/subject" }
},
"organization": {
"description": "Award's organizations.",
"type": "object",
"additionalProperties": false,
"properties": {
"scheme": {
"description": "Identifier of the organization scheme.",
"$ref": "local://definitions-v1.0.0.json#/identifier"
},
"id": {
"description": "Identifier of the organization for the given scheme.",
"$ref": "local://definitions-v1.0.0.json#/identifier"
},
"organization": {
"description": "Human readable label.",
"type": "string"
}
}
},
"organizations": {
"type": "array",
"items": { "$ref": "#/organization" }
},
"title_type": {
"description": "Type of title.",
Expand Down
31 changes: 11 additions & 20 deletions invenio_rdm_records/records/jsonschemas/records/record-v6.0.0.json
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,12 @@
"items": {
"$ref": "local://definitions-v1.0.0.json#/identifiers_with_scheme"
}
},
"subjects": {
"$ref": "local://records/definitions-v2.0.0.json#/subjects"
},
"organizations": {
"$ref": "local://records/definitions-v2.0.0.json#/organizations"
}
}
}
Expand Down Expand Up @@ -400,18 +406,12 @@
"record": {
"description": "Record visibility (public or restricted)",
"type": "string",
"enum": [
"public",
"restricted"
]
"enum": ["public", "restricted"]
},
"files": {
"description": "Files visibility (public or restricted)",
"type": "string",
"enum": [
"public",
"restricted"
]
"enum": ["public", "restricted"]
},
"embargo": {
"description": "Description of the embargo on the record.",
Expand All @@ -420,25 +420,16 @@
"properties": {
"active": {
"description": "Whether or not the embargo is (still) active.",
"type": [
"boolean",
"null"
]
"type": ["boolean", "null"]
},
"until": {
"description": "Embargo date of record (ISO8601 formatted date time in UTC). At this time both metadata and files will be made public.",
"type": [
"string",
"null"
],
"type": ["string", "null"],
"format": "date"
},
"reason": {
"description": "The reason why the record is under embargo.",
"type": [
"string",
"null"
]
"type": ["string", "null"]
}
}
}
Expand Down
Loading

0 comments on commit 83b7979

Please sign in to comment.