Skip to content

Commit

Permalink
Merge pull request #7 from odissei-data/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
FjodorvRijsselberg authored Jan 30, 2023
2 parents d10a497 + df24325 commit 16bdd08
Show file tree
Hide file tree
Showing 28 changed files with 486 additions and 48 deletions.
63 changes: 60 additions & 3 deletions dot_env_example
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
HOSTNAME="localhost"
PROJECT_CONTAINER_NAME="prefect"
# This python path is added to avoid having to do relative imports.
PYTHONPATH="/scripts/"
DATAVERSE_URL="https://portal.staging.odissei.nl"
DATAVERSE_API_TOKEN="example1-ab12-12ab-abcd-a1b2c3d4e5g6"
XML2JSON_API_TOKEN="Bearer @km1-10122004-lamA"
GITHUB_USERNAME="odissei-data"
DOCKERHUB_USERNAME="fjodorvr"
VERSION="v0.2.0-alpha"

#CBS
CBS_DATAVERSE_ALIAS="cbs"
CBS_TEMPLATE_FILE_PATH="/resources/templates/cbs_dataverse_template.json"
CBS_MAPPING_FILE_PATH="/resources/mappings/cbs-mapping.json"
CBS_METADATA_DIRECTORY="/local-metadata/cbs-metadata"
CBS_METADATA_DIRECTORY="/local-metadata/all"

#EASY
EASY_DATAVERSE_ALIAS="dans-easy"
EASY_DATAVERSE_ALIAS="easy"
EASY_TEMPLATE_FILE_PATH="/resources/templates/easy_dataverse_template.json"
EASY_MAPPING_FILE_PATH="/resources/mappings/easy-mapping.json"
EASY_METADATA_DIRECTORY="/local-metadata/easy-metadata"
Expand All @@ -25,4 +30,56 @@ LISS_METADATA_DIRECTORY="/local-metadata/liss-metadata"
#DataverseNL
DATAVERSE_NL_DATAVERSE_ALIAS="dataverse_nl"
DATAVERSE_NL_METADATA_DIRECTORY="/local-metadata/dataverse_nl-metadata"
DATAVERSE_NL_SOURCE_DATAVERSE_URL="https://dataverse.nl"
DATAVERSE_NL_SOURCE_DATAVERSE_URL="https://dataverse.nl"

#4tu
RESEARCH_DATA_DATAVERSE_ALIAS="4tu"
RESEARCH_DATA_METADATA_DIRECTORY="/local-metadata/dataverse_nl/4tu"

#avans
AVANS_DATAVERSE_ALIAS="AvansHogeschool"
AVANS_METADATA_DIRECTORY="/local-metadata/dataverse_nl/avans"

#fontys
FONTYS_DATAVERSE_ALIAS="fontys"
FONTYS_METADATA_DIRECTORY="/local-metadata/dataverse_nl/fontys"

#groningen
GRONINGEN_DATAVERSE_ALIAS="groningen"
GRONINGEN_METADATA_DIRECTORY="/local-metadata/dataverse_nl/groningen"

#hanze
HANZE_DATAVERSE_ALIAS="hanze"
HANZE_METADATA_DIRECTORY="/local-metadata/dataverse_nl/hanze"

#Hogeschool Rotterdam
HR_DATAVERSE_ALIAS="hr"
HR_METADATA_DIRECTORY="/local-metadata/dataverse_nl/hr"

#leiden
LEIDEN_DATAVERSE_ALIAS="leidenuniversity"
LEIDEN_METADATA_DIRECTORY="/local-metadata/dataverse_nl/leiden"

#maastricht
MAASTRICHT_DATAVERSE_ALIAS="maastricht"
MAASTRICHT_METADATA_DIRECTORY="/local-metadata/dataverse_nl/maastricht"

#tilburg
TILBURG_DATAVERSE_ALIAS="tiu"
TILBURG_METADATA_DIRECTORY="/local-metadata/dataverse_nl/tilburg"

#trimbos
TRIMBOS_DATAVERSE_ALIAS="trimbos"
TRIMBOS_METADATA_DIRECTORY="/local-metadata/dataverse_nl/trimbos"

#UMC
UMC_DATAVERSE_ALIAS="UMCU"
UMC_METADATA_DIRECTORY="/local-metadata/dataverse_nl/umc"

#utrecht
UTRECHT_DATAVERSE_ALIAS="UU"
UTRECHT_METADATA_DIRECTORY="/local-metadata/dataverse_nl/utrecht"

#vu
VU_DATAVERSE_ALIAS="vuamsterdam"
VU_METADATA_DIRECTORY="/local-metadata/dataverse_nl/vu"
Empty file.
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import os

from prefect import flow
from prefect import flow, get_run_logger
from prefect.orion.schemas.states import Completed, Failed
from tasks.base_tasks import xml2json, dataverse_mapper, \
dataverse_import, update_publication_date, add_workflow_versioning_url

CBS_MAPPING_FILE_PATH = os.getenv('CBS_MAPPING_FILE_PATH')
CBS_TEMPLATE_FILE_PATH = os.getenv('CBS_TEMPLATE_FILE_PATH')
CBS_DATAVERSE_ALIAS = os.getenv('CBS_DATAVERSE_ALIAS')


@flow
def cbs_metadata_ingestion(file_path, version):
def cbs_metadata_ingestion(file_path, alias, version):
json_metadata = xml2json(file_path)
if not json_metadata:
return Failed(message='Unable to transform from xml to json.')
Expand All @@ -25,13 +24,15 @@ def cbs_metadata_ingestion(file_path, version):
if not mapped_metadata:
return Failed(message='Unable to store workflow version.')

import_response = dataverse_import(mapped_metadata, CBS_DATAVERSE_ALIAS)
fields = mapped_metadata['datasetVersion']['metadataBlocks']['citation'][
'fields']
split_path = file_path.split('/')[3]
doi = 'doi:10.57934/' + split_path.split('_')[0]
import_response = dataverse_import(mapped_metadata, alias,
doi)
if not import_response:
return Failed(message='Unable to import dataset into Dataverse.')

doi = import_response.json()['data']['persistentId']
fields = mapped_metadata['datasetVersion']['metadataBlocks']['citation'][
'fields']
publication_date = next((field for field in fields if
field.get('typeName') == 'distributionDate'),
{})
Expand Down
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import copy
import os

from prefect import flow
from prefect import flow, get_run_logger
from prefect.orion.schemas.states import Failed, Completed

from tasks.base_tasks import xml2json, get_doi_from_header, \
dataverse_metadata_fetcher, \
dataverse_import, add_contact_email, update_publication_date, \
add_workflow_versioning_url
format_license, add_workflow_versioning_url

DATAVERSE_NL_DATAVERSE_ALIAS = os.getenv('DATAVERSE_NL_DATAVERSE_ALIAS')
DATAVERSE_NL_SOURCE_DATAVERSE_URL = os.getenv(
'DATAVERSE_NL_SOURCE_DATAVERSE_URL')


@flow
def dataverse_nl_metadata_ingestion(file_path, version):
def dataverse_nl_metadata_ingestion(file_path, alias, version):
metadata_format = "dataverse_json"
json_metadata = xml2json(file_path)
if not json_metadata:
Expand All @@ -37,18 +36,35 @@ def dataverse_nl_metadata_ingestion(file_path, version):
if not dataverse_json:
return Failed(message='Unable to add contact email')


metadata_blocks = copy.deepcopy(
dataverse_json["datasetVersion"]['metadataBlocks'])
dataverse_json['datasetVersion'] = {}
dataverse_json['datasetVersion']['metadataBlocks'] = metadata_blocks

terms_of_use = None
if 'termsOfUse' in dataverse_json['datasetVersion']:
terms_of_use = copy.deepcopy(
dataverse_json['datasetVersion']['termsOfUse'])

ds_license = None
if 'license' in dataverse_json['datasetVersion']:
ds_license = copy.deepcopy(dataverse_json['datasetVersion']['license'])

dataverse_json['datasetVersion'] = {
'metadataBlocks': metadata_blocks
}

if terms_of_use:
dataverse_json['datasetVersion']['termsOfUse'] = terms_of_use

if ds_license and ds_license != 'NONE':
dataverse_json['datasetVersion']['license'] = format_license(
ds_license)

dataverse_json = add_workflow_versioning_url(dataverse_json, version)
if not dataverse_json:
return Failed(message='Unable to store workflow version.')

import_response = dataverse_import(dataverse_json,
DATAVERSE_NL_DATAVERSE_ALIAS, doi)
alias, doi)
if not import_response:
return Failed(message='Unable to import dataset into Dataverse')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,10 @@

EASY_MAPPING_FILE_PATH = os.getenv('EASY_MAPPING_FILE_PATH')
EASY_TEMPLATE_FILE_PATH = os.getenv('EASY_TEMPLATE_FILE_PATH')
EASY_DATAVERSE_ALIAS = os.getenv('EASY_DATAVERSE_ALIAS')


@flow
def easy_metadata_ingestion(file_path, version):
def easy_metadata_ingestion(file_path, alias, version):
json_metadata = xml2json(file_path)
if not json_metadata:
return Failed(message='Unable to transform from xml to json')
Expand All @@ -34,7 +33,7 @@ def easy_metadata_ingestion(file_path, version):
if not mapped_metadata:
return Failed(message='Unable to store workflow version.')

import_response = dataverse_import(mapped_metadata, EASY_DATAVERSE_ALIAS,
import_response = dataverse_import(mapped_metadata, alias,
doi)
if not import_response:
return Failed(message='Unable to import dataset into Dataverse')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@

LISS_MAPPING_FILE_PATH = os.getenv('LISS_MAPPING_FILE_PATH')
LISS_TEMPLATE_FILE_PATH = os.getenv('LISS_TEMPLATE_FILE_PATH')
LISS_DATAVERSE_ALIAS = os.getenv('LISS_DATAVERSE_ALIAS')


@flow
def liss_metadata_ingestion(file_path, version):
def liss_metadata_ingestion(file_path, alias, version):
json_metadata = xml2json(file_path)
if not json_metadata:
return Failed(message='Unable to transform from xml to json')
Expand All @@ -35,7 +34,7 @@ def liss_metadata_ingestion(file_path, version):
if not mapped_metadata:
return Failed(message='Unable to store workflow version.')

import_response = dataverse_import(mapped_metadata, LISS_DATAVERSE_ALIAS,
import_response = dataverse_import(mapped_metadata, alias,
doi)
if not import_response:
return Failed(message='Unable to import dataset into Dataverse')
Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import \
dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

RESEARCH_DATA_METADATA_DIRECTORY = os.getenv(
'RESEARCH_DATA_METADATA_DIRECTORY')
RESEARCH_DATA_DATAVERSE_ALIAS = os.getenv('RESEARCH_DATA_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
RESEARCH_DATA_METADATA_DIRECTORY, version,
RESEARCH_DATA_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import \
dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

AVANS_METADATA_DIRECTORY = os.getenv('AVANS_METADATA_DIRECTORY')
AVANS_DATAVERSE_ALIAS = os.getenv('AVANS_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
AVANS_METADATA_DIRECTORY, version,
AVANS_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import \
dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

FONTYS_METADATA_DIRECTORY = os.getenv('FONTYS_METADATA_DIRECTORY')
FONTYS_DATAVERSE_ALIAS = os.getenv('FONTYS_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
FONTYS_METADATA_DIRECTORY, version,
FONTYS_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import \
dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

GRONINGEN_METADATA_DIRECTORY = os.getenv('GRONINGEN_METADATA_DIRECTORY')
GRONINGEN_DATAVERSE_ALIAS = os.getenv('GRONINGEN_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
GRONINGEN_METADATA_DIRECTORY, version,
GRONINGEN_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import \
dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

HANZE_METADATA_DIRECTORY = os.getenv('HANZE_METADATA_DIRECTORY')
HANZE_DATAVERSE_ALIAS = os.getenv('HANZE_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
HANZE_METADATA_DIRECTORY, version,
HANZE_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import utils
from prefect import flow
from flows.dataset_workflows.dataverse_nl_ingestion import dataverse_nl_metadata_ingestion
from flows.workflow_versioning.workflow_versioner import \
create_ingestion_workflow_versioning

HR_METADATA_DIRECTORY = os.getenv('HR_METADATA_DIRECTORY')
HR_DATAVERSE_ALIAS = os.getenv('HR_DATAVERSE_ALIAS')


@flow
def dataverse_nl_ingestion_pipeline():
version = create_ingestion_workflow_versioning(transformer=True,
fetcher=True,
importer=True,
updater=True)

utils.workflow_executor(dataverse_nl_metadata_ingestion,
HR_METADATA_DIRECTORY, version, HR_DATAVERSE_ALIAS)


if __name__ == "__main__":
dataverse_nl_ingestion_pipeline()
Loading

0 comments on commit 16bdd08

Please sign in to comment.