Skip to content

Commit

Permalink
Initial support for optional Dataset dataset_type attribute create, u…
Browse files Browse the repository at this point in the history
…pdate, read, and index. TODO markups for future work when dataset_type required and dataset_types deprecated/removed.
  • Loading branch information
Karl Burke committed Dec 4, 2023
1 parent ebfecad commit af0cca2
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 27 deletions.
56 changes: 41 additions & 15 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ def http_internal_server_error(e):
####################################################################################################

try:
try:
_schema_yaml_file = app.config['SCHEMA_YAML_FILE']
except KeyError as ke:
logger.error("Expected configuration failed to load %s from app_config=%s.", ke, app.config)
raise Exception("Expected configuration failed to load. See the logs.")

# The schema_manager is a singleton module
# Pass in auth_helper_instance, neo4j_driver instance, and memcached_client_instance
schema_manager.initialize(app.config['SCHEMA_YAML_FILE'],
Expand Down Expand Up @@ -931,6 +937,8 @@ def create_entity(entity_type):
# Currently only ValueError
except ValueError as e:
bad_request_error(e)
except schema_errors.UnimplementedValidatorException as uve:
internal_server_error(uve)

# Additional validation for Sample entities
if normalized_entity_type == 'Sample':
Expand Down Expand Up @@ -2666,7 +2674,8 @@ def get_prov_info():
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
HEADER_FIRST_SAMPLE_SUBMISSION_ID = 'first_sample_submission_id'
Expand All @@ -2693,11 +2702,12 @@ def get_prov_info():
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand Down Expand Up @@ -2790,6 +2800,8 @@ def get_prov_info():

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
Expand All @@ -2805,6 +2817,8 @@ def get_prov_info():
# If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])

Expand Down Expand Up @@ -3041,7 +3055,8 @@ def get_prov_info_for_dataset(id):
HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
HEADER_DATASET_LAB_ID = 'lab_id_or_name'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
HEADER_DATASET_SAMPLES = 'dataset_samples'
HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
Expand All @@ -3068,11 +3083,12 @@ def get_prov_info_for_dataset(id):
HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'

# TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
headers = [
HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
Expand Down Expand Up @@ -3111,6 +3127,8 @@ def get_prov_info_for_dataset(id):

# Data type codes are replaced with data type descriptions
assay_description_list = []
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
for item in dataset['data_types']:
try:
assay_description_list.append(assay_types_dict[item]['description'])
Expand All @@ -3124,6 +3142,11 @@ def get_prov_info_for_dataset(id):
internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
if return_json is False:
internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']

internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
'<identifier>', dataset['uuid'])
if dataset['first_sample'] is not None:
Expand Down Expand Up @@ -3295,15 +3318,16 @@ def get_prov_info_for_dataset(id):
-------
json
a json array. Each item in the array corresponds to a dataset. Each dataset has the values: dataset_group_name,
organ_type, dataset_data_types, and dataset_status, each of which is a string.
organ_type, dataset_data_types, and dataset_status, each of which is a string. # TODO-integrate dataset_dataset_type to documentation.
"""
@app.route('/datasets/sankey_data', methods=['GET'])
def sankey_data():
# String constants
HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
HEADER_ORGAN_TYPE = 'organ_type'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
HEADER_DATASET_STATUS = 'dataset_status'

with open('sankey_mapping.json') as f:
Expand Down Expand Up @@ -3335,10 +3359,8 @@ def sankey_data():
internal_dict = collections.OrderedDict()
internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]

organ_code = dataset[HEADER_ORGAN_TYPE].upper()
validate_organ_code(organ_code)

internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
# TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.
# Data type codes are replaced with data type descriptions
assay_description = ""
try:
Expand All @@ -3357,6 +3379,8 @@ def sankey_data():
internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
# TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
# and once dataset['dataset_type'] is required and dataset['data_types'] removed.

# Each dataset's dictionary is added to the list to be returned
dataset_sankey_list.append(internal_dict)
Expand Down Expand Up @@ -3534,16 +3558,17 @@ def get_sample_prov_info():
json
an array of each unpublished dataset.
fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization",
"provider_experiment_id", "uuid")
"provider_experiment_id", "uuid") # TODO-integrate dataset_dataset_type to documentation.
tsv
a text/tab-seperated-value document including each unpublished dataset.
fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization",
"provider_experiment_id", "uuid")
"provider_experiment_id", "uuid") # TODO-integrate dataset_dataset_type to documentation.
"""
@app.route('/datasets/unpublished', methods=['GET'])
def unpublished():
# String constraints
HEADER_DATA_TYPES = "data_types"
HEADER_DATA_TYPES = "data_types" # TODO-eliminate when HEADER_DATASET_TYPE is required
HEADER_DATASET_TYPE = 'dataset_type'
HEADER_ORGANIZATION = "organization"
HEADER_UUID = "uuid"
HEADER_HUBMAP_ID = "hubmap_id"
Expand All @@ -3552,8 +3577,9 @@ def unpublished():
HEADER_SUBMISSION_ID = "donor_submission_id"
HEADER_PROVIDER_EXPERIMENT_ID = "provider_experiment_id"

# TODO-Eliminate HEADER_DATA_TYPES once HEADER_DATASET_TYPE is required.
headers = [
HEADER_DATA_TYPES, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
HEADER_DATA_TYPES, HEADER_DATASET_TYPE, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
HEADER_SUBMISSION_ID, HEADER_PROVIDER_EXPERIMENT_ID
]

Expand Down Expand Up @@ -4772,7 +4798,7 @@ def access_level_prefix_dir(dir_name):


"""
Ensures that a given organ code is 2-letter alphabetic and can be found int the UBKG ontology-api
Ensures that a given organ code is 2-letter alphabetic and can be found in the UBKG ontology-api
Parameters
----------
Expand Down
5 changes: 3 additions & 2 deletions src/app_neo4j_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS")
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required

logger.info("======get_prov_info() query======")
logger.info(query)
Expand Down Expand Up @@ -834,7 +834,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET")
f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
logger.info("======get_prov_info() query======")
logger.info(query)

Expand Down Expand Up @@ -891,6 +891,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
node_dict = schema_neo4j_queries.node_to_dict(entry)
content_sixteen.append(node_dict)
record_dict['processed_dataset'] = content_sixteen
record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
return record_dict


Expand Down
8 changes: 8 additions & 0 deletions src/schema/provenance_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,14 @@ ENTITIES:
type: list
required_on_create: true # Only required for create via POST, not update via PUT
description: "The data or assay types contained in this dataset as a json array of strings. Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/assay_types.yaml)."
dataset_type:
before_property_create_validators:
- validate_recognized_dataset_type
before_property_update_validators:
- validate_recognized_dataset_type
type: string
required_on_create: false # Once replaces data_types, will be required for create via POST, not update via PUT
description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API."
collections:
type: list
transient: true
Expand Down
5 changes: 4 additions & 1 deletion src/schema/schema_errors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@

class UnimplementedValidatorException(Exception):
pass

class SchemaValidationException(Exception):
pass

Expand Down Expand Up @@ -36,4 +39,4 @@ class MissingApplicationHeaderException(Exception):
pass

class InvalidApplicationHeaderException(Exception):
pass
pass
Loading

0 comments on commit af0cca2

Please sign in to comment.