Initial support for optional Dataset dataset_type attribute create, u…

…pdate, read, and index. TODO markups for future work when dataset_type required and dataset_types deprecated/removed.
hubmapconsortium · Dec 4, 2023 · af0cca2 · af0cca2
1 parent ebfecad
commit af0cca2
Show file tree

Hide file tree

Showing 6 changed files with 200 additions and 27 deletions.
diff --git a/src/app.py b/src/app.py
@@ -191,6 +191,12 @@ def http_internal_server_error(e):
 ####################################################################################################
 
 try:
+    try:
+        _schema_yaml_file = app.config['SCHEMA_YAML_FILE']
+    except KeyError as ke:
+        logger.error("Expected configuration failed to load %s from app_config=%s.", ke, app.config)
+        raise Exception("Expected configuration failed to load. See the logs.")
+
     # The schema_manager is a singleton module
     # Pass in auth_helper_instance, neo4j_driver instance, and memcached_client_instance
     schema_manager.initialize(app.config['SCHEMA_YAML_FILE'],
@@ -931,6 +937,8 @@ def create_entity(entity_type):
     # Currently only ValueError
     except ValueError as e:
         bad_request_error(e)
+    except schema_errors.UnimplementedValidatorException as uve:
+        internal_server_error(uve)
 
     # Additional validation for Sample entities
     if normalized_entity_type == 'Sample':
@@ -2666,7 +2674,8 @@ def get_prov_info():
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
     HEADER_FIRST_SAMPLE_SUBMISSION_ID = 'first_sample_submission_id'
@@ -2693,11 +2702,12 @@ def get_prov_info():
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
     HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids'
 
+    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -2790,6 +2800,8 @@ def get_prov_info():
 
         # Data type codes are replaced with data type descriptions
         assay_description_list = []
+        # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
         for item in dataset['data_types']:
             try:
                 assay_description_list.append(assay_types_dict[item]['description'])
@@ -2805,6 +2817,8 @@ def get_prov_info():
         # If return_format was not equal to json, json arrays must be converted into comma separated lists for the tsv
         if return_json is False:
             internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
+        # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+        # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
 
         internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace('<identifier>', dataset['uuid'])
 
@@ -3041,7 +3055,8 @@ def get_prov_info_for_dataset(id):
     HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified'
     HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email'
     HEADER_DATASET_LAB_ID = 'lab_id_or_name'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_PORTAL_URL = 'dataset_portal_url'
     HEADER_DATASET_SAMPLES = 'dataset_samples'
     HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id'
@@ -3068,11 +3083,12 @@ def get_prov_info_for_dataset(id):
     HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status'
     HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url'
 
+    # TODO-Eliminate HEADER_DATASET_DATA_TYPES once HEADER_DATASET_DATASET_TYPE is required.
     headers = [
         HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME,
         HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL,
         HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID,
-        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
+        HEADER_DATASET_DATA_TYPES, HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID,
         HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE,
         HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID,
         HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID,
@@ -3111,6 +3127,8 @@ def get_prov_info_for_dataset(id):
 
     # Data type codes are replaced with data type descriptions
     assay_description_list = []
+    # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
     for item in dataset['data_types']:
         try:
             assay_description_list.append(assay_types_dict[item]['description'])
@@ -3124,6 +3142,11 @@ def get_prov_info_for_dataset(id):
     internal_dict[HEADER_DATASET_DATA_TYPES] = dataset['data_types']
     if return_json is False:
         internal_dict[HEADER_DATASET_DATA_TYPES] = ",".join(dataset['data_types'])
+    # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+    # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
+
+    internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_type']
+
     internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('<entity_type>', 'dataset').replace(
         '<identifier>', dataset['uuid'])
     if dataset['first_sample'] is not None:
@@ -3295,15 +3318,16 @@ def get_prov_info_for_dataset(id):
 -------
 json
     a json array. Each item in the array corresponds to a dataset. Each dataset has the values: dataset_group_name, 
-    organ_type, dataset_data_types, and dataset_status, each of which is a string. 
+    organ_type, dataset_data_types, and dataset_status, each of which is a string. # TODO-integrate dataset_dataset_type to documentation.
 
 """
 @app.route('/datasets/sankey_data', methods=['GET'])
 def sankey_data():
     # String constants
     HEADER_DATASET_GROUP_NAME = 'dataset_group_name'
     HEADER_ORGAN_TYPE = 'organ_type'
-    HEADER_DATASET_DATA_TYPES = 'dataset_data_types'
+    HEADER_DATASET_DATA_TYPES = 'dataset_data_types' # TODO-eliminate when HEADER_DATASET_DATASET_TYPE is required
+    HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type'
     HEADER_DATASET_STATUS = 'dataset_status'
 
     with open('sankey_mapping.json') as f:
@@ -3335,10 +3359,8 @@ def sankey_data():
             internal_dict = collections.OrderedDict()
             internal_dict[HEADER_DATASET_GROUP_NAME] = dataset[HEADER_DATASET_GROUP_NAME]
 
-            organ_code = dataset[HEADER_ORGAN_TYPE].upper()
-            validate_organ_code(organ_code)
-
-            internal_dict[HEADER_ORGAN_TYPE] = organ_types_dict[organ_code].lower()
+            # TODO BEGIN evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
             # Data type codes are replaced with data type descriptions
             assay_description = ""
             try:
@@ -3357,6 +3379,8 @@ def sankey_data():
                 internal_dict[HEADER_DATASET_GROUP_NAME] = mapping_dict[internal_dict[HEADER_DATASET_GROUP_NAME]]
             if internal_dict[HEADER_DATASET_DATA_TYPES] in mapping_dict.keys():
                 internal_dict[HEADER_DATASET_DATA_TYPES] = mapping_dict[internal_dict[HEADER_DATASET_DATA_TYPES]]
+            # TODO END evaluate elimination of this block, if it is still in place following the YAML-to-UBKG effort on https://github.com/hubmapconsortium/entity-api/issues/494,
+            # and once dataset['dataset_type'] is required and dataset['data_types'] removed.
 
             # Each dataset's dictionary is added to the list to be returned
             dataset_sankey_list.append(internal_dict)
@@ -3534,16 +3558,17 @@ def get_sample_prov_info():
 json
     an array of each unpublished dataset.
     fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", 
-             "provider_experiment_id", "uuid")
+             "provider_experiment_id", "uuid")  # TODO-integrate dataset_dataset_type to documentation.
 tsv
     a text/tab-seperated-value document including each unpublished dataset.
     fields: ("data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", 
-             "provider_experiment_id", "uuid")
+             "provider_experiment_id", "uuid")  # TODO-integrate dataset_dataset_type to documentation.
 """
 @app.route('/datasets/unpublished', methods=['GET'])
 def unpublished():
     # String constraints
-    HEADER_DATA_TYPES = "data_types"
+    HEADER_DATA_TYPES = "data_types" # TODO-eliminate when HEADER_DATASET_TYPE is required
+    HEADER_DATASET_TYPE = 'dataset_type'
     HEADER_ORGANIZATION = "organization"
     HEADER_UUID = "uuid"
     HEADER_HUBMAP_ID = "hubmap_id"
@@ -3552,8 +3577,9 @@ def unpublished():
     HEADER_SUBMISSION_ID = "donor_submission_id"
     HEADER_PROVIDER_EXPERIMENT_ID = "provider_experiment_id"
 
+    # TODO-Eliminate HEADER_DATA_TYPES once HEADER_DATASET_TYPE is required.
     headers = [
-        HEADER_DATA_TYPES, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
+        HEADER_DATA_TYPES, HEADER_DATASET_TYPE, HEADER_ORGANIZATION, HEADER_UUID, HEADER_HUBMAP_ID, HEADER_ORGAN, HEADER_DONOR_HUBMAP_ID,
         HEADER_SUBMISSION_ID, HEADER_PROVIDER_EXPERIMENT_ID
     ]
 
@@ -4772,7 +4798,7 @@ def access_level_prefix_dir(dir_name):
 
 
 """
-Ensures that a given organ code is 2-letter alphabetic and can be found int the UBKG ontology-api
+Ensures that a given organ code is 2-letter alphabetic and can be found in the UBKG ontology-api
 
 Parameters
 ----------

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -736,7 +736,7 @@ def get_prov_info(neo4j_driver, param_dict, published_only):
              f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, REVISIONS") # TODO replace ds.data_types with ds.dataset_type when required
 
     logger.info("======get_prov_info() query======")
     logger.info(query)
@@ -834,7 +834,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
              f" WITH ds, FIRSTSAMPLE, DONOR, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET"
              f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name,"
              f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, "
-             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET")
+             f" ds.last_modified_user_email, ds.lab_dataset_id, ds.data_types, METASAMPLE, PROCESSED_DATASET, ds.dataset_type")
     logger.info("======get_prov_info() query======")
     logger.info(query)
 
@@ -891,6 +891,7 @@ def get_individual_prov_info(neo4j_driver, dataset_uuid):
                 node_dict = schema_neo4j_queries.node_to_dict(entry)
                 content_sixteen.append(node_dict)
             record_dict['processed_dataset'] = content_sixteen
+            record_dict['dataset_type'] = record_contents[17] if record_contents[17] is not None else ''
     return record_dict
 
 

diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -332,6 +332,14 @@ ENTITIES:
         type: list
         required_on_create: true # Only required for create via POST, not update via PUT
         description: "The data or assay types contained in this dataset as a json array of strings.  Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/main/src/search-schema/data/definitions/enums/assay_types.yaml)."
+      dataset_type:
+        before_property_create_validators:
+          - validate_recognized_dataset_type
+        before_property_update_validators:
+          - validate_recognized_dataset_type
+        type: string
+        required_on_create: false # Once replaces data_types, will be required for create via POST, not update via PUT
+        description: "The assay types of this Dataset. Valid values are from UBKG are queried by schema_manager.get_valueset_dataset_type() using the Ontology API."
       collections:
         type: list
         transient: true

diff --git a/src/schema/schema_errors.py b/src/schema/schema_errors.py
@@ -1,4 +1,7 @@
 
+class UnimplementedValidatorException(Exception):
+    pass
+
 class SchemaValidationException(Exception):
     pass
 
@@ -36,4 +39,4 @@ class MissingApplicationHeaderException(Exception):
     pass
 
 class InvalidApplicationHeaderException(Exception):
-    pass
+    pass