diff --git a/CHANGELOG.md b/CHANGELOG.md index 841d1f890..5f6882cf2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ # Changelog -## v0.0.25 (in progress) +## v0.0.25 - Update GeoMx NGS directory schema +- Added EPIC dataset field derived_dataset_type to UNIQUE_FIELDS_MAP ## v0.0.24 - Release MERFISH @@ -21,6 +22,7 @@ - Update Visium with probes directory schema - Update Visium no probes directory schema - Change to EntityTypeInfo constraint format to support constraints endpoint +- Adding support for EPIC's new plugin ## v0.0.23 - Add token to validation_utils.get_assaytype_data, replace URL string concatenation with urllib diff --git a/examples/dataset-examples/bad-no-assay-type/README.md b/examples/dataset-examples/bad-no-assay-type/README.md index 00f25a7d7..05410254e 100644 --- a/examples/dataset-examples/bad-no-assay-type/README.md +++ b/examples/dataset-examples/bad-no-assay-type/README.md @@ -1,4 +1,5 @@ ``` Preflight Errors: -- No assay_type or dataset_type in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. -``` \ No newline at end of file +- 'Required dataset field not present in examples/dataset-examples/bad-no-assay-type/upload/bad-metadata.tsv. + One of the following is required: assay_type, dataset_type, derived_dataset_type.' +``` diff --git a/src/ingest_validation_tools/enums.py b/src/ingest_validation_tools/enums.py index 178aeea60..d3909b488 100644 --- a/src/ingest_validation_tools/enums.py +++ b/src/ingest_validation_tools/enums.py @@ -208,3 +208,18 @@ class Sample(EntityTypes): @classmethod def with_parent_type(cls): return [*[entity_type for entity_type in cls], OtherTypes.SAMPLE] + + +# These should all be considered to be mutually exclusive, +# even within the same type +UNIQUE_FIELDS_MAP = { + OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, + OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, + DatasetType.DATASET: {"assay_type", "dataset_type", "derived_dataset_type"}, + OtherTypes.SOURCE: {"strain_rrid"}, + OtherTypes.ORGAN: {"organ_id"}, # Deprecated + OtherTypes.SAMPLE: {"sample_id"}, +} +OTHER_FIELDS_UNIQUE_FIELDS_MAP = { + k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET +} diff --git a/src/ingest_validation_tools/plugin_validator.py b/src/ingest_validation_tools/plugin_validator.py index 5f390ee33..9e2b5f49c 100644 --- a/src/ingest_validation_tools/plugin_validator.py +++ b/src/ingest_validation_tools/plugin_validator.py @@ -3,7 +3,7 @@ from collections.abc import Iterator from importlib import util from pathlib import Path -from typing import List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple, Type, Union from ingest_validation_tools.schema_loader import SchemaVersion @@ -53,6 +53,9 @@ def __init__( assay_type: str, contains: List = [], verbose: bool = False, + metadata_tsv: SchemaVersion = None, + globus_token: str = None, + app_context: Dict[str, str] = {}, **kwargs, ): """ @@ -72,6 +75,9 @@ def __init__( self.assay_type = assay_type self.contains = contains self.verbose = verbose + self.metadata_tsv = metadata_tsv + self.token = globus_token + self.app_context = app_context def _log(self, message): if self.verbose: @@ -99,6 +105,8 @@ def run_plugin_validators_iter( plugin_dir: PathOrStr, is_shared_upload: bool, verbose: bool = True, + globus_token: str = None, + app_context: Dict[str, str] = {}, **kwargs, ) -> Iterator[KeyValuePair]: """ @@ -134,7 +142,15 @@ def run_plugin_validators_iter( raise ValidatorError(f"{data_path} should be the base directory of a dataset") data_paths.append(data_path) for k, v in validation_error_iter( - data_paths, sv.dataset_type, plugin_dir, sv.contains, verbose=verbose, **kwargs + data_paths, + sv.dataset_type, + plugin_dir, + sv.contains, + verbose=verbose, + metadata_tsv=sv, + globus_token=globus_token, + app_context=app_context, + **kwargs, ): yield k, v else: @@ -179,6 +195,9 @@ def validation_error_iter( plugin_dir: PathOrStr, contains: List, verbose: bool = False, + metadata_tsv: SchemaVersion = None, + globus_token: str = None, + app_context: Dict[str, str] = {}, **kwargs, ) -> Iterator[KeyValuePair]: """ @@ -195,6 +214,8 @@ def validation_error_iter( error messages """ for cls in validation_class_iter(plugin_dir): - validator = cls(paths, assay_type, contains, verbose) + validator = cls( + paths, assay_type, contains, verbose, metadata_tsv, globus_token, app_context + ) for err in validator.collect_errors(**kwargs): yield cls, err diff --git a/src/ingest_validation_tools/schema_loader.py b/src/ingest_validation_tools/schema_loader.py index 8018826d6..fb98d53f8 100644 --- a/src/ingest_validation_tools/schema_loader.py +++ b/src/ingest_validation_tools/schema_loader.py @@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Sequence, Set, Union from ingest_validation_tools.enums import ( + UNIQUE_FIELDS_MAP, DatasetType, EntityTypes, OtherTypes, @@ -91,13 +92,8 @@ def get_row_data(self): self.is_cedar = True else: self.is_cedar = False + self.get_dataset_type_value() self.version = self.rows[0].get("version") - assay_type = self.rows[0].get("assay_type") - dataset_type = self.rows[0].get("dataset_type") - if assay_type is not None and dataset_type is not None: - raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!") - else: - self.dataset_type = assay_type if assay_type else dataset_type def get_assayclassifier_data(self): self.vitessce_hints = self.soft_assay_data.get("vitessce-hints", []) @@ -109,6 +105,19 @@ def get_assayclassifier_data(self): contains = self.soft_assay_data.get("must-contain", []) self.contains = [schema.lower() for schema in contains] + def get_dataset_type_value(self): + dataset_fields = { + k: v for k, v in self.rows[0].items() if k in UNIQUE_FIELDS_MAP[DatasetType.DATASET] + } + values_found = list(dataset_fields.values()) + if len(values_found) == 0: + return + elif len(values_found) > 1: + raise PreflightError( + f"Found multiple dataset fields for path {self.path}: {dataset_fields}" + ) + self.dataset_type = values_found[0] + @dataclass class EntityTypeInfo: diff --git a/src/ingest_validation_tools/upload.py b/src/ingest_validation_tools/upload.py index 998da5188..aa082c297 100644 --- a/src/ingest_validation_tools/upload.py +++ b/src/ingest_validation_tools/upload.py @@ -181,7 +181,7 @@ def get_app_context(self, submitted_app_context: Dict): Ensure that all default values are present, but privilege any submitted values (after making a basic validity check). """ - for url_type in ["entities_url", "ingest_url", "constraints_url"]: + for url_type in ["entities_url", "ingest_url", "constraints_url", "uuid_url"]: if submitted_app_context.get(url_type): split_url = urlsplit(submitted_app_context[url_type]) assert ( @@ -193,6 +193,7 @@ def get_app_context(self, submitted_app_context: Dict): "request_header": {"X-Hubmap-Application": "ingest-pipeline"}, # TODO: does not work in HuBMAP currently "constraints_url": None, + "uuid_url": "https://uuid.api.hubmapconsortium.org/uuid/", } | submitted_app_context def validation_routine( @@ -444,6 +445,8 @@ def _get_plugin_errors(self, **kwargs) -> dict: plugin_path, self.is_shared_upload, verbose=self.verbose, + globus_token=self.globus_token, + app_context=self.app_context, **kwargs, ): if v is None: diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index b50f29556..d85095d21 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -12,7 +12,14 @@ DirectoryValidationErrors, validate_directory, ) -from ingest_validation_tools.enums import DatasetType, EntityTypes, OtherTypes, Sample +from ingest_validation_tools.enums import ( + OTHER_FIELDS_UNIQUE_FIELDS_MAP, + UNIQUE_FIELDS_MAP, + DatasetType, + EntityTypes, + OtherTypes, + Sample, +) from ingest_validation_tools.schema_loader import ( EntityTypeInfo, PreflightError, @@ -21,18 +28,6 @@ ) from ingest_validation_tools.table_validator import ReportType -UNIQUE_FIELDS_MAP = { - OtherTypes.ANTIBODIES: {"antibody_rrid", "antibody_name"}, - OtherTypes.CONTRIBUTORS: {"orcid", "orcid_id"}, - DatasetType.DATASET: {"assay_type", "dataset_type"}, - OtherTypes.SOURCE: {"strain_rrid"}, - OtherTypes.ORGAN: {"organ_id"}, # Deprecated? - OtherTypes.SAMPLE: {"sample_id"}, -} -OTHER_FIELDS_UNIQUE_FIELDS_MAP = { - k: v for k, v in UNIQUE_FIELDS_MAP.items() if not k == DatasetType.DATASET -} - def match_field_in_unique_fields( match_fields: list, path: str, dataset=True @@ -86,7 +81,9 @@ def get_schema_version( return other_type message = [] if not [field for field in UNIQUE_FIELDS_MAP[DatasetType.DATASET] if field in rows[0].keys()]: - message.append(f"No assay_type or dataset_type in {path}.") + message.append( + f"Required dataset field not present in {path}. One of the following is required: {', '.join(sorted(UNIQUE_FIELDS_MAP[DatasetType.DATASET]))}" + ) if "channel_id" in rows[0]: message.append('Has "channel_id": Antibodies TSV found where metadata TSV expected.') elif "orcid_id" in rows[0]: