From 6939f11a82990961365bc76ec7fbcb9121d1a790 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 19 Aug 2024 12:13:09 -0400 Subject: [PATCH 1/2] updated metadata model --- bedboss/models.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/bedboss/models.py b/bedboss/models.py index f037295..db3a130 100644 --- a/bedboss/models.py +++ b/bedboss/models.py @@ -24,23 +24,42 @@ class FILE_TYPE(str, Enum): class BedMetadata(BaseModel): sample_name: str genome: str - organism: str = "" + + species_name: str = Field( + default="", description="Name of species. e.g. Homo sapiens.", alias="organism" + ) species_id: str = "" - cell_type: str = "" - cell_line: str = "" - exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)") + genotype: str = Field("", description="Genotype of the sample") + phenotype: str = Field("", description="Phenotype of the sample") + + cell_type: str = Field( + "", + description="specific kind of cell with distinct characteristics found in an organism. e.g. Neurons, Hepatocytes, Adipocytes", + ) + cell_line: str = Field( + "", + description="population of cells derived from a single cell and cultured in the lab for extended use, e.g. HeLa, HepG2, k562", + ) + tissue: str = Field("", description="Tissue type") + library_source: str = Field( "", description="Library source (e.g. genomic, transcriptomic)" ) - genotype: str = Field("", description="Genotype of the sample") - target: str = Field("", description="Target of the assay (e.g. H3K4me3)") + assay: str = Field( + "", description="Experimental protocol (e.g. ChIP-seq)", alias="exp_protocol" + ) antibody: str = Field("", description="Antibody used in the assay") + target: str = Field("", description="Target of the assay (e.g. H3K4me3)") treatment: str = Field( "", description="Treatment of the sample (e.g. drug treatment)" ) - tissue: str = Field("", description="Tissue type") - global_sample_id: str = Field("", description="Global sample identifier") - global_experiment_id: str = Field("", description="Global experiment identifier") + + global_sample_id: str = Field( + "", description="Global sample identifier. e.g. GSM000" + ) # excluded in training + global_experiment_id: str = Field( + "", description="Global experiment identifier. e.g. GSE000" + ) # excluded in training description: str = Field("", description="Description of the sample") model_config = ConfigDict( From 413392c58fe1f1e4e92385b1e438996eff8d11b9 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 12 Sep 2024 13:38:03 -0400 Subject: [PATCH 2/2] added bedmess to bedboss --- bedboss/bedboss.py | 6 ++++ bedboss/cli.py | 2 ++ bedboss/utils.py | 46 +++++++++++++++++++++++++++++++ requirements/requirements-all.txt | 3 +- test/test_standardizer.py | 24 ++++++++++++++++ 5 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 test/test_standardizer.py diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 298501e..be1b710 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -30,6 +30,7 @@ from bedboss.utils import ( standardize_genome_name, get_genome_digest, + standardize_pep as pep_standardizer, ) from bedboss.exceptions import BedBossException from bedboss._version import __version__ @@ -241,6 +242,7 @@ def insert_pep( upload_pephub: bool = False, upload_qdrant: bool = False, no_fail: bool = False, + standardize_pep: bool = False, pm: pypiper.PipelineManager = None, ) -> None: """ @@ -266,6 +268,7 @@ def insert_pep( :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database + :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param pypiper.PipelineManager pm: pypiper object :return: None """ @@ -282,6 +285,9 @@ def insert_pep( else: raise BedBossException("Incorrect pep type. Exiting...") + if standardize_pep: + pep = pep_standardizer(pep) + bbagent = BedBaseAgent(bedbase_config) validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH) diff --git a/bedboss/cli.py b/bedboss/cli.py index 8a4951d..15e6908 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -162,6 +162,7 @@ def run_pep( upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), + standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), recover: bool = typer.Option(True, help="Recover from previous run"), @@ -189,6 +190,7 @@ def run_pep( upload_pephub=upload_pephub, upload_qdrant=upload_qdrant, no_fail=no_fail, + standardize_pep=standardize_pep, pm=create_pm( outfolder=outfolder, multi=multi, diff --git a/bedboss/utils.py b/bedboss/utils.py index ec3bae6..d5b1dc6 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -3,6 +3,9 @@ import urllib.request import requests from pephubclient.files_manager import FilesManager +import peppy +from peppy.const import SAMPLE_RAW_DICT_KEY +from attribute_standardizer import AttrStandardizer _LOGGER = logging.getLogger("bedboss") @@ -119,3 +122,46 @@ def save_example_bedbase_config(path: str) -> None: file_path = os.path.abspath(os.path.join(path, "bedbase_config.yaml")) FilesManager.save_yaml(example_bedbase_config(), file_path) _LOGGER.info(f"Example BedBase configuration saved to: {file_path}") + + +def standardize_pep( + pep: peppy.Project, standard_columns: list = None, model: str = "BEDBASE" +) -> peppy.Project: + """ + Standardize PEP file by using bedMS standardization model + :param pep: peppy project + :param standard_columns: list of columns to standardize + + :return: peppy project + + """ + if standard_columns is None: + standard_columns = ["library_source", "assay", "genome", "species_name"] + model = AttrStandardizer("BEDBASE") + suggestions = model.standardize(pep) + + changes = {} + if suggestions is None: + return pep + for original, suggestion_dict in suggestions.items(): + for suggestion, value in suggestion_dict.items(): + if value > 0.9 and suggestion in standard_columns: + if suggestion not in changes: + changes[suggestion] = {original: value} + else: + if list(changes[suggestion].values())[0] < value: + changes[suggestion] = {original: value} + + raw_pep = pep.to_dict(extended=True) + for suggestion, original_dict in changes.items(): + original_key = list(original_dict.keys())[0] + if ( + suggestion not in raw_pep[SAMPLE_RAW_DICT_KEY] + and original_key in raw_pep[SAMPLE_RAW_DICT_KEY] + ): + raw_pep[SAMPLE_RAW_DICT_KEY][suggestion] = raw_pep[SAMPLE_RAW_DICT_KEY][ + original_key + ] + del raw_pep[SAMPLE_RAW_DICT_KEY][original_key] + + return peppy.Project.from_dict(raw_pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 84b6f3c..182597d 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -10,4 +10,5 @@ refgenconf>=0.12.2 pandas>=1.5.3 ubiquerg>=0.6.2 pephubclient>=0.2.2 -geniml>=0.2.0 \ No newline at end of file +geniml>=0.2.0 +bedmess>=0.1.0 diff --git a/test/test_standardizer.py b/test/test_standardizer.py new file mode 100644 index 0000000..b63b6af --- /dev/null +++ b/test/test_standardizer.py @@ -0,0 +1,24 @@ +from bedboss.utils import standardize_pep +import peppy +import pytest + + +# @pytest.mark.skip(reason="Not for automatic testing") +@pytest.mark.parametrize( + "registry_path", + [ + "bedbase/gse274894:samples", + "bedbase/gse275349:samples", + "bedbase/gse262920:samples", + "bedbase/gse236101:samples", + "bedbase/gse254365:samples", + ], +) +@pytest.mark.parametrize( + "model", + ["BEDBASE", "ENCODE"], +) +def test_standardize_pep(registry_path, model): + pep = peppy.Project.from_pephub(registry_path) + standardize_pep(pep, model=model) + assert True