diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 41cf0cb..50581e7 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -26,7 +26,15 @@ PlotsUpload, StatsUpload, ) -from bedboss.utils import get_genome_digest, standardize_genome_name + +from bedboss.utils import ( + standardize_genome_name, + get_genome_digest, + standardize_pep as pep_standardizer, +) +from bedboss.exceptions import BedBossException +from bedboss._version import __version__ + _LOGGER = logging.getLogger(PKG_NAME) @@ -235,6 +243,7 @@ def insert_pep( upload_pephub: bool = False, upload_qdrant: bool = False, no_fail: bool = False, + standardize_pep: bool = False, pm: pypiper.PipelineManager = None, ) -> None: """ @@ -260,6 +269,7 @@ def insert_pep( :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database + :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param pypiper.PipelineManager pm: pypiper object :return: None """ @@ -276,6 +286,9 @@ def insert_pep( else: raise BedBossException("Incorrect pep type. Exiting...") + if standardize_pep: + pep = pep_standardizer(pep) + bbagent = BedBaseAgent(bedbase_config) validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH) diff --git a/bedboss/cli.py b/bedboss/cli.py index 82a07b6..c0ad3ed 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -165,6 +165,7 @@ def run_pep( upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), + standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), recover: bool = typer.Option(True, help="Recover from previous run"), @@ -192,6 +193,7 @@ def run_pep( upload_pephub=upload_pephub, upload_qdrant=upload_qdrant, no_fail=no_fail, + standardize_pep=standardize_pep, pm=create_pm( outfolder=outfolder, multi=multi, diff --git a/bedboss/models.py b/bedboss/models.py index ddbae26..636bef3 100644 --- a/bedboss/models.py +++ b/bedboss/models.py @@ -23,23 +23,42 @@ class FILE_TYPE(str, Enum): class BedMetadata(BaseModel): sample_name: str genome: str - organism: str = "" + + species_name: str = Field( + default="", description="Name of species. e.g. Homo sapiens.", alias="organism" + ) species_id: str = "" - cell_type: str = "" - cell_line: str = "" - exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)") + genotype: str = Field("", description="Genotype of the sample") + phenotype: str = Field("", description="Phenotype of the sample") + + cell_type: str = Field( + "", + description="specific kind of cell with distinct characteristics found in an organism. e.g. Neurons, Hepatocytes, Adipocytes", + ) + cell_line: str = Field( + "", + description="population of cells derived from a single cell and cultured in the lab for extended use, e.g. HeLa, HepG2, k562", + ) + tissue: str = Field("", description="Tissue type") + library_source: str = Field( "", description="Library source (e.g. genomic, transcriptomic)" ) - genotype: str = Field("", description="Genotype of the sample") - target: str = Field("", description="Target of the assay (e.g. H3K4me3)") + assay: str = Field( + "", description="Experimental protocol (e.g. ChIP-seq)", alias="exp_protocol" + ) antibody: str = Field("", description="Antibody used in the assay") + target: str = Field("", description="Target of the assay (e.g. H3K4me3)") treatment: str = Field( "", description="Treatment of the sample (e.g. drug treatment)" ) - tissue: str = Field("", description="Tissue type") - global_sample_id: str = Field("", description="Global sample identifier") - global_experiment_id: str = Field("", description="Global experiment identifier") + + global_sample_id: str = Field( + "", description="Global sample identifier. e.g. GSM000" + ) # excluded in training + global_experiment_id: str = Field( + "", description="Global experiment identifier. e.g. GSE000" + ) # excluded in training description: str = Field("", description="Description of the sample") model_config = ConfigDict( diff --git a/bedboss/utils.py b/bedboss/utils.py index f281b01..feea63a 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -4,6 +4,9 @@ import requests from pephubclient.files_manager import FilesManager +import peppy +from peppy.const import SAMPLE_RAW_DICT_KEY +from attribute_standardizer import AttrStandardizer _LOGGER = logging.getLogger("bedboss") @@ -119,3 +122,46 @@ def save_example_bedbase_config(path: str) -> None: file_path = os.path.abspath(os.path.join(path, "bedbase_config.yaml")) FilesManager.save_yaml(example_bedbase_config(), file_path) _LOGGER.info(f"Example BedBase configuration saved to: {file_path}") + + +def standardize_pep( + pep: peppy.Project, standard_columns: list = None, model: str = "BEDBASE" +) -> peppy.Project: + """ + Standardize PEP file by using bedMS standardization model + :param pep: peppy project + :param standard_columns: list of columns to standardize + + :return: peppy project + + """ + if standard_columns is None: + standard_columns = ["library_source", "assay", "genome", "species_name"] + model = AttrStandardizer("BEDBASE") + suggestions = model.standardize(pep) + + changes = {} + if suggestions is None: + return pep + for original, suggestion_dict in suggestions.items(): + for suggestion, value in suggestion_dict.items(): + if value > 0.9 and suggestion in standard_columns: + if suggestion not in changes: + changes[suggestion] = {original: value} + else: + if list(changes[suggestion].values())[0] < value: + changes[suggestion] = {original: value} + + raw_pep = pep.to_dict(extended=True) + for suggestion, original_dict in changes.items(): + original_key = list(original_dict.keys())[0] + if ( + suggestion not in raw_pep[SAMPLE_RAW_DICT_KEY] + and original_key in raw_pep[SAMPLE_RAW_DICT_KEY] + ): + raw_pep[SAMPLE_RAW_DICT_KEY][suggestion] = raw_pep[SAMPLE_RAW_DICT_KEY][ + original_key + ] + del raw_pep[SAMPLE_RAW_DICT_KEY][original_key] + + return peppy.Project.from_dict(raw_pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 2fb0151..9d6aab5 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -9,5 +9,6 @@ bbconf>=0.6.1 refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 +bedmess>=0.1.0 pephubclient>=0.4.4 geniml>=0.4.0 \ No newline at end of file diff --git a/test/test_standardizer.py b/test/test_standardizer.py new file mode 100644 index 0000000..b63b6af --- /dev/null +++ b/test/test_standardizer.py @@ -0,0 +1,24 @@ +from bedboss.utils import standardize_pep +import peppy +import pytest + + +# @pytest.mark.skip(reason="Not for automatic testing") +@pytest.mark.parametrize( + "registry_path", + [ + "bedbase/gse274894:samples", + "bedbase/gse275349:samples", + "bedbase/gse262920:samples", + "bedbase/gse236101:samples", + "bedbase/gse254365:samples", + ], +) +@pytest.mark.parametrize( + "model", + ["BEDBASE", "ENCODE"], +) +def test_standardize_pep(registry_path, model): + pep = peppy.Project.from_pephub(registry_path) + standardize_pep(pep, model=model) + assert True