diff --git a/bedboss/bbuploader/cli.py b/bedboss/bbuploader/cli.py index fb79413..55c6ac2 100644 --- a/bedboss/bbuploader/cli.py +++ b/bedboss/bbuploader/cli.py @@ -41,6 +41,9 @@ def upload_all( True, help="Run skipped projects. [Default: False]" ), run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"), + standardize_pep: bool = typer.Option( + False, help="Standardize pep with BEDMESS. [Default: False]" + ), ): from .main import upload_all as upload_all_function @@ -57,6 +60,7 @@ def upload_all( rerun=rerun, run_skipped=run_skipped, run_failed=run_failed, + standardize_pep=standardize_pep, ) @@ -79,6 +83,9 @@ def upload_gse( True, help="Run skipped projects. [Default: False]" ), run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"), + standardize_pep: bool = typer.Option( + False, help="Standardize pep with BEDMESS. [Default: False]" + ), ): from .main import upload_gse as upload_gse_function @@ -91,6 +98,7 @@ def upload_gse( rerun=rerun, run_skipped=run_skipped, run_failed=run_failed, + standardize_pep=standardize_pep, ) diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 5a7bfc0..9f3797e 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -19,7 +19,7 @@ from bedboss.bedboss import run_all from bedboss.bedbuncher.bedbuncher import run_bedbuncher from bedboss.exceptions import BedBossException -from bedboss.utils import standardize_genome_name +from bedboss.utils import standardize_genome_name, standardize_pep as pep_standardizer _LOGGER = logging.getLogger(PKG_NAME) _LOGGER.setLevel(logging.DEBUG) @@ -36,8 +36,9 @@ def upload_all( genome: str = None, create_bedset: bool = True, rerun: bool = False, - run_skipped=False, - run_failed=True, + run_skipped: bool = False, + run_failed: bool = True, + standardize_pep: bool = False, ): """ This is main function that is responsible for processing bed files from PEPHub. @@ -54,6 +55,7 @@ def upload_all( :param rerun: rerun processing of the series :param run_skipped: rerun files that were skipped :param run_failed: rerun failed files + :param standardize_pep: standardize pep metadata using BEDMS """ phc = PEPHubClient() @@ -121,6 +123,7 @@ def upload_all( genome=genome, sa_session=session, gse_status_sa_model=gse_status, + standardize_pep=standardize_pep, ) except Exception as err: _LOGGER.error( @@ -243,8 +246,9 @@ def upload_gse( create_bedset: bool = True, genome: str = None, rerun: bool = False, - run_skipped=False, - run_failed=True, + run_skipped: bool = False, + run_failed: bool = True, + standardize_pep: bool = False, ): """ Upload bed files from GEO series to BedBase @@ -257,6 +261,7 @@ def upload_gse( :param rerun: rerun processing of the series :param run_skipped: rerun files that were skipped :param run_failed: rerun failed files + :param standardize_pep: standardize pep metadata using BEDMS :return: None """ @@ -301,6 +306,7 @@ def upload_gse( genome=genome, sa_session=session, gse_status_sa_model=gse_status, + standardize_pep=standardize_pep, ) except Exception as e: _LOGGER.error(f"Processing of '{gse}' failed with error: {e}") @@ -346,6 +352,7 @@ def _upload_gse( genome: str = None, sa_session: Session = None, gse_status_sa_model: GeoGseStatus = None, + standardize_pep: bool = False, ) -> ProjectProcessingStatus: """ Upload bed files from GEO series to BedBase @@ -357,6 +364,7 @@ def _upload_gse( :param genome: reference genome to upload to database. If None, all genomes will be processed :param sa_session: opened session to the database :param gse_status_sa_model: sqlalchemy model for project status + :param standardize_pep: standardize pep metadata using BEDMS :return: None """ @@ -370,6 +378,9 @@ def _upload_gse( project = phc.load_project(f"bedbase/{gse}:{DEFAULT_GEO_TAG}") + if standardize_pep: + project = pep_standardizer(project) + project_status = ProjectProcessingStatus(number_of_samples=len(project.samples)) uploaded_files = [] gse_status_sa_model.number_of_files = len(project.samples) diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 41cf0cb..50581e7 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -26,7 +26,15 @@ PlotsUpload, StatsUpload, ) -from bedboss.utils import get_genome_digest, standardize_genome_name + +from bedboss.utils import ( + standardize_genome_name, + get_genome_digest, + standardize_pep as pep_standardizer, +) +from bedboss.exceptions import BedBossException +from bedboss._version import __version__ + _LOGGER = logging.getLogger(PKG_NAME) @@ -235,6 +243,7 @@ def insert_pep( upload_pephub: bool = False, upload_qdrant: bool = False, no_fail: bool = False, + standardize_pep: bool = False, pm: pypiper.PipelineManager = None, ) -> None: """ @@ -260,6 +269,7 @@ def insert_pep( :param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False) :param bool upload_qdrant: whether to execute qdrant indexing :param bool no_fail: whether to raise an error if bedset was not added to the database + :param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False) :param pypiper.PipelineManager pm: pypiper object :return: None """ @@ -276,6 +286,9 @@ def insert_pep( else: raise BedBossException("Incorrect pep type. Exiting...") + if standardize_pep: + pep = pep_standardizer(pep) + bbagent = BedBaseAgent(bedbase_config) validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH) diff --git a/bedboss/cli.py b/bedboss/cli.py index 3cf2843..76a70b5 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -165,6 +165,7 @@ def run_pep( upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"), no_fail: bool = typer.Option(False, help="Do not fail on error"), license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"), + standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"), # PipelineManager multi: bool = typer.Option(False, help="Run multiple samples"), recover: bool = typer.Option(True, help="Recover from previous run"), @@ -192,6 +193,7 @@ def run_pep( upload_pephub=upload_pephub, upload_qdrant=upload_qdrant, no_fail=no_fail, + standardize_pep=standardize_pep, pm=create_pm( outfolder=outfolder, multi=multi, diff --git a/bedboss/models.py b/bedboss/models.py index ddbae26..636bef3 100644 --- a/bedboss/models.py +++ b/bedboss/models.py @@ -23,23 +23,42 @@ class FILE_TYPE(str, Enum): class BedMetadata(BaseModel): sample_name: str genome: str - organism: str = "" + + species_name: str = Field( + default="", description="Name of species. e.g. Homo sapiens.", alias="organism" + ) species_id: str = "" - cell_type: str = "" - cell_line: str = "" - exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)") + genotype: str = Field("", description="Genotype of the sample") + phenotype: str = Field("", description="Phenotype of the sample") + + cell_type: str = Field( + "", + description="specific kind of cell with distinct characteristics found in an organism. e.g. Neurons, Hepatocytes, Adipocytes", + ) + cell_line: str = Field( + "", + description="population of cells derived from a single cell and cultured in the lab for extended use, e.g. HeLa, HepG2, k562", + ) + tissue: str = Field("", description="Tissue type") + library_source: str = Field( "", description="Library source (e.g. genomic, transcriptomic)" ) - genotype: str = Field("", description="Genotype of the sample") - target: str = Field("", description="Target of the assay (e.g. H3K4me3)") + assay: str = Field( + "", description="Experimental protocol (e.g. ChIP-seq)", alias="exp_protocol" + ) antibody: str = Field("", description="Antibody used in the assay") + target: str = Field("", description="Target of the assay (e.g. H3K4me3)") treatment: str = Field( "", description="Treatment of the sample (e.g. drug treatment)" ) - tissue: str = Field("", description="Tissue type") - global_sample_id: str = Field("", description="Global sample identifier") - global_experiment_id: str = Field("", description="Global experiment identifier") + + global_sample_id: str = Field( + "", description="Global sample identifier. e.g. GSM000" + ) # excluded in training + global_experiment_id: str = Field( + "", description="Global experiment identifier. e.g. GSE000" + ) # excluded in training description: str = Field("", description="Description of the sample") model_config = ConfigDict( diff --git a/bedboss/utils.py b/bedboss/utils.py index f281b01..feea63a 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -4,6 +4,9 @@ import requests from pephubclient.files_manager import FilesManager +import peppy +from peppy.const import SAMPLE_RAW_DICT_KEY +from attribute_standardizer import AttrStandardizer _LOGGER = logging.getLogger("bedboss") @@ -119,3 +122,46 @@ def save_example_bedbase_config(path: str) -> None: file_path = os.path.abspath(os.path.join(path, "bedbase_config.yaml")) FilesManager.save_yaml(example_bedbase_config(), file_path) _LOGGER.info(f"Example BedBase configuration saved to: {file_path}") + + +def standardize_pep( + pep: peppy.Project, standard_columns: list = None, model: str = "BEDBASE" +) -> peppy.Project: + """ + Standardize PEP file by using bedMS standardization model + :param pep: peppy project + :param standard_columns: list of columns to standardize + + :return: peppy project + + """ + if standard_columns is None: + standard_columns = ["library_source", "assay", "genome", "species_name"] + model = AttrStandardizer("BEDBASE") + suggestions = model.standardize(pep) + + changes = {} + if suggestions is None: + return pep + for original, suggestion_dict in suggestions.items(): + for suggestion, value in suggestion_dict.items(): + if value > 0.9 and suggestion in standard_columns: + if suggestion not in changes: + changes[suggestion] = {original: value} + else: + if list(changes[suggestion].values())[0] < value: + changes[suggestion] = {original: value} + + raw_pep = pep.to_dict(extended=True) + for suggestion, original_dict in changes.items(): + original_key = list(original_dict.keys())[0] + if ( + suggestion not in raw_pep[SAMPLE_RAW_DICT_KEY] + and original_key in raw_pep[SAMPLE_RAW_DICT_KEY] + ): + raw_pep[SAMPLE_RAW_DICT_KEY][suggestion] = raw_pep[SAMPLE_RAW_DICT_KEY][ + original_key + ] + del raw_pep[SAMPLE_RAW_DICT_KEY][original_key] + + return peppy.Project.from_dict(raw_pep) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 521a177..f13255c 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -4,10 +4,11 @@ peppy>=0.40.6 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.0 -bbconf>=0.6.1 +bbconf>=0.7.0 # bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 +bedmess>=0.1.0 pephubclient>=0.4.4 geniml>=0.4.1 \ No newline at end of file diff --git a/test/test_standardizer.py b/test/test_standardizer.py new file mode 100644 index 0000000..b63b6af --- /dev/null +++ b/test/test_standardizer.py @@ -0,0 +1,24 @@ +from bedboss.utils import standardize_pep +import peppy +import pytest + + +# @pytest.mark.skip(reason="Not for automatic testing") +@pytest.mark.parametrize( + "registry_path", + [ + "bedbase/gse274894:samples", + "bedbase/gse275349:samples", + "bedbase/gse262920:samples", + "bedbase/gse236101:samples", + "bedbase/gse254365:samples", + ], +) +@pytest.mark.parametrize( + "model", + ["BEDBASE", "ENCODE"], +) +def test_standardize_pep(registry_path, model): + pep = peppy.Project.from_pephub(registry_path) + standardize_pep(pep, model=model) + assert True