Skip to content

Commit

Permalink
Merge branch 'dev' into dev_ref_genome_validator
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi authored Sep 19, 2024
2 parents e34e4af + ed04ce3 commit 8c1c2d3
Show file tree
Hide file tree
Showing 8 changed files with 140 additions and 16 deletions.
8 changes: 8 additions & 0 deletions bedboss/bbuploader/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def upload_all(
True, help="Run skipped projects. [Default: False]"
),
run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"),
standardize_pep: bool = typer.Option(
False, help="Standardize pep with BEDMESS. [Default: False]"
),
):
from .main import upload_all as upload_all_function

Expand All @@ -57,6 +60,7 @@ def upload_all(
rerun=rerun,
run_skipped=run_skipped,
run_failed=run_failed,
standardize_pep=standardize_pep,
)


Expand All @@ -79,6 +83,9 @@ def upload_gse(
True, help="Run skipped projects. [Default: False]"
),
run_failed: bool = typer.Option(True, help="Run failed projects. [Default: False]"),
standardize_pep: bool = typer.Option(
False, help="Standardize pep with BEDMESS. [Default: False]"
),
):
from .main import upload_gse as upload_gse_function

Expand All @@ -91,6 +98,7 @@ def upload_gse(
rerun=rerun,
run_skipped=run_skipped,
run_failed=run_failed,
standardize_pep=standardize_pep,
)


Expand Down
21 changes: 16 additions & 5 deletions bedboss/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from bedboss.bedboss import run_all
from bedboss.bedbuncher.bedbuncher import run_bedbuncher
from bedboss.exceptions import BedBossException
from bedboss.utils import standardize_genome_name
from bedboss.utils import standardize_genome_name, standardize_pep as pep_standardizer

_LOGGER = logging.getLogger(PKG_NAME)
_LOGGER.setLevel(logging.DEBUG)
Expand All @@ -36,8 +36,9 @@ def upload_all(
genome: str = None,
create_bedset: bool = True,
rerun: bool = False,
run_skipped=False,
run_failed=True,
run_skipped: bool = False,
run_failed: bool = True,
standardize_pep: bool = False,
):
"""
This is main function that is responsible for processing bed files from PEPHub.
Expand All @@ -54,6 +55,7 @@ def upload_all(
:param rerun: rerun processing of the series
:param run_skipped: rerun files that were skipped
:param run_failed: rerun failed files
:param standardize_pep: standardize pep metadata using BEDMS
"""

phc = PEPHubClient()
Expand Down Expand Up @@ -121,6 +123,7 @@ def upload_all(
genome=genome,
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
)
except Exception as err:
_LOGGER.error(
Expand Down Expand Up @@ -243,8 +246,9 @@ def upload_gse(
create_bedset: bool = True,
genome: str = None,
rerun: bool = False,
run_skipped=False,
run_failed=True,
run_skipped: bool = False,
run_failed: bool = True,
standardize_pep: bool = False,
):
"""
Upload bed files from GEO series to BedBase
Expand All @@ -257,6 +261,7 @@ def upload_gse(
:param rerun: rerun processing of the series
:param run_skipped: rerun files that were skipped
:param run_failed: rerun failed files
:param standardize_pep: standardize pep metadata using BEDMS
:return: None
"""
Expand Down Expand Up @@ -301,6 +306,7 @@ def upload_gse(
genome=genome,
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
)
except Exception as e:
_LOGGER.error(f"Processing of '{gse}' failed with error: {e}")
Expand Down Expand Up @@ -346,6 +352,7 @@ def _upload_gse(
genome: str = None,
sa_session: Session = None,
gse_status_sa_model: GeoGseStatus = None,
standardize_pep: bool = False,
) -> ProjectProcessingStatus:
"""
Upload bed files from GEO series to BedBase
Expand All @@ -357,6 +364,7 @@ def _upload_gse(
:param genome: reference genome to upload to database. If None, all genomes will be processed
:param sa_session: opened session to the database
:param gse_status_sa_model: sqlalchemy model for project status
:param standardize_pep: standardize pep metadata using BEDMS
:return: None
"""
Expand All @@ -370,6 +378,9 @@ def _upload_gse(

project = phc.load_project(f"bedbase/{gse}:{DEFAULT_GEO_TAG}")

if standardize_pep:
project = pep_standardizer(project)

project_status = ProjectProcessingStatus(number_of_samples=len(project.samples))
uploaded_files = []
gse_status_sa_model.number_of_files = len(project.samples)
Expand Down
15 changes: 14 additions & 1 deletion bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@
PlotsUpload,
StatsUpload,
)
from bedboss.utils import get_genome_digest, standardize_genome_name

from bedboss.utils import (
standardize_genome_name,
get_genome_digest,
standardize_pep as pep_standardizer,
)
from bedboss.exceptions import BedBossException
from bedboss._version import __version__


_LOGGER = logging.getLogger(PKG_NAME)

Expand Down Expand Up @@ -235,6 +243,7 @@ def insert_pep(
upload_pephub: bool = False,
upload_qdrant: bool = False,
no_fail: bool = False,
standardize_pep: bool = False,
pm: pypiper.PipelineManager = None,
) -> None:
"""
Expand All @@ -260,6 +269,7 @@ def insert_pep(
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param bool upload_qdrant: whether to execute qdrant indexing
:param bool no_fail: whether to raise an error if bedset was not added to the database
:param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False)
:param pypiper.PipelineManager pm: pypiper object
:return: None
"""
Expand All @@ -276,6 +286,9 @@ def insert_pep(
else:
raise BedBossException("Incorrect pep type. Exiting...")

if standardize_pep:
pep = pep_standardizer(pep)

bbagent = BedBaseAgent(bedbase_config)

validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH)
Expand Down
2 changes: 2 additions & 0 deletions bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def run_pep(
upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"),
no_fail: bool = typer.Option(False, help="Do not fail on error"),
license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"),
standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"),
# PipelineManager
multi: bool = typer.Option(False, help="Run multiple samples"),
recover: bool = typer.Option(True, help="Recover from previous run"),
Expand Down Expand Up @@ -192,6 +193,7 @@ def run_pep(
upload_pephub=upload_pephub,
upload_qdrant=upload_qdrant,
no_fail=no_fail,
standardize_pep=standardize_pep,
pm=create_pm(
outfolder=outfolder,
multi=multi,
Expand Down
37 changes: 28 additions & 9 deletions bedboss/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,42 @@ class FILE_TYPE(str, Enum):
class BedMetadata(BaseModel):
sample_name: str
genome: str
organism: str = ""

species_name: str = Field(
default="", description="Name of species. e.g. Homo sapiens.", alias="organism"
)
species_id: str = ""
cell_type: str = ""
cell_line: str = ""
exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)")
genotype: str = Field("", description="Genotype of the sample")
phenotype: str = Field("", description="Phenotype of the sample")

cell_type: str = Field(
"",
description="specific kind of cell with distinct characteristics found in an organism. e.g. Neurons, Hepatocytes, Adipocytes",
)
cell_line: str = Field(
"",
description="population of cells derived from a single cell and cultured in the lab for extended use, e.g. HeLa, HepG2, k562",
)
tissue: str = Field("", description="Tissue type")

library_source: str = Field(
"", description="Library source (e.g. genomic, transcriptomic)"
)
genotype: str = Field("", description="Genotype of the sample")
target: str = Field("", description="Target of the assay (e.g. H3K4me3)")
assay: str = Field(
"", description="Experimental protocol (e.g. ChIP-seq)", alias="exp_protocol"
)
antibody: str = Field("", description="Antibody used in the assay")
target: str = Field("", description="Target of the assay (e.g. H3K4me3)")
treatment: str = Field(
"", description="Treatment of the sample (e.g. drug treatment)"
)
tissue: str = Field("", description="Tissue type")
global_sample_id: str = Field("", description="Global sample identifier")
global_experiment_id: str = Field("", description="Global experiment identifier")

global_sample_id: str = Field(
"", description="Global sample identifier. e.g. GSM000"
) # excluded in training
global_experiment_id: str = Field(
"", description="Global experiment identifier. e.g. GSE000"
) # excluded in training
description: str = Field("", description="Description of the sample")

model_config = ConfigDict(
Expand Down
46 changes: 46 additions & 0 deletions bedboss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

import requests
from pephubclient.files_manager import FilesManager
import peppy
from peppy.const import SAMPLE_RAW_DICT_KEY
from attribute_standardizer import AttrStandardizer

_LOGGER = logging.getLogger("bedboss")

Expand Down Expand Up @@ -119,3 +122,46 @@ def save_example_bedbase_config(path: str) -> None:
file_path = os.path.abspath(os.path.join(path, "bedbase_config.yaml"))
FilesManager.save_yaml(example_bedbase_config(), file_path)
_LOGGER.info(f"Example BedBase configuration saved to: {file_path}")


def standardize_pep(
pep: peppy.Project, standard_columns: list = None, model: str = "BEDBASE"
) -> peppy.Project:
"""
Standardize PEP file by using bedMS standardization model
:param pep: peppy project
:param standard_columns: list of columns to standardize
:return: peppy project
"""
if standard_columns is None:
standard_columns = ["library_source", "assay", "genome", "species_name"]
model = AttrStandardizer("BEDBASE")
suggestions = model.standardize(pep)

changes = {}
if suggestions is None:
return pep
for original, suggestion_dict in suggestions.items():
for suggestion, value in suggestion_dict.items():
if value > 0.9 and suggestion in standard_columns:
if suggestion not in changes:
changes[suggestion] = {original: value}
else:
if list(changes[suggestion].values())[0] < value:
changes[suggestion] = {original: value}

raw_pep = pep.to_dict(extended=True)
for suggestion, original_dict in changes.items():
original_key = list(original_dict.keys())[0]
if (
suggestion not in raw_pep[SAMPLE_RAW_DICT_KEY]
and original_key in raw_pep[SAMPLE_RAW_DICT_KEY]
):
raw_pep[SAMPLE_RAW_DICT_KEY][suggestion] = raw_pep[SAMPLE_RAW_DICT_KEY][
original_key
]
del raw_pep[SAMPLE_RAW_DICT_KEY][original_key]

return peppy.Project.from_dict(raw_pep)
3 changes: 2 additions & 1 deletion requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ peppy>=0.40.6
yacman>=0.8.4
requests>=2.28.2
piper>=v0.14.0
bbconf>=0.6.1
bbconf>=0.7.0
# bbconf @ git+https://github.com/databio/bbconf.git@dev#egg=bbconf
refgenconf>=0.12.2
pandas>=2.0.0
ubiquerg>=0.6.2
bedmess>=0.1.0
pephubclient>=0.4.4
geniml>=0.4.1
24 changes: 24 additions & 0 deletions test/test_standardizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from bedboss.utils import standardize_pep
import peppy
import pytest


# @pytest.mark.skip(reason="Not for automatic testing")
@pytest.mark.parametrize(
"registry_path",
[
"bedbase/gse274894:samples",
"bedbase/gse275349:samples",
"bedbase/gse262920:samples",
"bedbase/gse236101:samples",
"bedbase/gse254365:samples",
],
)
@pytest.mark.parametrize(
"model",
["BEDBASE", "ENCODE"],
)
def test_standardize_pep(registry_path, model):
pep = peppy.Project.from_pephub(registry_path)
standardize_pep(pep, model=model)
assert True

0 comments on commit 8c1c2d3

Please sign in to comment.