Skip to content

Commit

Permalink
Merge pull request #75 from databio/dev_metadata_model
Browse files Browse the repository at this point in the history
Added new metadata model
  • Loading branch information
khoroshevskyi authored Sep 12, 2024
2 parents 341affe + 2e46d10 commit 0d28e99
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 10 deletions.
15 changes: 14 additions & 1 deletion bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@
PlotsUpload,
StatsUpload,
)
from bedboss.utils import get_genome_digest, standardize_genome_name

from bedboss.utils import (
standardize_genome_name,
get_genome_digest,
standardize_pep as pep_standardizer,
)
from bedboss.exceptions import BedBossException
from bedboss._version import __version__


_LOGGER = logging.getLogger(PKG_NAME)

Expand Down Expand Up @@ -235,6 +243,7 @@ def insert_pep(
upload_pephub: bool = False,
upload_qdrant: bool = False,
no_fail: bool = False,
standardize_pep: bool = False,
pm: pypiper.PipelineManager = None,
) -> None:
"""
Expand All @@ -260,6 +269,7 @@ def insert_pep(
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param bool upload_qdrant: whether to execute qdrant indexing
:param bool no_fail: whether to raise an error if bedset was not added to the database
:param bool standardize_pep: whether to standardize the pep file before processing by using bedms. (default: False)
:param pypiper.PipelineManager pm: pypiper object
:return: None
"""
Expand All @@ -276,6 +286,9 @@ def insert_pep(
else:
raise BedBossException("Incorrect pep type. Exiting...")

if standardize_pep:
pep = pep_standardizer(pep)

bbagent = BedBaseAgent(bedbase_config)

validate_project(pep, BEDBOSS_PEP_SCHEMA_PATH)
Expand Down
2 changes: 2 additions & 0 deletions bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def run_pep(
upload_pephub: bool = typer.Option(False, help="Upload to PEPHub"),
no_fail: bool = typer.Option(False, help="Do not fail on error"),
license_id: str = typer.Option(DEFAULT_LICENSE, help="License ID"),
standardize_pep: bool = typer.Option(False, help="Standardize the PEP using bedMS"),
# PipelineManager
multi: bool = typer.Option(False, help="Run multiple samples"),
recover: bool = typer.Option(True, help="Recover from previous run"),
Expand Down Expand Up @@ -192,6 +193,7 @@ def run_pep(
upload_pephub=upload_pephub,
upload_qdrant=upload_qdrant,
no_fail=no_fail,
standardize_pep=standardize_pep,
pm=create_pm(
outfolder=outfolder,
multi=multi,
Expand Down
37 changes: 28 additions & 9 deletions bedboss/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,42 @@ class FILE_TYPE(str, Enum):
class BedMetadata(BaseModel):
sample_name: str
genome: str
organism: str = ""

species_name: str = Field(
default="", description="Name of species. e.g. Homo sapiens.", alias="organism"
)
species_id: str = ""
cell_type: str = ""
cell_line: str = ""
exp_protocol: str = Field("", description="Experimental protocol (e.g. ChIP-seq)")
genotype: str = Field("", description="Genotype of the sample")
phenotype: str = Field("", description="Phenotype of the sample")

cell_type: str = Field(
"",
description="specific kind of cell with distinct characteristics found in an organism. e.g. Neurons, Hepatocytes, Adipocytes",
)
cell_line: str = Field(
"",
description="population of cells derived from a single cell and cultured in the lab for extended use, e.g. HeLa, HepG2, k562",
)
tissue: str = Field("", description="Tissue type")

library_source: str = Field(
"", description="Library source (e.g. genomic, transcriptomic)"
)
genotype: str = Field("", description="Genotype of the sample")
target: str = Field("", description="Target of the assay (e.g. H3K4me3)")
assay: str = Field(
"", description="Experimental protocol (e.g. ChIP-seq)", alias="exp_protocol"
)
antibody: str = Field("", description="Antibody used in the assay")
target: str = Field("", description="Target of the assay (e.g. H3K4me3)")
treatment: str = Field(
"", description="Treatment of the sample (e.g. drug treatment)"
)
tissue: str = Field("", description="Tissue type")
global_sample_id: str = Field("", description="Global sample identifier")
global_experiment_id: str = Field("", description="Global experiment identifier")

global_sample_id: str = Field(
"", description="Global sample identifier. e.g. GSM000"
) # excluded in training
global_experiment_id: str = Field(
"", description="Global experiment identifier. e.g. GSE000"
) # excluded in training
description: str = Field("", description="Description of the sample")

model_config = ConfigDict(
Expand Down
46 changes: 46 additions & 0 deletions bedboss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

import requests
from pephubclient.files_manager import FilesManager
import peppy
from peppy.const import SAMPLE_RAW_DICT_KEY
from attribute_standardizer import AttrStandardizer

_LOGGER = logging.getLogger("bedboss")

Expand Down Expand Up @@ -119,3 +122,46 @@ def save_example_bedbase_config(path: str) -> None:
file_path = os.path.abspath(os.path.join(path, "bedbase_config.yaml"))
FilesManager.save_yaml(example_bedbase_config(), file_path)
_LOGGER.info(f"Example BedBase configuration saved to: {file_path}")


def standardize_pep(
pep: peppy.Project, standard_columns: list = None, model: str = "BEDBASE"
) -> peppy.Project:
"""
Standardize PEP file by using bedMS standardization model
:param pep: peppy project
:param standard_columns: list of columns to standardize
:return: peppy project
"""
if standard_columns is None:
standard_columns = ["library_source", "assay", "genome", "species_name"]
model = AttrStandardizer("BEDBASE")
suggestions = model.standardize(pep)

changes = {}
if suggestions is None:
return pep
for original, suggestion_dict in suggestions.items():
for suggestion, value in suggestion_dict.items():
if value > 0.9 and suggestion in standard_columns:
if suggestion not in changes:
changes[suggestion] = {original: value}
else:
if list(changes[suggestion].values())[0] < value:
changes[suggestion] = {original: value}

raw_pep = pep.to_dict(extended=True)
for suggestion, original_dict in changes.items():
original_key = list(original_dict.keys())[0]
if (
suggestion not in raw_pep[SAMPLE_RAW_DICT_KEY]
and original_key in raw_pep[SAMPLE_RAW_DICT_KEY]
):
raw_pep[SAMPLE_RAW_DICT_KEY][suggestion] = raw_pep[SAMPLE_RAW_DICT_KEY][
original_key
]
del raw_pep[SAMPLE_RAW_DICT_KEY][original_key]

return peppy.Project.from_dict(raw_pep)
1 change: 1 addition & 0 deletions requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ bbconf>=0.6.1
refgenconf>=0.12.2
pandas>=2.0.0
ubiquerg>=0.6.2
bedmess>=0.1.0
pephubclient>=0.4.4
geniml>=0.4.0
24 changes: 24 additions & 0 deletions test/test_standardizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from bedboss.utils import standardize_pep
import peppy
import pytest


# @pytest.mark.skip(reason="Not for automatic testing")
@pytest.mark.parametrize(
"registry_path",
[
"bedbase/gse274894:samples",
"bedbase/gse275349:samples",
"bedbase/gse262920:samples",
"bedbase/gse236101:samples",
"bedbase/gse254365:samples",
],
)
@pytest.mark.parametrize(
"model",
["BEDBASE", "ENCODE"],
)
def test_standardize_pep(registry_path, model):
pep = peppy.Project.from_pephub(registry_path)
standardize_pep(pep, model=model)
assert True

0 comments on commit 0d28e99

Please sign in to comment.