Skip to content

Commit

Permalink
Merge pull request #74 from databio/dev_ref_genome_validator
Browse files Browse the repository at this point in the history
Dev ref genome validator
  • Loading branch information
khoroshevskyi authored Sep 20, 2024
2 parents ed04ce3 + b58571f commit f40682d
Show file tree
Hide file tree
Showing 46 changed files with 9,906 additions and 60 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,8 @@ data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e27
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
test/Untitled.ipynb
/scripts/ref_genome_validating/genome_folder/
/scripts/ref_genome_validating/data/
/scripts/ref_genome_validating/results/
/scripts/ref_genome_validating/stats_results/results.yaml
/scripts/ref_genome_validating/stats_results/results_backup_11sep2024.yaml
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
repos:
# Using this mirror lets us use mypyc-compiled black, which is about 2x faster
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.1.1
rev: 24.8.0
hooks:
- id: black
# It is recommended to specify the latest version of Python
Expand Down
5 changes: 4 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,8 @@ include bedboss/qdrant_index/*
include bedboss/bedbuncher/*
include bedboss/bedbuncher/tools/*
include bedboss/bedclassifier/*
include bedboss/refgenome_validator/*
include bedboss/tokens/*
include bedboss/bbuploader/*
include bedboss/tokens/*
include bedboss/bbuploader/*
include bedboss/refgenome_validator/chrom_sizes/*
2 changes: 1 addition & 1 deletion bedboss/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.1"
__version__ = "0.5.0"
46 changes: 4 additions & 42 deletions bedboss/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def upload_all(

count = 0
for gse_pep in pep_annotation_list.results:

with Session(bbagent.config.db_engine.engine) as session:
_LOGGER.info(f"Processing: '{gse_pep.name}'")

Expand Down Expand Up @@ -308,6 +307,7 @@ def upload_gse(
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
rerun=rerun,
)
except Exception as e:
_LOGGER.error(f"Processing of '{gse}' failed with error: {e}")
Expand Down Expand Up @@ -354,6 +354,7 @@ def _upload_gse(
sa_session: Session = None,
gse_status_sa_model: GeoGseStatus = None,
standardize_pep: bool = False,
rerun: bool = False,
) -> ProjectProcessingStatus:
"""
Upload bed files from GEO series to BedBase
Expand All @@ -366,6 +367,7 @@ def _upload_gse(
:param sa_session: opened session to the database
:param gse_status_sa_model: sqlalchemy model for project status
:param standardize_pep: standardize pep metadata using BEDMS
:param rerun: force overwrite data in the database
:return: None
"""
Expand All @@ -387,7 +389,6 @@ def _upload_gse(
gse_status_sa_model.number_of_files = len(project.samples)
sa_session.commit()
for project_sample in project.samples:

sample_gsm = project_sample.get("sample_geo_accession", "").lower()

required_metadata = process_pep_sample(
Expand Down Expand Up @@ -446,7 +447,7 @@ def _upload_gse(
upload_pephub=True,
upload_s3=True,
upload_qdrant=True,
force_overwrite=False,
force_overwrite=rerun,
)
uploaded_files.append(file_digest)
sample_status.status = STATUS.SUCCESS
Expand All @@ -460,7 +461,6 @@ def _upload_gse(
sa_session.commit()

if create_bedset and uploaded_files:

_LOGGER.info(f"Creating bedset for: '{gse}'")
run_bedbuncher(
bedbase_config=bedbase_config,
Expand All @@ -481,41 +481,3 @@ def _upload_gse(

_LOGGER.info(f"Processing of '{gse}' is finished with success!")
return project_status


#
# if __name__ == "__main__":
# # upload_gse(
# # # gse="gse246900",
# # # gse="gse247593",
# # # gse="gse241222",
# # #gse="gse266130",
# # gse="gse99178",
# # # gse="gse240325", # TODO: check if qc works
# # # gse="gse229592", # mice
# # bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
# # outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
# # # genome="HG38",
# # # rerun=True,
# # run_failed=True,
# # run_skipped=True,
# # )
# upload_all(
# bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
# outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
# start_date="2024/01/21",
# end_date="2024/08/28",
# search_limit=2,
# search_offset=0,
# genome="GRCh38",
# rerun=True,
# )
# # upload_all(
# # bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
# # outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
# # start_date="2024/01/01",
# # # end_date="2024/03/28",
# # search_limit=200,
# # search_offset=0,
# # genome="GRCh38",
# # )
4 changes: 2 additions & 2 deletions bedboss/bbuploader/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

class BedBossMetadata(BaseModel):
genome: str = Field(None, alias="ref_genome")
organism: Optional[str] = Field("", alias="sample_organism_ch1")
species_name: Optional[str] = Field("", alias="sample_organism_ch1")
species_id: Optional[str] = Field("", alias="sample_taxid_ch1")
cell_type: Optional[str] = ""
cell_line: Optional[str] = ""
genotype: Optional[str] = ""
exp_protocol: Optional[str] = Field("", alias="sample_library_strategy")
assay: Optional[str] = Field("", alias="sample_library_strategy")
library_source: Optional[str] = Field("", alias="sample_library_source")
target: Optional[str] = Field("")
antibody: Optional[str] = Field("", alias="chip_antibody")
Expand Down
14 changes: 12 additions & 2 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,17 @@
from pephubclient.helpers import MessageHandler as m
from pephubclient.helpers import is_registry_path

from bedboss._version import __version__
from bedboss.bedbuncher import run_bedbuncher
from bedboss.bedmaker.bedmaker import make_all
from bedboss.bedstat.bedstat import bedstat
from bedboss.const import BEDBOSS_PEP_SCHEMA_PATH, PKG_NAME
from bedboss.exceptions import BedBossException
from bedboss.models import (
BedClassificationUpload,
FilesUpload,
PlotsUpload,
StatsUpload,
)
from bedboss.refgenome_validator.main import ReferenceValidator

from bedboss.utils import (
standardize_genome_name,
Expand Down Expand Up @@ -62,6 +61,7 @@ def run_all(
rfg_config: str = None,
narrowpeak: bool = False,
check_qc: bool = True,
validate_reference: bool = True,
chrom_sizes: str = None,
open_signal_matrix: str = None,
ensdb: str = None,
Expand Down Expand Up @@ -91,6 +91,7 @@ def run_all(
:param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
(transcription factor implies narrow, histone mark implies broad peaks) [optional]
:param bool check_qc: set True to run quality control during badmaking [optional] (default: True)
:param bool validate_reference: set True to run genome reference validator
:param str chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional]
:param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
:param dict other_metadata: a dict containing all attributes from the sample
Expand Down Expand Up @@ -195,13 +196,22 @@ def run_all(
bed_format=bed_metadata.bed_format.value,
)

if validate_reference:
_LOGGER.info("Validating reference genome")
ref_valid_stats = ReferenceValidator().determine_compatibility(
bedfile=bed_metadata.bed_file, concise=True
)
else:
ref_valid_stats = None

bbagent.bed.add(
identifier=bed_metadata.bed_digest,
stats=stats.model_dump(exclude_unset=True),
metadata=other_metadata,
plots=plots.model_dump(exclude_unset=True),
files=files.model_dump(exclude_unset=True),
classification=classification.model_dump(exclude_unset=True),
ref_validation=ref_valid_stats,
license_id=license_id,
upload_qdrant=upload_qdrant,
upload_pephub=upload_pephub,
Expand Down
2 changes: 1 addition & 1 deletion bedboss/bedbuncher/tools/bedsetStat.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ if (is.null(opt$json)) {
#'
#' Calculates how many regionsets (bedfiles) overlap at least said percentage
#' of regions included in the universe. The universe is considered a union of
#' all regionsets (bedfiles) in the colection of
#' all regionsets (bedfiles) in the collection of
#' regionsets (bedset, or set of bedfiles)
#'
#' @param queryList GRangesList object with regionsets to be considered
Expand Down
1 change: 0 additions & 1 deletion bedboss/bedclassifier/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from bedboss.bedclassifier.bedclassifier import get_bed_type
2 changes: 1 addition & 1 deletion bedboss/bedmaker/bedmaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from refgenconf.exceptions import MissingGenomeError
from ubiquerg import is_command_callable

from bedboss.bedclassifier import get_bed_type
from bedboss.bedclassifier.bedclassifier import get_bed_type
from bedboss.bedmaker.const import (
BED_TO_BIGBED_PROGRAM,
BEDGRAPH_TEMPLATE,
Expand Down
2 changes: 1 addition & 1 deletion bedboss/bedmaker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_chrom_sizes(genome: str, rfg_config: Union[str, Path]) -> str:
"""

_LOGGER.info("Determining path to chrom.sizes asset via Refgenie.")
# initalizing refginie config file
# initializing refginie config file
rgc = get_rgc(rfg_config=rfg_config)

try:
Expand Down
2 changes: 1 addition & 1 deletion bedboss/bedstat/tools/regionstat.R
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) {
)
}

# QThist plot
# QThis plot
if (exists("bedmeta")){
if ("mean_region_width" %in% names(bedmeta)){
run_plot = FALSE
Expand Down
2 changes: 1 addition & 1 deletion bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def convert_universe(
def check_requirements():
from bedboss.bedboss import requirements_check

print("Checking piplines requirements...")
print("Checking pipelines requirements...")
requirements_check()


Expand Down
12 changes: 12 additions & 0 deletions bedboss/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,15 @@ def __init__(self, reason: str = ""):
using Open Signal Matrix
"""
super(BedTypeException, self).__init__(reason)


class ValidatorException(BedBossException):
"""Exception when there is an exception during refgenome validation"""

def __init__(self, reason: str = ""):
"""
Optionally provide explanation for exceptional condition.
:param str reason: some context why error occurred
"""
super(BedTypeException, self).__init__(reason)
63 changes: 63 additions & 0 deletions bedboss/refgenome_validator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Reference Genome Predictor and Validator

-----
### Background
Original Lab Blog Post: https://lab.databio.org/docs/guide/Reference-genome-predictor-tool/


## Research Project Outline

#### Question
Original: Can we give a BED file as an input and predict the most likely reference genome assembly associated with the BED file?

Modified: Can we give a BED file as an input and then determine a level of compatibility for different reference genome assemblies?


#### High Level Execution

1. Ingest query bed file (without any annotations or metadata, for now).

2. Compare regions with chromosome lengths and chrom names.

- Assumption: if regions exist beyond defined ref genome chromosomes,the bed file is less likely to be associated with the ref genome.
- Use size files to get chrom lengths and names
- How many chrom names in BED file that are _not_ in the size file?

3. Compare regions in bed files with excluded ranges/black list regions for each of reference genome.

- Assumption: during bed file creation, excluded ranges were filtered out. Therefore, if a bed file has more regions in excluded ranges from hg19 vs less from hg38, then the bed file is more likely associated with hg38.
- Two Tiers:
1. Gaps/Centromeres/Telomeres
2. All other Excluded Ranges

4. Once the above are quantified, we can exclude some reference genomes altogether and give probability of compatibility with the remaining.


#### Detailed Execution Steps

1. Retrieve chrom size files for ref genome assemblies (can use seq collections API: https://refget.databio.org/).
2. Cache relevant BED files which contain excluded ranges using [BBClient](https://docs.bedbase.org/geniml/tutorials/bbclient/)
3. Build database for each refgenome assembly excluded ranges, gaps centromeres telomeres using [IGD](https://github.com/databio/gtars/tree/dev_igd) (Use rust implementation if finished, else use C++ implementation).
4. Query "unknown" BED File against chrom size files and the IGDs (using `igd search`).
5. Obtain overlap stats for each of the IGDs, rank them based on _least overlaps_.
6. Run on BED files whose ref genomes are _known_ and calculate accuracy of highest probability compatible ref genome.


#### Additional Notes
- Begin with human bed files and reference genomes for now.
- predicting between hg38 and hg19 should be the first (simple) task
- Prediction happens at a high level for mutually exclusive options,e.g. predicting hg38 vs hg19 as the coordinate systems are different
- this mutual exclusivity would also apply to different types of hg38 (UCSC, ensembl, ncbi).
- Validation occurs at the next level after basic prediction is done.
- different/levels of tiers based on cutoffs wrt specificity and sensitivity
- These tiers are based on a variety of parameters of increasing complexity:
- name matching of chromosomes
- overlaps (chr.sizes, excluded ranges)
- ML (BED embeddings)
- Bed annotations (text similarity)
- Future work could involve machine learning for ref genome prediction. However, we will begin with a simple classifier (which may be sufficient).
- Future work could add annotations/metadata for making the prediction.

#### Software Notes

- Create a validator class that can ingest a BED file as well as a genome_model object
Empty file.
Loading

0 comments on commit f40682d

Please sign in to comment.