Merge pull request #74 from databio/dev_ref_genome_validator

Dev ref genome validator
databio · Sep 20, 2024 · f40682d · f40682d
2 parents ed04ce3 + b58571f
commit f40682d
Show file tree

Hide file tree

Showing 46 changed files with 9,906 additions and 60 deletions.
diff --git a/.gitignore b/.gitignore
@@ -151,3 +151,8 @@ data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e27
 data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
 data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
 test/Untitled.ipynb
+/scripts/ref_genome_validating/genome_folder/
+/scripts/ref_genome_validating/data/
+/scripts/ref_genome_validating/results/
+/scripts/ref_genome_validating/stats_results/results.yaml
+/scripts/ref_genome_validating/stats_results/results_backup_11sep2024.yaml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
   # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.1.1
+    rev: 24.8.0
     hooks:
       - id: black
         # It is recommended to specify the latest version of Python

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -9,5 +9,8 @@ include bedboss/qdrant_index/*
 include bedboss/bedbuncher/*
 include bedboss/bedbuncher/tools/*
 include bedboss/bedclassifier/*
+include bedboss/refgenome_validator/*
 include bedboss/tokens/*
-include bedboss/bbuploader/*
+include bedboss/tokens/*
+include bedboss/bbuploader/*
+include bedboss/refgenome_validator/chrom_sizes/*
diff --git a/bedboss/_version.py b/bedboss/_version.py
@@ -1 +1 @@
-__version__ = "0.4.1"
+__version__ = "0.5.0"
diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py
@@ -79,7 +79,6 @@ def upload_all(
 
     count = 0
     for gse_pep in pep_annotation_list.results:
-
         with Session(bbagent.config.db_engine.engine) as session:
             _LOGGER.info(f"Processing: '{gse_pep.name}'")
 
@@ -308,6 +307,7 @@ def upload_gse(
                 sa_session=session,
                 gse_status_sa_model=gse_status,
                 standardize_pep=standardize_pep,
+                rerun=rerun,
             )
         except Exception as e:
             _LOGGER.error(f"Processing of '{gse}' failed with error: {e}")
@@ -354,6 +354,7 @@ def _upload_gse(
     sa_session: Session = None,
     gse_status_sa_model: GeoGseStatus = None,
     standardize_pep: bool = False,
+    rerun: bool = False,
 ) -> ProjectProcessingStatus:
     """
     Upload bed files from GEO series to BedBase
@@ -366,6 +367,7 @@ def _upload_gse(
     :param sa_session: opened session to the database
     :param gse_status_sa_model: sqlalchemy model for project status
     :param standardize_pep: standardize pep metadata using BEDMS
+    :param rerun: force overwrite data in the database
 
     :return: None
     """
@@ -387,7 +389,6 @@ def _upload_gse(
     gse_status_sa_model.number_of_files = len(project.samples)
     sa_session.commit()
     for project_sample in project.samples:
-
         sample_gsm = project_sample.get("sample_geo_accession", "").lower()
 
         required_metadata = process_pep_sample(
@@ -446,7 +447,7 @@ def _upload_gse(
                 upload_pephub=True,
                 upload_s3=True,
                 upload_qdrant=True,
-                force_overwrite=False,
+                force_overwrite=rerun,
             )
             uploaded_files.append(file_digest)
             sample_status.status = STATUS.SUCCESS
@@ -460,7 +461,6 @@ def _upload_gse(
         sa_session.commit()
 
     if create_bedset and uploaded_files:
-
         _LOGGER.info(f"Creating bedset for: '{gse}'")
         run_bedbuncher(
             bedbase_config=bedbase_config,
@@ -481,41 +481,3 @@ def _upload_gse(
 
     _LOGGER.info(f"Processing of '{gse}' is finished with success!")
     return project_status
-
-
-#
-# if __name__ == "__main__":
-#     # upload_gse(
-#     #     # gse="gse246900",
-#     #     # gse="gse247593",
-#     #     # gse="gse241222",
-#     #     #gse="gse266130",
-#     #     gse="gse99178",
-#     #     # gse="gse240325", # TODO: check if qc works
-#     #     # gse="gse229592", # mice
-#     #     bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
-#     #     outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
-#     #     # genome="HG38",
-#     #     # rerun=True,
-#     #     run_failed=True,
-#     #     run_skipped=True,
-#     # )
-#     upload_all(
-#         bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
-#         outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
-#         start_date="2024/01/21",
-#         end_date="2024/08/28",
-#         search_limit=2,
-#         search_offset=0,
-#         genome="GRCh38",
-#         rerun=True,
-#     )
-# # upload_all(
-# #     bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
-# #     outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
-# #     start_date="2024/01/01",
-# #     # end_date="2024/03/28",
-# #     search_limit=200,
-# #     search_offset=0,
-# #     genome="GRCh38",
-# # )
diff --git a/bedboss/bbuploader/models.py b/bedboss/bbuploader/models.py
@@ -5,12 +5,12 @@
 
 class BedBossMetadata(BaseModel):
     genome: str = Field(None, alias="ref_genome")
-    organism: Optional[str] = Field("", alias="sample_organism_ch1")
+    species_name: Optional[str] = Field("", alias="sample_organism_ch1")
     species_id: Optional[str] = Field("", alias="sample_taxid_ch1")
     cell_type: Optional[str] = ""
     cell_line: Optional[str] = ""
     genotype: Optional[str] = ""
-    exp_protocol: Optional[str] = Field("", alias="sample_library_strategy")
+    assay: Optional[str] = Field("", alias="sample_library_strategy")
     library_source: Optional[str] = Field("", alias="sample_library_source")
     target: Optional[str] = Field("")
     antibody: Optional[str] = Field("", alias="chip_antibody")

diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py
@@ -14,18 +14,17 @@
 from pephubclient.helpers import MessageHandler as m
 from pephubclient.helpers import is_registry_path
 
-from bedboss._version import __version__
 from bedboss.bedbuncher import run_bedbuncher
 from bedboss.bedmaker.bedmaker import make_all
 from bedboss.bedstat.bedstat import bedstat
 from bedboss.const import BEDBOSS_PEP_SCHEMA_PATH, PKG_NAME
-from bedboss.exceptions import BedBossException
 from bedboss.models import (
     BedClassificationUpload,
     FilesUpload,
     PlotsUpload,
     StatsUpload,
 )
+from bedboss.refgenome_validator.main import ReferenceValidator
 
 from bedboss.utils import (
     standardize_genome_name,
@@ -62,6 +61,7 @@ def run_all(
     rfg_config: str = None,
     narrowpeak: bool = False,
     check_qc: bool = True,
+    validate_reference: bool = True,
     chrom_sizes: str = None,
     open_signal_matrix: str = None,
     ensdb: str = None,
@@ -91,6 +91,7 @@ def run_all(
     :param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
         (transcription factor implies narrow, histone mark implies broad peaks) [optional]
     :param bool check_qc: set True to run quality control during badmaking [optional] (default: True)
+    :param bool validate_reference: set True to run genome reference validator
     :param str chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional]
     :param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
     :param dict other_metadata: a dict containing all attributes from the sample
@@ -195,13 +196,22 @@ def run_all(
         bed_format=bed_metadata.bed_format.value,
     )
 
+    if validate_reference:
+        _LOGGER.info("Validating reference genome")
+        ref_valid_stats = ReferenceValidator().determine_compatibility(
+            bedfile=bed_metadata.bed_file, concise=True
+        )
+    else:
+        ref_valid_stats = None
+
     bbagent.bed.add(
         identifier=bed_metadata.bed_digest,
         stats=stats.model_dump(exclude_unset=True),
         metadata=other_metadata,
         plots=plots.model_dump(exclude_unset=True),
         files=files.model_dump(exclude_unset=True),
         classification=classification.model_dump(exclude_unset=True),
+        ref_validation=ref_valid_stats,
         license_id=license_id,
         upload_qdrant=upload_qdrant,
         upload_pephub=upload_pephub,

diff --git a/bedboss/bedbuncher/tools/bedsetStat.R b/bedboss/bedbuncher/tools/bedsetStat.R
@@ -67,7 +67,7 @@ if (is.null(opt$json)) {
 #'
 #' Calculates how many regionsets (bedfiles) overlap at least said percentage 
 #' of regions included in the universe. The universe is considered a union of 
-#' all regionsets (bedfiles) in the colection of 
+#' all regionsets (bedfiles) in the collection of 
 #' regionsets (bedset, or set of bedfiles)
 #'
 #' @param queryList GRangesList object with regionsets to be considered

diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py
@@ -1 +0,0 @@
-from bedboss.bedclassifier.bedclassifier import get_bed_type

diff --git a/bedboss/bedmaker/bedmaker.py b/bedboss/bedmaker/bedmaker.py
@@ -12,7 +12,7 @@
 from refgenconf.exceptions import MissingGenomeError
 from ubiquerg import is_command_callable
 
-from bedboss.bedclassifier import get_bed_type
+from bedboss.bedclassifier.bedclassifier import get_bed_type
 from bedboss.bedmaker.const import (
     BED_TO_BIGBED_PROGRAM,
     BEDGRAPH_TEMPLATE,

diff --git a/bedboss/bedmaker/utils.py b/bedboss/bedmaker/utils.py
@@ -22,7 +22,7 @@ def get_chrom_sizes(genome: str, rfg_config: Union[str, Path]) -> str:
     """
 
     _LOGGER.info("Determining path to chrom.sizes asset via Refgenie.")
-    # initalizing refginie config file
+    # initializing refginie config file
     rgc = get_rgc(rfg_config=rfg_config)
 
     try:

diff --git a/bedboss/bedstat/tools/regionstat.R b/bedboss/bedstat/tools/regionstat.R
@@ -314,7 +314,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) {
     ) 
   }
 
-  # QThist plot
+  # QThis plot
   if (exists("bedmeta")){
     if ("mean_region_width" %in% names(bedmeta)){
       run_plot = FALSE

diff --git a/bedboss/cli.py b/bedboss/cli.py
@@ -554,7 +554,7 @@ def convert_universe(
 def check_requirements():
     from bedboss.bedboss import requirements_check
 
-    print("Checking piplines requirements...")
+    print("Checking pipelines requirements...")
     requirements_check()
 
 

diff --git a/bedboss/exceptions.py b/bedboss/exceptions.py
@@ -60,3 +60,15 @@ def __init__(self, reason: str = ""):
         using Open Signal Matrix
         """
         super(BedTypeException, self).__init__(reason)
+
+
+class ValidatorException(BedBossException):
+    """Exception when there is an exception during refgenome validation"""
+
+    def __init__(self, reason: str = ""):
+        """
+        Optionally provide explanation for exceptional condition.
+
+        :param str reason: some context why error occurred
+        """
+        super(BedTypeException, self).__init__(reason)
diff --git a/bedboss/refgenome_validator/README.md b/bedboss/refgenome_validator/README.md
@@ -0,0 +1,63 @@
+# Reference Genome Predictor and Validator
+
+-----
+### Background
+Original Lab Blog Post: https://lab.databio.org/docs/guide/Reference-genome-predictor-tool/
+
+
+## Research Project Outline
+
+#### Question
+Original: Can we give a BED file as an input and predict the most likely reference genome assembly associated with the BED file?
+
+Modified: Can we give a BED file as an input and then determine a level of compatibility for different reference genome assemblies?
+
+
+#### High Level Execution 
+
+1. Ingest query bed file (without any annotations or metadata, for now).
+
+2. Compare regions with chromosome lengths and chrom names.
+
+   - Assumption: if regions exist beyond defined ref genome chromosomes,the bed file is less likely to be associated with the ref genome.
+   - Use size files to get chrom lengths and names
+   - How many chrom names in BED file that are _not_ in the size file?
+
+3. Compare regions in bed files with excluded ranges/black list regions for each of reference genome.
+
+   - Assumption: during bed file creation, excluded ranges were filtered out. Therefore, if a bed file has more regions in excluded ranges from hg19 vs less from hg38, then the bed file is more likely associated with hg38.
+   - Two Tiers: 
+     1. Gaps/Centromeres/Telomeres
+     2. All other Excluded Ranges
+
+4. Once the above are quantified, we can exclude some reference genomes altogether and give probability of compatibility with the remaining.
+
+
+#### Detailed Execution Steps
+
+1. Retrieve chrom size files for ref genome assemblies (can use seq collections API: https://refget.databio.org/).
+2. Cache relevant BED files which contain excluded ranges using [BBClient](https://docs.bedbase.org/geniml/tutorials/bbclient/)
+3. Build database for each refgenome assembly excluded ranges, gaps centromeres telomeres using [IGD](https://github.com/databio/gtars/tree/dev_igd) (Use rust implementation if finished, else use C++ implementation).
+4. Query "unknown" BED File against chrom size files and the IGDs (using `igd search`).
+5. Obtain overlap stats for each of the IGDs, rank them based on _least overlaps_.
+6. Run on BED files whose ref genomes are _known_ and calculate accuracy of highest probability compatible ref genome.
+
+
+#### Additional Notes
+- Begin with human bed files and reference genomes for now.
+  - predicting between hg38 and hg19 should be the first (simple) task
+- Prediction happens at a high level for mutually exclusive options,e.g. predicting hg38 vs hg19 as the coordinate systems are different
+  - this mutual exclusivity would also apply to different types of hg38 (UCSC, ensembl, ncbi).
+- Validation occurs at the next level after basic prediction is done.
+  - different/levels of tiers based on cutoffs wrt specificity and sensitivity
+  - These tiers are based on a variety of parameters of increasing complexity:
+    - name matching of chromosomes
+    - overlaps (chr.sizes, excluded ranges)
+    - ML (BED embeddings)
+    - Bed annotations (text similarity)
+- Future work could involve machine learning for ref genome prediction. However, we will begin with a simple classifier (which may be sufficient).
+- Future work could add annotations/metadata for making the prediction.
+
+#### Software Notes
+
+- Create a validator class that can ingest a BED file as well as a genome_model object
diff --git a/bedboss/refgenome_validator/__init__.py b/bedboss/refgenome_validator/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -1 +0,0 @@
		from bedboss.bedclassifier.bedclassifier import get_bed_type