From ebc07dd64afb3de78ee3b5660c0b5a20db37e194 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Sep 2024 14:58:12 -0400 Subject: [PATCH] Added ref validator to main bbpipeline and cleaning --- MANIFEST.in | 1 + bedboss/bbuploader/main.py | 5 ++- bedboss/bedboss.py | 14 ++++++-- bedboss/bedclassifier/__init__.py | 1 - bedboss/refgenome_validator/__init__.py | 2 -- .../chrom_sizes/ucsc_mm9.chrom.sizes | 35 +++++++++++++++++++ ...o6.chrom.sizes => ucsc_panTo6.chrom.sizes} | 0 bedboss/refgenome_validator/genome_model.py | 7 +--- bedboss/refgenome_validator/main.py | 5 +-- bedboss/utils.py | 4 +-- requirements/requirements-all.txt | 2 +- .../ref_genome_validating/grab_chrom_sizes.py | 1 - .../ref_genome_validating/pull1000bedfiles.py | 2 +- .../stats_compat_testing.py | 3 -- test/test_ref_validator.py | 26 +++++++++++++- 15 files changed, 85 insertions(+), 23 deletions(-) create mode 100644 bedboss/refgenome_validator/chrom_sizes/ucsc_mm9.chrom.sizes rename bedboss/refgenome_validator/chrom_sizes/{ucsc_panTro6.chrom.sizes => ucsc_panTo6.chrom.sizes} (100%) diff --git a/MANIFEST.in b/MANIFEST.in index f2c6358..91bf8af 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -13,3 +13,4 @@ include bedboss/refgenome_validator/* include bedboss/tokens/* include bedboss/tokens/* include bedboss/bbuploader/* +include bedboss/refgenome_validator/chrom_sizes/* diff --git a/bedboss/bbuploader/main.py b/bedboss/bbuploader/main.py index 9f3797e..ac0912c 100644 --- a/bedboss/bbuploader/main.py +++ b/bedboss/bbuploader/main.py @@ -307,6 +307,7 @@ def upload_gse( sa_session=session, gse_status_sa_model=gse_status, standardize_pep=standardize_pep, + rerun=rerun, ) except Exception as e: _LOGGER.error(f"Processing of '{gse}' failed with error: {e}") @@ -353,6 +354,7 @@ def _upload_gse( sa_session: Session = None, gse_status_sa_model: GeoGseStatus = None, standardize_pep: bool = False, + rerun: bool = False, ) -> ProjectProcessingStatus: """ Upload bed files from GEO series to BedBase @@ -365,6 +367,7 @@ def _upload_gse( :param sa_session: opened session to the database :param gse_status_sa_model: sqlalchemy model for project status :param standardize_pep: standardize pep metadata using BEDMS + :param rerun: force overwrite data in the database :return: None """ @@ -444,7 +447,7 @@ def _upload_gse( upload_pephub=True, upload_s3=True, upload_qdrant=True, - force_overwrite=False, + force_overwrite=rerun, ) uploaded_files.append(file_digest) sample_status.status = STATUS.SUCCESS diff --git a/bedboss/bedboss.py b/bedboss/bedboss.py index 50581e7..3320dd4 100644 --- a/bedboss/bedboss.py +++ b/bedboss/bedboss.py @@ -14,18 +14,17 @@ from pephubclient.helpers import MessageHandler as m from pephubclient.helpers import is_registry_path -from bedboss._version import __version__ from bedboss.bedbuncher import run_bedbuncher from bedboss.bedmaker.bedmaker import make_all from bedboss.bedstat.bedstat import bedstat from bedboss.const import BEDBOSS_PEP_SCHEMA_PATH, PKG_NAME -from bedboss.exceptions import BedBossException from bedboss.models import ( BedClassificationUpload, FilesUpload, PlotsUpload, StatsUpload, ) +from bedboss.refgenome_validator.main import ReferenceValidator from bedboss.utils import ( standardize_genome_name, @@ -62,6 +61,7 @@ def run_all( rfg_config: str = None, narrowpeak: bool = False, check_qc: bool = True, + validate_reference: bool = True, chrom_sizes: str = None, open_signal_matrix: str = None, ensdb: str = None, @@ -91,6 +91,7 @@ def run_all( :param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig (transcription factor implies narrow, histone mark implies broad peaks) [optional] :param bool check_qc: set True to run quality control during badmaking [optional] (default: True) + :param bool validate_reference: set True to run genome reference validator :param str chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional] :param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional] :param dict other_metadata: a dict containing all attributes from the sample @@ -195,6 +196,14 @@ def run_all( bed_format=bed_metadata.bed_format.value, ) + if validate_reference: + _LOGGER.info("Validating reference genome") + ref_valid_stats = ReferenceValidator().determine_compatibility( + bedfile=bed_metadata.bed_file, concise=True + ) + else: + ref_valid_stats = None + bbagent.bed.add( identifier=bed_metadata.bed_digest, stats=stats.model_dump(exclude_unset=True), @@ -202,6 +211,7 @@ def run_all( plots=plots.model_dump(exclude_unset=True), files=files.model_dump(exclude_unset=True), classification=classification.model_dump(exclude_unset=True), + ref_validation=ref_valid_stats, license_id=license_id, upload_qdrant=upload_qdrant, upload_pephub=upload_pephub, diff --git a/bedboss/bedclassifier/__init__.py b/bedboss/bedclassifier/__init__.py index 5dcc56d..e69de29 100644 --- a/bedboss/bedclassifier/__init__.py +++ b/bedboss/bedclassifier/__init__.py @@ -1 +0,0 @@ -from bedboss.bedclassifier.bedclassifier import get_bed_type diff --git a/bedboss/refgenome_validator/__init__.py b/bedboss/refgenome_validator/__init__.py index 12d8d5c..e69de29 100644 --- a/bedboss/refgenome_validator/__init__.py +++ b/bedboss/refgenome_validator/__init__.py @@ -1,2 +0,0 @@ -from bedboss.refgenome_validator.genome_model import GenomeModel -from bedboss.refgenome_validator.main import ReferenceValidator diff --git a/bedboss/refgenome_validator/chrom_sizes/ucsc_mm9.chrom.sizes b/bedboss/refgenome_validator/chrom_sizes/ucsc_mm9.chrom.sizes new file mode 100644 index 0000000..877c62d --- /dev/null +++ b/bedboss/refgenome_validator/chrom_sizes/ucsc_mm9.chrom.sizes @@ -0,0 +1,35 @@ +chr1 197195432 +chr2 181748087 +chr3 159599783 +chr4 155630120 +chr5 152537259 +chr6 149517037 +chr7 152524553 +chr8 131738871 +chr9 124076172 +chr10 129993255 +chr11 121843856 +chr12 121257530 +chr13 120284312 +chr14 125194864 +chr15 103494974 +chr16 98319150 +chr17 95272651 +chr18 90772031 +chr19 61342430 +chrX 166650296 +chrY 15902555 +chrM 16299 +chr13_random 400311 +chr16_random 3994 +chr17_random 628739 +chr1_random 1231697 +chr3_random 41899 +chr4_random 160594 +chr5_random 357350 +chr7_random 362490 +chr8_random 849593 +chr9_random 449403 +chrUn_random 5900358 +chrX_random 1785075 +chrY_random 58682461 diff --git a/bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes b/bedboss/refgenome_validator/chrom_sizes/ucsc_panTo6.chrom.sizes similarity index 100% rename from bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes rename to bedboss/refgenome_validator/chrom_sizes/ucsc_panTo6.chrom.sizes diff --git a/bedboss/refgenome_validator/genome_model.py b/bedboss/refgenome_validator/genome_model.py index 4758f80..c415756 100644 --- a/bedboss/refgenome_validator/genome_model.py +++ b/bedboss/refgenome_validator/genome_model.py @@ -1,8 +1,3 @@ -from typing import Optional, List - -import refgenconf - - class GenomeModel: """ Initialize genome model @@ -72,4 +67,4 @@ def filter_excluded_ranges(self, bed_list, igd_hit_matrix): # We will probably have a singular .igd database that we will simply compare the bed file to, so this should probably # just filter results in a way to determine if there were any hits/not hits for this particular genome - raise NotImplemented() + raise NotImplementedError() diff --git a/bedboss/refgenome_validator/main.py b/bedboss/refgenome_validator/main.py index 35d9ce1..756351b 100644 --- a/bedboss/refgenome_validator/main.py +++ b/bedboss/refgenome_validator/main.py @@ -194,7 +194,7 @@ def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, Non IGD_LOCATION = os.environ["IGD_LOCATION"] except: # Local installation of C version of IGD - IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" + IGD_LOCATION = "/home/drc/GITHUB/igd/IGD/bin/igd" # Construct an IGD command to run as subprocess igd_command = IGD_LOCATION + f" search {self.igd_path} -q {bedfile}" @@ -233,6 +233,8 @@ def determine_compatibility( :return: a dict with CompatibilityStats, or CompatibilityConcise model (depends if concise is set to True) """ + _LOGGER.info(f"Calculating reference genome stats for {bedfile}...") + if ref_filter: # Filter out unwanted reference genomes to assess for genome_model in self.genome_models: @@ -269,7 +271,6 @@ def determine_compatibility( model_compat_stats[genome_model.genome_alias].compatibility = ( self.calculate_rating(model_compat_stats[genome_model.genome_alias]) ) - if concise: concise_dict = {} for name, stats in model_compat_stats.items(): diff --git a/bedboss/utils.py b/bedboss/utils.py index feea63a..f237392 100644 --- a/bedboss/utils.py +++ b/bedboss/utils.py @@ -6,7 +6,7 @@ from pephubclient.files_manager import FilesManager import peppy from peppy.const import SAMPLE_RAW_DICT_KEY -from attribute_standardizer import AttrStandardizer +from bedms import AttrStandardizer _LOGGER = logging.getLogger("bedboss") @@ -137,7 +137,7 @@ def standardize_pep( """ if standard_columns is None: standard_columns = ["library_source", "assay", "genome", "species_name"] - model = AttrStandardizer("BEDBASE") + model = AttrStandardizer(model) suggestions = model.standardize(pep) changes = {} diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index f13255c..266f16e 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -9,6 +9,6 @@ bbconf>=0.7.0 refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 -bedmess>=0.1.0 +bedms>=0.1.0 pephubclient>=0.4.4 geniml>=0.4.1 \ No newline at end of file diff --git a/scripts/ref_genome_validating/grab_chrom_sizes.py b/scripts/ref_genome_validating/grab_chrom_sizes.py index 3440d7f..46914ff 100644 --- a/scripts/ref_genome_validating/grab_chrom_sizes.py +++ b/scripts/ref_genome_validating/grab_chrom_sizes.py @@ -1,7 +1,6 @@ # This script exists because some chrom.sizes files have been hard to track down. # However, one can extract their own from the sequence files. Requires having downloaded a reference genome locally -import os from Bio import SeqIO diff --git a/scripts/ref_genome_validating/pull1000bedfiles.py b/scripts/ref_genome_validating/pull1000bedfiles.py index 82c0702..8e697fb 100644 --- a/scripts/ref_genome_validating/pull1000bedfiles.py +++ b/scripts/ref_genome_validating/pull1000bedfiles.py @@ -47,7 +47,7 @@ def pull_1000_bedfiles(filter, data_output_path): try: os.makedirs(data_output_path, exist_ok=True) except OSError: - print(f"Directory already exists, skipping...") + print("Directory already exists, skipping...") print(filter) print(data_output_path) diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index 1345a26..1e1fabc 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -2,14 +2,11 @@ # PEP : donaldcampbelljr/refgenome_compat_testing:default import copy import os -import numpy as np -import requests import pephubclient import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import matplotlib.colors as mcolors -from scipy.cluster.hierarchy import linkage try: PEP_URL = os.environ["PEP_URL"] diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py index bc5bd4f..557a35c 100644 --- a/test/test_ref_validator.py +++ b/test/test_ref_validator.py @@ -9,9 +9,33 @@ def test_main(): + # dict_result = ReferenceValidator().determine_compatibility( + # FILE_PATH, + # concise=True, + # ) dict_result = ReferenceValidator().determine_compatibility( - FILE_PATH, + "/home/bnt4me/.bbcache/bedfiles/0/7/0740332b148a613342603e2e483f53e5.bed.gz", concise=True, ) assert dict_result + + +def test_another_test(): + from bedboss.bbuploader.main import upload_gse + + upload_gse( + # gse="gse246900", + # gse="gse247593", + # gse="gse241222", + # gse="gse266130", + gse="gse99178", + # gse="gse240325", # TODO: check if qc works + # gse="gse229592", # mice + bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml", + outfolder="/home/bnt4me/virginia/repos/bbuploader/data", + # genome="HG38", + rerun=True, + run_failed=True, + run_skipped=True, + )