Skip to content

Commit

Permalink
Added ref validator to main bbpipeline and cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi committed Sep 19, 2024
1 parent 8c1c2d3 commit ebc07dd
Show file tree
Hide file tree
Showing 15 changed files with 85 additions and 23 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ include bedboss/refgenome_validator/*
include bedboss/tokens/*
include bedboss/tokens/*
include bedboss/bbuploader/*
include bedboss/refgenome_validator/chrom_sizes/*
5 changes: 4 additions & 1 deletion bedboss/bbuploader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ def upload_gse(
sa_session=session,
gse_status_sa_model=gse_status,
standardize_pep=standardize_pep,
rerun=rerun,
)
except Exception as e:
_LOGGER.error(f"Processing of '{gse}' failed with error: {e}")
Expand Down Expand Up @@ -353,6 +354,7 @@ def _upload_gse(
sa_session: Session = None,
gse_status_sa_model: GeoGseStatus = None,
standardize_pep: bool = False,
rerun: bool = False,
) -> ProjectProcessingStatus:
"""
Upload bed files from GEO series to BedBase
Expand All @@ -365,6 +367,7 @@ def _upload_gse(
:param sa_session: opened session to the database
:param gse_status_sa_model: sqlalchemy model for project status
:param standardize_pep: standardize pep metadata using BEDMS
:param rerun: force overwrite data in the database
:return: None
"""
Expand Down Expand Up @@ -444,7 +447,7 @@ def _upload_gse(
upload_pephub=True,
upload_s3=True,
upload_qdrant=True,
force_overwrite=False,
force_overwrite=rerun,
)
uploaded_files.append(file_digest)
sample_status.status = STATUS.SUCCESS
Expand Down
14 changes: 12 additions & 2 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,17 @@
from pephubclient.helpers import MessageHandler as m
from pephubclient.helpers import is_registry_path

from bedboss._version import __version__
from bedboss.bedbuncher import run_bedbuncher
from bedboss.bedmaker.bedmaker import make_all
from bedboss.bedstat.bedstat import bedstat
from bedboss.const import BEDBOSS_PEP_SCHEMA_PATH, PKG_NAME
from bedboss.exceptions import BedBossException
from bedboss.models import (
BedClassificationUpload,
FilesUpload,
PlotsUpload,
StatsUpload,
)
from bedboss.refgenome_validator.main import ReferenceValidator

from bedboss.utils import (
standardize_genome_name,
Expand Down Expand Up @@ -62,6 +61,7 @@ def run_all(
rfg_config: str = None,
narrowpeak: bool = False,
check_qc: bool = True,
validate_reference: bool = True,
chrom_sizes: str = None,
open_signal_matrix: str = None,
ensdb: str = None,
Expand Down Expand Up @@ -91,6 +91,7 @@ def run_all(
:param bool narrowpeak: whether the regions are narrow. Used to create bed file from bedgraph or bigwig
(transcription factor implies narrow, histone mark implies broad peaks) [optional]
:param bool check_qc: set True to run quality control during badmaking [optional] (default: True)
:param bool validate_reference: set True to run genome reference validator
:param str chrom_sizes: a full path to the chrom.sizes required for the bedtobigbed conversion [optional]
:param str open_signal_matrix: a full path to the openSignalMatrix required for the tissue [optional]
:param dict other_metadata: a dict containing all attributes from the sample
Expand Down Expand Up @@ -195,13 +196,22 @@ def run_all(
bed_format=bed_metadata.bed_format.value,
)

if validate_reference:
_LOGGER.info("Validating reference genome")
ref_valid_stats = ReferenceValidator().determine_compatibility(
bedfile=bed_metadata.bed_file, concise=True
)
else:
ref_valid_stats = None

bbagent.bed.add(
identifier=bed_metadata.bed_digest,
stats=stats.model_dump(exclude_unset=True),
metadata=other_metadata,
plots=plots.model_dump(exclude_unset=True),
files=files.model_dump(exclude_unset=True),
classification=classification.model_dump(exclude_unset=True),
ref_validation=ref_valid_stats,
license_id=license_id,
upload_qdrant=upload_qdrant,
upload_pephub=upload_pephub,
Expand Down
1 change: 0 additions & 1 deletion bedboss/bedclassifier/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from bedboss.bedclassifier.bedclassifier import get_bed_type
2 changes: 0 additions & 2 deletions bedboss/refgenome_validator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
from bedboss.refgenome_validator.genome_model import GenomeModel
from bedboss.refgenome_validator.main import ReferenceValidator
35 changes: 35 additions & 0 deletions bedboss/refgenome_validator/chrom_sizes/ucsc_mm9.chrom.sizes
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
chr1 197195432
chr2 181748087
chr3 159599783
chr4 155630120
chr5 152537259
chr6 149517037
chr7 152524553
chr8 131738871
chr9 124076172
chr10 129993255
chr11 121843856
chr12 121257530
chr13 120284312
chr14 125194864
chr15 103494974
chr16 98319150
chr17 95272651
chr18 90772031
chr19 61342430
chrX 166650296
chrY 15902555
chrM 16299
chr13_random 400311
chr16_random 3994
chr17_random 628739
chr1_random 1231697
chr3_random 41899
chr4_random 160594
chr5_random 357350
chr7_random 362490
chr8_random 849593
chr9_random 449403
chrUn_random 5900358
chrX_random 1785075
chrY_random 58682461
7 changes: 1 addition & 6 deletions bedboss/refgenome_validator/genome_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
from typing import Optional, List

import refgenconf


class GenomeModel:
"""
Initialize genome model
Expand Down Expand Up @@ -72,4 +67,4 @@ def filter_excluded_ranges(self, bed_list, igd_hit_matrix):
# We will probably have a singular .igd database that we will simply compare the bed file to, so this should probably
# just filter results in a way to determine if there were any hits/not hits for this particular genome

raise NotImplemented()
raise NotImplementedError()
5 changes: 3 additions & 2 deletions bedboss/refgenome_validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, Non
IGD_LOCATION = os.environ["IGD_LOCATION"]
except:
# Local installation of C version of IGD
IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd"
IGD_LOCATION = "/home/drc/GITHUB/igd/IGD/bin/igd"

# Construct an IGD command to run as subprocess
igd_command = IGD_LOCATION + f" search {self.igd_path} -q {bedfile}"
Expand Down Expand Up @@ -233,6 +233,8 @@ def determine_compatibility(
:return: a dict with CompatibilityStats, or CompatibilityConcise model (depends if concise is set to True)
"""

_LOGGER.info(f"Calculating reference genome stats for {bedfile}...")

if ref_filter:
# Filter out unwanted reference genomes to assess
for genome_model in self.genome_models:
Expand Down Expand Up @@ -269,7 +271,6 @@ def determine_compatibility(
model_compat_stats[genome_model.genome_alias].compatibility = (
self.calculate_rating(model_compat_stats[genome_model.genome_alias])
)

if concise:
concise_dict = {}
for name, stats in model_compat_stats.items():
Expand Down
4 changes: 2 additions & 2 deletions bedboss/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pephubclient.files_manager import FilesManager
import peppy
from peppy.const import SAMPLE_RAW_DICT_KEY
from attribute_standardizer import AttrStandardizer
from bedms import AttrStandardizer

_LOGGER = logging.getLogger("bedboss")

Expand Down Expand Up @@ -137,7 +137,7 @@ def standardize_pep(
"""
if standard_columns is None:
standard_columns = ["library_source", "assay", "genome", "species_name"]
model = AttrStandardizer("BEDBASE")
model = AttrStandardizer(model)
suggestions = model.standardize(pep)

changes = {}
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ bbconf>=0.7.0
refgenconf>=0.12.2
pandas>=2.0.0
ubiquerg>=0.6.2
bedmess>=0.1.0
bedms>=0.1.0
pephubclient>=0.4.4
geniml>=0.4.1
1 change: 0 additions & 1 deletion scripts/ref_genome_validating/grab_chrom_sizes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# This script exists because some chrom.sizes files have been hard to track down.
# However, one can extract their own from the sequence files. Requires having downloaded a reference genome locally

import os
from Bio import SeqIO


Expand Down
2 changes: 1 addition & 1 deletion scripts/ref_genome_validating/pull1000bedfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def pull_1000_bedfiles(filter, data_output_path):
try:
os.makedirs(data_output_path, exist_ok=True)
except OSError:
print(f"Directory already exists, skipping...")
print("Directory already exists, skipping...")

print(filter)
print(data_output_path)
Expand Down
3 changes: 0 additions & 3 deletions scripts/ref_genome_validating/stats_compat_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@
# PEP : donaldcampbelljr/refgenome_compat_testing:default
import copy
import os
import numpy as np
import requests
import pephubclient
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from scipy.cluster.hierarchy import linkage

try:
PEP_URL = os.environ["PEP_URL"]
Expand Down
26 changes: 25 additions & 1 deletion test/test_ref_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,33 @@


def test_main():
# dict_result = ReferenceValidator().determine_compatibility(
# FILE_PATH,
# concise=True,
# )
dict_result = ReferenceValidator().determine_compatibility(
FILE_PATH,
"/home/bnt4me/.bbcache/bedfiles/0/7/0740332b148a613342603e2e483f53e5.bed.gz",
concise=True,
)

assert dict_result


def test_another_test():
from bedboss.bbuploader.main import upload_gse

upload_gse(
# gse="gse246900",
# gse="gse247593",
# gse="gse241222",
# gse="gse266130",
gse="gse99178",
# gse="gse240325", # TODO: check if qc works
# gse="gse229592", # mice
bedbase_config="/home/bnt4me/virginia/repos/bbuploader/config_db_local.yaml",
outfolder="/home/bnt4me/virginia/repos/bbuploader/data",
# genome="HG38",
rerun=True,
run_failed=True,
run_skipped=True,
)

0 comments on commit ebc07dd

Please sign in to comment.