Skip to content

Commit

Permalink
refactor to allow option to fetch bedfiles using GEOFETCH
Browse files Browse the repository at this point in the history
  • Loading branch information
donaldcampbelljr committed Sep 9, 2024
1 parent d2697d1 commit 178719e
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 46 deletions.
6 changes: 2 additions & 4 deletions bedboss/refgenome_validator/refgenomevalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
)
final_compat_rating["tier_ranking"] = 4

processed_stats[genome_alias].update({"Compatibility": final_compat_rating})
processed_stats[genome_alias].update({"compatibility": final_compat_rating})

return processed_stats

Expand All @@ -379,13 +379,11 @@ def get_bed_chrom_info(bed_file_path: str) -> Union[dict, None]:
Given a path to a Bedfile. Attempt to open it and read it to find all of the chromosomes and the max length of each.
"""

# TODO In bed classifier we skip a few rows just in case there is header information there...

# Right now this assumes this is atleast a 3 column bedfile
# This also assumes the bed file has been unzipped

try:
df = pd.read_csv(bed_file_path, sep="\t", header=None)
df = pd.read_csv(bed_file_path, sep="\t", header=None, skiprows=5)

max_end_for_each_chrom = df.groupby(0)[2].max()

Expand Down
151 changes: 109 additions & 42 deletions scripts/ref_genome_validating/validate_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

from bedboss.refgenome_validator import *

# helper utils
from process_exclude_ranges import unzip_bedfile, get_samples, MAX_SAMPLES

try:
IGD_DB_PATH = os.environ["IGD_DB_PATH"]
except:
Expand All @@ -26,6 +29,10 @@
# if you wish to report results to pephub
PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default"

# Where to get Bedfiles?
LOCAL = False
GEOFETCH = True


def main():
# Simple script to testing that the Validator objects is working correctly.
Expand Down Expand Up @@ -55,50 +62,110 @@ def main():
new_genome_model = GenomeModel(genome_alias=reference_genome, refgenomeconf=rgc)
all_genome_models.append(new_genome_model)

# Get BED files
# for now, hard code a couple

# all_bed_files = [
# "/home/drc/GITHUB/bedboss/bedboss/test/data/bed/hg19/correct/hg19_example1.bed"
# ]
all_bed_files = []
for root, dirs, files in os.walk(BEDFILE_DIRECTORY):
for file in files:
if file.endswith(".bed"):
# print(os.path.join(root, file))
all_bed_files.append(os.path.join(root, file))

# validate each Bed file
# Create Validator Object
validator = Validator(genome_models=all_genome_models, igd_path=IGD_DB_PATH)

# use pipestat to report to pephub and file backend
psm = pipestat.PipestatManager(
pephub_path=PEP_URL,
)
psm2 = pipestat.PipestatManager(results_file_path="stats_results/results.yaml")

for bedfile in all_bed_files[:10]:
compat_vector = validator.determine_compatibility(bedfile)
# Debug printing
# import pprint
# pprint.pprint(compat_vector, depth=5)

# Report to pephub
rid = os.path.basename(bedfile)
all_vals = {}
if compat_vector:
for i in compat_vector:
if i is not None:
for key, values in i.items():
all_vals.update({key: values})

# Report to file backend
psm2.report(record_identifier=rid, values=all_vals)

# Convert to json string before reporting to pephub
for key, value in all_vals.items():
all_vals[key] = str(json.dumps(value))
psm.report(record_identifier=rid, values=all_vals)
# Get BED files
# LOCAL
if LOCAL:
print("Obtaining Bed files locally")
all_bed_files = []
for root, dirs, files in os.walk(BEDFILE_DIRECTORY):
for file in files:
if file.endswith(".bed"):
# print(os.path.join(root, file))
all_bed_files.append(os.path.join(root, file))

for bedfile in all_bed_files[:10]:
compat_vector = validator.determine_compatibility(bedfile)
rid = os.path.basename(bedfile)
all_vals = {}
if compat_vector:
for i in compat_vector:
if i is not None:
for key, values in i.items():
all_vals.update({key: values})

# use pipestat to report to pephub and file backend
psm = pipestat.PipestatManager(
pephub_path=PEP_URL,
)
psm2 = pipestat.PipestatManager(
results_file_path="stats_results/results.yaml"
)

# Report to file backend
psm2.report(record_identifier=rid, values=all_vals)

# Convert to json string before reporting to pephub
for key, value in all_vals.items():
all_vals[key] = str(json.dumps(value))
psm.report(record_identifier=rid, values=all_vals)

# # USE GEOFETCH TO OBTAIN BED FILES
if GEOFETCH:
print("Obtaining Bed files using Geofetch")
species = "homosapiens"
data_output_path = os.path.abspath("data")
species_output_path = os.path.join(data_output_path, species)
bedfileslist = os.path.join(species_output_path, "bedfileslist.txt")
results_path = os.path.abspath("results")

with open(bedfileslist, "r") as file:
lines = file.readlines()
lines = [
line.strip() for line in lines
] # Remove leading/trailing whitespace

for line in lines:
samples = get_samples(
data_output_path=species_output_path, gse_number=line
)

if samples:
for sample in samples[:MAX_SAMPLES]:
all_vals = {}
if isinstance(sample.output_file_path, list):
bedfile = sample.output_file_path[0]
else:
bedfile = sample.output_file_path
reported_ref_genome = getattr(sample, "ref_genome", None)
all_vals.update({"reported_ref_genome": reported_ref_genome})

file = unzip_bedfile(bedfile, results_path)

if file:
compat_vector = validator.determine_compatibility(bedfile)
rid = os.path.basename(bedfile)
tier = {
"tier_rating": {}
} # add this to a column to make comparisons easier for human eyes on pephub
if compat_vector:
for i in compat_vector:
if i is not None:
for key, values in i.items():
all_vals.update({key: values})
if "compatibility" in values:
tier["tier_rating"].update(
{key: values["compatibility"]}
)
all_vals.update(tier)

# use pipestat to report to pephub and file backend
psm = pipestat.PipestatManager(
pephub_path=PEP_URL,
)
psm2 = pipestat.PipestatManager(
results_file_path="stats_results/results.yaml"
)

# Report to file backend
psm2.report(record_identifier=rid, values=all_vals)

# Convert to json string before reporting to pephub
for key, value in all_vals.items():
all_vals[key] = str(json.dumps(value))
psm.report(record_identifier=rid, values=all_vals)

return

Expand Down

0 comments on commit 178719e

Please sign in to comment.