Skip to content

Commit

Permalink
add more logic to compare_chrom_names
Browse files Browse the repository at this point in the history
  • Loading branch information
donaldcampbelljr committed Sep 4, 2024
1 parent e4ce058 commit 633e4d6
Showing 1 changed file with 43 additions and 61 deletions.
104 changes: 43 additions & 61 deletions bedboss/refgenome_validator/refgenomevalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,49 +20,9 @@ def __init__(self, genome_models):
[]
) # this will be a list of dictionary info with length of genome_models

def predict_parent(self):
"""
Predicts from a mutually exclusive list of reference genomes
"""

# For mutually exclusive items, we can quickly check, chrom sizes and chrom names and see if there are incompatible
# genome models and truncate the list at the very beginning

predicted_parents = []

for bed in self.bedfiles:
# We may be doing this somewhere else and should just pass this in...
bed_chrom_info = get_bed_chrom_info(bed)

for genome in self.genome_models:
exclude_result = self.compare_chrom_size(
bed_chrom_info, genome.chrom_sizes
)
if not exclude_result:
# keep the genome
predicted_parents.append(genome.alias)
else:
# Can we for sure exclude this? if so delete it
# Do we need to deep copy this or will it cause issues?
self.genome_models.remove(genome)

predicted_parent = "hg38"

return predicted_parent

def modify_compatibility_matrix(self):
"""
modifys the data frame storing compatibility matrix of entries
"""

pass

def compare_chrom_names_lengths(
self, bed_chrom_sizes: dict, genome_chrom_sizes: dict
):
) -> dict:
"""
Given two dicts of chroms (key) and their sizes (values)
determine overlap
Expand All @@ -73,32 +33,54 @@ def compare_chrom_names_lengths(
return dict: returns a dictionary with information on Query vs Model, e.g. chrom names QueryvsModel
"""

# Check names
# Layer 1: Check names
# Define Three separate counts
# Q = Query, M = Model
name_stats = {}
q_and_m = None
q_and_not_m = None
not_q_and_m = None

name_stats = {}

extra_chroms = False
chroms_beyond_range = False
exclude = False
q_and_m = 0
q_and_not_m = 0
not_q_and_m = 0
passed_chrom_names = True

for key in list(bed_chrom_sizes.keys()):
if key not in genome_chrom_sizes:
extra_chroms = True
elif key in genome_chrom_sizes:
if genome_chrom_sizes[key] < bed_chrom_sizes[key]:
chroms_beyond_range = True

if extra_chroms or chroms_beyond_range:
exclude = True
return exclude

return exclude
q_and_not_m += 1
if key in genome_chrom_sizes:
q_and_m += 1
for key in list(genome_chrom_sizes.keys()):
if key not in bed_chrom_sizes:
not_q_and_m += 1

# What is our threshold for passing layer 1?
if q_and_not_m > 1:
passed_chrom_names = False

name_stats["q_and_m"] = q_and_m
name_stats["q_and_not_m"] = q_and_not_m
name_stats["not_q_and_m"] = not_q_and_m
name_stats["passed_chrom_names"] = passed_chrom_names

# Layer 2: Check Lengths, but only if layer 1 is passing
if passed_chrom_names:
length_stats = {}

chroms_beyond_range = False
num_of_chrm_beyond = 0

for key in list(bed_chrom_sizes.keys()):
if key in genome_chrom_sizes:
if bed_chrom_sizes[key] > genome_chrom_sizes[key]:
num_of_chrm_beyond += 1
chroms_beyond_range = True

length_stats["beyond_range"] = chroms_beyond_range
length_stats["num_of_chrm_beyond"] = num_of_chrm_beyond
else:
length_stats = {}
length_stats["beyond_range"] = None
length_stats["num_of_chrm_beyond"] = None

return {"chrom_name_stats": name_stats, "chrom_length_stats": length_stats}

def determine_compatibility(
self, bedfile: str, ref_filter: list[str]
Expand Down

0 comments on commit 633e4d6

Please sign in to comment.