Skip to content

Commit

Permalink
add tier rating based on calculated stats
Browse files Browse the repository at this point in the history
  • Loading branch information
donaldcampbelljr committed Sep 9, 2024
1 parent 70e8328 commit ad98d8e
Showing 1 changed file with 51 additions and 84 deletions.
135 changes: 51 additions & 84 deletions bedboss/refgenome_validator/refgenomevalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ def calculate_chrom_stats(
length_stats["beyond_range"] = None
length_stats["num_of_chrm_beyond"] = None

# Layer 3 Calculate Sequence Fit

# Layer 3 Calculate Sequence Fit if any query chrom names were present
if len(query_keys_present) > 0:
seq_fit_stats = {}
bed_sum = 0
Expand Down Expand Up @@ -246,7 +245,7 @@ def determine_compatibility(
model_compat_stats[genome_model.genome_alias] = self.calculate_chrom_stats(
bed_chrom_info, genome_model.chrom_sizes
)
# Third Layer is to run IGD but only if layer 1 and layer 2 have passed
# Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed
if (
model_compat_stats[genome_model.genome_alias]["chrom_name_stats"][
"passed_chrom_names"
Expand Down Expand Up @@ -280,7 +279,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
Tier1: Excellent compatibility, 0 pts
Tier2: Good compatibility, may need some processing, 1-3 pts
Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 3-6 pts
Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts
Tier4: Poor compatibility, 7-9 pts
:param dict compat_stats: dicitionary containing unprocessed compat stats
Expand All @@ -295,7 +294,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
processed_stats[genome_alias].update(compat_stats)

# Currently will proceed with discrete buckets, however, we could do a point system in the future
final_compat_rating = "Tier 1"
final_compat_rating = {"tier_ranking": 1, "assigned_points": 0}
points_rating = 0 # we will add points increasing the level and then sort into various tiers at the end

# Check extra sequences sensitivity and assign points based on how sensitive the outcome is
Expand All @@ -309,91 +308,59 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
if compat_stats["chrom_name_stats"]["XS"] < 0.3:
points_rating += 1

# Check OOBR and assign points based on sensitivity
if compat_stats["chrom_name_stats"]["OOBR"] < 1:
points_rating += 3
if compat_stats["chrom_name_stats"]["OOBR"] < 0.7:
points_rating += 1
if compat_stats["chrom_name_stats"]["OOBR"] < 0.5:
points_rating += 1
if compat_stats["chrom_name_stats"]["OOBR"] < 0.3:
points_rating += 1
if compat_stats["chrom_name_stats"]["passed_chrom_names"]:
# Check OOBR and assign points based on sensitivity
# only assessed if no extra chroms in query bed file
if compat_stats["chrom_length_stats"]["OOBR"] < 1:
points_rating += 3
if compat_stats["chrom_length_stats"]["OOBR"] < 0.7:
points_rating += 1
if compat_stats["chrom_length_stats"]["OOBR"] < 0.5:
points_rating += 1
if compat_stats["chrom_length_stats"]["OOBR"] < 0.3:
points_rating += 1
else:
# Do nothing here, points have already been added when Assessing XS if it is not == 1
pass

# Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome
# TODO

# if not compat_stats["chrom_name_stats"]["passed_chrom_names"]:
# # if the file did not pass the first layer, immediately bump it down a Tier
# # this pass_chrom_names bool is determined via if q_and_not_m > 1
#
# final_compat_rating = "Tier 2"
# # Ok, there were some chroms in the bed file NOT in the chromsizes file
# # How many? If it's a lot, we should penalize and knock it down the tier list
# # use jaccard index to determine this
#
# if compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.90:
# # Keep at current tier
# pass
# elif compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.60:
# final_compat_rating = "Tier 3"
# elif compat_stats["chrom_name_stats"]["jaccard_index"] < 0.60:
# final_compat_rating = "Tier 4"
# else:
# # if the bed file has passed the first layer, we should check the chrom lengths
# # careful using jaccard index here for comparison as you could have a small bed file with one chr
# # that is in both the ref and query but would still have a low jaccard index
# if not compat_stats["chrom_length_stats"]["beyond_range"] == 0:
# final_compat_rating = "Tier 2"
# # Some chroms are beyond range. Determine how this should affect tier rating.
# # We can assess what percentage of the chroms in bed file extended beyond the chromsizes file
# # and also compare to how many extended chroms are in the genome.
# if (
# compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"]
# >= 0.50
# ):
# if (
# compat_stats["chrom_length_stats"][
# "percentage_genome_chrm_beyond"
# ]
# >= 0.50
# ):
# final_compat_rating = "Tier 4"
# if (
# compat_stats["chrom_length_stats"][
# "percentage_genome_chrm_beyond"
# ]
# < 0.50
# ):
# final_compat_rating = "Tier 3"
# if (
# compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"]
# <= 0.50
# ):
# if (
# compat_stats["chrom_length_stats"][
# "percentage_genome_chrm_beyond"
# ]
# >= 0.30
# ):
# final_compat_rating = "Tier 4"
# if (
# compat_stats["chrom_length_stats"][
# "percentage_genome_chrm_beyond"
# ]
# < 0.30
# ):
# final_compat_rating = "Tier 3"
#
# else:
# # Keep at Tier 1 and run analysis igd_stats for layer 3 analysis
# if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}:
# self.process_igd_stats(compat_stats["igd_stats"])

# Run analysis igd_stats for layer 3 analysis
if compat_stats["seq_fit_stats"]["sequence_fit"]:
# since this is only on keys present in both, ratio should always be less than 1
# Should files be penalized here or actually awarded but only if the fit is really good?
if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.90:
points_rating += 1
if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60:
points_rating += 1
if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60:
points_rating += 1

else:
# if no chrom names were found during assessment
points_rating += 4

# Run analysis on igd_stats
# WIP, currently only showing IGD stats for informational purposes
if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}:
self.process_igd_stats(compat_stats["igd_stats"])

final_compat_rating["assigned_points"] = points_rating

# Now Assign Tier Based on Points

if points_rating == 0:
final_compat_rating["tier_ranking"] = 1
elif 1 <= points_rating <= 3:
final_compat_rating["tier_ranking"] = 2
elif 4 <= points_rating <= 6:
final_compat_rating["tier_ranking"] = 3
elif 7 <= points_rating:
final_compat_rating["tier_ranking"] = 4
else:
print(
f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4"
)
final_compat_rating["tier_ranking"] = 4

processed_stats[genome_alias].update({"Compatibility": final_compat_rating})

return processed_stats
Expand Down

0 comments on commit ad98d8e

Please sign in to comment.