diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index 530dcb4..efcac28 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -151,8 +151,7 @@ def calculate_chrom_stats( length_stats["beyond_range"] = None length_stats["num_of_chrm_beyond"] = None - # Layer 3 Calculate Sequence Fit - + # Layer 3 Calculate Sequence Fit if any query chrom names were present if len(query_keys_present) > 0: seq_fit_stats = {} bed_sum = 0 @@ -246,7 +245,7 @@ def determine_compatibility( model_compat_stats[genome_model.genome_alias] = self.calculate_chrom_stats( bed_chrom_info, genome_model.chrom_sizes ) - # Third Layer is to run IGD but only if layer 1 and layer 2 have passed + # Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed if ( model_compat_stats[genome_model.genome_alias]["chrom_name_stats"][ "passed_chrom_names" @@ -280,7 +279,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: Tier1: Excellent compatibility, 0 pts Tier2: Good compatibility, may need some processing, 1-3 pts - Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 3-6 pts + Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts Tier4: Poor compatibility, 7-9 pts :param dict compat_stats: dicitionary containing unprocessed compat stats @@ -295,7 +294,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: processed_stats[genome_alias].update(compat_stats) # Currently will proceed with discrete buckets, however, we could do a point system in the future - final_compat_rating = "Tier 1" + final_compat_rating = {"tier_ranking": 1, "assigned_points": 0} points_rating = 0 # we will add points increasing the level and then sort into various tiers at the end # Check extra sequences sensitivity and assign points based on how sensitive the outcome is @@ -309,91 +308,59 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: if compat_stats["chrom_name_stats"]["XS"] < 0.3: points_rating += 1 - # Check OOBR and assign points based on sensitivity - if compat_stats["chrom_name_stats"]["OOBR"] < 1: - points_rating += 3 - if compat_stats["chrom_name_stats"]["OOBR"] < 0.7: - points_rating += 1 - if compat_stats["chrom_name_stats"]["OOBR"] < 0.5: - points_rating += 1 - if compat_stats["chrom_name_stats"]["OOBR"] < 0.3: - points_rating += 1 + if compat_stats["chrom_name_stats"]["passed_chrom_names"]: + # Check OOBR and assign points based on sensitivity + # only assessed if no extra chroms in query bed file + if compat_stats["chrom_length_stats"]["OOBR"] < 1: + points_rating += 3 + if compat_stats["chrom_length_stats"]["OOBR"] < 0.7: + points_rating += 1 + if compat_stats["chrom_length_stats"]["OOBR"] < 0.5: + points_rating += 1 + if compat_stats["chrom_length_stats"]["OOBR"] < 0.3: + points_rating += 1 + else: + # Do nothing here, points have already been added when Assessing XS if it is not == 1 + pass # Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome - # TODO - - # if not compat_stats["chrom_name_stats"]["passed_chrom_names"]: - # # if the file did not pass the first layer, immediately bump it down a Tier - # # this pass_chrom_names bool is determined via if q_and_not_m > 1 - # - # final_compat_rating = "Tier 2" - # # Ok, there were some chroms in the bed file NOT in the chromsizes file - # # How many? If it's a lot, we should penalize and knock it down the tier list - # # use jaccard index to determine this - # - # if compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.90: - # # Keep at current tier - # pass - # elif compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.60: - # final_compat_rating = "Tier 3" - # elif compat_stats["chrom_name_stats"]["jaccard_index"] < 0.60: - # final_compat_rating = "Tier 4" - # else: - # # if the bed file has passed the first layer, we should check the chrom lengths - # # careful using jaccard index here for comparison as you could have a small bed file with one chr - # # that is in both the ref and query but would still have a low jaccard index - # if not compat_stats["chrom_length_stats"]["beyond_range"] == 0: - # final_compat_rating = "Tier 2" - # # Some chroms are beyond range. Determine how this should affect tier rating. - # # We can assess what percentage of the chroms in bed file extended beyond the chromsizes file - # # and also compare to how many extended chroms are in the genome. - # if ( - # compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"] - # >= 0.50 - # ): - # if ( - # compat_stats["chrom_length_stats"][ - # "percentage_genome_chrm_beyond" - # ] - # >= 0.50 - # ): - # final_compat_rating = "Tier 4" - # if ( - # compat_stats["chrom_length_stats"][ - # "percentage_genome_chrm_beyond" - # ] - # < 0.50 - # ): - # final_compat_rating = "Tier 3" - # if ( - # compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"] - # <= 0.50 - # ): - # if ( - # compat_stats["chrom_length_stats"][ - # "percentage_genome_chrm_beyond" - # ] - # >= 0.30 - # ): - # final_compat_rating = "Tier 4" - # if ( - # compat_stats["chrom_length_stats"][ - # "percentage_genome_chrm_beyond" - # ] - # < 0.30 - # ): - # final_compat_rating = "Tier 3" - # - # else: - # # Keep at Tier 1 and run analysis igd_stats for layer 3 analysis - # if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}: - # self.process_igd_stats(compat_stats["igd_stats"]) - - # Run analysis igd_stats for layer 3 analysis + if compat_stats["seq_fit_stats"]["sequence_fit"]: + # since this is only on keys present in both, ratio should always be less than 1 + # Should files be penalized here or actually awarded but only if the fit is really good? + if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.90: + points_rating += 1 + if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: + points_rating += 1 + if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: + points_rating += 1 + + else: + # if no chrom names were found during assessment + points_rating += 4 + + # Run analysis on igd_stats # WIP, currently only showing IGD stats for informational purposes if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}: self.process_igd_stats(compat_stats["igd_stats"]) + final_compat_rating["assigned_points"] = points_rating + + # Now Assign Tier Based on Points + + if points_rating == 0: + final_compat_rating["tier_ranking"] = 1 + elif 1 <= points_rating <= 3: + final_compat_rating["tier_ranking"] = 2 + elif 4 <= points_rating <= 6: + final_compat_rating["tier_ranking"] = 3 + elif 7 <= points_rating: + final_compat_rating["tier_ranking"] = 4 + else: + print( + f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4" + ) + final_compat_rating["tier_ranking"] = 4 + processed_stats[genome_alias].update({"Compatibility": final_compat_rating}) return processed_stats