add tier rating based on calculated stats

databio · Sep 9, 2024 · ad98d8e · ad98d8e
1 parent 70e8328
commit ad98d8e
Showing 1 changed file with 51 additions and 84 deletions.
diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py
@@ -151,8 +151,7 @@ def calculate_chrom_stats(
             length_stats["beyond_range"] = None
             length_stats["num_of_chrm_beyond"] = None
 
-        # Layer 3 Calculate Sequence Fit
-
+        # Layer 3 Calculate Sequence Fit if any query chrom names were present
         if len(query_keys_present) > 0:
             seq_fit_stats = {}
             bed_sum = 0
@@ -246,7 +245,7 @@ def determine_compatibility(
             model_compat_stats[genome_model.genome_alias] = self.calculate_chrom_stats(
                 bed_chrom_info, genome_model.chrom_sizes
             )
-            # Third Layer is to run IGD but only if layer 1 and layer 2 have passed
+            # Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed
             if (
                 model_compat_stats[genome_model.genome_alias]["chrom_name_stats"][
                     "passed_chrom_names"
@@ -280,7 +279,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
 
         Tier1: Excellent compatibility, 0 pts
         Tier2: Good compatibility, may need some processing, 1-3 pts
-        Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 3-6 pts
+        Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts
         Tier4: Poor compatibility, 7-9 pts
 
         :param dict compat_stats: dicitionary containing unprocessed compat stats
@@ -295,7 +294,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
         processed_stats[genome_alias].update(compat_stats)
 
         # Currently will proceed with discrete buckets, however, we could do a point system in the future
-        final_compat_rating = "Tier 1"
+        final_compat_rating = {"tier_ranking": 1, "assigned_points": 0}
         points_rating = 0  # we will add points increasing the level and then sort into various tiers at the end
 
         # Check extra sequences sensitivity and assign points based on how sensitive the outcome is
@@ -309,91 +308,59 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict:
         if compat_stats["chrom_name_stats"]["XS"] < 0.3:
             points_rating += 1
 
-        # Check OOBR and assign points based on sensitivity
-        if compat_stats["chrom_name_stats"]["OOBR"] < 1:
-            points_rating += 3
-        if compat_stats["chrom_name_stats"]["OOBR"] < 0.7:
-            points_rating += 1
-        if compat_stats["chrom_name_stats"]["OOBR"] < 0.5:
-            points_rating += 1
-        if compat_stats["chrom_name_stats"]["OOBR"] < 0.3:
-            points_rating += 1
+        if compat_stats["chrom_name_stats"]["passed_chrom_names"]:
+            # Check OOBR and assign points based on sensitivity
+            # only assessed if no extra chroms in query bed file
+            if compat_stats["chrom_length_stats"]["OOBR"] < 1:
+                points_rating += 3
+            if compat_stats["chrom_length_stats"]["OOBR"] < 0.7:
+                points_rating += 1
+            if compat_stats["chrom_length_stats"]["OOBR"] < 0.5:
+                points_rating += 1
+            if compat_stats["chrom_length_stats"]["OOBR"] < 0.3:
+                points_rating += 1
+        else:
+            # Do nothing here, points have already been added when Assessing XS if it is not == 1
+            pass
 
         # Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome
-        # TODO
-
-        # if not compat_stats["chrom_name_stats"]["passed_chrom_names"]:
-        #     # if the file did not pass the first layer, immediately bump it down a Tier
-        #     # this pass_chrom_names bool is determined via if q_and_not_m > 1
-        #
-        #     final_compat_rating = "Tier 2"
-        #     # Ok, there were some chroms in the bed file NOT in the chromsizes file
-        #     # How many? If it's a lot, we should penalize and knock it down the tier list
-        #     # use jaccard index to determine this
-        #
-        #     if compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.90:
-        #         # Keep at current tier
-        #         pass
-        #     elif compat_stats["chrom_name_stats"]["jaccard_index"] >= 0.60:
-        #         final_compat_rating = "Tier 3"
-        #     elif compat_stats["chrom_name_stats"]["jaccard_index"] < 0.60:
-        #         final_compat_rating = "Tier 4"
-        # else:
-        #     # if the bed file has passed the first layer, we should check the chrom lengths
-        #     # careful using jaccard index here for comparison as you could have a small bed file with one chr
-        #     # that is in both the ref and query but would still have a low jaccard index
-        #     if not compat_stats["chrom_length_stats"]["beyond_range"] == 0:
-        #         final_compat_rating = "Tier 2"
-        #         # Some chroms are beyond range. Determine how this should affect tier rating.
-        #         # We can assess  what percentage of the chroms in bed file extended beyond the chromsizes file
-        #         # and also compare to how many extended chroms are in the genome.
-        #         if (
-        #             compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"]
-        #             >= 0.50
-        #         ):
-        #             if (
-        #                 compat_stats["chrom_length_stats"][
-        #                     "percentage_genome_chrm_beyond"
-        #                 ]
-        #                 >= 0.50
-        #             ):
-        #                 final_compat_rating = "Tier 4"
-        #             if (
-        #                 compat_stats["chrom_length_stats"][
-        #                     "percentage_genome_chrm_beyond"
-        #                 ]
-        #                 < 0.50
-        #             ):
-        #                 final_compat_rating = "Tier 3"
-        #         if (
-        #             compat_stats["chrom_length_stats"]["percentage_bed_chrm_beyond"]
-        #             <= 0.50
-        #         ):
-        #             if (
-        #                 compat_stats["chrom_length_stats"][
-        #                     "percentage_genome_chrm_beyond"
-        #                 ]
-        #                 >= 0.30
-        #             ):
-        #                 final_compat_rating = "Tier 4"
-        #             if (
-        #                 compat_stats["chrom_length_stats"][
-        #                     "percentage_genome_chrm_beyond"
-        #                 ]
-        #                 < 0.30
-        #             ):
-        #                 final_compat_rating = "Tier 3"
-        #
-        #     else:
-        #         # Keep at Tier 1 and run analysis igd_stats for layer 3 analysis
-        #         if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}:
-        #             self.process_igd_stats(compat_stats["igd_stats"])
-
-        # Run analysis igd_stats for layer 3 analysis
+        if compat_stats["seq_fit_stats"]["sequence_fit"]:
+            # since this is only on keys present in both, ratio should always be less than 1
+            # Should files be penalized here or actually awarded but only if the fit is really good?
+            if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.90:
+                points_rating += 1
+            if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60:
+                points_rating += 1
+            if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60:
+                points_rating += 1
+
+        else:
+            # if no chrom names were found during assessment
+            points_rating += 4
+
+        # Run analysis on igd_stats
         # WIP, currently only showing IGD stats for informational purposes
         if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}:
             self.process_igd_stats(compat_stats["igd_stats"])
 
+        final_compat_rating["assigned_points"] = points_rating
+
+        # Now Assign Tier Based on Points
+
+        if points_rating == 0:
+            final_compat_rating["tier_ranking"] = 1
+        elif 1 <= points_rating <= 3:
+            final_compat_rating["tier_ranking"] = 2
+        elif 4 <= points_rating <= 6:
+            final_compat_rating["tier_ranking"] = 3
+        elif 7 <= points_rating:
+            final_compat_rating["tier_ranking"] = 4
+        else:
+            print(
+                f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4"
+            )
+            final_compat_rating["tier_ranking"] = 4
+
         processed_stats[genome_alias].update({"Compatibility": final_compat_rating})
 
         return processed_stats