From 156376be78c135bdea653aec00531be2f2bdfc40 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Sep 2024 16:19:03 -0400 Subject: [PATCH 01/14] refactoring --- bedboss/refgenome_validator/__init__.py | 4 +- bedboss/refgenome_validator/models.py | 47 +++ .../refgenome_validator/refgenomevalidator.py | 272 +++++++++--------- test/test_ref_validator.py | 13 + 4 files changed, 189 insertions(+), 147 deletions(-) create mode 100644 bedboss/refgenome_validator/models.py create mode 100644 test/test_ref_validator.py diff --git a/bedboss/refgenome_validator/__init__.py b/bedboss/refgenome_validator/__init__.py index 24f8c10..3bcafd3 100644 --- a/bedboss/refgenome_validator/__init__.py +++ b/bedboss/refgenome_validator/__init__.py @@ -1,4 +1,2 @@ from bedboss.refgenome_validator.genome_model import GenomeModel -from bedboss.refgenome_validator.refgenomevalidator import RefValidator - -# __all__ = ["GenomeModel"] +from bedboss.refgenome_validator.refgenomevalidator import ReferenceValidator diff --git a/bedboss/refgenome_validator/models.py b/bedboss/refgenome_validator/models.py new file mode 100644 index 0000000..25261f5 --- /dev/null +++ b/bedboss/refgenome_validator/models.py @@ -0,0 +1,47 @@ +from pydantic import BaseModel, ConfigDict +from typing import Union + + +class ChromNameStats(BaseModel): + xs: float = 0.0 + q_and_m: float = 0.0 + q_and_not_m: float = 0.0 + not_q_and_m: float = 0.0 + jaccard_index: float = 0.0 + jaccard_index_binary: float = 0.0 + passed_chrom_names: bool = False + + +class ChromLengthStats(BaseModel): + oobr: Union[float, None] = None + beyond_range: bool = False + num_of_chrom_beyond: int = 0 + percentage_bed_chrom_beyond: float = 0.0 + percentage_genome_chrom_beyond: float = 0.0 + + +class SequenceFitStats(BaseModel): + sequence_fit: Union[float, None] = None + + +class RatingModel(BaseModel): + assigned_points: int + tier_ranking: int + # model_config = ConfigDict(extra="forbid") + + +class CompatibilityStats(BaseModel): + chrom_name_stats: ChromNameStats + chrom_length_stats: ChromLengthStats + chrom_sequence_fit_stats: SequenceFitStats + igd_stats: Union[dict, None] = None + compatibility: Union[RatingModel, None] = None + + model_config = ConfigDict(extra="forbid") + + +class CompatibilityConcise(BaseModel): + xs: float = 0.0 + oobr: Union[float, None] = None + sequence_fit: Union[float, None] = None + compatibility: Union[RatingModel, None] = None diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index 55db9df..6974d65 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -4,7 +4,15 @@ import subprocess from bedboss.exceptions import ValidatorException -from bedboss.refgenome_validator import GenomeModel +from bedboss.refgenome_validator.genome_model import GenomeModel +from bedboss.refgenome_validator.models import ( + ChromNameStats, + ChromLengthStats, + SequenceFitStats, + CompatibilityStats, + RatingModel, + CompatibilityConcise, +) try: IGD_LOCATION = os.environ["IGD_LOCATION"] @@ -13,12 +21,10 @@ IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" -class RefValidator: +class ReferenceValidator: """ - This is primary class for creating a compatibility vector An object of this class is to be created once and then used for the entirety of a pipeline,e.g. Bedboss. - """ def __init__( @@ -29,30 +35,26 @@ def __init__( """ Initialization method - :param list[GenomeModels] genome_models: this is a list of GenomeModels that will be checked against a bed file - :param str igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, if not provided these metrics are not computed. - + :param genome_models: this is a list of GenomeModels that will be checked against a bed file + :param igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, if not provided these metrics are not computed. """ - if not genome_models: - genome_models = self.build_default_models() - if isinstance(genome_models, str): + if not genome_models: + genome_models = self._build_default_models() + elif isinstance(genome_models, str): genome_models = list(genome_models) - if not isinstance(genome_models, list): + elif not isinstance(genome_models, list): raise ValidatorException( reason="A list of GenomeModels must be provided to initialize the Validator class" ) - self.genome_models = genome_models - + self.genome_models: List[GenomeModel] = genome_models self.igd_path = igd_path - # this will be a list of dictionary info with length of genome_models - self.compatibility_list = [] - + @staticmethod def calculate_chrom_stats( - self, bed_chrom_sizes: dict, genome_chrom_sizes: dict - ) -> dict: + bed_chrom_sizes: dict, genome_chrom_sizes: dict + ) -> CompatibilityStats: """ Given two dicts of chroms (key) and their sizes (values) determine overlap and sequence fit @@ -67,7 +69,6 @@ def calculate_chrom_stats( # Layer 1: Check names and Determine XS (Extra Sequences) via Calculation of Recall/Sensitivity # Q = Query, M = Model - name_stats = {} q_and_m = 0 # how many chrom names are in the query and the genome model? q_and_not_m = ( 0 # how many chrom names are in the query but not in the genome model? @@ -108,76 +109,69 @@ def calculate_chrom_stats( sensitivity = q_and_m / (q_and_m + q_and_not_m) # Assign Stats - # TODO maybe use pydantic in the future - name_stats["XS"] = sensitivity - name_stats["q_and_m"] = q_and_m - name_stats["q_and_not_m"] = q_and_not_m - name_stats["not_q_and_m"] = not_q_and_m - name_stats["jaccard_index"] = chrom_jaccard_index - name_stats["jaccard_index_binary"] = jaccard_binary - name_stats["passed_chrom_names"] = passed_chrom_names + name_stats = ChromNameStats( + xs=sensitivity, + q_and_m=q_and_m, + q_and_not_m=q_and_not_m, + not_q_and_m=not_q_and_m, + jaccard_index=chrom_jaccard_index, + jaccard_index_binary=jaccard_binary, + passed_chrom_names=passed_chrom_names, + ) # Layer 2: Check Lengths, but only if layer 1 is passing if passed_chrom_names: - length_stats = {} - chroms_beyond_range = False - num_of_chrm_beyond = 0 - num_chrm_within_bounds = 0 + num_of_chrom_beyond = 0 + num_chrom_within_bounds = 0 for key in list(bed_chrom_sizes.keys()): if key in genome_chrom_sizes: if bed_chrom_sizes[key] > genome_chrom_sizes[key]: - num_of_chrm_beyond += 1 + num_of_chrom_beyond += 1 chroms_beyond_range = True else: - num_chrm_within_bounds += 1 + num_chrom_within_bounds += 1 # Calculate recall/sensitivity for chrom lengths # defined as OOBR -> Out of Bounds Range - sensitivity = num_chrm_within_bounds / ( - num_chrm_within_bounds + num_of_chrm_beyond - ) - length_stats["OOBR"] = sensitivity - - length_stats["beyond_range"] = chroms_beyond_range - length_stats["num_of_chrm_beyond"] = num_of_chrm_beyond - - # Naive calculation, seq fit (below) may be better metric - length_stats["percentage_bed_chrm_beyond"] = ( - 100 * num_of_chrm_beyond / len(bed_chrom_set) + sensitivity = num_chrom_within_bounds / ( + num_chrom_within_bounds + num_of_chrom_beyond ) - length_stats["percentage_genome_chrm_beyond"] = ( - 100 * num_of_chrm_beyond / len(genome_chrom_set) + length_stats = ChromLengthStats( + oobr=sensitivity, + beyond_range=chroms_beyond_range, + num_of_chrm_beyond=num_of_chrom_beyond, + percentage_bed_chrom_beyond=( + 100 * num_of_chrom_beyond / len(bed_chrom_set) + ), + percentage_genome_chrom_beyond=( + 100 * num_of_chrom_beyond / len(genome_chrom_set) + ), ) else: - length_stats = {} - length_stats["OOBR"] = None - length_stats["beyond_range"] = None - length_stats["num_of_chrm_beyond"] = None + length_stats = ChromLengthStats() # Layer 3 Calculate Sequence Fit if any query chrom names were present if len(query_keys_present) > 0: - seq_fit_stats = {} bed_sum = 0 ref_genome_sum = 0 - for chr in query_keys_present: - bed_sum += int(genome_chrom_sizes[chr]) - for chr in genome_chrom_sizes: - ref_genome_sum += int(genome_chrom_sizes[chr]) + for q_chr in query_keys_present: + bed_sum += int(genome_chrom_sizes[q_chr]) + for g_chr in genome_chrom_sizes: + ref_genome_sum += int(genome_chrom_sizes[g_chr]) + + seq_fit_stats = SequenceFitStats(sequence_fit=bed_sum / ref_genome_sum) - sequence_fit = bed_sum / ref_genome_sum - seq_fit_stats["sequence_fit"] = sequence_fit else: - seq_fit_stats = {} - seq_fit_stats["sequence_fit"] = None + seq_fit_stats = SequenceFitStats(sequence_fit=None) - return { - "chrom_name_stats": name_stats, - "chrom_length_stats": length_stats, - "seq_fit_stats": seq_fit_stats, - } + return CompatibilityStats( + chrom_name_stats=name_stats, + chrom_length_stats=length_stats, + chrom_sequence_fit_stats=seq_fit_stats, + ) def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, None]]: """ @@ -217,19 +211,20 @@ def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, Non if "file_name" in datum and "number_of_hits" in datum: overlaps_dict.update({datum["file_name"]: datum["number_of_hits"]}) - return {"igd_stats": overlaps_dict} + return overlaps_dict def determine_compatibility( self, bedfile: str, ref_filter: Optional[List[str]] = None, concise: Optional[bool] = False, - ) -> Union[List[dict], None]: + ) -> Union[List[CompatibilityStats], List[CompatibilityConcise]]: """ Given a bedfile, determine compatibility with reference genomes (GenomeModels) created at Validator initialization. :param str bedfile: path to bedfile on disk, .bed :param list[str] ref_filter: list of ref genome aliases to filter on. + :param bool concise: if True, only return a concise list of compatibility stats :return list[dict]: a list of dictionaries where each element of the array represents a compatibility dictionary for each refgenome model. """ @@ -252,40 +247,36 @@ def determine_compatibility( final_compatibility_list = [] for genome_model in self.genome_models: # First and Second Layer of Compatibility - model_compat_stats[genome_model.genome_alias] = self.calculate_chrom_stats( - bed_chrom_info, genome_model.chrom_sizes + model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( + self.calculate_chrom_stats(bed_chrom_info, genome_model.chrom_sizes) ) # Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed if ( - model_compat_stats[genome_model.genome_alias]["chrom_name_stats"][ - "passed_chrom_names" - ] - and not model_compat_stats[genome_model.genome_alias][ - "chrom_length_stats" - ]["beyond_range"] + model_compat_stats[ + genome_model.genome_alias + ].chrom_name_stats.passed_chrom_names + and not model_compat_stats[ + genome_model.genome_alias + ].chrom_length_stats.beyond_range ): - model_compat_stats[genome_model.genome_alias].update( + model_compat_stats[genome_model.genome_alias].igd_stats = ( self.get_igd_overlaps(bedfile) ) - else: - model_compat_stats[genome_model.genome_alias].update( - {"igd_stats": None} - ) # Once all stats are collected, process them and add compatibility rating - processed_stats = self.process_compat_stats( - model_compat_stats[genome_model.genome_alias], genome_model.genome_alias - ) + model_compat_stats[genome_model.genome_alias].compatibility = self.calculate_rating(model_compat_stats[genome_model.genome_alias]) - final_compatibility_list.append(processed_stats) + if concise: + ... - if concise: - # TODO just return XS, OOBR, SEQ FIT, COMPAT TIER - return final_compatibility_list + return model_compat_stats + # if concise: + # # TODO just return XS, OOBR, SEQ FIT, COMPAT TIER + # return final_compatibility_list + # + # return final_compatibility_list - return final_compatibility_list - - def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: + def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: """ Given compatibility stats for a specific ref genome determine the compatibility tier @@ -296,56 +287,53 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts Tier4: Poor compatibility, 7-9 pts - :param dict compat_stats: dicitionary containing unprocessed compat stats - :param str genome_alias: + :param CompatibilityStats compat_stats: dicitionary containing unprocessed compat stats :return dict : containing both the original stats and the final compatibility rating for the reference genome """ - # Set up processed stats dict, it will add an extra key to the original dict - # with the final rating - processed_stats = {} - processed_stats[genome_alias] = {} - processed_stats[genome_alias].update(compat_stats) - - # Currently will proceed with discrete buckets, however, we could do a point system in the future - final_compat_rating = {"tier_ranking": 1, "assigned_points": 0} - points_rating = 0 # we will add points increasing the level and then sort into various tiers at the end + points_rating = 0 - # Check extra sequences sensitivity and assign points based on how sensitive the outcome is + # 1. Check extra sequences sensitivity and assign points based on how sensitive the outcome is # sensitivity = 1 is considered great and no points should be assigned - if compat_stats["chrom_name_stats"]["XS"] < 1: + xs = compat_stats.chrom_name_stats.xs + if xs < 0.3: + points_rating += 6 # 3 + 1 + 1 + 1 + elif xs < 0.5: + points_rating += 5 # 3 + 1 + 1 + elif xs < 0.7: + points_rating += 4 # 3 + 1 + elif xs < 1: points_rating += 3 - if compat_stats["chrom_name_stats"]["XS"] < 0.7: - points_rating += 1 - if compat_stats["chrom_name_stats"]["XS"] < 0.5: - points_rating += 1 - if compat_stats["chrom_name_stats"]["XS"] < 0.3: - points_rating += 1 - - if compat_stats["chrom_name_stats"]["passed_chrom_names"]: - # Check OOBR and assign points based on sensitivity - # only assessed if no extra chroms in query bed file - if compat_stats["chrom_length_stats"]["OOBR"] < 1: + else: + pass + + # 2. Check OOBR and assign points based on sensitivity + # only assessed if no extra chroms in query bed file + if compat_stats.chrom_name_stats.passed_chrom_names: + oobr = compat_stats.chrom_length_stats.oobr + + if oobr < 0.3: + points_rating += 6 # 3 + 1 + 1 + 1 + elif oobr < 0.5: + points_rating += 5 # 3 + 1 + 1 + elif oobr < 0.7: + points_rating += 4 # 3 + 1 + elif oobr < 1: points_rating += 3 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.7: - points_rating += 1 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.5: - points_rating += 1 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.3: - points_rating += 1 else: # Do nothing here, points have already been added when Assessing XS if it is not == 1 pass # Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome - if compat_stats["seq_fit_stats"]["sequence_fit"]: + sequence_fit = compat_stats.chrom_sequence_fit_stats.sequence_fit + if sequence_fit: # since this is only on keys present in both, ratio should always be less than 1 # Should files be penalized here or actually awarded but only if the fit is really good? - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.90: + if sequence_fit < 0.90: points_rating += 1 - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: + if sequence_fit < 0.60: points_rating += 1 - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: + if sequence_fit < 0.60: points_rating += 1 else: @@ -354,30 +342,29 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: # Run analysis on igd_stats # WIP, currently only showing IGD stats for informational purposes - if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}: - self.process_igd_stats(compat_stats["igd_stats"]) + if compat_stats.igd_stats and compat_stats.igd_stats != {}: + self.process_igd_stats(compat_stats.igd_stats) - final_compat_rating["assigned_points"] = points_rating - - # Now Assign Tier Based on Points + tier_ranking = 0 if points_rating == 0: - final_compat_rating["tier_ranking"] = 1 + tier_ranking = 1 elif 1 <= points_rating <= 3: - final_compat_rating["tier_ranking"] = 2 + tier_ranking = 2 elif 4 <= points_rating <= 6: - final_compat_rating["tier_ranking"] = 3 + tier_ranking = 3 elif 7 <= points_rating: - final_compat_rating["tier_ranking"] = 4 + tier_ranking = 4 else: print( f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4" ) - final_compat_rating["tier_ranking"] = 4 + tier_ranking = 4 - processed_stats[genome_alias].update({"compatibility": final_compat_rating}) + return RatingModel( + assigned_points=points_rating, tier_ranking=tier_ranking + ) - return processed_stats def process_igd_stats(self, igd_stats: dict): """ @@ -385,27 +372,24 @@ def process_igd_stats(self, igd_stats: dict): """ pass - def build_default_models(self): + @staticmethod + def _build_default_models() -> list[GenomeModel]: """ Builds a default list of GenomeModels from the chrom.sizes folder. Uses file names as genome alias. return list[GenomeModel] """ + dir_path = os.path.dirname(os.path.realpath(__file__)) + + chrm_sizes_directory = os.path.join(dir_path, "chrom_sizes") - chrm_sizes_directory = os.path.join( - os.path.curdir, os.path.abspath("./chrom_sizes") - ) all_genome_models = [] for root, dirs, files in os.walk(chrm_sizes_directory): for file in files: if file.endswith(".sizes"): - # print(os.path.join(root, file)) - # Get file name - name = os.path.basename(file) - curr_genome_model = GenomeModel( - genome_alias=name, chrom_sizes_file=file + genome_alias=file, chrom_sizes_file=os.path.join(root, file) ) all_genome_models.append(curr_genome_model) diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py new file mode 100644 index 0000000..fd7fa93 --- /dev/null +++ b/test/test_ref_validator.py @@ -0,0 +1,13 @@ +import os + +from bedboss.refgenome_validator import ReferenceValidator + + +FILE_DIR = os.path.dirname(os.path.realpath(__file__)) +HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct") +FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz" + + +def test_main(): + ff = ReferenceValidator().determine_compatibility(FILE_PATH) + ff From 85f449e70807a8532c7561075114b0864cf96e8b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Sep 2024 22:27:57 -0400 Subject: [PATCH 02/14] added concise to ref gen validator return --- bedboss/refgenome_validator/models.py | 3 +- .../refgenome_validator/refgenomevalidator.py | 47 +++++++++++-------- test/test_ref_validator.py | 9 +++- 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/bedboss/refgenome_validator/models.py b/bedboss/refgenome_validator/models.py index 25261f5..f852860 100644 --- a/bedboss/refgenome_validator/models.py +++ b/bedboss/refgenome_validator/models.py @@ -44,4 +44,5 @@ class CompatibilityConcise(BaseModel): xs: float = 0.0 oobr: Union[float, None] = None sequence_fit: Union[float, None] = None - compatibility: Union[RatingModel, None] = None + assigned_points: int + tier_ranking: int diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index 6974d65..b41b6a1 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List +from typing import Optional, Union, List, Dict import os import pandas as pd import subprocess @@ -136,17 +136,17 @@ def calculate_chrom_stats( # Calculate recall/sensitivity for chrom lengths # defined as OOBR -> Out of Bounds Range sensitivity = num_chrom_within_bounds / ( - num_chrom_within_bounds + num_of_chrom_beyond + num_chrom_within_bounds + num_of_chrom_beyond ) length_stats = ChromLengthStats( oobr=sensitivity, beyond_range=chroms_beyond_range, - num_of_chrm_beyond=num_of_chrom_beyond, + num_of_chrom_beyond=num_of_chrom_beyond, percentage_bed_chrom_beyond=( - 100 * num_of_chrom_beyond / len(bed_chrom_set) + 100 * num_of_chrom_beyond / len(bed_chrom_set) ), percentage_genome_chrom_beyond=( - 100 * num_of_chrom_beyond / len(genome_chrom_set) + 100 * num_of_chrom_beyond / len(genome_chrom_set) ), ) @@ -218,7 +218,7 @@ def determine_compatibility( bedfile: str, ref_filter: Optional[List[str]] = None, concise: Optional[bool] = False, - ) -> Union[List[CompatibilityStats], List[CompatibilityConcise]]: + ) -> Union[Dict[str, CompatibilityStats], Dict[str, CompatibilityConcise]]: """ Given a bedfile, determine compatibility with reference genomes (GenomeModels) created at Validator initialization. @@ -241,7 +241,7 @@ def determine_compatibility( if not bed_chrom_info: # if there is trouble parsing the bed file, return None - return None + raise ValidatorException model_compat_stats = {} final_compatibility_list = [] @@ -264,17 +264,18 @@ def determine_compatibility( ) # Once all stats are collected, process them and add compatibility rating - model_compat_stats[genome_model.genome_alias].compatibility = self.calculate_rating(model_compat_stats[genome_model.genome_alias]) + model_compat_stats[genome_model.genome_alias].compatibility = ( + self.calculate_rating(model_compat_stats[genome_model.genome_alias]) + ) + + if concise: + concise_dict = {} + for name, stats in model_compat_stats.items(): + concise_dict[name] = self._create_concise_output(stats) - if concise: - ... + return concise_dict return model_compat_stats - # if concise: - # # TODO just return XS, OOBR, SEQ FIT, COMPAT TIER - # return final_compatibility_list - # - # return final_compatibility_list def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: """ @@ -345,7 +346,6 @@ def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: if compat_stats.igd_stats and compat_stats.igd_stats != {}: self.process_igd_stats(compat_stats.igd_stats) - tier_ranking = 0 if points_rating == 0: tier_ranking = 1 @@ -361,10 +361,7 @@ def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: ) tier_ranking = 4 - return RatingModel( - assigned_points=points_rating, tier_ranking=tier_ranking - ) - + return RatingModel(assigned_points=points_rating, tier_ranking=tier_ranking) def process_igd_stats(self, igd_stats: dict): """ @@ -395,6 +392,16 @@ def _build_default_models() -> list[GenomeModel]: return all_genome_models + @staticmethod + def _create_concise_output(output: CompatibilityStats) -> CompatibilityConcise: + return CompatibilityConcise( + xs=output.chrom_name_stats.xs, + oobr=output.chrom_length_stats.oobr, + sequence_fit=output.chrom_sequence_fit_stats.sequence_fit, + assigned_points=output.compatibility.assigned_points, + tier_ranking=output.compatibility.tier_ranking, + ) + # ---------------------------- # Helper Functions diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py index fd7fa93..e68d81f 100644 --- a/test/test_ref_validator.py +++ b/test/test_ref_validator.py @@ -9,5 +9,12 @@ def test_main(): - ff = ReferenceValidator().determine_compatibility(FILE_PATH) + ff = ReferenceValidator().determine_compatibility( + FILE_PATH, + concise=True, + ) + # ff = ReferenceValidator().determine_compatibility( + # "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz", + # concise=True, + # ) ff From 1ada5c9ad9f6d94f6d93ed45f3c40472e4eea9ee Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 17 Sep 2024 10:27:58 -0400 Subject: [PATCH 03/14] added better opening of the bed file --- .../refgenome_validator/refgenomevalidator.py | 29 +-------- bedboss/refgenome_validator/utils.py | 65 +++++++++++++++++++ test/test_ref_validator.py | 10 +-- 3 files changed, 72 insertions(+), 32 deletions(-) create mode 100644 bedboss/refgenome_validator/utils.py diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index b41b6a1..ee8f0f0 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -1,6 +1,5 @@ from typing import Optional, Union, List, Dict import os -import pandas as pd import subprocess from bedboss.exceptions import ValidatorException @@ -13,6 +12,7 @@ RatingModel, CompatibilityConcise, ) +from bedboss.refgenome_validator.utils import get_bed_chrom_info try: IGD_LOCATION = os.environ["IGD_LOCATION"] @@ -244,7 +244,7 @@ def determine_compatibility( raise ValidatorException model_compat_stats = {} - final_compatibility_list = [] + for genome_model in self.genome_models: # First and Second Layer of Compatibility model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( @@ -403,31 +403,6 @@ def _create_concise_output(output: CompatibilityStats) -> CompatibilityConcise: ) -# ---------------------------- -# Helper Functions -def get_bed_chrom_info(bed_file_path: str) -> Union[dict, None]: - """ - Given a path to a Bedfile. Attempt to open it and read it to find all of the chromosomes and the max length of each. - - returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. - """ - - # Right now this assumes this is atleast a 3 column bedfile - # This also assumes the bed file has been unzipped - - # TODO This code is overlapping with geniml io code and should be refactored in the future, ie move opening bed files to helper utils. - - try: - df = pd.read_csv(bed_file_path, sep="\t", header=None, skiprows=5) - - max_end_for_each_chrom = df.groupby(0)[2].max() - - # return max end position for each chromosome - return max_end_for_each_chrom.to_dict() - except Exception: - return None - - def run_igd_command(command): """Run IGD via a subprocess, this is a temp implementation until Rust IGD python bindings are finished.""" diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py new file mode 100644 index 0000000..038dccd --- /dev/null +++ b/bedboss/refgenome_validator/utils.py @@ -0,0 +1,65 @@ +from typing import Union +import pandas as pd +from geniml.io.utils import is_gzipped +import logging + + +_LOGGER = logging.getLogger("bedboss") + + +def _read_gzipped_file(file_path: str) -> pd.DataFrame: + """ + !! Copy from geniml! + Read a gzipped file into a pandas dataframe + + :param file_path: path to gzipped file + :return: pandas dataframe + """ + return _read_file_pd( + file_path, + sep="\t", + compression="gzip", + header=None, + engine="pyarrow", + ) + + +def _read_file_pd(*args, **kwargs) -> pd.DataFrame: + """ + !! Copy from geniml! + Read bed file into a pandas DataFrame, and skip header rows if needed + + :return: pandas dataframe + """ + max_rows = 5 + row_count = 0 + while row_count <= max_rows: + try: + df = pd.read_csv(*args, **kwargs, skiprows=row_count) + if row_count > 0: + _LOGGER.info( + f"Skipped {row_count} rows while standardization. File: '{args}'" + ) + df = df.dropna(axis=1) + return df + except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: + if row_count <= max_rows: + row_count += 1 + # if can't open file after 5 attempts try to open it with gzip + return _read_gzipped_file(*args) + + +def get_bed_chrom_info(bedfile: str) -> dict: + """ + Attempt to open it and read it to find all of the chromosomes and the max length of each. + + :param bedfile: bedfilepath + returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. + """ + if is_gzipped(bedfile): + df = _read_gzipped_file(bedfile) + else: + df = _read_file_pd(bedfile, sep="\t", header=None, engine="pyarrow") + + max_end_for_each_chrom = df.groupby(0)[2].max() + return max_end_for_each_chrom.to_dict() diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py index e68d81f..9293534 100644 --- a/test/test_ref_validator.py +++ b/test/test_ref_validator.py @@ -9,12 +9,12 @@ def test_main(): - ff = ReferenceValidator().determine_compatibility( - FILE_PATH, - concise=True, - ) # ff = ReferenceValidator().determine_compatibility( - # "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz", + # FILE_PATH, # concise=True, # ) + ff = ReferenceValidator().determine_compatibility( + "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz", + concise=True, + ) ff From e8be1fe3a24214e3e6579886b6b9e86131dd30e8 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:51:57 -0400 Subject: [PATCH 04/14] fix stats scripts after refactor --- .../stats_compat_testing.py | 11 ++-- .../ref_genome_validating/validate_genome.py | 51 ++++++++++++------- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index ed3a870..86e2048 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -15,7 +15,8 @@ PEP_URL = os.environ["PEP_URL"] except: # pep url - PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" def main(): @@ -166,9 +167,9 @@ def extract_values(dictionary): dictionary = json.loads(dictionary) - mm10_tier = dictionary["mm10"]["tier_ranking"] + mm10_tier = dictionary["ucsc_mm10"]["tier_ranking"] ncbi_hg38_tier = dictionary["ncbi_hg38"]["tier_ranking"] - hg19_tier = dictionary["hg19"]["tier_ranking"] + hg19_tier = dictionary["ucsc_hg19"]["tier_ranking"] ucsc_dm6_tier = dictionary["ucsc_dm6"]["tier_ranking"] ucsc_hg38_tier = dictionary["ucsc_hg38"]["tier_ranking"] ensembl_hg38_tier = dictionary["ensembl_hg38"]["tier_ranking"] @@ -230,8 +231,8 @@ def create_heatmap(df): colorbar.set_ticklabels(custom_labels) # plt.colorbar(ticks=4, label='Custom Label', ticklabels=custom_labels) plt.title("Tier Rating: Bed File vs Ref Genome") - plt.xlabel("Reference Genomes", fontsize=12) - plt.ylabel("Query Bed Files", fontsize=12) + plt.xlabel("Reference Genomes", fontsize=8) + plt.ylabel("Query Bed Files", fontsize=8) # plt.grid(True) plt.show() diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index c65ba39..0ee7dea 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -4,7 +4,10 @@ import refgenconf from pipestat import pipestat -from bedboss.refgenome_validator.refgenomevalidator import RefValidator, GenomeModel +from bedboss.refgenome_validator.refgenomevalidator import ( + ReferenceValidator, + GenomeModel, +) # helper utils from process_exclude_ranges import unzip_bedfile, get_samples, MAX_SAMPLES @@ -29,8 +32,9 @@ except: # if you wish to report results to pephub # PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" - PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" # PEP_URL ="donaldcampbelljr/ref_genome_dros_only:default" + PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" # Where to get Bedfiles? LOCAL = True @@ -77,36 +81,48 @@ def main(): # Manually create more genome models not found in ref genie ucsc_hg38 = GenomeModel( genome_alias="ucsc_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_hg38.chrom.sizes", ) ncbi_hg38 = GenomeModel( genome_alias="ncbi_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ncbi_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ncbi_hg38.chrom.sizes", ) ensembl_hg38 = GenomeModel( genome_alias="ensembl_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ensembl_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ensembl_hg38.chrom.sizes", ) - ucsc_pantro6 = GenomeModel( - genome_alias="ucsc_pantro6", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_panTro6.chrom.sizes", + ucsc_hg19 = GenomeModel( + genome_alias="ucsc_hg19", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_hg19.chrom.sizes", ) ucsc_dm6 = GenomeModel( genome_alias="ucsc_dm6", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_dm6.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_dm6.chrom.sizes", + ) + + ucsc_mm10 = GenomeModel( + genome_alias="ucsc_mm10", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm10.chrom.sizes", + ) + + ucsc_pantro6 = GenomeModel( + genome_alias="ucsc_pantro6", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes", ) all_genome_models.append(ucsc_hg38) all_genome_models.append(ncbi_hg38) + all_genome_models.append(ucsc_mm10) + all_genome_models.append(ucsc_hg19) all_genome_models.append(ensembl_hg38) all_genome_models.append(ucsc_pantro6) all_genome_models.append(ucsc_dm6) # Create Validator Object - validator = RefValidator(genome_models=all_genome_models) + validator = ReferenceValidator(genome_models=all_genome_models) # Get BED files # LOCAL @@ -127,14 +143,15 @@ def main(): } # add this to a column to make comparisons easier for human eyes on pephub all_vals = {} if compat_vector: - for i in compat_vector: + for i in compat_vector.keys(): if i is not None: - for key, values in i.items(): - all_vals.update({key: values}) - if "compatibility" in values: - tier["tier_rating"].update( - {key: values["compatibility"]} - ) + all_vals.update({i: compat_vector[i].model_dump()}) + dict_to_check = compat_vector[i].model_dump() + if "compatibility" in dict_to_check: + tier["tier_rating"].update( + {i: dict_to_check["compatibility"]} + ) + all_vals.update(tier) # use pipestat to report to pephub and file backend From ccf0c5c848dd0222b1355d3df612b2183bd49021 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:11:44 -0400 Subject: [PATCH 05/14] fix row skipping during file opening --- bedboss/refgenome_validator/utils.py | 25 +++++++++++++++++-- .../ref_genome_validating/validate_genome.py | 6 +++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index 038dccd..68cb66e 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -41,11 +41,32 @@ def _read_file_pd(*args, **kwargs) -> pd.DataFrame: f"Skipped {row_count} rows while standardization. File: '{args}'" ) df = df.dropna(axis=1) - return df + for index, row in df.iterrows(): + if ( + isinstance(row[0], str) + and isinstance(row[1], int) + and isinstance(row[2], int) + ): + return df + else: + if isinstance(row[1], str): + try: + _ = int(row[1]) + df[1] = pd.to_numeric(df[1]) + except Exception: + row_count += 1 + break + if isinstance(row[2], str): + try: + _ = int(row[2]) + df[2] = pd.to_numeric(df[2]) + except Exception: + row_count += 1 + break + return df except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: if row_count <= max_rows: row_count += 1 - # if can't open file after 5 attempts try to open it with gzip return _read_gzipped_file(*args) diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 0ee7dea..4e1ea6b 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -109,6 +109,11 @@ def main(): chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm10.chrom.sizes", ) + ucsc_mm39 = GenomeModel( + genome_alias="ucsc_mm10", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm39.chrom.sizes", + ) + ucsc_pantro6 = GenomeModel( genome_alias="ucsc_pantro6", chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes", @@ -117,6 +122,7 @@ def main(): all_genome_models.append(ucsc_hg38) all_genome_models.append(ncbi_hg38) all_genome_models.append(ucsc_mm10) + all_genome_models.append(ucsc_mm39) all_genome_models.append(ucsc_hg19) all_genome_models.append(ensembl_hg38) all_genome_models.append(ucsc_pantro6) From cce5dcce9c801c02a8a8a6dc26d0d68895ab4e8d Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:29:19 -0400 Subject: [PATCH 06/14] fix logic for q_and_not_m > 0: --- .../refgenome_validator/refgenomevalidator.py | 2 +- .../ref_genome_validating/validate_genome.py | 46 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index ee8f0f0..4a253ba 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -101,7 +101,7 @@ def calculate_chrom_stats( jaccard_binary = q_and_m / (q_and_m + not_q_and_m + q_and_not_m) # What is our threshold for passing layer 1? - if q_and_not_m > 1: + if q_and_not_m > 0: passed_chrom_names = False # Calculate sensitivity for chrom names diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 4e1ea6b..0eead1e 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -46,28 +46,28 @@ def main(): # Simple script to testing that the Validator objects is working correctly. # Set up Ref Genie - try: - ref_genie_config_path = os.environ["REFGENIE"] - except: - print("Ref genie environment variable not found") - # hard code for testing for now - ref_genie_config_path = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/genome_folder/genome_config.yaml" - - # Ensure refgenie assets are built based on the provided config module - rgc = refgenconf.RefGenConf(filepath=ref_genie_config_path) - rgc.pull( - genome="hg38", asset="fasta", tag="default", force=False - ) # GCA_000001405.15 GRCh38_no_alt_analysis_set from NCBI - # rgc.pull(genome="hg38_primary", asset="fasta", tag="default", force=False) # UCSC primary chromosomes only - rgc.pull( - genome="hg19", asset="fasta", tag="default", force=False - ) # GRCh37 reference sequence from UCSC - rgc.pull(genome="mm10", asset="fasta", tag="default", force=False) - # rgc.pull(genome="dm6", asset="fasta", tag="default", force=False) #the ncbi chromosomes - - genome_list = rgc.list() - - print(genome_list) + # try: + # ref_genie_config_path = os.environ["REFGENIE"] + # except: + # print("Ref genie environment variable not found") + # # hard code for testing for now + # ref_genie_config_path = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/genome_folder/genome_config.yaml" + # + # # Ensure refgenie assets are built based on the provided config module + # rgc = refgenconf.RefGenConf(filepath=ref_genie_config_path) + # rgc.pull( + # genome="hg38", asset="fasta", tag="default", force=False + # ) # GCA_000001405.15 GRCh38_no_alt_analysis_set from NCBI + # # rgc.pull(genome="hg38_primary", asset="fasta", tag="default", force=False) # UCSC primary chromosomes only + # rgc.pull( + # genome="hg19", asset="fasta", tag="default", force=False + # ) # GRCh37 reference sequence from UCSC + # rgc.pull(genome="mm10", asset="fasta", tag="default", force=False) + # # rgc.pull(genome="dm6", asset="fasta", tag="default", force=False) #the ncbi chromosomes + # + # genome_list = rgc.list() + # + # print(genome_list) # build genome models # for each reference genome in the user's config file, build a genome model @@ -110,7 +110,7 @@ def main(): ) ucsc_mm39 = GenomeModel( - genome_alias="ucsc_mm10", + genome_alias="ucsc_mm39", chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm39.chrom.sizes", ) From 867762985fa8945ed02ff8652a55336611fbe723 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:42:49 -0400 Subject: [PATCH 07/14] add mm39 to stats comparison --- scripts/ref_genome_validating/stats_compat_testing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index 86e2048..1345a26 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -116,6 +116,7 @@ def extract_tier_ratings(df): ucsc_hg38_values = [] ensembl_hg38_values = [] ucsc_pantro6_values = [] + ucsc_mm39_values = [] for index, row in df.iterrows(): ( mm10_tier, @@ -125,6 +126,7 @@ def extract_tier_ratings(df): ucsc_hg38_tier, ucsc_pantro6_tier, ensembl_hg38_tier, + ucsc_mm39_tier, ) = extract_values(row["tier_rating"]) mm10_values.append(mm10_tier) ncbi_hg38_values.append(ncbi_hg38_tier) @@ -133,6 +135,7 @@ def extract_tier_ratings(df): ucsc_hg38_values.append(ucsc_hg38_tier) ensembl_hg38_values.append(ensembl_hg38_tier) ucsc_pantro6_values.append(ucsc_pantro6_tier) + ucsc_mm39_values.append(ucsc_mm39_tier) # df_tiers.rename(columns={0: 'tier_rating'}, inplace=True) # df_tiers['mm10_tier'] = None @@ -151,6 +154,7 @@ def extract_tier_ratings(df): df2["panTro6 UCSC"] = ucsc_pantro6_values df2["mm10 NCBI"] = mm10_values df2["dm6_UCSC"] = dm6_values + df2["ucsc_mm39"] = ucsc_mm39_values print(df2) df2.to_csv("/home/drc/Downloads/export_test.csv") @@ -174,6 +178,7 @@ def extract_values(dictionary): ucsc_hg38_tier = dictionary["ucsc_hg38"]["tier_ranking"] ensembl_hg38_tier = dictionary["ensembl_hg38"]["tier_ranking"] ucsc_pantro6_tier = dictionary["ucsc_pantro6"]["tier_ranking"] + ucsc_mm39_tier = dictionary["ucsc_mm39"]["tier_ranking"] return ( mm10_tier, @@ -183,6 +188,7 @@ def extract_values(dictionary): ucsc_hg38_tier, ucsc_pantro6_tier, ensembl_hg38_tier, + ucsc_mm39_tier, ) @@ -207,6 +213,7 @@ def create_heatmap(df): "Pan troglodytes (panTro6)", "Mus Musculus (mm10)", "Drosophila melanogaster (dm6)", + "Mus Musculus (mm39)", ] df.set_index(["sample_name", df.index], inplace=True) From 1b0c4efa884687fc8cdc0f3606f5c727718637b9 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:05:48 -0400 Subject: [PATCH 08/14] fix geotfetch portion of validator script --- .../ref_genome_validating/validate_genome.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 0eead1e..ad207f6 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -31,16 +31,16 @@ PEP_URL = os.environ["PEP_URL"] except: # if you wish to report results to pephub - # PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" + PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" # PEP_URL ="donaldcampbelljr/ref_genome_dros_only:default" - PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" # Where to get Bedfiles? -LOCAL = True -GEOFETCH = False -# SPECIES = "homosapiens" -SPECIES = "fly" +LOCAL = False +GEOFETCH = True +SPECIES = "homosapiens" +# SPECIES = "fly" def main(): @@ -213,15 +213,19 @@ def main(): tier = { "tier_rating": {} } # add this to a column to make comparisons easier for human eyes on pephub + all_vals = {} if compat_vector: - for i in compat_vector: + for i in compat_vector.keys(): if i is not None: - for key, values in i.items(): - all_vals.update({key: values}) - if "compatibility" in values: - tier["tier_rating"].update( - {key: values["compatibility"]} - ) + all_vals.update( + {i: compat_vector[i].model_dump()} + ) + dict_to_check = compat_vector[i].model_dump() + if "compatibility" in dict_to_check: + tier["tier_rating"].update( + {i: dict_to_check["compatibility"]} + ) + all_vals.update(tier) # use pipestat to report to pephub and file backend From 171a685f5d32c7f94b04ac43d603d3e7c95d8dca Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:29:09 -0400 Subject: [PATCH 09/14] add bedfile read exception --- bedboss/refgenome_validator/utils.py | 15 ++++++++++++++- scripts/ref_genome_validating/validate_genome.py | 9 +++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index 68cb66e..e0c9d29 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -3,6 +3,7 @@ from geniml.io.utils import is_gzipped import logging +from bedboss.exceptions import BedBossException _LOGGER = logging.getLogger("bedboss") @@ -67,7 +68,7 @@ def _read_file_pd(*args, **kwargs) -> pd.DataFrame: except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: if row_count <= max_rows: row_count += 1 - return _read_gzipped_file(*args) + raise BedfileReadException(reason="Cannot read bed file.") def get_bed_chrom_info(bedfile: str) -> dict: @@ -84,3 +85,15 @@ def get_bed_chrom_info(bedfile: str) -> dict: max_end_for_each_chrom = df.groupby(0)[2].max() return max_end_for_each_chrom.to_dict() + + +class BedfileReadException(BedBossException): + """Exception when there is an exception during refgenome validation""" + + def __init__(self, reason: str = ""): + """ + Optionally provide explanation for exceptional condition. + + :param str reason: some context why error occurred + """ + super(BedfileReadException, self).__init__(reason) diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index ad207f6..4e04517 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -25,7 +25,8 @@ # BEDFILE_DIRECTORY = ( # "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/results" # ) - BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/bed_small_subset" + # BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/bed_small_subset" + BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles" try: PEP_URL = os.environ["PEP_URL"] @@ -72,6 +73,11 @@ def main(): # build genome models # for each reference genome in the user's config file, build a genome model + # from geniml.io import RegionSet + # + # ff =RegionSet("/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles/GSM8196564_435_RUNX3_KO_H3K27AC_v_435_ctrl_IgG_seacr.relaxed.bed") + # ff + all_genome_models = [] # for reference_genome in rgc.list(): @@ -213,7 +219,6 @@ def main(): tier = { "tier_rating": {} } # add this to a column to make comparisons easier for human eyes on pephub - all_vals = {} if compat_vector: for i in compat_vector.keys(): if i is not None: From 1591178ffece0624e7a33d7ff52e1029d0e2b91c Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:34:08 -0400 Subject: [PATCH 10/14] add valueerror exception --- bedboss/refgenome_validator/utils.py | 4 ++-- scripts/ref_genome_validating/validate_genome.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index e0c9d29..502093f 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -54,14 +54,14 @@ def _read_file_pd(*args, **kwargs) -> pd.DataFrame: try: _ = int(row[1]) df[1] = pd.to_numeric(df[1]) - except Exception: + except ValueError: row_count += 1 break if isinstance(row[2], str): try: _ = int(row[2]) df[2] = pd.to_numeric(df[2]) - except Exception: + except ValueError: row_count += 1 break return df diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 4e04517..65720b1 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -25,8 +25,8 @@ # BEDFILE_DIRECTORY = ( # "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/results" # ) - # BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/bed_small_subset" - BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles" + BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/bed_small_subset" + # BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles" try: PEP_URL = os.environ["PEP_URL"] @@ -38,8 +38,8 @@ # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" # Where to get Bedfiles? -LOCAL = False -GEOFETCH = True +LOCAL = True +GEOFETCH = False SPECIES = "homosapiens" # SPECIES = "fly" From 0490dd1671f1b938d67f922ee5018f7f5d2f25df Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Sep 2024 15:34:33 -0400 Subject: [PATCH 11/14] moved opening bed file to geniml --- .../{refgenomevalidator.py => main.py} | 0 bedboss/refgenome_validator/utils.py | 92 ++----------------- requirements/requirements-all.txt | 4 +- test/test_ref_validator.py | 11 +-- 4 files changed, 13 insertions(+), 94 deletions(-) rename bedboss/refgenome_validator/{refgenomevalidator.py => main.py} (100%) diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/main.py similarity index 100% rename from bedboss/refgenome_validator/refgenomevalidator.py rename to bedboss/refgenome_validator/main.py diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index 502093f..0eb4d2b 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -1,99 +1,21 @@ from typing import Union -import pandas as pd -from geniml.io.utils import is_gzipped +from geniml.io import RegionSet import logging -from bedboss.exceptions import BedBossException - _LOGGER = logging.getLogger("bedboss") -def _read_gzipped_file(file_path: str) -> pd.DataFrame: - """ - !! Copy from geniml! - Read a gzipped file into a pandas dataframe - - :param file_path: path to gzipped file - :return: pandas dataframe - """ - return _read_file_pd( - file_path, - sep="\t", - compression="gzip", - header=None, - engine="pyarrow", - ) - - -def _read_file_pd(*args, **kwargs) -> pd.DataFrame: - """ - !! Copy from geniml! - Read bed file into a pandas DataFrame, and skip header rows if needed - - :return: pandas dataframe +def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict: """ - max_rows = 5 - row_count = 0 - while row_count <= max_rows: - try: - df = pd.read_csv(*args, **kwargs, skiprows=row_count) - if row_count > 0: - _LOGGER.info( - f"Skipped {row_count} rows while standardization. File: '{args}'" - ) - df = df.dropna(axis=1) - for index, row in df.iterrows(): - if ( - isinstance(row[0], str) - and isinstance(row[1], int) - and isinstance(row[2], int) - ): - return df - else: - if isinstance(row[1], str): - try: - _ = int(row[1]) - df[1] = pd.to_numeric(df[1]) - except ValueError: - row_count += 1 - break - if isinstance(row[2], str): - try: - _ = int(row[2]) - df[2] = pd.to_numeric(df[2]) - except ValueError: - row_count += 1 - break - return df - except (pd.errors.ParserError, pd.errors.EmptyDataError) as _: - if row_count <= max_rows: - row_count += 1 - raise BedfileReadException(reason="Cannot read bed file.") + Open bed file and find all of the chromosomes and the max length of each. - -def get_bed_chrom_info(bedfile: str) -> dict: - """ - Attempt to open it and read it to find all of the chromosomes and the max length of each. - - :param bedfile: bedfilepath + :param bedfile: RegionSet object or path to bed file returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. """ - if is_gzipped(bedfile): - df = _read_gzipped_file(bedfile) + if isinstance(bedfile, RegionSet): + df = bedfile.to_pandas() else: - df = _read_file_pd(bedfile, sep="\t", header=None, engine="pyarrow") + df = RegionSet(bedfile).to_pandas() max_end_for_each_chrom = df.groupby(0)[2].max() return max_end_for_each_chrom.to_dict() - - -class BedfileReadException(BedBossException): - """Exception when there is an exception during refgenome validation""" - - def __init__(self, reason: str = ""): - """ - Optionally provide explanation for exceptional condition. - - :param str reason: some context why error occurred - """ - super(BedfileReadException, self).__init__(reason) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 2fb0151..521a177 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ logmuse>=0.2.7 coloredlogs>=15.0.1 -peppy>=0.40.5 +peppy>=0.40.6 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.0 @@ -10,4 +10,4 @@ refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 pephubclient>=0.4.4 -geniml>=0.4.0 \ No newline at end of file +geniml>=0.4.1 \ No newline at end of file diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py index 9293534..bc5bd4f 100644 --- a/test/test_ref_validator.py +++ b/test/test_ref_validator.py @@ -9,12 +9,9 @@ def test_main(): - # ff = ReferenceValidator().determine_compatibility( - # FILE_PATH, - # concise=True, - # ) - ff = ReferenceValidator().determine_compatibility( - "/home/bnt4me/.bbcache/bedfiles/3/2/GSE244926_mm39_LPx6_oligofile.bed.gz", + dict_result = ReferenceValidator().determine_compatibility( + FILE_PATH, concise=True, ) - ff + + assert dict_result From 36ddbef53b01e71b96f8690cc65c2fa4e33724c2 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Wed, 18 Sep 2024 16:24:08 -0400 Subject: [PATCH 12/14] fix refactoring to main from refgenomevalidator.py --- bedboss/refgenome_validator/__init__.py | 2 +- scripts/ref_genome_validating/validate_genome.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bedboss/refgenome_validator/__init__.py b/bedboss/refgenome_validator/__init__.py index 3bcafd3..12d8d5c 100644 --- a/bedboss/refgenome_validator/__init__.py +++ b/bedboss/refgenome_validator/__init__.py @@ -1,2 +1,2 @@ from bedboss.refgenome_validator.genome_model import GenomeModel -from bedboss.refgenome_validator.refgenomevalidator import ReferenceValidator +from bedboss.refgenome_validator.main import ReferenceValidator diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 65720b1..cec06be 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -4,7 +4,7 @@ import refgenconf from pipestat import pipestat -from bedboss.refgenome_validator.refgenomevalidator import ( +from bedboss.refgenome_validator.main import ( ReferenceValidator, GenomeModel, ) From cbe02f4ce8d909be3dd91f96b8a7dccb4210bc81 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Sep 2024 12:01:15 -0400 Subject: [PATCH 13/14] cleaning docstrings --- bedboss/refgenome_validator/main.py | 169 +++++++++++---------------- bedboss/refgenome_validator/utils.py | 50 +++++++- 2 files changed, 116 insertions(+), 103 deletions(-) diff --git a/bedboss/refgenome_validator/main.py b/bedboss/refgenome_validator/main.py index 4a253ba..35d9ce1 100644 --- a/bedboss/refgenome_validator/main.py +++ b/bedboss/refgenome_validator/main.py @@ -1,6 +1,5 @@ from typing import Optional, Union, List, Dict import os -import subprocess from bedboss.exceptions import ValidatorException from bedboss.refgenome_validator.genome_model import GenomeModel @@ -12,19 +11,19 @@ RatingModel, CompatibilityConcise, ) -from bedboss.refgenome_validator.utils import get_bed_chrom_info +from bedboss.refgenome_validator.utils import ( + get_bed_chrom_info, + run_igd_command, + parse_IGD_output, +) +import logging -try: - IGD_LOCATION = os.environ["IGD_LOCATION"] -except: - # Local installation of C version of IGD - IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" +_LOGGER = logging.getLogger("bedboss") class ReferenceValidator: """ - This is primary class for creating a compatibility vector - An object of this class is to be created once and then used for the entirety of a pipeline,e.g. Bedboss. + Primary class for creating a compatibility dict """ def __init__( @@ -35,8 +34,9 @@ def __init__( """ Initialization method - :param genome_models: this is a list of GenomeModels that will be checked against a bed file - :param igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, if not provided these metrics are not computed. + :param genome_models: this is a list of GenomeModels that will be checked against a bed file. Default: None + :param igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, + if not provided these metrics are not computed. Default: None """ if not genome_models: @@ -56,15 +56,13 @@ def calculate_chrom_stats( bed_chrom_sizes: dict, genome_chrom_sizes: dict ) -> CompatibilityStats: """ - Given two dicts of chroms (key) and their sizes (values) - determine overlap and sequence fit - - Calculates Stats associated with comparison of chrom names, chrom lengths, and sequence fits + Calculate overlap and sequence fit. + Stats are associated with comparison of chrom names, chrom lengths, and sequence fits. - :param dict bed_chrom_sizes: dict of a bedfile's chrom size - :param dict genome_chrom_sizes: dict of a GenomeModel's chrom sizes + :param bed_chrom_sizes: dict of a bedfile's chrom size + :param genome_chrom_sizes: dict of a GenomeModel's chrom sizes - return dict: returns a dictionary with information on Query vs Model, e.g. chrom names QueryvsModel + :return: dictionary with information on Query vs Model, e.g. chrom names QueryvsModel """ # Layer 1: Check names and Determine XS (Extra Sequences) via Calculation of Recall/Sensitivity @@ -105,10 +103,9 @@ def calculate_chrom_stats( passed_chrom_names = False # Calculate sensitivity for chrom names - # defined as XS -> Extra Sequences + # XS -> Extra Sequences sensitivity = q_and_m / (q_and_m + q_and_not_m) - # Assign Stats name_stats = ChromNameStats( xs=sensitivity, q_and_m=q_and_m, @@ -134,7 +131,7 @@ def calculate_chrom_stats( num_chrom_within_bounds += 1 # Calculate recall/sensitivity for chrom lengths - # defined as OOBR -> Out of Bounds Range + # OOBR -> Out of Bounds Range sensitivity = num_chrom_within_bounds / ( num_chrom_within_bounds + num_of_chrom_beyond ) @@ -175,12 +172,15 @@ def calculate_chrom_stats( def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, None]]: """ - This is the third layer compatibility check. - It runs helper functions to execute an igd search query across an Integrated Genome Database. + Third layer compatibility check. + Run helper functions and execute an igd search query across an Integrated Genome Database. - It returns a dict of dicts containing keys (file names) and values (number of overlaps). Or if no overlaps are found, + :param bedfile: path to the bedfile + :return: dict of dicts containing keys (file names) and values (number of overlaps). Or if no overlaps are found, it returns an empty dict. + # TODO: should be a pydantic model + Currently for this function to work, the user must install the C version of IGD and have created a local igd file for the Excluded Ranges Bedset: https://github.com/databio/IGD @@ -189,13 +189,18 @@ def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, Non """ if not self.igd_path: return {"igd_stats": None} + + try: + IGD_LOCATION = os.environ["IGD_LOCATION"] + except: + # Local installation of C version of IGD + IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" + # Construct an IGD command to run as subprocess igd_command = IGD_LOCATION + f" search {self.igd_path} -q {bedfile}" returned_stdout = run_igd_command(igd_command) - # print(f"DEBUG: {returned_stdout}") - if not returned_stdout: return {"igd_stats": None} @@ -220,28 +225,24 @@ def determine_compatibility( concise: Optional[bool] = False, ) -> Union[Dict[str, CompatibilityStats], Dict[str, CompatibilityConcise]]: """ - Given a bedfile, determine compatibility with reference genomes (GenomeModels) created at Validator initialization. + Determine compatibility of the bed file. - :param str bedfile: path to bedfile on disk, .bed - :param list[str] ref_filter: list of ref genome aliases to filter on. - :param bool concise: if True, only return a concise list of compatibility stats - :return list[dict]: a list of dictionaries where each element of the array represents a compatibility dictionary - for each refgenome model. + :param bedfile: path to bedfile + :param ref_filter: list of ref genome aliases to filter on. + :param concise: if True, only return a concise list of compatibility stats. Default: False + :return: a dict with CompatibilityStats, or CompatibilityConcise model (depends if concise is set to True) """ if ref_filter: - # Before proceeding filter out unwanted reference genomes to assess + # Filter out unwanted reference genomes to assess for genome_model in self.genome_models: if genome_model.genome_alias in ref_filter: self.genome_models.remove(genome_model) - bed_chrom_info = get_bed_chrom_info( - bedfile - ) # for this bed file determine the chromosome lengths + bed_chrom_info = get_bed_chrom_info(bedfile) if not bed_chrom_info: - # if there is trouble parsing the bed file, return None - raise ValidatorException + raise ValidatorException("Incorrect bed file provided") model_compat_stats = {} @@ -250,7 +251,8 @@ def determine_compatibility( model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( self.calculate_chrom_stats(bed_chrom_info, genome_model.chrom_sizes) ) - # Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed + + # Third layer - IGD, only if layer 1 and layer 2 have passed if ( model_compat_stats[ genome_model.genome_alias @@ -263,7 +265,7 @@ def determine_compatibility( self.get_igd_overlaps(bedfile) ) - # Once all stats are collected, process them and add compatibility rating + # Calculate compatibility rating model_compat_stats[genome_model.genome_alias].compatibility = ( self.calculate_rating(model_compat_stats[genome_model.genome_alias]) ) @@ -279,22 +281,24 @@ def determine_compatibility( def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: """ - Given compatibility stats for a specific ref genome determine the compatibility tier - - Tiers Definition ---- - - Tier1: Excellent compatibility, 0 pts - Tier2: Good compatibility, may need some processing, 1-3 pts - Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts - Tier4: Poor compatibility, 7-9 pts - - :param CompatibilityStats compat_stats: dicitionary containing unprocessed compat stats - :return dict : containing both the original stats and the final compatibility rating for the reference genome + Determine the compatibility tier + + Tiers: + - Tier1: Excellent compatibility, 0 pts + - Tier2: Good compatibility, may need some processing, 1-3 pts + - Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts + - Tier4: Poor compatibility, 7-9 pts + + :param compat_stats: CompatibilityStats with unprocess compatibility statistics + :return: RatingModel - { + assigned_points: int + tier_ranking: int + } """ points_rating = 0 - # 1. Check extra sequences sensitivity and assign points based on how sensitive the outcome is + # 1. Check extra sequences sensitivity. # sensitivity = 1 is considered great and no points should be assigned xs = compat_stats.chrom_name_stats.xs if xs < 0.3: @@ -325,7 +329,7 @@ def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: # Do nothing here, points have already been added when Assessing XS if it is not == 1 pass - # Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome + # 3. Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome sequence_fit = compat_stats.chrom_sequence_fit_stats.sequence_fit if sequence_fit: # since this is only on keys present in both, ratio should always be less than 1 @@ -344,7 +348,7 @@ def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: # Run analysis on igd_stats # WIP, currently only showing IGD stats for informational purposes if compat_stats.igd_stats and compat_stats.igd_stats != {}: - self.process_igd_stats(compat_stats.igd_stats) + self._process_igd_stats(compat_stats.igd_stats) tier_ranking = 0 if points_rating == 0: @@ -356,23 +360,23 @@ def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: elif 7 <= points_rating: tier_ranking = 4 else: - print( + _LOGGER.info( f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4" ) tier_ranking = 4 return RatingModel(assigned_points=points_rating, tier_ranking=tier_ranking) - def process_igd_stats(self, igd_stats: dict): + def _process_igd_stats(self, igd_stats: dict): """ Placeholder to process IGD Stats and determine if it should impact tier rating """ - pass + ... @staticmethod def _build_default_models() -> list[GenomeModel]: """ - Builds a default list of GenomeModels from the chrom.sizes folder. + Build a default list of GenomeModels from the chrom.sizes folder. Uses file names as genome alias. return list[GenomeModel] @@ -394,6 +398,12 @@ def _build_default_models() -> list[GenomeModel]: @staticmethod def _create_concise_output(output: CompatibilityStats) -> CompatibilityConcise: + """ + Convert extended CompatibilityStats to concise output + + :param output: full compatibility stats + :return: concise compatibility stats + """ return CompatibilityConcise( xs=output.chrom_name_stats.xs, oobr=output.chrom_length_stats.oobr, @@ -401,46 +411,3 @@ def _create_concise_output(output: CompatibilityStats) -> CompatibilityConcise: assigned_points=output.compatibility.assigned_points, tier_ranking=output.compatibility.tier_ranking, ) - - -def run_igd_command(command): - """Run IGD via a subprocess, this is a temp implementation until Rust IGD python bindings are finished.""" - - result = subprocess.run(command, shell=True, capture_output=True, text=True) - if result.returncode == 0: - return result.stdout - else: - print(f"Error running command: {result.stderr}") - return None - - -def parse_IGD_output(output_str) -> Union[None, List[dict]]: - """ - Parses IGD terminal output into a list of dicts - Args: - output_str: The output string from IGD - - Returns: - A list of dictionaries, where each dictionary represents a record. - """ - - try: - lines = output_str.splitlines() - data = [] - for line in lines: - if line.startswith("index"): - continue # Skip the header line - elif line.startswith("Total"): - break # Stop parsing after the "Total" line - else: - fields = line.split() - record = { - "index": int(fields[0]), - "number_of_regions": int(fields[1]), - "number_of_hits": int(fields[2]), - "file_name": fields[3], - } - data.append(record) - return data - except Exception: - return None diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py index 0eb4d2b..2a84934 100644 --- a/bedboss/refgenome_validator/utils.py +++ b/bedboss/refgenome_validator/utils.py @@ -1,5 +1,6 @@ -from typing import Union +from typing import Union, List from geniml.io import RegionSet +import subprocess import logging _LOGGER = logging.getLogger("bedboss") @@ -7,7 +8,7 @@ def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict: """ - Open bed file and find all of the chromosomes and the max length of each. + Determine chrom lengths for bed file :param bedfile: RegionSet object or path to bed file returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. @@ -19,3 +20,48 @@ def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict: max_end_for_each_chrom = df.groupby(0)[2].max() return max_end_for_each_chrom.to_dict() + + +def run_igd_command(command): + """ + Run IGD via a subprocess, this is a temp implementation until Rust IGD python bindings are finished. + """ + + result = subprocess.run(command, shell=True, capture_output=True, text=True) + if result.returncode == 0: + return result.stdout + else: + _LOGGER.error(f"Error running command: {result.stderr}") + return None + + +def parse_IGD_output(output_str) -> Union[None, List[dict]]: + """ + Parses IGD terminal output into a list of dicts + Args: + output_str: The output string from IGD + + Returns: + A list of dictionaries, where each dictionary represents a record. + """ + + try: + lines = output_str.splitlines() + data = [] + for line in lines: + if line.startswith("index"): + continue # Skip the header line + elif line.startswith("Total"): + break # Stop parsing after the "Total" line + else: + fields = line.split() + record = { + "index": int(fields[0]), + "number_of_regions": int(fields[1]), + "number_of_hits": int(fields[2]), + "file_name": fields[3], + } + data.append(record) + return data + except Exception: + return None From 50f65e0263842346f0afef5fdbd00d35eb72fec9 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Sep 2024 12:06:26 -0400 Subject: [PATCH 14/14] version and typos --- bedboss/_version.py | 2 +- bedboss/bedbuncher/tools/bedsetStat.R | 2 +- bedboss/bedmaker/utils.py | 2 +- bedboss/bedstat/tools/regionstat.R | 2 +- bedboss/cli.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bedboss/_version.py b/bedboss/_version.py index 6a9beea..3d18726 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.4.0" +__version__ = "0.5.0" diff --git a/bedboss/bedbuncher/tools/bedsetStat.R b/bedboss/bedbuncher/tools/bedsetStat.R index db99061..c37b913 100755 --- a/bedboss/bedbuncher/tools/bedsetStat.R +++ b/bedboss/bedbuncher/tools/bedsetStat.R @@ -67,7 +67,7 @@ if (is.null(opt$json)) { #' #' Calculates how many regionsets (bedfiles) overlap at least said percentage #' of regions included in the universe. The universe is considered a union of -#' all regionsets (bedfiles) in the colection of +#' all regionsets (bedfiles) in the collection of #' regionsets (bedset, or set of bedfiles) #' #' @param queryList GRangesList object with regionsets to be considered diff --git a/bedboss/bedmaker/utils.py b/bedboss/bedmaker/utils.py index 2cb8469..6a4a538 100644 --- a/bedboss/bedmaker/utils.py +++ b/bedboss/bedmaker/utils.py @@ -22,7 +22,7 @@ def get_chrom_sizes(genome: str, rfg_config: Union[str, Path]) -> str: """ _LOGGER.info("Determining path to chrom.sizes asset via Refgenie.") - # initalizing refginie config file + # initializing refginie config file rgc = get_rgc(rfg_config=rfg_config) try: diff --git a/bedboss/bedstat/tools/regionstat.R b/bedboss/bedstat/tools/regionstat.R index c42c6bf..26de673 100644 --- a/bedboss/bedstat/tools/regionstat.R +++ b/bedboss/bedstat/tools/regionstat.R @@ -314,7 +314,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) { ) } - # QThist plot + # QThis plot if (exists("bedmeta")){ if ("mean_region_width" %in% names(bedmeta)){ run_plot = FALSE diff --git a/bedboss/cli.py b/bedboss/cli.py index 82a07b6..3cf2843 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -552,7 +552,7 @@ def convert_universe( def check_requirements(): from bedboss.bedboss import requirements_check - print("Checking piplines requirements...") + print("Checking pipelines requirements...") requirements_check()