diff --git a/bedboss/_version.py b/bedboss/_version.py index 6a9beea..3d18726 100644 --- a/bedboss/_version.py +++ b/bedboss/_version.py @@ -1 +1 @@ -__version__ = "0.4.0" +__version__ = "0.5.0" diff --git a/bedboss/bedbuncher/tools/bedsetStat.R b/bedboss/bedbuncher/tools/bedsetStat.R index db99061..c37b913 100755 --- a/bedboss/bedbuncher/tools/bedsetStat.R +++ b/bedboss/bedbuncher/tools/bedsetStat.R @@ -67,7 +67,7 @@ if (is.null(opt$json)) { #' #' Calculates how many regionsets (bedfiles) overlap at least said percentage #' of regions included in the universe. The universe is considered a union of -#' all regionsets (bedfiles) in the colection of +#' all regionsets (bedfiles) in the collection of #' regionsets (bedset, or set of bedfiles) #' #' @param queryList GRangesList object with regionsets to be considered diff --git a/bedboss/bedmaker/utils.py b/bedboss/bedmaker/utils.py index 2cb8469..6a4a538 100644 --- a/bedboss/bedmaker/utils.py +++ b/bedboss/bedmaker/utils.py @@ -22,7 +22,7 @@ def get_chrom_sizes(genome: str, rfg_config: Union[str, Path]) -> str: """ _LOGGER.info("Determining path to chrom.sizes asset via Refgenie.") - # initalizing refginie config file + # initializing refginie config file rgc = get_rgc(rfg_config=rfg_config) try: diff --git a/bedboss/bedstat/tools/regionstat.R b/bedboss/bedstat/tools/regionstat.R index c42c6bf..26de673 100644 --- a/bedboss/bedstat/tools/regionstat.R +++ b/bedboss/bedstat/tools/regionstat.R @@ -314,7 +314,7 @@ doItAall <- function(query, fileId, genome, cellMatrix) { ) } - # QThist plot + # QThis plot if (exists("bedmeta")){ if ("mean_region_width" %in% names(bedmeta)){ run_plot = FALSE diff --git a/bedboss/cli.py b/bedboss/cli.py index 82a07b6..3cf2843 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -552,7 +552,7 @@ def convert_universe( def check_requirements(): from bedboss.bedboss import requirements_check - print("Checking piplines requirements...") + print("Checking pipelines requirements...") requirements_check() diff --git a/bedboss/refgenome_validator/__init__.py b/bedboss/refgenome_validator/__init__.py index 24f8c10..12d8d5c 100644 --- a/bedboss/refgenome_validator/__init__.py +++ b/bedboss/refgenome_validator/__init__.py @@ -1,4 +1,2 @@ from bedboss.refgenome_validator.genome_model import GenomeModel -from bedboss.refgenome_validator.refgenomevalidator import RefValidator - -# __all__ = ["GenomeModel"] +from bedboss.refgenome_validator.main import ReferenceValidator diff --git a/bedboss/refgenome_validator/main.py b/bedboss/refgenome_validator/main.py new file mode 100644 index 0000000..35d9ce1 --- /dev/null +++ b/bedboss/refgenome_validator/main.py @@ -0,0 +1,413 @@ +from typing import Optional, Union, List, Dict +import os + +from bedboss.exceptions import ValidatorException +from bedboss.refgenome_validator.genome_model import GenomeModel +from bedboss.refgenome_validator.models import ( + ChromNameStats, + ChromLengthStats, + SequenceFitStats, + CompatibilityStats, + RatingModel, + CompatibilityConcise, +) +from bedboss.refgenome_validator.utils import ( + get_bed_chrom_info, + run_igd_command, + parse_IGD_output, +) +import logging + +_LOGGER = logging.getLogger("bedboss") + + +class ReferenceValidator: + """ + Primary class for creating a compatibility dict + """ + + def __init__( + self, + genome_models: Optional[List[GenomeModel]] = None, + igd_path: Optional[str] = None, + ): + """ + Initialization method + + :param genome_models: this is a list of GenomeModels that will be checked against a bed file. Default: None + :param igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, + if not provided these metrics are not computed. Default: None + """ + + if not genome_models: + genome_models = self._build_default_models() + elif isinstance(genome_models, str): + genome_models = list(genome_models) + elif not isinstance(genome_models, list): + raise ValidatorException( + reason="A list of GenomeModels must be provided to initialize the Validator class" + ) + + self.genome_models: List[GenomeModel] = genome_models + self.igd_path = igd_path + + @staticmethod + def calculate_chrom_stats( + bed_chrom_sizes: dict, genome_chrom_sizes: dict + ) -> CompatibilityStats: + """ + Calculate overlap and sequence fit. + Stats are associated with comparison of chrom names, chrom lengths, and sequence fits. + + :param bed_chrom_sizes: dict of a bedfile's chrom size + :param genome_chrom_sizes: dict of a GenomeModel's chrom sizes + + :return: dictionary with information on Query vs Model, e.g. chrom names QueryvsModel + """ + + # Layer 1: Check names and Determine XS (Extra Sequences) via Calculation of Recall/Sensitivity + # Q = Query, M = Model + q_and_m = 0 # how many chrom names are in the query and the genome model? + q_and_not_m = ( + 0 # how many chrom names are in the query but not in the genome model? + ) + not_q_and_m = ( + 0 # how many chrom names are not in the query but are in the genome model? + ) + passed_chrom_names = True # does this bed file pass the first layer of testing? + + query_keys_present = [] # These keys are used for seq fit calculation + for key in list(bed_chrom_sizes.keys()): + if key not in genome_chrom_sizes: + q_and_not_m += 1 + if key in genome_chrom_sizes: + q_and_m += 1 + query_keys_present.append(key) + for key in list(genome_chrom_sizes.keys()): + if key not in bed_chrom_sizes: + not_q_and_m += 1 + + # Calculate the Jaccard Index for Chrom Names + bed_chrom_set = set(list(bed_chrom_sizes.keys())) + genome_chrom_set = set(list(genome_chrom_sizes.keys())) + chrom_intersection = bed_chrom_set.intersection(genome_chrom_set) + chrom_union = bed_chrom_set.union(chrom_intersection) + chrom_jaccard_index = len(chrom_intersection) / len(chrom_union) + + # Alternative Method for Calculating Jaccard_index for binary classification + # JI = TP/(TP+FP+FN) + jaccard_binary = q_and_m / (q_and_m + not_q_and_m + q_and_not_m) + + # What is our threshold for passing layer 1? + if q_and_not_m > 0: + passed_chrom_names = False + + # Calculate sensitivity for chrom names + # XS -> Extra Sequences + sensitivity = q_and_m / (q_and_m + q_and_not_m) + + name_stats = ChromNameStats( + xs=sensitivity, + q_and_m=q_and_m, + q_and_not_m=q_and_not_m, + not_q_and_m=not_q_and_m, + jaccard_index=chrom_jaccard_index, + jaccard_index_binary=jaccard_binary, + passed_chrom_names=passed_chrom_names, + ) + + # Layer 2: Check Lengths, but only if layer 1 is passing + if passed_chrom_names: + chroms_beyond_range = False + num_of_chrom_beyond = 0 + num_chrom_within_bounds = 0 + + for key in list(bed_chrom_sizes.keys()): + if key in genome_chrom_sizes: + if bed_chrom_sizes[key] > genome_chrom_sizes[key]: + num_of_chrom_beyond += 1 + chroms_beyond_range = True + else: + num_chrom_within_bounds += 1 + + # Calculate recall/sensitivity for chrom lengths + # OOBR -> Out of Bounds Range + sensitivity = num_chrom_within_bounds / ( + num_chrom_within_bounds + num_of_chrom_beyond + ) + length_stats = ChromLengthStats( + oobr=sensitivity, + beyond_range=chroms_beyond_range, + num_of_chrom_beyond=num_of_chrom_beyond, + percentage_bed_chrom_beyond=( + 100 * num_of_chrom_beyond / len(bed_chrom_set) + ), + percentage_genome_chrom_beyond=( + 100 * num_of_chrom_beyond / len(genome_chrom_set) + ), + ) + + else: + length_stats = ChromLengthStats() + + # Layer 3 Calculate Sequence Fit if any query chrom names were present + if len(query_keys_present) > 0: + bed_sum = 0 + ref_genome_sum = 0 + for q_chr in query_keys_present: + bed_sum += int(genome_chrom_sizes[q_chr]) + for g_chr in genome_chrom_sizes: + ref_genome_sum += int(genome_chrom_sizes[g_chr]) + + seq_fit_stats = SequenceFitStats(sequence_fit=bed_sum / ref_genome_sum) + + else: + seq_fit_stats = SequenceFitStats(sequence_fit=None) + + return CompatibilityStats( + chrom_name_stats=name_stats, + chrom_length_stats=length_stats, + chrom_sequence_fit_stats=seq_fit_stats, + ) + + def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, None]]: + """ + Third layer compatibility check. + Run helper functions and execute an igd search query across an Integrated Genome Database. + + :param bedfile: path to the bedfile + :return: dict of dicts containing keys (file names) and values (number of overlaps). Or if no overlaps are found, + it returns an empty dict. + + # TODO: should be a pydantic model + + Currently for this function to work, the user must install the C version of IGD and have created a local igd file + for the Excluded Ranges Bedset: + https://github.com/databio/IGD + https://bedbase.org/bedset/excluderanges + + """ + if not self.igd_path: + return {"igd_stats": None} + + try: + IGD_LOCATION = os.environ["IGD_LOCATION"] + except: + # Local installation of C version of IGD + IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" + + # Construct an IGD command to run as subprocess + igd_command = IGD_LOCATION + f" search {self.igd_path} -q {bedfile}" + + returned_stdout = run_igd_command(igd_command) + + if not returned_stdout: + return {"igd_stats": None} + + igd_overlap_data = parse_IGD_output(returned_stdout) + + if not igd_overlap_data: + return { + "igd_stats": {} + } # None tells us if the bed file never made it to layer 3 or perhaps igd errord, empty dict tells us that there were no overlaps found + else: + overlaps_dict = {} + for datum in igd_overlap_data: + if "file_name" in datum and "number_of_hits" in datum: + overlaps_dict.update({datum["file_name"]: datum["number_of_hits"]}) + + return overlaps_dict + + def determine_compatibility( + self, + bedfile: str, + ref_filter: Optional[List[str]] = None, + concise: Optional[bool] = False, + ) -> Union[Dict[str, CompatibilityStats], Dict[str, CompatibilityConcise]]: + """ + Determine compatibility of the bed file. + + :param bedfile: path to bedfile + :param ref_filter: list of ref genome aliases to filter on. + :param concise: if True, only return a concise list of compatibility stats. Default: False + :return: a dict with CompatibilityStats, or CompatibilityConcise model (depends if concise is set to True) + """ + + if ref_filter: + # Filter out unwanted reference genomes to assess + for genome_model in self.genome_models: + if genome_model.genome_alias in ref_filter: + self.genome_models.remove(genome_model) + + bed_chrom_info = get_bed_chrom_info(bedfile) + + if not bed_chrom_info: + raise ValidatorException("Incorrect bed file provided") + + model_compat_stats = {} + + for genome_model in self.genome_models: + # First and Second Layer of Compatibility + model_compat_stats[genome_model.genome_alias]: CompatibilityStats = ( + self.calculate_chrom_stats(bed_chrom_info, genome_model.chrom_sizes) + ) + + # Third layer - IGD, only if layer 1 and layer 2 have passed + if ( + model_compat_stats[ + genome_model.genome_alias + ].chrom_name_stats.passed_chrom_names + and not model_compat_stats[ + genome_model.genome_alias + ].chrom_length_stats.beyond_range + ): + model_compat_stats[genome_model.genome_alias].igd_stats = ( + self.get_igd_overlaps(bedfile) + ) + + # Calculate compatibility rating + model_compat_stats[genome_model.genome_alias].compatibility = ( + self.calculate_rating(model_compat_stats[genome_model.genome_alias]) + ) + + if concise: + concise_dict = {} + for name, stats in model_compat_stats.items(): + concise_dict[name] = self._create_concise_output(stats) + + return concise_dict + + return model_compat_stats + + def calculate_rating(self, compat_stats: CompatibilityStats) -> RatingModel: + """ + Determine the compatibility tier + + Tiers: + - Tier1: Excellent compatibility, 0 pts + - Tier2: Good compatibility, may need some processing, 1-3 pts + - Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts + - Tier4: Poor compatibility, 7-9 pts + + :param compat_stats: CompatibilityStats with unprocess compatibility statistics + :return: RatingModel - { + assigned_points: int + tier_ranking: int + } + """ + + points_rating = 0 + + # 1. Check extra sequences sensitivity. + # sensitivity = 1 is considered great and no points should be assigned + xs = compat_stats.chrom_name_stats.xs + if xs < 0.3: + points_rating += 6 # 3 + 1 + 1 + 1 + elif xs < 0.5: + points_rating += 5 # 3 + 1 + 1 + elif xs < 0.7: + points_rating += 4 # 3 + 1 + elif xs < 1: + points_rating += 3 + else: + pass + + # 2. Check OOBR and assign points based on sensitivity + # only assessed if no extra chroms in query bed file + if compat_stats.chrom_name_stats.passed_chrom_names: + oobr = compat_stats.chrom_length_stats.oobr + + if oobr < 0.3: + points_rating += 6 # 3 + 1 + 1 + 1 + elif oobr < 0.5: + points_rating += 5 # 3 + 1 + 1 + elif oobr < 0.7: + points_rating += 4 # 3 + 1 + elif oobr < 1: + points_rating += 3 + else: + # Do nothing here, points have already been added when Assessing XS if it is not == 1 + pass + + # 3. Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome + sequence_fit = compat_stats.chrom_sequence_fit_stats.sequence_fit + if sequence_fit: + # since this is only on keys present in both, ratio should always be less than 1 + # Should files be penalized here or actually awarded but only if the fit is really good? + if sequence_fit < 0.90: + points_rating += 1 + if sequence_fit < 0.60: + points_rating += 1 + if sequence_fit < 0.60: + points_rating += 1 + + else: + # if no chrom names were found during assessment + points_rating += 4 + + # Run analysis on igd_stats + # WIP, currently only showing IGD stats for informational purposes + if compat_stats.igd_stats and compat_stats.igd_stats != {}: + self._process_igd_stats(compat_stats.igd_stats) + + tier_ranking = 0 + if points_rating == 0: + tier_ranking = 1 + elif 1 <= points_rating <= 3: + tier_ranking = 2 + elif 4 <= points_rating <= 6: + tier_ranking = 3 + elif 7 <= points_rating: + tier_ranking = 4 + else: + _LOGGER.info( + f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4" + ) + tier_ranking = 4 + + return RatingModel(assigned_points=points_rating, tier_ranking=tier_ranking) + + def _process_igd_stats(self, igd_stats: dict): + """ + Placeholder to process IGD Stats and determine if it should impact tier rating + """ + ... + + @staticmethod + def _build_default_models() -> list[GenomeModel]: + """ + Build a default list of GenomeModels from the chrom.sizes folder. + Uses file names as genome alias. + + return list[GenomeModel] + """ + dir_path = os.path.dirname(os.path.realpath(__file__)) + + chrm_sizes_directory = os.path.join(dir_path, "chrom_sizes") + + all_genome_models = [] + for root, dirs, files in os.walk(chrm_sizes_directory): + for file in files: + if file.endswith(".sizes"): + curr_genome_model = GenomeModel( + genome_alias=file, chrom_sizes_file=os.path.join(root, file) + ) + all_genome_models.append(curr_genome_model) + + return all_genome_models + + @staticmethod + def _create_concise_output(output: CompatibilityStats) -> CompatibilityConcise: + """ + Convert extended CompatibilityStats to concise output + + :param output: full compatibility stats + :return: concise compatibility stats + """ + return CompatibilityConcise( + xs=output.chrom_name_stats.xs, + oobr=output.chrom_length_stats.oobr, + sequence_fit=output.chrom_sequence_fit_stats.sequence_fit, + assigned_points=output.compatibility.assigned_points, + tier_ranking=output.compatibility.tier_ranking, + ) diff --git a/bedboss/refgenome_validator/models.py b/bedboss/refgenome_validator/models.py new file mode 100644 index 0000000..f852860 --- /dev/null +++ b/bedboss/refgenome_validator/models.py @@ -0,0 +1,48 @@ +from pydantic import BaseModel, ConfigDict +from typing import Union + + +class ChromNameStats(BaseModel): + xs: float = 0.0 + q_and_m: float = 0.0 + q_and_not_m: float = 0.0 + not_q_and_m: float = 0.0 + jaccard_index: float = 0.0 + jaccard_index_binary: float = 0.0 + passed_chrom_names: bool = False + + +class ChromLengthStats(BaseModel): + oobr: Union[float, None] = None + beyond_range: bool = False + num_of_chrom_beyond: int = 0 + percentage_bed_chrom_beyond: float = 0.0 + percentage_genome_chrom_beyond: float = 0.0 + + +class SequenceFitStats(BaseModel): + sequence_fit: Union[float, None] = None + + +class RatingModel(BaseModel): + assigned_points: int + tier_ranking: int + # model_config = ConfigDict(extra="forbid") + + +class CompatibilityStats(BaseModel): + chrom_name_stats: ChromNameStats + chrom_length_stats: ChromLengthStats + chrom_sequence_fit_stats: SequenceFitStats + igd_stats: Union[dict, None] = None + compatibility: Union[RatingModel, None] = None + + model_config = ConfigDict(extra="forbid") + + +class CompatibilityConcise(BaseModel): + xs: float = 0.0 + oobr: Union[float, None] = None + sequence_fit: Union[float, None] = None + assigned_points: int + tier_ranking: int diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py deleted file mode 100644 index 55db9df..0000000 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ /dev/null @@ -1,480 +0,0 @@ -from typing import Optional, Union, List -import os -import pandas as pd -import subprocess - -from bedboss.exceptions import ValidatorException -from bedboss.refgenome_validator import GenomeModel - -try: - IGD_LOCATION = os.environ["IGD_LOCATION"] -except: - # Local installation of C version of IGD - IGD_LOCATION = f"/home/drc/GITHUB/igd/IGD/bin/igd" - - -class RefValidator: - """ - - This is primary class for creating a compatibility vector - An object of this class is to be created once and then used for the entirety of a pipeline,e.g. Bedboss. - - """ - - def __init__( - self, - genome_models: Optional[List[GenomeModel]] = None, - igd_path: Optional[str] = None, - ): - """ - Initialization method - - :param list[GenomeModels] genome_models: this is a list of GenomeModels that will be checked against a bed file - :param str igd_path: path to a local IGD file containing ALL excluded ranges intervals for IGD overlap assessment, if not provided these metrics are not computed. - - """ - if not genome_models: - genome_models = self.build_default_models() - - if isinstance(genome_models, str): - genome_models = list(genome_models) - if not isinstance(genome_models, list): - raise ValidatorException( - reason="A list of GenomeModels must be provided to initialize the Validator class" - ) - - self.genome_models = genome_models - - self.igd_path = igd_path - - # this will be a list of dictionary info with length of genome_models - self.compatibility_list = [] - - def calculate_chrom_stats( - self, bed_chrom_sizes: dict, genome_chrom_sizes: dict - ) -> dict: - """ - Given two dicts of chroms (key) and their sizes (values) - determine overlap and sequence fit - - Calculates Stats associated with comparison of chrom names, chrom lengths, and sequence fits - - :param dict bed_chrom_sizes: dict of a bedfile's chrom size - :param dict genome_chrom_sizes: dict of a GenomeModel's chrom sizes - - return dict: returns a dictionary with information on Query vs Model, e.g. chrom names QueryvsModel - """ - - # Layer 1: Check names and Determine XS (Extra Sequences) via Calculation of Recall/Sensitivity - # Q = Query, M = Model - name_stats = {} - q_and_m = 0 # how many chrom names are in the query and the genome model? - q_and_not_m = ( - 0 # how many chrom names are in the query but not in the genome model? - ) - not_q_and_m = ( - 0 # how many chrom names are not in the query but are in the genome model? - ) - passed_chrom_names = True # does this bed file pass the first layer of testing? - - query_keys_present = [] # These keys are used for seq fit calculation - for key in list(bed_chrom_sizes.keys()): - if key not in genome_chrom_sizes: - q_and_not_m += 1 - if key in genome_chrom_sizes: - q_and_m += 1 - query_keys_present.append(key) - for key in list(genome_chrom_sizes.keys()): - if key not in bed_chrom_sizes: - not_q_and_m += 1 - - # Calculate the Jaccard Index for Chrom Names - bed_chrom_set = set(list(bed_chrom_sizes.keys())) - genome_chrom_set = set(list(genome_chrom_sizes.keys())) - chrom_intersection = bed_chrom_set.intersection(genome_chrom_set) - chrom_union = bed_chrom_set.union(chrom_intersection) - chrom_jaccard_index = len(chrom_intersection) / len(chrom_union) - - # Alternative Method for Calculating Jaccard_index for binary classification - # JI = TP/(TP+FP+FN) - jaccard_binary = q_and_m / (q_and_m + not_q_and_m + q_and_not_m) - - # What is our threshold for passing layer 1? - if q_and_not_m > 1: - passed_chrom_names = False - - # Calculate sensitivity for chrom names - # defined as XS -> Extra Sequences - sensitivity = q_and_m / (q_and_m + q_and_not_m) - - # Assign Stats - # TODO maybe use pydantic in the future - name_stats["XS"] = sensitivity - name_stats["q_and_m"] = q_and_m - name_stats["q_and_not_m"] = q_and_not_m - name_stats["not_q_and_m"] = not_q_and_m - name_stats["jaccard_index"] = chrom_jaccard_index - name_stats["jaccard_index_binary"] = jaccard_binary - name_stats["passed_chrom_names"] = passed_chrom_names - - # Layer 2: Check Lengths, but only if layer 1 is passing - if passed_chrom_names: - length_stats = {} - - chroms_beyond_range = False - num_of_chrm_beyond = 0 - num_chrm_within_bounds = 0 - - for key in list(bed_chrom_sizes.keys()): - if key in genome_chrom_sizes: - if bed_chrom_sizes[key] > genome_chrom_sizes[key]: - num_of_chrm_beyond += 1 - chroms_beyond_range = True - else: - num_chrm_within_bounds += 1 - - # Calculate recall/sensitivity for chrom lengths - # defined as OOBR -> Out of Bounds Range - sensitivity = num_chrm_within_bounds / ( - num_chrm_within_bounds + num_of_chrm_beyond - ) - length_stats["OOBR"] = sensitivity - - length_stats["beyond_range"] = chroms_beyond_range - length_stats["num_of_chrm_beyond"] = num_of_chrm_beyond - - # Naive calculation, seq fit (below) may be better metric - length_stats["percentage_bed_chrm_beyond"] = ( - 100 * num_of_chrm_beyond / len(bed_chrom_set) - ) - length_stats["percentage_genome_chrm_beyond"] = ( - 100 * num_of_chrm_beyond / len(genome_chrom_set) - ) - - else: - length_stats = {} - length_stats["OOBR"] = None - length_stats["beyond_range"] = None - length_stats["num_of_chrm_beyond"] = None - - # Layer 3 Calculate Sequence Fit if any query chrom names were present - if len(query_keys_present) > 0: - seq_fit_stats = {} - bed_sum = 0 - ref_genome_sum = 0 - for chr in query_keys_present: - bed_sum += int(genome_chrom_sizes[chr]) - for chr in genome_chrom_sizes: - ref_genome_sum += int(genome_chrom_sizes[chr]) - - sequence_fit = bed_sum / ref_genome_sum - seq_fit_stats["sequence_fit"] = sequence_fit - else: - seq_fit_stats = {} - seq_fit_stats["sequence_fit"] = None - - return { - "chrom_name_stats": name_stats, - "chrom_length_stats": length_stats, - "seq_fit_stats": seq_fit_stats, - } - - def get_igd_overlaps(self, bedfile: str) -> Union[dict[str, dict], dict[str, None]]: - """ - This is the third layer compatibility check. - It runs helper functions to execute an igd search query across an Integrated Genome Database. - - It returns a dict of dicts containing keys (file names) and values (number of overlaps). Or if no overlaps are found, - it returns an empty dict. - - Currently for this function to work, the user must install the C version of IGD and have created a local igd file - for the Excluded Ranges Bedset: - https://github.com/databio/IGD - https://bedbase.org/bedset/excluderanges - - """ - if not self.igd_path: - return {"igd_stats": None} - # Construct an IGD command to run as subprocess - igd_command = IGD_LOCATION + f" search {self.igd_path} -q {bedfile}" - - returned_stdout = run_igd_command(igd_command) - - # print(f"DEBUG: {returned_stdout}") - - if not returned_stdout: - return {"igd_stats": None} - - igd_overlap_data = parse_IGD_output(returned_stdout) - - if not igd_overlap_data: - return { - "igd_stats": {} - } # None tells us if the bed file never made it to layer 3 or perhaps igd errord, empty dict tells us that there were no overlaps found - else: - overlaps_dict = {} - for datum in igd_overlap_data: - if "file_name" in datum and "number_of_hits" in datum: - overlaps_dict.update({datum["file_name"]: datum["number_of_hits"]}) - - return {"igd_stats": overlaps_dict} - - def determine_compatibility( - self, - bedfile: str, - ref_filter: Optional[List[str]] = None, - concise: Optional[bool] = False, - ) -> Union[List[dict], None]: - """ - Given a bedfile, determine compatibility with reference genomes (GenomeModels) created at Validator initialization. - - :param str bedfile: path to bedfile on disk, .bed - :param list[str] ref_filter: list of ref genome aliases to filter on. - :return list[dict]: a list of dictionaries where each element of the array represents a compatibility dictionary - for each refgenome model. - """ - - if ref_filter: - # Before proceeding filter out unwanted reference genomes to assess - for genome_model in self.genome_models: - if genome_model.genome_alias in ref_filter: - self.genome_models.remove(genome_model) - - bed_chrom_info = get_bed_chrom_info( - bedfile - ) # for this bed file determine the chromosome lengths - - if not bed_chrom_info: - # if there is trouble parsing the bed file, return None - return None - - model_compat_stats = {} - final_compatibility_list = [] - for genome_model in self.genome_models: - # First and Second Layer of Compatibility - model_compat_stats[genome_model.genome_alias] = self.calculate_chrom_stats( - bed_chrom_info, genome_model.chrom_sizes - ) - # Fourth Layer is to run IGD but only if layer 1 and layer 2 have passed - if ( - model_compat_stats[genome_model.genome_alias]["chrom_name_stats"][ - "passed_chrom_names" - ] - and not model_compat_stats[genome_model.genome_alias][ - "chrom_length_stats" - ]["beyond_range"] - ): - model_compat_stats[genome_model.genome_alias].update( - self.get_igd_overlaps(bedfile) - ) - else: - model_compat_stats[genome_model.genome_alias].update( - {"igd_stats": None} - ) - - # Once all stats are collected, process them and add compatibility rating - processed_stats = self.process_compat_stats( - model_compat_stats[genome_model.genome_alias], genome_model.genome_alias - ) - - final_compatibility_list.append(processed_stats) - - if concise: - # TODO just return XS, OOBR, SEQ FIT, COMPAT TIER - return final_compatibility_list - - return final_compatibility_list - - def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: - """ - Given compatibility stats for a specific ref genome determine the compatibility tier - - Tiers Definition ---- - - Tier1: Excellent compatibility, 0 pts - Tier2: Good compatibility, may need some processing, 1-3 pts - Tier3: Bed file needs processing to work (shifted hg38 to hg19?), 4-6 pts - Tier4: Poor compatibility, 7-9 pts - - :param dict compat_stats: dicitionary containing unprocessed compat stats - :param str genome_alias: - :return dict : containing both the original stats and the final compatibility rating for the reference genome - """ - - # Set up processed stats dict, it will add an extra key to the original dict - # with the final rating - processed_stats = {} - processed_stats[genome_alias] = {} - processed_stats[genome_alias].update(compat_stats) - - # Currently will proceed with discrete buckets, however, we could do a point system in the future - final_compat_rating = {"tier_ranking": 1, "assigned_points": 0} - points_rating = 0 # we will add points increasing the level and then sort into various tiers at the end - - # Check extra sequences sensitivity and assign points based on how sensitive the outcome is - # sensitivity = 1 is considered great and no points should be assigned - if compat_stats["chrom_name_stats"]["XS"] < 1: - points_rating += 3 - if compat_stats["chrom_name_stats"]["XS"] < 0.7: - points_rating += 1 - if compat_stats["chrom_name_stats"]["XS"] < 0.5: - points_rating += 1 - if compat_stats["chrom_name_stats"]["XS"] < 0.3: - points_rating += 1 - - if compat_stats["chrom_name_stats"]["passed_chrom_names"]: - # Check OOBR and assign points based on sensitivity - # only assessed if no extra chroms in query bed file - if compat_stats["chrom_length_stats"]["OOBR"] < 1: - points_rating += 3 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.7: - points_rating += 1 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.5: - points_rating += 1 - if compat_stats["chrom_length_stats"]["OOBR"] < 0.3: - points_rating += 1 - else: - # Do nothing here, points have already been added when Assessing XS if it is not == 1 - pass - - # Check Sequence Fit - comparing lengths in queries vs lengths of queries in ref genome vs not in ref genome - if compat_stats["seq_fit_stats"]["sequence_fit"]: - # since this is only on keys present in both, ratio should always be less than 1 - # Should files be penalized here or actually awarded but only if the fit is really good? - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.90: - points_rating += 1 - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: - points_rating += 1 - if compat_stats["seq_fit_stats"]["sequence_fit"] < 0.60: - points_rating += 1 - - else: - # if no chrom names were found during assessment - points_rating += 4 - - # Run analysis on igd_stats - # WIP, currently only showing IGD stats for informational purposes - if compat_stats["igd_stats"] and compat_stats["igd_stats"] != {}: - self.process_igd_stats(compat_stats["igd_stats"]) - - final_compat_rating["assigned_points"] = points_rating - - # Now Assign Tier Based on Points - - if points_rating == 0: - final_compat_rating["tier_ranking"] = 1 - elif 1 <= points_rating <= 3: - final_compat_rating["tier_ranking"] = 2 - elif 4 <= points_rating <= 6: - final_compat_rating["tier_ranking"] = 3 - elif 7 <= points_rating: - final_compat_rating["tier_ranking"] = 4 - else: - print( - f"Catching points discrepancy,points = {points_rating}, assigning to Tier 4" - ) - final_compat_rating["tier_ranking"] = 4 - - processed_stats[genome_alias].update({"compatibility": final_compat_rating}) - - return processed_stats - - def process_igd_stats(self, igd_stats: dict): - """ - Placeholder to process IGD Stats and determine if it should impact tier rating - """ - pass - - def build_default_models(self): - """ - Builds a default list of GenomeModels from the chrom.sizes folder. - Uses file names as genome alias. - - return list[GenomeModel] - """ - - chrm_sizes_directory = os.path.join( - os.path.curdir, os.path.abspath("./chrom_sizes") - ) - all_genome_models = [] - for root, dirs, files in os.walk(chrm_sizes_directory): - for file in files: - if file.endswith(".sizes"): - # print(os.path.join(root, file)) - # Get file name - name = os.path.basename(file) - - curr_genome_model = GenomeModel( - genome_alias=name, chrom_sizes_file=file - ) - all_genome_models.append(curr_genome_model) - - return all_genome_models - - -# ---------------------------- -# Helper Functions -def get_bed_chrom_info(bed_file_path: str) -> Union[dict, None]: - """ - Given a path to a Bedfile. Attempt to open it and read it to find all of the chromosomes and the max length of each. - - returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. - """ - - # Right now this assumes this is atleast a 3 column bedfile - # This also assumes the bed file has been unzipped - - # TODO This code is overlapping with geniml io code and should be refactored in the future, ie move opening bed files to helper utils. - - try: - df = pd.read_csv(bed_file_path, sep="\t", header=None, skiprows=5) - - max_end_for_each_chrom = df.groupby(0)[2].max() - - # return max end position for each chromosome - return max_end_for_each_chrom.to_dict() - except Exception: - return None - - -def run_igd_command(command): - """Run IGD via a subprocess, this is a temp implementation until Rust IGD python bindings are finished.""" - - result = subprocess.run(command, shell=True, capture_output=True, text=True) - if result.returncode == 0: - return result.stdout - else: - print(f"Error running command: {result.stderr}") - return None - - -def parse_IGD_output(output_str) -> Union[None, List[dict]]: - """ - Parses IGD terminal output into a list of dicts - Args: - output_str: The output string from IGD - - Returns: - A list of dictionaries, where each dictionary represents a record. - """ - - try: - lines = output_str.splitlines() - data = [] - for line in lines: - if line.startswith("index"): - continue # Skip the header line - elif line.startswith("Total"): - break # Stop parsing after the "Total" line - else: - fields = line.split() - record = { - "index": int(fields[0]), - "number_of_regions": int(fields[1]), - "number_of_hits": int(fields[2]), - "file_name": fields[3], - } - data.append(record) - return data - except Exception: - return None diff --git a/bedboss/refgenome_validator/utils.py b/bedboss/refgenome_validator/utils.py new file mode 100644 index 0000000..2a84934 --- /dev/null +++ b/bedboss/refgenome_validator/utils.py @@ -0,0 +1,67 @@ +from typing import Union, List +from geniml.io import RegionSet +import subprocess +import logging + +_LOGGER = logging.getLogger("bedboss") + + +def get_bed_chrom_info(bedfile: Union[str, RegionSet]) -> dict: + """ + Determine chrom lengths for bed file + + :param bedfile: RegionSet object or path to bed file + returns dict: returns dictionary where keys are chrom names and values are the max end position of that chromosome. + """ + if isinstance(bedfile, RegionSet): + df = bedfile.to_pandas() + else: + df = RegionSet(bedfile).to_pandas() + + max_end_for_each_chrom = df.groupby(0)[2].max() + return max_end_for_each_chrom.to_dict() + + +def run_igd_command(command): + """ + Run IGD via a subprocess, this is a temp implementation until Rust IGD python bindings are finished. + """ + + result = subprocess.run(command, shell=True, capture_output=True, text=True) + if result.returncode == 0: + return result.stdout + else: + _LOGGER.error(f"Error running command: {result.stderr}") + return None + + +def parse_IGD_output(output_str) -> Union[None, List[dict]]: + """ + Parses IGD terminal output into a list of dicts + Args: + output_str: The output string from IGD + + Returns: + A list of dictionaries, where each dictionary represents a record. + """ + + try: + lines = output_str.splitlines() + data = [] + for line in lines: + if line.startswith("index"): + continue # Skip the header line + elif line.startswith("Total"): + break # Stop parsing after the "Total" line + else: + fields = line.split() + record = { + "index": int(fields[0]), + "number_of_regions": int(fields[1]), + "number_of_hits": int(fields[2]), + "file_name": fields[3], + } + data.append(record) + return data + except Exception: + return None diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 2fb0151..521a177 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ logmuse>=0.2.7 coloredlogs>=15.0.1 -peppy>=0.40.5 +peppy>=0.40.6 yacman>=0.8.4 requests>=2.28.2 piper>=v0.14.0 @@ -10,4 +10,4 @@ refgenconf>=0.12.2 pandas>=2.0.0 ubiquerg>=0.6.2 pephubclient>=0.4.4 -geniml>=0.4.0 \ No newline at end of file +geniml>=0.4.1 \ No newline at end of file diff --git a/scripts/ref_genome_validating/stats_compat_testing.py b/scripts/ref_genome_validating/stats_compat_testing.py index ed3a870..1345a26 100644 --- a/scripts/ref_genome_validating/stats_compat_testing.py +++ b/scripts/ref_genome_validating/stats_compat_testing.py @@ -15,7 +15,8 @@ PEP_URL = os.environ["PEP_URL"] except: # pep url - PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" def main(): @@ -115,6 +116,7 @@ def extract_tier_ratings(df): ucsc_hg38_values = [] ensembl_hg38_values = [] ucsc_pantro6_values = [] + ucsc_mm39_values = [] for index, row in df.iterrows(): ( mm10_tier, @@ -124,6 +126,7 @@ def extract_tier_ratings(df): ucsc_hg38_tier, ucsc_pantro6_tier, ensembl_hg38_tier, + ucsc_mm39_tier, ) = extract_values(row["tier_rating"]) mm10_values.append(mm10_tier) ncbi_hg38_values.append(ncbi_hg38_tier) @@ -132,6 +135,7 @@ def extract_tier_ratings(df): ucsc_hg38_values.append(ucsc_hg38_tier) ensembl_hg38_values.append(ensembl_hg38_tier) ucsc_pantro6_values.append(ucsc_pantro6_tier) + ucsc_mm39_values.append(ucsc_mm39_tier) # df_tiers.rename(columns={0: 'tier_rating'}, inplace=True) # df_tiers['mm10_tier'] = None @@ -150,6 +154,7 @@ def extract_tier_ratings(df): df2["panTro6 UCSC"] = ucsc_pantro6_values df2["mm10 NCBI"] = mm10_values df2["dm6_UCSC"] = dm6_values + df2["ucsc_mm39"] = ucsc_mm39_values print(df2) df2.to_csv("/home/drc/Downloads/export_test.csv") @@ -166,13 +171,14 @@ def extract_values(dictionary): dictionary = json.loads(dictionary) - mm10_tier = dictionary["mm10"]["tier_ranking"] + mm10_tier = dictionary["ucsc_mm10"]["tier_ranking"] ncbi_hg38_tier = dictionary["ncbi_hg38"]["tier_ranking"] - hg19_tier = dictionary["hg19"]["tier_ranking"] + hg19_tier = dictionary["ucsc_hg19"]["tier_ranking"] ucsc_dm6_tier = dictionary["ucsc_dm6"]["tier_ranking"] ucsc_hg38_tier = dictionary["ucsc_hg38"]["tier_ranking"] ensembl_hg38_tier = dictionary["ensembl_hg38"]["tier_ranking"] ucsc_pantro6_tier = dictionary["ucsc_pantro6"]["tier_ranking"] + ucsc_mm39_tier = dictionary["ucsc_mm39"]["tier_ranking"] return ( mm10_tier, @@ -182,6 +188,7 @@ def extract_values(dictionary): ucsc_hg38_tier, ucsc_pantro6_tier, ensembl_hg38_tier, + ucsc_mm39_tier, ) @@ -206,6 +213,7 @@ def create_heatmap(df): "Pan troglodytes (panTro6)", "Mus Musculus (mm10)", "Drosophila melanogaster (dm6)", + "Mus Musculus (mm39)", ] df.set_index(["sample_name", df.index], inplace=True) @@ -230,8 +238,8 @@ def create_heatmap(df): colorbar.set_ticklabels(custom_labels) # plt.colorbar(ticks=4, label='Custom Label', ticklabels=custom_labels) plt.title("Tier Rating: Bed File vs Ref Genome") - plt.xlabel("Reference Genomes", fontsize=12) - plt.ylabel("Query Bed Files", fontsize=12) + plt.xlabel("Reference Genomes", fontsize=8) + plt.ylabel("Query Bed Files", fontsize=8) # plt.grid(True) plt.show() diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index c65ba39..cec06be 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -4,7 +4,10 @@ import refgenconf from pipestat import pipestat -from bedboss.refgenome_validator.refgenomevalidator import RefValidator, GenomeModel +from bedboss.refgenome_validator.main import ( + ReferenceValidator, + GenomeModel, +) # helper utils from process_exclude_ranges import unzip_bedfile, get_samples, MAX_SAMPLES @@ -23,51 +26,58 @@ # "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/results" # ) BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/bed_small_subset" + # BEDFILE_DIRECTORY = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles" try: PEP_URL = os.environ["PEP_URL"] except: # if you wish to report results to pephub - # PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" - PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" + PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_small:default" # PEP_URL ="donaldcampbelljr/ref_genome_dros_only:default" + # PEP_URL = "donaldcampbelljr/ref_genome_compat_testing_refactor:default" # Where to get Bedfiles? LOCAL = True GEOFETCH = False -# SPECIES = "homosapiens" -SPECIES = "fly" +SPECIES = "homosapiens" +# SPECIES = "fly" def main(): # Simple script to testing that the Validator objects is working correctly. # Set up Ref Genie - try: - ref_genie_config_path = os.environ["REFGENIE"] - except: - print("Ref genie environment variable not found") - # hard code for testing for now - ref_genie_config_path = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/genome_folder/genome_config.yaml" - - # Ensure refgenie assets are built based on the provided config module - rgc = refgenconf.RefGenConf(filepath=ref_genie_config_path) - rgc.pull( - genome="hg38", asset="fasta", tag="default", force=False - ) # GCA_000001405.15 GRCh38_no_alt_analysis_set from NCBI - # rgc.pull(genome="hg38_primary", asset="fasta", tag="default", force=False) # UCSC primary chromosomes only - rgc.pull( - genome="hg19", asset="fasta", tag="default", force=False - ) # GRCh37 reference sequence from UCSC - rgc.pull(genome="mm10", asset="fasta", tag="default", force=False) - # rgc.pull(genome="dm6", asset="fasta", tag="default", force=False) #the ncbi chromosomes - - genome_list = rgc.list() - - print(genome_list) + # try: + # ref_genie_config_path = os.environ["REFGENIE"] + # except: + # print("Ref genie environment variable not found") + # # hard code for testing for now + # ref_genie_config_path = "/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/genome_folder/genome_config.yaml" + # + # # Ensure refgenie assets are built based on the provided config module + # rgc = refgenconf.RefGenConf(filepath=ref_genie_config_path) + # rgc.pull( + # genome="hg38", asset="fasta", tag="default", force=False + # ) # GCA_000001405.15 GRCh38_no_alt_analysis_set from NCBI + # # rgc.pull(genome="hg38_primary", asset="fasta", tag="default", force=False) # UCSC primary chromosomes only + # rgc.pull( + # genome="hg19", asset="fasta", tag="default", force=False + # ) # GRCh37 reference sequence from UCSC + # rgc.pull(genome="mm10", asset="fasta", tag="default", force=False) + # # rgc.pull(genome="dm6", asset="fasta", tag="default", force=False) #the ncbi chromosomes + # + # genome_list = rgc.list() + # + # print(genome_list) # build genome models # for each reference genome in the user's config file, build a genome model + # from geniml.io import RegionSet + # + # ff =RegionSet("/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/data/test_singles/GSM8196564_435_RUNX3_KO_H3K27AC_v_435_ctrl_IgG_seacr.relaxed.bed") + # ff + all_genome_models = [] # for reference_genome in rgc.list(): @@ -77,36 +87,54 @@ def main(): # Manually create more genome models not found in ref genie ucsc_hg38 = GenomeModel( genome_alias="ucsc_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_hg38.chrom.sizes", ) ncbi_hg38 = GenomeModel( genome_alias="ncbi_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ncbi_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ncbi_hg38.chrom.sizes", ) ensembl_hg38 = GenomeModel( genome_alias="ensembl_hg38", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ensembl_hg38.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ensembl_hg38.chrom.sizes", ) - ucsc_pantro6 = GenomeModel( - genome_alias="ucsc_pantro6", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_panTro6.chrom.sizes", + ucsc_hg19 = GenomeModel( + genome_alias="ucsc_hg19", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_hg19.chrom.sizes", ) ucsc_dm6 = GenomeModel( genome_alias="ucsc_dm6", - chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/scripts/ref_genome_validating/chrom_sizes/ucsc_dm6.chrom.sizes", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_dm6.chrom.sizes", + ) + + ucsc_mm10 = GenomeModel( + genome_alias="ucsc_mm10", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm10.chrom.sizes", + ) + + ucsc_mm39 = GenomeModel( + genome_alias="ucsc_mm39", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm39.chrom.sizes", + ) + + ucsc_pantro6 = GenomeModel( + genome_alias="ucsc_pantro6", + chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes", ) all_genome_models.append(ucsc_hg38) all_genome_models.append(ncbi_hg38) + all_genome_models.append(ucsc_mm10) + all_genome_models.append(ucsc_mm39) + all_genome_models.append(ucsc_hg19) all_genome_models.append(ensembl_hg38) all_genome_models.append(ucsc_pantro6) all_genome_models.append(ucsc_dm6) # Create Validator Object - validator = RefValidator(genome_models=all_genome_models) + validator = ReferenceValidator(genome_models=all_genome_models) # Get BED files # LOCAL @@ -127,14 +155,15 @@ def main(): } # add this to a column to make comparisons easier for human eyes on pephub all_vals = {} if compat_vector: - for i in compat_vector: + for i in compat_vector.keys(): if i is not None: - for key, values in i.items(): - all_vals.update({key: values}) - if "compatibility" in values: - tier["tier_rating"].update( - {key: values["compatibility"]} - ) + all_vals.update({i: compat_vector[i].model_dump()}) + dict_to_check = compat_vector[i].model_dump() + if "compatibility" in dict_to_check: + tier["tier_rating"].update( + {i: dict_to_check["compatibility"]} + ) + all_vals.update(tier) # use pipestat to report to pephub and file backend @@ -191,14 +220,17 @@ def main(): "tier_rating": {} } # add this to a column to make comparisons easier for human eyes on pephub if compat_vector: - for i in compat_vector: + for i in compat_vector.keys(): if i is not None: - for key, values in i.items(): - all_vals.update({key: values}) - if "compatibility" in values: - tier["tier_rating"].update( - {key: values["compatibility"]} - ) + all_vals.update( + {i: compat_vector[i].model_dump()} + ) + dict_to_check = compat_vector[i].model_dump() + if "compatibility" in dict_to_check: + tier["tier_rating"].update( + {i: dict_to_check["compatibility"]} + ) + all_vals.update(tier) # use pipestat to report to pephub and file backend diff --git a/test/test_ref_validator.py b/test/test_ref_validator.py new file mode 100644 index 0000000..bc5bd4f --- /dev/null +++ b/test/test_ref_validator.py @@ -0,0 +1,17 @@ +import os + +from bedboss.refgenome_validator import ReferenceValidator + + +FILE_DIR = os.path.dirname(os.path.realpath(__file__)) +HG19_CORRECT_DIR = os.path.join(FILE_DIR, "test_data", "bed", "hg19", "correct") +FILE_PATH = f"{HG19_CORRECT_DIR}/sample1.bed.gz" + + +def test_main(): + dict_result = ReferenceValidator().determine_compatibility( + FILE_PATH, + concise=True, + ) + + assert dict_result