diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index b865c6d..855181b 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -20,49 +20,9 @@ def __init__(self, genome_models): [] ) # this will be a list of dictionary info with length of genome_models - def predict_parent(self): - """ - Predicts from a mutually exclusive list of reference genomes - - """ - - # For mutually exclusive items, we can quickly check, chrom sizes and chrom names and see if there are incompatible - # genome models and truncate the list at the very beginning - - predicted_parents = [] - - for bed in self.bedfiles: - # We may be doing this somewhere else and should just pass this in... - bed_chrom_info = get_bed_chrom_info(bed) - - for genome in self.genome_models: - exclude_result = self.compare_chrom_size( - bed_chrom_info, genome.chrom_sizes - ) - if not exclude_result: - # keep the genome - predicted_parents.append(genome.alias) - else: - # Can we for sure exclude this? if so delete it - # Do we need to deep copy this or will it cause issues? - self.genome_models.remove(genome) - - predicted_parent = "hg38" - - return predicted_parent - - def modify_compatibility_matrix(self): - """ - - modifys the data frame storing compatibility matrix of entries - - """ - - pass - def compare_chrom_names_lengths( self, bed_chrom_sizes: dict, genome_chrom_sizes: dict - ): + ) -> dict: """ Given two dicts of chroms (key) and their sizes (values) determine overlap @@ -73,32 +33,54 @@ def compare_chrom_names_lengths( return dict: returns a dictionary with information on Query vs Model, e.g. chrom names QueryvsModel """ - # Check names + # Layer 1: Check names # Define Three separate counts # Q = Query, M = Model name_stats = {} - q_and_m = None - q_and_not_m = None - not_q_and_m = None - - name_stats = {} - - extra_chroms = False - chroms_beyond_range = False - exclude = False + q_and_m = 0 + q_and_not_m = 0 + not_q_and_m = 0 + passed_chrom_names = True for key in list(bed_chrom_sizes.keys()): if key not in genome_chrom_sizes: - extra_chroms = True - elif key in genome_chrom_sizes: - if genome_chrom_sizes[key] < bed_chrom_sizes[key]: - chroms_beyond_range = True - - if extra_chroms or chroms_beyond_range: - exclude = True - return exclude - - return exclude + q_and_not_m += 1 + if key in genome_chrom_sizes: + q_and_m += 1 + for key in list(genome_chrom_sizes.keys()): + if key not in bed_chrom_sizes: + not_q_and_m += 1 + + # What is our threshold for passing layer 1? + if q_and_not_m > 1: + passed_chrom_names = False + + name_stats["q_and_m"] = q_and_m + name_stats["q_and_not_m"] = q_and_not_m + name_stats["not_q_and_m"] = not_q_and_m + name_stats["passed_chrom_names"] = passed_chrom_names + + # Layer 2: Check Lengths, but only if layer 1 is passing + if passed_chrom_names: + length_stats = {} + + chroms_beyond_range = False + num_of_chrm_beyond = 0 + + for key in list(bed_chrom_sizes.keys()): + if key in genome_chrom_sizes: + if bed_chrom_sizes[key] > genome_chrom_sizes[key]: + num_of_chrm_beyond += 1 + chroms_beyond_range = True + + length_stats["beyond_range"] = chroms_beyond_range + length_stats["num_of_chrm_beyond"] = num_of_chrm_beyond + else: + length_stats = {} + length_stats["beyond_range"] = None + length_stats["num_of_chrm_beyond"] = None + + return {"chrom_name_stats": name_stats, "chrom_length_stats": length_stats} def determine_compatibility( self, bedfile: str, ref_filter: list[str]