Skip to content

Commit

Permalink
rework compatibility to take single bed and return a list of dicts fo…
Browse files Browse the repository at this point in the history
…r compatibility vector
  • Loading branch information
donaldcampbelljr committed Sep 4, 2024
1 parent 74310e5 commit e4ce058
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 14 deletions.
12 changes: 8 additions & 4 deletions bedboss/refgenome_validator/genome_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
class GenomeModel:
"""
Initialize genome model
A class representing a reference genome. You feed it a reference genome. It retrieves chrom sizes (from refgenie), and then it provides some helper functions, intended for use with reference genome validation.
A class representing a reference genome. You feed it a reference genome. It retrieves chrom sizes (from refgenie),
and then it provides some helper functions, intended for use with reference genome validation.
"""

Expand All @@ -23,16 +24,19 @@ def __init__(
self.chrom_sizes = self.get_chrom_sizes()
self.excluded_ranges_names = exclude_ranges_names # Which bed file digests from the excluded ranges are associated with this reference genome?

def get_chrom_sizes(self):
# read chromsizes file
def get_chrom_sizes(self) -> dict:
"""
Obtains chrom sizes via refgenie (using a refgenconf.refgenconf.RefGenConf object)
:return dict: dictionary containing chroms(keys) and lengths(values)
"""

chrom_sizes_path = self.rgc.seek(
genome_name=self.genome_alias,
asset_name="fasta",
tag_name="default",
seek_key="chrom_sizes",
)
print(chrom_sizes_path)

chrom_sizes = {}

Expand Down
64 changes: 55 additions & 9 deletions bedboss/refgenome_validator/refgenomevalidator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@ class Validator:
"""

def __init__(self, bedfiles, genome_models):
def __init__(self, genome_models):
# TODO ensure these are lists

self.bedfiles = bedfiles
self.genome_models = genome_models

self.compatibility_matrix = pd.DataFrame(
index=self.bedfiles, columns=self.genome_models
) # non mutually exclusive validation, i.e.

self.predicted_parent = self.predict_parent()
self.compatibility_list = (
[]
) # this will be a list of dictionary info with length of genome_models

def predict_parent(self):
"""
Expand Down Expand Up @@ -63,12 +60,29 @@ def modify_compatibility_matrix(self):

pass

def compare_chrom_size(self, bed_chrom_sizes, genome_chrom_sizes):
def compare_chrom_names_lengths(
self, bed_chrom_sizes: dict, genome_chrom_sizes: dict
):
"""
Given two dicts of chroms (key) and their sizes (values)
determine overlap
:param dict bed_chrom_sizes: dict of a bedfile's chrom size
:param dict genome_chrom_sizes: dict of a GenomeModel's chrom sizes
return dict: returns a dictionary with information on Query vs Model, e.g. chrom names QueryvsModel
"""

# Check names
# Define Three separate counts
# Q = Query, M = Model
name_stats = {}
q_and_m = None
q_and_not_m = None
not_q_and_m = None

name_stats = {}

extra_chroms = False
chroms_beyond_range = False
exclude = False
Expand All @@ -86,13 +100,45 @@ def compare_chrom_size(self, bed_chrom_sizes, genome_chrom_sizes):

return exclude

def determine_compatibility(
self, bedfile: str, ref_filter: list[str]
) -> list[dict]:
"""
Given a bedfile, determine compatibility with reference genomes (GenomeModels) created at Validator initialization.
:param str bedfile: path to bedfile on disk
:param list[str] ref_filter: list of ref genome aliases to filter on.
:return list[dict]: a list of dictionaries where each element of the array represents a compatibility dictionary
for each refgenome model.
"""

if ref_filter:
# Before proceeding filter out unwanted reference genomes to assess
for genome_model in self.genome_models:
if genome_model.alias in ref_filter:
self.genome_models.remove(genome_model)

compatibility_list = []

bed_chrom_info = get_bed_chrom_info(
bedfile
) # for this bed file determine the chromosome lengths

for genome_model in self.genome_models:
model_compat_info = {}
model_compat_info[genome_model.alias] = self.compare_chrom_names_lengths(
bed_chrom_info, genome_model.chrom_sizes
)

pass


def get_bed_chrom_info(bed_file_path: str):
"""
Given a path to a Bedfile. Attempt to open it and read it to find all of the chromosomes and the max length of each.
"""

# In bed classifier we skip a few rows just in case there is header information there...
# TODO In bed classifier we skip a few rows just in case there is header information there...

# Right now this assumes this is atleast a 3 column bedfile
df = pd.read_csv(bed_file_path, sep="\t", header=None)
Expand Down
6 changes: 5 additions & 1 deletion scripts/ref_genome_validating/validate_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ def main():

# validate each Bed file

validator = Validator(bedfiles=all_bed_files, genome_models=all_genome_models)
validator = Validator(genome_models=all_genome_models)

for bedfile in all_bed_files:
validator.determine_compatibility(bedfile)

print(validator.compatibility_matrix.head())

# output the results (dataframe -> csv?).
Expand Down

0 comments on commit e4ce058

Please sign in to comment.