From 178719e94ee64d91b6f52933b60aa63b97d94673 Mon Sep 17 00:00:00 2001 From: Donald Campbell <125581724+donaldcampbelljr@users.noreply.github.com> Date: Mon, 9 Sep 2024 12:33:28 -0400 Subject: [PATCH] refactor to allow option to fetch bedfiles using GEOFETCH --- .../refgenome_validator/refgenomevalidator.py | 6 +- .../ref_genome_validating/validate_genome.py | 151 +++++++++++++----- 2 files changed, 111 insertions(+), 46 deletions(-) diff --git a/bedboss/refgenome_validator/refgenomevalidator.py b/bedboss/refgenome_validator/refgenomevalidator.py index efcac28..00b9ba2 100644 --- a/bedboss/refgenome_validator/refgenomevalidator.py +++ b/bedboss/refgenome_validator/refgenomevalidator.py @@ -361,7 +361,7 @@ def process_compat_stats(self, compat_stats: dict, genome_alias: str) -> dict: ) final_compat_rating["tier_ranking"] = 4 - processed_stats[genome_alias].update({"Compatibility": final_compat_rating}) + processed_stats[genome_alias].update({"compatibility": final_compat_rating}) return processed_stats @@ -379,13 +379,11 @@ def get_bed_chrom_info(bed_file_path: str) -> Union[dict, None]: Given a path to a Bedfile. Attempt to open it and read it to find all of the chromosomes and the max length of each. """ - # TODO In bed classifier we skip a few rows just in case there is header information there... - # Right now this assumes this is atleast a 3 column bedfile # This also assumes the bed file has been unzipped try: - df = pd.read_csv(bed_file_path, sep="\t", header=None) + df = pd.read_csv(bed_file_path, sep="\t", header=None, skiprows=5) max_end_for_each_chrom = df.groupby(0)[2].max() diff --git a/scripts/ref_genome_validating/validate_genome.py b/scripts/ref_genome_validating/validate_genome.py index 40416be..89d86fd 100644 --- a/scripts/ref_genome_validating/validate_genome.py +++ b/scripts/ref_genome_validating/validate_genome.py @@ -6,6 +6,9 @@ from bedboss.refgenome_validator import * +# helper utils +from process_exclude_ranges import unzip_bedfile, get_samples, MAX_SAMPLES + try: IGD_DB_PATH = os.environ["IGD_DB_PATH"] except: @@ -26,6 +29,10 @@ # if you wish to report results to pephub PEP_URL = "donaldcampbelljr/refgenome_compat_testing:default" +# Where to get Bedfiles? +LOCAL = False +GEOFETCH = True + def main(): # Simple script to testing that the Validator objects is working correctly. @@ -55,50 +62,110 @@ def main(): new_genome_model = GenomeModel(genome_alias=reference_genome, refgenomeconf=rgc) all_genome_models.append(new_genome_model) - # Get BED files - # for now, hard code a couple - - # all_bed_files = [ - # "/home/drc/GITHUB/bedboss/bedboss/test/data/bed/hg19/correct/hg19_example1.bed" - # ] - all_bed_files = [] - for root, dirs, files in os.walk(BEDFILE_DIRECTORY): - for file in files: - if file.endswith(".bed"): - # print(os.path.join(root, file)) - all_bed_files.append(os.path.join(root, file)) - - # validate each Bed file + # Create Validator Object validator = Validator(genome_models=all_genome_models, igd_path=IGD_DB_PATH) - # use pipestat to report to pephub and file backend - psm = pipestat.PipestatManager( - pephub_path=PEP_URL, - ) - psm2 = pipestat.PipestatManager(results_file_path="stats_results/results.yaml") - - for bedfile in all_bed_files[:10]: - compat_vector = validator.determine_compatibility(bedfile) - # Debug printing - # import pprint - # pprint.pprint(compat_vector, depth=5) - - # Report to pephub - rid = os.path.basename(bedfile) - all_vals = {} - if compat_vector: - for i in compat_vector: - if i is not None: - for key, values in i.items(): - all_vals.update({key: values}) - - # Report to file backend - psm2.report(record_identifier=rid, values=all_vals) - - # Convert to json string before reporting to pephub - for key, value in all_vals.items(): - all_vals[key] = str(json.dumps(value)) - psm.report(record_identifier=rid, values=all_vals) + # Get BED files + # LOCAL + if LOCAL: + print("Obtaining Bed files locally") + all_bed_files = [] + for root, dirs, files in os.walk(BEDFILE_DIRECTORY): + for file in files: + if file.endswith(".bed"): + # print(os.path.join(root, file)) + all_bed_files.append(os.path.join(root, file)) + + for bedfile in all_bed_files[:10]: + compat_vector = validator.determine_compatibility(bedfile) + rid = os.path.basename(bedfile) + all_vals = {} + if compat_vector: + for i in compat_vector: + if i is not None: + for key, values in i.items(): + all_vals.update({key: values}) + + # use pipestat to report to pephub and file backend + psm = pipestat.PipestatManager( + pephub_path=PEP_URL, + ) + psm2 = pipestat.PipestatManager( + results_file_path="stats_results/results.yaml" + ) + + # Report to file backend + psm2.report(record_identifier=rid, values=all_vals) + + # Convert to json string before reporting to pephub + for key, value in all_vals.items(): + all_vals[key] = str(json.dumps(value)) + psm.report(record_identifier=rid, values=all_vals) + + # # USE GEOFETCH TO OBTAIN BED FILES + if GEOFETCH: + print("Obtaining Bed files using Geofetch") + species = "homosapiens" + data_output_path = os.path.abspath("data") + species_output_path = os.path.join(data_output_path, species) + bedfileslist = os.path.join(species_output_path, "bedfileslist.txt") + results_path = os.path.abspath("results") + + with open(bedfileslist, "r") as file: + lines = file.readlines() + lines = [ + line.strip() for line in lines + ] # Remove leading/trailing whitespace + + for line in lines: + samples = get_samples( + data_output_path=species_output_path, gse_number=line + ) + + if samples: + for sample in samples[:MAX_SAMPLES]: + all_vals = {} + if isinstance(sample.output_file_path, list): + bedfile = sample.output_file_path[0] + else: + bedfile = sample.output_file_path + reported_ref_genome = getattr(sample, "ref_genome", None) + all_vals.update({"reported_ref_genome": reported_ref_genome}) + + file = unzip_bedfile(bedfile, results_path) + + if file: + compat_vector = validator.determine_compatibility(bedfile) + rid = os.path.basename(bedfile) + tier = { + "tier_rating": {} + } # add this to a column to make comparisons easier for human eyes on pephub + if compat_vector: + for i in compat_vector: + if i is not None: + for key, values in i.items(): + all_vals.update({key: values}) + if "compatibility" in values: + tier["tier_rating"].update( + {key: values["compatibility"]} + ) + all_vals.update(tier) + + # use pipestat to report to pephub and file backend + psm = pipestat.PipestatManager( + pephub_path=PEP_URL, + ) + psm2 = pipestat.PipestatManager( + results_file_path="stats_results/results.yaml" + ) + + # Report to file backend + psm2.report(record_identifier=rid, values=all_vals) + + # Convert to json string before reporting to pephub + for key, value in all_vals.items(): + all_vals[key] = str(json.dumps(value)) + psm.report(record_identifier=rid, values=all_vals) return