From 1ec72d61d11fbdd5b7aa2d0872d2d2e4ecea51ca Mon Sep 17 00:00:00 2001 From: dparks1134 Date: Tue, 26 Sep 2023 00:24:14 +1000 Subject: [PATCH] chore: updated script for translating GTDB-Tk classifications to NCBI classification via majority vote; script now properly handles genomes classified via ANI prescreening --- scripts/gtdb_to_ncbi_majority_vote.py | 43 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/scripts/gtdb_to_ncbi_majority_vote.py b/scripts/gtdb_to_ncbi_majority_vote.py index 326695d7..27e805ef 100755 --- a/scripts/gtdb_to_ncbi_majority_vote.py +++ b/scripts/gtdb_to_ncbi_majority_vote.py @@ -25,7 +25,7 @@ __copyright__ = 'Copyright 2019' __credits__ = ['Donovan Parks'] __license__ = 'GPL3' -__version__ = '0.2.0' +__version__ = '0.2.1' __maintainer__ = 'Donovan Parks' __email__ = 'donovan.parks@gmail.com' __status__ = 'Development' @@ -52,7 +52,8 @@ FAMILY_IDX = 4 SPECIES_IDX = 6 -class Translate(object): + +class GtdbNcbiTranslate(object): """Translate GTDB to NCBI classification via majority vote.""" def __init__(self): @@ -100,10 +101,10 @@ def get_gtdbtk_classifications(self, gtdbtk_ar_assignments = self.parse_gtdbtk_classifications( ar_summary) else: - logger.warning( - f'Archaeal GTDB-Tk classification file does not exist.') - logger.warning( - f'Assuming there are no archaeal genomes to reclassify.') + self.logger.warning( + 'Archaeal GTDB-Tk classification file does not exist.') + self.logger.warning( + 'Assuming there are no archaeal genomes to reclassify.') gtdbtk_bac_assignments = {} if bac120_metadata_file: @@ -114,10 +115,10 @@ def get_gtdbtk_classifications(self, gtdbtk_bac_assignments = self.parse_gtdbtk_classifications( bac_summary) else: - logger.warning( - f'Bacterial GTDB-Tk classification file does not exist.') - logger.warning( - f'Assuming there are no bacterial genomes to reclassify.') + self.logger.warning( + 'Bacterial GTDB-Tk classification file does not exist.') + self.logger.warning( + 'Assuming there are no bacterial genomes to reclassify.') return gtdbtk_ar_assignments, gtdbtk_bac_assignments @@ -243,7 +244,8 @@ def parse_gtdb_metadata(self, ar53_metadata_file, bac120_metadata_file): if gid == rep_id: # genome is a GTDB representative gtdb_taxonomy = tokens[gtdb_taxonomy_index] - gtdb_taxa = [t.strip() for t in gtdb_taxonomy.split(';')] + gtdb_taxa = [t.strip() + for t in gtdb_taxonomy.split(';')] gtdb_family = gtdb_taxa[FAMILY_IDX] gtdb_family_to_rids[gtdb_family].add(gid) @@ -315,7 +317,7 @@ def resolve_majority_vote(self, taxon_counter, num_votes): return None self.logger.error('Unexpected case while resolving majority vote.') - assert(False) + assert False def ncbi_sp_majority_vote(self, gtdb_sp_clusters, ncbi_taxa, ncbi_lineages): """Get NCBI majority vote classification for each GTDB species cluster.""" @@ -411,6 +413,11 @@ def ncbi_majority_vote(self, # placed in species-level trees processed_gids = set() for tree_file in sp_trees: + if not os.path.exists(tree_file): + # can occur since all genomes might be classified + # using ANI prescreening + continue + self.logger.info(f' - parsing {tree_file}') tree = dendropy.Tree.get_from_path(tree_file, schema='newick', @@ -513,9 +520,9 @@ def ncbi_majority_vote(self, ncbi_mv = ncbi_sp_classification[gtdb_sp_rid] fout.write('{}\t{}\t{}\n'.format( - gid, - ';'.join(gtdb_taxa), - ';'.join(ncbi_mv))) + gid, + ';'.join(gtdb_taxa), + ';'.join(ncbi_mv))) def run(self, gtdbtk_output_dir, @@ -632,8 +639,8 @@ def print_help(): GTDB bacterial metadata file (if processing bacterial genomes). NOTE: GTDB metadata files are available for download at: - https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tar.gz - https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tar.gz + https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz + https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz {colour('Optional arguments:', ['underscore'])} {colour('--gtdbtk_prefix', ['bright'])} @@ -683,7 +690,7 @@ def print_help(): sys.exit(1) try: - p = Translate() + p = GtdbNcbiTranslate() p.run(args.gtdbtk_output_dir, args.ar53_metadata_file, args.bac120_metadata_file,