signaturescience · mtisza1 · Oct 12, 2022
diff --git a/scripts/query_taxon_clades/main_DBs_con_dict_hierarchical_taxonomy.tsv.gz b/scripts/query_taxon_clades/main_DBs_con_dict_hierarchical_taxonomy.tsv.gz
diff --git a/scripts/query_taxon_clades/make_hierarchy_table.py b/scripts/query_taxon_clades/make_hierarchy_table.py
@@ -0,0 +1,80 @@
+#! usr/bin/python3
+### load packages
+import pickle, copy, datetime, shutil, argparse, configparser
+import os, logging, sys
+import glob
+import json
+import pandas as pd
+import pytaxonkit
+import itertools
+#from _utils import util_filter_out_main_dbnames
+
+### function grabbed from _utils.py
+def util_filter_out_main_dbnames(db_iterable):
+    '''
+    takes an iterable of db names and returns a list containing a subset that includes only the
+    latest version of refseq.
+    '''
+    out = []
+    latest_refseq_ver = -1
+    latest_refseq_name = ''
+
+    for nm in db_iterable:
+        if nm[:6].lower()=='refseq':
+            v = int(nm[8:])
+            if v > latest_refseq_ver:
+                latest_refseq_ver = v
+                latest_refseq_name = nm
+        else:
+            out.append(nm)
+    out.append(latest_refseq_name)
+    return out
+
+### set directories and files. replace with variable
+pathname = os.path.dirname( __file__ )  
+script_path = os.path.abspath(pathname)  
+up_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
+#dqt_dir = "/Users/tiszamj/Documents/mike_tisza/database_taxa_query/metscale/scripts"
+db_taxids = "containment_dict.json" 
+
+### add if is file
+if os.path.isfile(up_path + "/" + db_taxids):
+    f = open(up_path + "/" + db_taxids)
+    taxid_dict = json.load(f)
+else:
+    print ("file not found: " + up_path + "/" + db_taxids)
+    exit()
+
+main_db_list = util_filter_out_main_dbnames(taxid_dict["taxid_lists"])
+if main_db_list:
+    print ("DBs for table construction: ")
+    for i in main_db_list:
+        print (i)
+else:
+    print ("list of main DBs not found.")
+    exit()
+
+### make empty df
+large_tax_hi_df = pd.DataFrame(columns=['TaxID','kingdom','phylum','class','order','family','genus','species','DB'])
+#for item in itertools.islice(mylist, n):
+### Loops through lists of taxids from each of the main DBs 
+print ("Looping through DBs to get hierarchy")
+for i in main_db_list :
+    print (i)
+    i_IDs = taxid_dict["taxid_lists"][i]
+    result = pytaxonkit.lineage(i_IDs, data_dir = "/Users/tiszamj/Documents/mike_tisza/database_taxa_query/taxdmp/", threads=16, formatstr = "{K};{p};{c};{o};{f};{g};{s}")
+    result_df = pd.DataFrame(result[['TaxID','LineageTaxIDs']])
+    result_df[['kingdom','phylum','class','order','family','genus','species']] = result_df['LineageTaxIDs'].str.split(';',expand=True)
+    result_df = result_df.drop(['LineageTaxIDs'], axis=1)
+    result_df['DB'] = i
+    large_tax_hi_df = pd.concat([large_tax_hi_df, result_df])
+
+if large_tax_hi_df.empty:
+    print ("output table not found. ")
+    exit()
+else:
+    print ("table rows = " + str(len(large_tax_hi_df)))
+### write as tsv file
+large_tax_hi_df.to_csv(script_path + "/main_DBs_con_dict_hierarchical_taxonomy.tsv", sep = "\t", index_label = False)
+
+
diff --git a/scripts/query_taxon_clades/query_taxon_clades.py b/scripts/query_taxon_clades/query_taxon_clades.py
@@ -0,0 +1,117 @@
+#! usr/bin/python3
+### load packages
+import pickle, copy, datetime, shutil, argparse, configparser
+import os, logging, sys
+import glob
+import json
+import pandas as pd
+import pytaxonkit
+import itertools
+import subprocess
+
+
+### str2bool not currently used, but could help with future options
+def str2bool(v):
+    if isinstance(v, bool):
+       return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+### set directories and files. replace with variable
+pathname = os.path.dirname( __file__ )  
+script_path = os.path.abspath(pathname)  
+up_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
+
+parser = argparse.ArgumentParser(description='Query DBs for all records at or below taxonomical level for given taxid.' \
+    'For example, if taxid for genus Bacteroides is given, # sequence records within Bacteroides are returned. ')
+
+required_args = parser.add_argument_group(' REQUIRED ARGUMENTS for query hierarchical taxon')
+
+help_taxid_list = 'The NCBI taxonomy ID(s) to query against the containment_dict metadatabase. Can be in one of ' \
+                    'three forms: a) a single integer, which will be queried and the results output, b) a valid path ' \
+                    'to a text file containing one taxon ID per line, all of which will be queried and the results ' \
+                    'output as tab-delimited text, or c) \'stdin\' in which case a list of taxon_ids is pulled from ' \
+                    'standard input and processed as with a file.'
+
+required_args.add_argument('-t', '--taxids', dest='taxid_list', type=str, default=None, help=help_taxid_list)
+
+help_db_table = 'Database hierarchical table. Can be generated from script make_hierarchy_table.py'
+required_args.add_argument('-d', '--db_table', dest='db_table', type=str, default= script_path + "/main_DBs_con_dict_hierarchical_taxonomy.tsv", help=help_db_table)
+
+help_output = 'output file name. TSV format.'
+required_args.add_argument('-o', '--output', dest='output_file', type=str, default="output_hierarchical_taxonomy.tsv", help=help_output)
+
+args = parser.parse_args()
+
+
+
+taxids = []
+if args.taxid_list is None:
+    #options.parser_store.print_help()
+    sys.exit(1)
+elif args.taxid_list == 'stdin':
+    # while True:
+    #     foo = input()
+    for foo in sys.stdin:
+        if len(foo.strip())==0:
+            break
+        taxids.append(int(foo.strip()))
+elif os.path.isfile(args.taxid_list):
+    tf = open(args.taxid_list, 'r')
+    tfl = tf.readlines()
+    taxids = list(map(lambda x: int(x.strip()), tfl))
+else:
+    try:
+        taxids.append(int(args.taxid_list))
+    except:
+        logging.error('The taxonID list argument \'-t\' must be either a single integer string, or \'stdin\', or'
+                        'a valid file path')
+        #options.parser_store.print_help()
+        sys.exit(1)
+
+#db_table
+if os.path.isfile(args.db_table):
+    large_tax_hi_df = pd.read_csv(args.db_table, sep = "\t", header=0)
+elif os.path.isfile(args.db_table + ".gz"):
+    subprocess.run(['gzip', '-d', args.db_table + ".gz"])
+    if os.path.isfile(args.db_table):
+        large_tax_hi_df = pd.read_csv(args.db_table, sep = "\t", header=0)
+    else:
+        print(str(args.db_table) + " not found")
+        exit()
+else:
+    print(str(args.db_table) + " not found")
+    exit()
+
+### have to rename because "class" is a no-no column name
+large_tax_hi_df = large_tax_hi_df.rename(columns={"class": "tax_class" })
+
+### one column DF
+DBs_df = pd.DataFrame({"DB": large_tax_hi_df.DB.unique()})
+
+### actual querying
+for taxid in taxids:
+    tmp = large_tax_hi_df.query('(TaxID == @taxid) or (kingdom == @taxid) or (phylum == @taxid) or (order == @taxid) or (family == @taxid) or (genus == @taxid) or (species == @taxid)') \
+    .groupby('DB').count().reset_index()[['DB', 'TaxID']].rename(columns={"TaxID": "ID_" + str(taxid)})
+    DBs_df = pd.merge(DBs_df, tmp, on="DB", how='left')
+
+### remove decimals
+pd.options.display.float_format = '{:,.0f}'.format
+
+### current output print to terminal
+print(DBs_df.fillna(0))
+
+DBs_df.fillna(0).to_csv(args.output_file, sep="\t", index=False)
+'''
+NOTES
+On my macbook pro, it took 3 seconds to query 50 taxids and 42 seconds to query 1000 taxids
+
+SUGGESTIONS/TO-DO
+1. Change output file format and/or print the result in a more compact format ala the other tool
+2. Query the taxdump file to get taxon level "species, genus, family, etc" for each query taxid, similar to other script and set the column names to, for example Genus_816 instead of ID_816 (line 96)
+ -- Mike Tisza, 2022-10-12
+'''