diff --git a/README.md b/README.md index 7b7a8e6..a749cc4 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Supported database build programs * krakenuniq * centrifuge -The flextaxd (flextaxd) script allows customization of databases from NCBI, QIIME or CanSNPer source formats and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation). +The flextaxd (flextaxd) script allows customization of databases from several sources (See supported source formats above) and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation). ### Create database * Create your database from source files --taxonomy_file * and select --taxonomy_type [supported formats] diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py index cd6792f..2fb99dd 100644 --- a/flextaxd/create_databases.py +++ b/flextaxd/create_databases.py @@ -91,6 +91,7 @@ def __init__(self, message,expression=""): download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set") download_opts.add_argument('--download', action='store_true', help="Download additional sequences") download_opts.add_argument('-r', '--representative', action='store_true', help="Download GTDB representative genomes") + download_opts.add_argument('--download_file', default=False, help="Download genomes from file (path to file)") download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path") download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)") download_opts.add_argument('--genomes_path', metavar="",default=None, help='path to genomes') @@ -125,6 +126,8 @@ def __init__(self, message,expression=""): if not os.path.exists(args.database): raise FileNotFoundError("No database file could be found, please provide a FlexTaxD database to run FlexTaxD!") + if args.create_db and not args.genomes_path: + raise InputError("genomes_path parameter was not given") if not os.path.exists(args.genomes_path): raise FileNotFoundError("Directory {path} does not exist".format(path=args.genomes_path)) @@ -186,16 +189,19 @@ def __init__(self, message,expression=""): process_directory_obj = process_directory(args.database) genomes, missing = process_directory_obj.process_folder(args.genomes_path) ''' 2. Download missing files''' - if args.download or args.representative: + if args.download or args.representative or args.download_file: download = dynamic_import("modules", "DownloadGenomes") download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path) - new_genome_path, missing = download_obj.run(missing,representative=args.representative,url=args.rep_path) - if not new_genome_path: - still_missing = missing - if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + if args.download_file: + download_obj.download_from_file(args.download_file) else: - new_genomes, missing = process_directory_obj.process_folder(new_genome_path) - genomes += new_genomes + new_genome_path, missing = download_obj.run(missing,args.rep_path) + if not new_genome_path: + still_missing = missing + if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing))) + else: + new_genomes, missing = process_directory_obj.process_folder(new_genome_path) + genomes += new_genomes else: if len(missing) > 0: logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing))) diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py index 6feaabe..a83d06b 100644 --- a/flextaxd/custom_taxonomy_databases.py +++ b/flextaxd/custom_taxonomy_databases.py @@ -20,7 +20,7 @@ each column (not |). ''' -__version__ = "0.4.2" +__version__ = "0.4.3" __author__ = "David Sundell" __credits__ = ["David Sundell"] __license__ = "GPLv3" @@ -121,10 +121,12 @@ def __init__(self, message,expression=""): mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications") mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to") mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source') + #mod_opts.add_argument('-un', '--update_names', metavar="",default=None, help='Update node names using old to new name file.') mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch") mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!") mod_opts.add_argument('--clean_database', action='store_true', help="Clean up database from unannotated nodes") mod_opts.add_argument('--skip_annotation', action='store_true', help="Do not automatically add annotation when creating GTDB database") + mod_opts.add_argument('--refdatabase', metavar="", default=False, help="For download command, give value of expected source, default (refseq)") out_opts = parser.add_argument_group('output_opts', "Output options") @@ -132,6 +134,8 @@ def __init__(self, message,expression=""): out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)") out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken") out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers") + out_opts.add_argument('--dump_genomes', action='store_true', default=False, help="Print list of genomes (and source) to file") + out_opts.add_argument('--dump_genome_annotations', action='store_true', default=False, help="Add genome taxid annotation to genomes dump") vis_opts = parser.add_argument_group('vis_opts', "Visualisation options") vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node") @@ -261,6 +265,16 @@ def __init__(self, message,expression=""): modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base) modify_obj.clean_database(ncbi=ncbi) + '''Dump option, export list of genomes, added in flextaxd version 0.4.2''' + if args.dump_genomes: + logger.info("Dump list of genomes") + write_module = dynamic_import("modules", "WriteTaxonomy") + write_obj = write_module(args.outdir, database=args.database,prefix=args.dump_prefix,separator=args.dump_sep,minimal=args.dump_mini,desc=args.dump_descriptions,dbprogram=args.dbprogram,dump_genomes=True) + if args.dump_genome_annotations: + write_obj.dump_genome_annotations() + else: + write_obj.dump_genomes() + ''' 0. Create taxonomy database (if it does not exist)''' if args.taxonomy_file: if not os.path.exists(args.database) or force: @@ -304,6 +318,11 @@ def __init__(self, message,expression=""): modify_obj = modify_module(database=args.database, update_genomes=True,taxid_base=args.taxid_base) modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid) + # if args.update_names: + # modify_module = dynamic_import("modules", "ModifyTree") + # modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base) + # modify_obj.update_node_names(args.update_names) + if (args.mod_file or args.mod_database) and args.clean_database: modify_module = dynamic_import("modules", "ModifyTree") modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base) diff --git a/flextaxd/modules/DownloadGenomes.py b/flextaxd/modules/DownloadGenomes.py index 470424c..eb67b97 100644 --- a/flextaxd/modules/DownloadGenomes.py +++ b/flextaxd/modules/DownloadGenomes.py @@ -172,6 +172,16 @@ def download_files(self,files): logger.error("Could not stop all subprocesses check your process manager and end them manually!") return self.not_downloaded + def download_from_file(self,inputfile,exclude="",all=False): + '''Download files with ncbis download script using an accession input file''' + if not all: + exclude = " ".join([exclude,"--exclude-gff3 --exclude-protein --exclude-rna --exclude-genomic-cds"]) + args = ["genome", "accession" ,"--inputfile", input_file, exclude"] + p = Popen(args, stdout=PIPE,cwd=self.location) + (output, err) = p.communicate() + p_status = p.wait() + return self.location + def run(self, files, representative=False,url=""): '''Download list of GCF and or GCA files from NCBI or download represenative genomes diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py index 2c9c571..758319e 100644 --- a/flextaxd/modules/ModifyTree.py +++ b/flextaxd/modules/ModifyTree.py @@ -55,7 +55,7 @@ def __init__(self, message): class ModifyTree(object): """docstring for ModifyTree.""" - def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, separator="\t",verbose=False,parent=False,replace=False,**kwargs): + def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, update_node_names=False,separator="\t",verbose=False,parent=False,replace=False,**kwargs): super(ModifyTree, self).__init__() self.verbose = verbose logger.info("Modify Tree") @@ -63,6 +63,8 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c self.taxonomy = {} self.names = {} + self.keep_rank = {} ## Place holder for parent rank + self.parent=parent self.replace = replace self.ncbi_order = True @@ -92,7 +94,7 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c self.modsource = self.parse_modification(self.moddb,"database") elif mod_file: self.modsource = self.parse_modification(mod_file,"file") - elif update_genomes or clean_database: + elif update_genomes or clean_database or update_node_names: pass else: raise InputError("No modification source could be found both mod_database and mod_file file are empty!") @@ -163,8 +165,16 @@ def _parse_new_links(self, parent=None,child=None,rank="no rank"): self.new_nodes.add(parent_i) child_i = self.get_id(child) self.new_nodes.add(child_i) - - rank_i = self.add_rank(rank) + '''If node has no level (no rank), check if parent database had a classified level. If so default add level''' + try: + level = self.parent_levels[int(child_i)] + logger.info("Rank kept for {node}, {rank}".format(node=child, rank=level)) + except: + level = False + if not level: + rank_i = self.add_rank(rank) + else: + rank_i = level self.new_links.add((parent_i,child_i,rank_i)) return @@ -263,17 +273,15 @@ def parse_modification(self, input,modtype="database"): if modtype == "database": modparent = self.moddb.get_parent(self.moddb.get_id(self.parent)) logger.info("{n} existing links to {parent} ({parentlinks}) ({modparent})".format(n=len(self.existing_links),parent=self.parent,parentlinks=parentlinks,modparent=modparent)) - + self.parent_levels = self.keep_levels(self.existing_links | parentlinks | set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes)))) if modtype == "database": self.mod_genomes = self.database_mod(input,self.parent) elif modtype == "file": self.file_mod(input) else: raise InputError("Wrong modification input database or file must be supplied") - ### get links from current database self.old_nodes = self.existing_nodes - self.new_nodes - logger.info("nodes:") logger.info("old: {old}".format(old=len(self.old_nodes))) logger.info("new: {new}".format(new=len(self.new_nodes))) @@ -284,7 +292,6 @@ def parse_modification(self, input,modtype="database"): self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks) self.old_links = self.existing_links - self.new_links - logger.info("links:") logger.info("old: {old}".format(old=len(self.old_links))) logger.info("new: {new}".format(new=len(self.new_nodes))) @@ -295,21 +302,27 @@ def parse_modification(self, input,modtype="database"): '''Get all genomes annotated to new nodes in existing database''' return True - def update_annotations(self, genomeid2taxid): + def update_annotations(self, genomeid2taxid, reference=False): '''Function that adds annotation of genome ids to nodes''' logger.info("Update genome to taxid annotations using {genomeid2taxid}".format(genomeid2taxid=genomeid2taxid)) + _ref = reference update = { "set_column": "id", "where_column": "genome", "set_value": "", "where": "" } + #if _ref: + # update[""] updated = 0 added = 0 with open(genomeid2taxid) as f: for row in f: try: - genome,name = row.strip().split(self.sep) + if len(row.strip().split(self.sep)) > 2: + genome,name,reference = row.strip().split(self.sep) + else: + genome,name = row.strip().split(self.sep) except ValueError: genome,name = row.strip().split(" ") logger.debug("genome: {genome}, name: {name}".format(genome=genome,name=name)) @@ -386,6 +399,36 @@ def update_genomes(self): self.taxonomydb.commit() return + def update_node_names(self, refdict): + '''Function that adds annotation of genome ids to nodes''' + logger.info("Update node names in the database from {refdict}".format(refdict=refdict)) + update = { + "set_column": "name", + "where_column": "name", + "set_value": "", + "where": "" + } + updated = 0 + added = 0 + with open(refdict) as f: + for row in f: + try: + old_name,name = row.strip().split(self.sep) + except ValueError: + old_name,name = row.strip().split(" ") + logger.debug("old_name: {old_name}, name: {name}".format(old_name=old_name,name=name)) + ## If no exception occured add old_name + update["set_value"] = name + update["where"] = old_name.strip() + res = self.taxonomydb.update_table(update,table="nodes") + if self.taxonomydb.rowcount()!=0: + if res: + updated += 1 + self.taxonomydb.commit() + gid = self.taxonomydb.get_genomes() + logger.info("{updated} annotations were updated!".format(added=added, updated=updated)) + return + def clean_database(self, ncbi=False): '''Function that removes all node and node paths without annotation''' logger.info("Fetch annotated nodes") @@ -404,7 +447,7 @@ def clean_database(self, ncbi=False): logger.info("Parents added: {an}".format(an=len(self.annotated_nodes)-an)) if ncbi: logger.info("Keep main nodes of the NCBI taxonomy (parents on level 3 and above)") - self.keep = set(self.taxonomydb.get_children([1],maxdepth=1)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) + self.keep = set(self.taxonomydb.get_children([1],maxdepth=3)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)]) logger.info("Adding root levels {nlev}".format(nlev=len(self.keep-self.annotated_nodes))) self.annotated_nodes |= self.keep '''Get all links related to an annotated node and its parents''' @@ -434,6 +477,14 @@ def clean_database(self, ncbi=False): self.taxonomydb.query("vacuum") logger.info("Database is cleaned!") + def keep_levels(self, links): + parent_levels = {} + for c,p,l in links: + if self.rank[l] != 1: + parent_levels[p] = l + print(parent_levels) + return parent_levels + def update_database(self): '''Update the database file''' if self.replace: @@ -446,6 +497,7 @@ def update_database(self): logger.info("Replace tree, deleting all nodes downstream of selected parent!") if len(self.old_links | self.non_overlapping_old_links-set(self.parent_link)) > 0: logger.debug("Delete links no longer valid!") + '''Add function to keep taxonomy level from previous database (belongs to parent)''' self.taxonomydb.delete_links((self.old_links | self.non_overlapping_old_links)-set(self.parent_link)) if len(self.old_nodes): logger.debug("Delete nodes!") diff --git a/flextaxd/modules/ProcessDirectory.py b/flextaxd/modules/ProcessDirectory.py index c216aec..4dae1be 100644 --- a/flextaxd/modules/ProcessDirectory.py +++ b/flextaxd/modules/ProcessDirectory.py @@ -23,6 +23,7 @@ def __init__(self, database,limit=False): self.genome_names = [] self.genome_path_dict = {} self.files = [] + #logger.debug(self.genome_id_dict) def get_genome_names(self): ''' @@ -73,10 +74,6 @@ def is_gcf_gca(self,fname,debug=False): ''' try: GCX,END,REST = fname.split("_",2) ## If a name contains anything after the GCF number remove this if split by _ - if GCX in ["GB","RS"]: - return False ## GTDB stype input treat as non GCF - #GCX = END - #END,REST = REST.split("_",1) #remove GB or RS definitions if debug: logger.debug("[{} {} {}]".format(GCX,END,REST)) NUM,version = END.split(".",1) @@ -162,6 +159,7 @@ def walk_directory(self,folder_path): if not folder_path: raise IOError("Parameter --genomes_path was not set".format(folder_path)) logger.info("Process genome path ({path})".format(path=folder_path)) + logger.debug("Extensions valid: ({f})".format(f=self.ext)) for root, dirs, files in os.walk(folder_path,followlinks=True): for file in files: fname = file.rstrip(".gz") ## remove gz if present @@ -177,6 +175,8 @@ def walk_directory(self,folder_path): logger.info("Processed {count} genomes".format(count=count)) self.files = list(set(self.files)) self.genome_names = list(set(self.genome_names)) + if len(self.genome_names) == 0: + logger.info("# WARNING: No genomes from the input folder was added! Are your genomes annotated properly in the database?") return self.files, self.genome_names def process_folder(self,folder_path): diff --git a/flextaxd/modules/ReadTaxonomy.py b/flextaxd/modules/ReadTaxonomy.py index 019aa74..90e39ea 100644 --- a/flextaxd/modules/ReadTaxonomy.py +++ b/flextaxd/modules/ReadTaxonomy.py @@ -141,16 +141,23 @@ def read_nodes(self, treefile=False): self.length +=1 self.database.commit() - def parse_genomeid2taxid(self,genomeid2taxid): + def parse_genomeid2taxid(self,genomeid2taxid,reference=False): '''Parse file that annotates genome_id´s to nodes in the tree''' nodeDict = self.database.get_nodes() + _ref = reference with self.zopen(genomeid2taxid,"rt") as f: headers = f.readline().strip().split("\t") for row in f: if row.strip() != "": ## If there are trailing empty lines in the file - genomeid,taxid = row.strip().split("\t") try: - self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()]) + genomeid,taxid = row.strip().split("\t") + except: + if not _ref: ## override if there is a reference in file, and use given ref + genomeid,taxid,reference = row.strip().split("\t") + else: + genomeid,taxid,override = row.strip().split("\t") + try: + self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()],reference=reference) except KeyError: logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid)) self.database.commit() diff --git a/flextaxd/modules/ReadTaxonomyNCBI.py b/flextaxd/modules/ReadTaxonomyNCBI.py index 7364f28..d4176c2 100644 --- a/flextaxd/modules/ReadTaxonomyNCBI.py +++ b/flextaxd/modules/ReadTaxonomyNCBI.py @@ -83,7 +83,7 @@ def parse_genebank_file(self,filepath,filename): self.refseqid_to_GCF[refseqid] = genebankid return - def parse_genomeid2taxid(self, genomes_path,annotation_file): + def parse_genomeid2taxid(self, genomes_path,annotation_file,reference="refseq"): '''To allow NCBI databases to be build from scratch the sequences names needs to be stored in the database, this function parses the accession2taxid file from NCBI to speed up the function and reduce the amount of stored datata only sequences in input genomes_path will be fetched @@ -116,7 +116,7 @@ def parse_genomeid2taxid(self, genomes_path,annotation_file): logger.info("Error on first line in annotation file, check format!") try: genebankid = self.refseqid_to_GCF[refseqid] - self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8")) + self.database.add_genome(genome=genebankid,_id=taxid.decode("utf-8"),reference=reference) annotated_genome.add(refseqid) except KeyError: pass diff --git a/flextaxd/modules/ReadTaxonomyQIIME.py b/flextaxd/modules/ReadTaxonomyQIIME.py index d331614..46a528c 100644 --- a/flextaxd/modules/ReadTaxonomyQIIME.py +++ b/flextaxd/modules/ReadTaxonomyQIIME.py @@ -4,23 +4,22 @@ Read QIIME formatted taxonomy files and holds a dictionary with taxonomy tree and name translation ''' -from .ReadTaxonomy import ReadTaxonomy,InputError +from .ReadTaxonomy import ReadTaxonomy from .database.DatabaseConnection import DatabaseFunctions import logging logger = logging.getLogger(__name__) class ReadTaxonomyQIIME(ReadTaxonomy): """docstring for ReadTaxonomyQIIME.""" - def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose=False, taxid_base=1,skip_annotation=False): - super(ReadTaxonomyQIIME, self).__init__(database=database,verbose=verbose) - #self.database = DatabaseFunctions(database,verbose=verbose) + def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose=False, taxid_base=1,**kwargs): + super(ReadTaxonomyQIIME, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs) + #self.database = DatabaseFunctions(database,verbose=verbose) # Not nessesary opens in parent class self.input = taxonomy_file self.names = {} self.taxid_base = taxid_base - #self.taxonomy = {} + self.taxonomy = {} self.length = 0 self.ids = 0 - self.skip_annotation = skip_annotation self.levelDict = { "n": "no rank", "sk": "superkingdom", @@ -36,7 +35,7 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose } self.set_qiime(True) ### Add root name these manual nodes are required when parsing the GTDB database! - rootid = self.root ## Allways set in ReadTaxonomy + #rootid = self.add_node("root") ## Allways set in ReadTaxonomy coid = self.add_node("cellular organisms") bac_id = self.add_node("Bacteria") Euk_id = self.add_node("Eukaryota") @@ -48,14 +47,14 @@ def __init__(self, taxonomy_file=False, names_dmp=False, database=False, verbose self.add_rank("n",qiime=True) self.add_rank("sk",qiime=True) ## Add basic links - #self.add_link(child=rootid, parent=rootid,rank="n") ## Already done in ReadTaxonomy - self.add_link(child=coid, parent=rootid,rank="n") + self.add_link(child=self.root, parent=self.root,rank="n") + self.add_link(child=coid, parent=self.root,rank="n") self.add_link(child=bac_id, parent=coid,rank="sk") self.add_link(child=Euk_id, parent=coid,rank="sk") self.add_link(child=arc_id, parent=coid,rank="sk") - self.add_link(child=vir_id, parent=rootid,rank="sk") - self.add_link(child=oth_id, parent=rootid,rank="n") - self.add_link(child=unc_id, parent=rootid, rank="n") + self.add_link(child=vir_id, parent=self.root,rank="sk") + self.add_link(child=oth_id, parent=self.root,rank="n") + self.add_link(child=unc_id, parent=self.root, rank="n") def parse_taxonomy(self): '''Parse taxonomy information''' @@ -93,59 +92,54 @@ def parse_description(self,tree,current_i): '''Retrieve node description from QIIME formatted tree''' current_level = tree[current_i] level, description = current_level.split("__") + '''Fix greengenes bug with empty end nodes''' + if description == "": + level, description = self.parse_description(tree,current_i+1) return level,description - def qiime_to_tree(self, sep="\t"): + + def qiime_to_tree(self, sep="\t",reference=False): '''Read the qiime format file and parse out the relation tree (nodes.dmp)''' self.sep = sep self.tree = set() self.missed = 0 self.errors = 0 self.added = 0 + _ref = reference + refDict = {"RS":"refseq","GB":"genbank"} ## Refdict for GTDB formatted sources taxid_start = self.taxid_base - raiseWarning = False - if self.skip_annotation: - logger.info("Skip annotation is true, annotations in QIIME input file will be ignored.") with open(self.input) as f: '''Each row defines a genome annotation file connected to a tree level''' for row in f: if row.strip() != "": ## If there are trailing empty lines in the file data = row.strip().split("\t") - if self.skip_annotation: - genome_id = False - elif len(data) != 2: ## If the QIIME file is annotated it will contain two columns - if len(data) > 2: - raise InputError("The input format is not correct for QIIME. THe QIIME source format requires one [QIIME-tree] or two columns [gene_id\\tQIIME-tree]") - raiseWarning = True - genome_id = False - else: ## Annotation is given in column one - try: - if data[0].startswith(("RS","GB")): - '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' - genome_id = data[0].split("_",1)[1].strip() ## Genome ID - else: - genome_id = data[0].strip() - #print(genome_id) - except IndexError: - logger.debug("Row {row} could not be parsed".format(row=data)) - self.errors +=1 + try: + if data[0].startswith(("RS","GB")): + '''GTDB genome annotations contain one additional annotation to their genome names eg. RS_, this function removes this''' + reference,genome_id = data[0].split("_",1) ## Genome ID + genome_id = genome_id.strip() + if not _ref: + reference = refDict[reference] + else: + genome_id = data[0].strip() + '''Greengenes adaption, empty levels''' + #print(genome_id) + except IndexError: + logger.debug("Row {row} could not be parsed".format(row=data)) + self.errors +=1 ### Walk through tree and make sure all nodes back to root are annotated! taxonomy = list(reversed(data[-1].split(";"))) taxonomy_i = self.parse_tree(taxonomy) - if taxonomy_i and genome_id: - test = self.database.add_genome(genome=genome_id,_id=taxonomy_i) + if taxonomy_i: + test = self.database.add_genome(genome=genome_id,_id=taxonomy_i,reference=reference) if not str(test).startswith("UNIQUE"): self.added +=1 - elif not genome_id: ## No genome id suggest a QIIME file with no annotation - pass else: logger.debug("Warning taxonomy: {taxonomy} could not be parsed!!") self.missed +=1 self.database.commit() self.length = self.taxid_base - taxid_start - if raiseWarning: logger.warning("Warning no genomeid2taxid file given and your QIIME formatted file does not seem to contain annotations!") - if not self.skip_annotation: - logger.info("Genomes added to database: {genomes}".format(genomes=self.added)) - logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors)) + logger.info("Genomes added to database: {genomes}".format(genomes=self.added)) + logger.debug("Genomes not added to database {missed} errors {errors}".format(missed=self.missed,errors=self.errors)) logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length)) diff --git a/flextaxd/modules/WriteTaxonomy.py b/flextaxd/modules/WriteTaxonomy.py index 204b648..d00e786 100644 --- a/flextaxd/modules/WriteTaxonomy.py +++ b/flextaxd/modules/WriteTaxonomy.py @@ -10,7 +10,7 @@ class WriteTaxonomy(object): """docstring for WriteTaxonomy.""" - def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False,prefix="names,nodes",desc=False,dbprogram=None): + def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False,prefix="names,nodes",desc=False,dbprogram=None,dump_genomes=False): super(WriteTaxonomy, self).__init__() self.database = DatabaseFunctions(database) logging.debug("Write settings: ") @@ -39,6 +39,26 @@ def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False, self.link_order = False ## Default print is NCBI structure with child in the first column logging.debug("NCBI structure (child first): {parent}".format(parent=self.link_order)) + def dump_genomes(self): + '''Write the list of annotated genomes to a file''' + with open('{}{}.dmp'.format(self.path,"genomes"),"w") as outputfile: + genomes = self.get_all('genomes', 'genome,reference', sort="reference") + for genome in genomes: + print(*genome, sep="\t", end="\n", file=outputfile) + return + + def dump_genome_annotations(self, sort="reference"): + '''Dump all genomes, including their taxonomy reference (will work as input file for genomeid2taxid)''' + select = "genome,name,reference" + QUERY = "SELECT {select} FROM genomes JOIN nodes ON nodes.id=genomes.id".format(select=select) + if sort: + QUERY += " ORDER BY {col} DESC".format(col=sort) + logging.debug(QUERY) + with open('{}{}.dmp'.format(self.path,"genomes"),"w") as outputfile: + genomes = self.database.query(QUERY).fetchall() + for genome in genomes: + print(*genome, sep="\t", end="\n", file=outputfile) + return def set_separator(self,sep): self.separator=sep @@ -63,8 +83,10 @@ def set_prefix(self,prefix): logging.debug("Update output prefix nodes:{nodes} names:{names} ".format(nodes=self.prefix[1],names=self.prefix[0])) return self.prefix - def get_all(self, table, select="*"): + def get_all(self, table, select="*", sort=False): QUERY = "SELECT {select} FROM {table}".format(select=select, table=table) + if sort: + QUERY += " ORDER BY {col} DESC".format(col=sort) logging.debug(QUERY) return self.database.query(QUERY).fetchall() diff --git a/flextaxd/modules/database/CreateDatabase.py b/flextaxd/modules/database/CreateDatabase.py index 905b324..ea22ddf 100644 --- a/flextaxd/modules/database/CreateDatabase.py +++ b/flextaxd/modules/database/CreateDatabase.py @@ -41,6 +41,7 @@ def __init__(self, verbose=False): self.sql_create_genomes_table = """ CREATE TABLE IF NOT EXISTS genomes ( id integer NOT NULL, genome VARCHAR(255) NOT NULL UNIQUE, + reference VARCHAR(255), FOREIGN KEY (id) REFERENCES nodes (id) ); """ diff --git a/flextaxd/modules/database/DatabaseConnection.py b/flextaxd/modules/database/DatabaseConnection.py index 8dfab23..4788a53 100644 --- a/flextaxd/modules/database/DatabaseConnection.py +++ b/flextaxd/modules/database/DatabaseConnection.py @@ -350,7 +350,7 @@ def check_parent(self): raise TreeError("Node: {node} has more than one parent!".format(node=name)) '''Get functions of class''' - def get_all(self, database=False, table=False): + def get_all(self, database=False, table=False,sort=False): '''Get full table from table ------ @@ -360,6 +360,8 @@ def get_all(self, database=False, table=False): if not table: raise Exception("Table was not supplied to DatabaseFuction get_all()") QUERY = '''SELECT * FROM {table}'''.format(table) + if sort: + QUERY += " ORDER BY {col} ASC".format(sort=sort) return self.query(QUERY).fetchall() def get_taxid_base(self): @@ -503,7 +505,7 @@ def add_link(self, child=None, parent=None, rank=1, table="tree"): logger.debug("link added: child {}, parent {}, rank {} ".format(child,parent,rank)) return self.insert(info, table="tree") - def add_genome(self, genome, _id=False): + def add_genome(self, genome, _id=False,reference=False): '''Add genome annotation to nodes Returns @@ -513,8 +515,12 @@ def add_genome(self, genome, _id=False): info = { "genome": genome } + if _id: info["id"] = _id + if reference: + info["reference"] = reference + logger.debug(info) return self.insert(info, table="genomes") def add_links(self,links, table="tree",hold=False): @@ -799,6 +805,15 @@ def get_id(self,name): raise NameError("Name not found in the database! {name}".format(name=name)) return res + def update_table(self,data,table): + '''Add annotation to table + + Returns + ------ + see update responses + ''' + return self.update(data, table=table) + def update_genome(self,data): '''Add genome annotation to nodes @@ -806,4 +821,4 @@ def update_genome(self,data): ------ see update responses ''' - return self.update(data, table="genomes") + return self.update_table(data, table="genomes")