Merge pull request #54 from FOI-Bioinformatics/v0.4.3

V0.4.3 * New Features ** Added function that retain defined levels (if merging database does not have levels) ** Added function to download genomes from a list of genomes ** Added function to export genomes from the database to a list (see above function) * Bug fixes ** Double root id bug resolved ** Bug reading GTDB files resolved * Other ** The database now keeps the information from which source the genome annotation is expected to be found (refseq/genbank)
FOI-Bioinformatics · Jul 13, 2022 · c6cf4ab · c6cf4ab
2 parents 423d49f + 1ad2986
commit c6cf4ab
Show file tree

Hide file tree

Showing 12 changed files with 202 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Supported database build programs
 * krakenuniq
 * centrifuge
 
-The flextaxd (flextaxd) script allows customization of databases from NCBI, QIIME or CanSNPer source formats and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation).
+The flextaxd (flextaxd) script allows customization of databases from several sources (See supported source formats above) and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation).
 ### Create database
 * Create your database from source files --taxonomy_file
 * and select --taxonomy_type [supported formats]

diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py
@@ -91,6 +91,7 @@ def __init__(self,  message,expression=""):
 	download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set")
 	download_opts.add_argument('--download', action='store_true', help="Download additional sequences")
 	download_opts.add_argument('-r', '--representative',   action='store_true', help="Download GTDB representative genomes")
+	download_opts.add_argument('--download_file', default=False, help="Download genomes from file (path to file)")
 	download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path")
 	download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)")
 	download_opts.add_argument('--genomes_path', metavar="",default=None,  help='path to genomes')
@@ -125,6 +126,8 @@ def __init__(self,  message,expression=""):
 	if not os.path.exists(args.database):
 		raise FileNotFoundError("No database file could be found, please provide a FlexTaxD database to run FlexTaxD!")
 
+	if args.create_db and not args.genomes_path:
+		raise InputError("genomes_path parameter was not given")
 	if not os.path.exists(args.genomes_path):
 		raise FileNotFoundError("Directory {path} does not exist".format(path=args.genomes_path))
 
@@ -186,16 +189,19 @@ def __init__(self,  message,expression=""):
 		process_directory_obj = process_directory(args.database)
 		genomes, missing = process_directory_obj.process_folder(args.genomes_path)
 		''' 2. Download missing files'''
-		if args.download or args.representative:
+		if args.download or args.representative or args.download_file:
 			download = dynamic_import("modules", "DownloadGenomes")
 			download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path)
-			new_genome_path, missing = download_obj.run(missing,representative=args.representative,url=args.rep_path)
-			if not new_genome_path:
-				still_missing = missing
-				if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
+			if args.download_file:
+				download_obj.download_from_file(args.download_file)
 			else:
-				new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
-				genomes += new_genomes
+				new_genome_path, missing = download_obj.run(missing,args.rep_path)
+				if not new_genome_path:
+					still_missing = missing
+					if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
+				else:
+					new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
+					genomes += new_genomes
 		else:
 			if len(missing) > 0:
 				logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing)))

diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py
@@ -20,7 +20,7 @@
 each column (not <tab>|<tab>).
 '''
 
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 __author__ = "David Sundell"
 __credits__ = ["David Sundell"]
 __license__ = "GPLv3"
@@ -121,17 +121,21 @@ def __init__(self,  message,expression=""):
     mod_opts.add_argument('-md', '--mod_database', metavar="",default=False,            help="Database file containing modifications")
     mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False,         help="File that lists which node a genome should be assigned to")
     mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None,             help='Path to genome folder is required when using NCBI_taxonomy as source')
+    #mod_opts.add_argument('-un', '--update_names', metavar="",default=None,             help='Update node names using old to new name file.')
     mod_opts.add_argument('-p', '--parent',metavar="", default=False,                   help="Parent from which to add (replace see below) branch")
     mod_opts.add_argument('--replace', action='store_true',                             help="Add if existing children of parents should be removed!")
     mod_opts.add_argument('--clean_database',	action='store_true',                    help="Clean up database from unannotated nodes")
     mod_opts.add_argument('--skip_annotation',	action='store_true',                    help="Do not automatically add annotation when creating GTDB database")
+    mod_opts.add_argument('--refdatabase', metavar="", default=False,                   help="For download command, give value of expected source, default (refseq)")
 
 
     out_opts = parser.add_argument_group('output_opts', "Output options")
     out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__,  help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
     out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes",                       help="change dump prefix reqires two names default(names,nodes)")
     out_opts.add_argument('--dump_sep', metavar="", default="\t|\t",                                help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
     out_opts.add_argument('--dump_descriptions', action='store_true', default=False,                help="Dump description names instead of database integers")
+    out_opts.add_argument('--dump_genomes', action='store_true', default=False,                     help="Print list of genomes (and source) to file")
+    out_opts.add_argument('--dump_genome_annotations', action='store_true', default=False,          help="Add genome taxid annotation to genomes dump")
 
     vis_opts = parser.add_argument_group('vis_opts', "Visualisation options")
     vis_opts.add_argument('--visualise_node', metavar='', default=False,                            help="Visualise tree from selected node")
@@ -261,6 +265,16 @@ def __init__(self,  message,expression=""):
         modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base)
         modify_obj.clean_database(ncbi=ncbi)
 
+    '''Dump option, export list of genomes, added in flextaxd version 0.4.2'''
+    if args.dump_genomes:
+        logger.info("Dump list of genomes")
+        write_module = dynamic_import("modules", "WriteTaxonomy")
+        write_obj = write_module(args.outdir, database=args.database,prefix=args.dump_prefix,separator=args.dump_sep,minimal=args.dump_mini,desc=args.dump_descriptions,dbprogram=args.dbprogram,dump_genomes=True)
+        if args.dump_genome_annotations:
+            write_obj.dump_genome_annotations()
+        else:
+            write_obj.dump_genomes()
+
     ''' 0. Create taxonomy database (if it does not exist)'''
     if args.taxonomy_file:
         if not os.path.exists(args.database) or force:
@@ -304,6 +318,11 @@ def __init__(self,  message,expression=""):
         modify_obj = modify_module(database=args.database, update_genomes=True,taxid_base=args.taxid_base)
         modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid)
 
+    # if args.update_names:
+    #     modify_module = dynamic_import("modules", "ModifyTree")
+    #     modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base)
+    #     modify_obj.update_node_names(args.update_names)
+
     if (args.mod_file or args.mod_database) and args.clean_database:
         modify_module = dynamic_import("modules", "ModifyTree")
         modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base)

diff --git a/flextaxd/modules/DownloadGenomes.py b/flextaxd/modules/DownloadGenomes.py
@@ -172,6 +172,16 @@ def download_files(self,files):
 				logger.error("Could not stop all subprocesses check your process manager and end them manually!")
 		return self.not_downloaded
 
+	def download_from_file(self,inputfile,exclude="",all=False):
+		'''Download files with ncbis download script using an accession input file'''
+		if not all:
+			exclude = " ".join([exclude,"--exclude-gff3 --exclude-protein --exclude-rna --exclude-genomic-cds"])
+		args = ["genome", "accession" ,"--inputfile", input_file, exclude"]
+		p = Popen(args, stdout=PIPE,cwd=self.location)
+		(output, err) = p.communicate()
+		p_status = p.wait()
+		return self.location
+
 	def run(self, files, representative=False,url=""):
 		'''Download list of GCF and or GCA files from NCBI or download represenative genomes
 

diff --git a/flextaxd/modules/ModifyTree.py b/flextaxd/modules/ModifyTree.py
@@ -55,14 +55,16 @@ def __init__(self, message):
 
 class ModifyTree(object):
 	"""docstring for ModifyTree."""
-	def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, separator="\t",verbose=False,parent=False,replace=False,**kwargs):
+	def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, update_node_names=False,separator="\t",verbose=False,parent=False,replace=False,**kwargs):
 		super(ModifyTree, self).__init__()
 		self.verbose = verbose
 		logger.info("Modify Tree")
 		self.sep = separator
 		self.taxonomy = {}
 		self.names = {}
 
+		self.keep_rank = {} ## Place holder for parent rank
+
 		self.parent=parent
 		self.replace = replace
 		self.ncbi_order = True
@@ -92,7 +94,7 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c
 			self.modsource = self.parse_modification(self.moddb,"database")
 		elif mod_file:
 			self.modsource = self.parse_modification(mod_file,"file")
-		elif update_genomes or clean_database:
+		elif update_genomes or clean_database or update_node_names:
 			pass
 		else:
 			raise InputError("No modification source could be found both mod_database and mod_file file are empty!")
@@ -163,8 +165,16 @@ def _parse_new_links(self, parent=None,child=None,rank="no rank"):
 		self.new_nodes.add(parent_i)
 		child_i = self.get_id(child)
 		self.new_nodes.add(child_i)
-
-		rank_i = self.add_rank(rank)
+		'''If node has no level (no rank), check if parent database had a classified level. If so default add level'''
+		try:
+			level = self.parent_levels[int(child_i)]
+			logger.info("Rank kept for {node}, {rank}".format(node=child, rank=level))
+		except:
+			level = False
+		if not level:
+			rank_i = self.add_rank(rank)
+		else:
+			rank_i = level
 		self.new_links.add((parent_i,child_i,rank_i))
 		return
 
@@ -263,17 +273,15 @@ def parse_modification(self, input,modtype="database"):
 		if modtype == "database":
 			modparent = self.moddb.get_parent(self.moddb.get_id(self.parent))
 			logger.info("{n} existing links to {parent} ({parentlinks}) ({modparent})".format(n=len(self.existing_links),parent=self.parent,parentlinks=parentlinks,modparent=modparent))
-
+		self.parent_levels = self.keep_levels(self.existing_links | parentlinks | set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes))))
 		if modtype == "database":
 			self.mod_genomes = self.database_mod(input,self.parent)
 		elif modtype == "file":
 			self.file_mod(input)
 		else:
 			raise InputError("Wrong modification input database or file must be supplied")
-
 		### get links from current database
 		self.old_nodes = self.existing_nodes - self.new_nodes
-
 		logger.info("nodes:")
 		logger.info("old: {old}".format(old=len(self.old_nodes)))
 		logger.info("new: {new}".format(new=len(self.new_nodes)))
@@ -284,7 +292,6 @@ def parse_modification(self, input,modtype="database"):
 
 		self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks)
 		self.old_links = self.existing_links - self.new_links
-
 		logger.info("links:")
 		logger.info("old: {old}".format(old=len(self.old_links)))
 		logger.info("new: {new}".format(new=len(self.new_nodes)))
@@ -295,21 +302,27 @@ def parse_modification(self, input,modtype="database"):
 		'''Get all genomes annotated to new nodes in existing database'''
 		return True
 
-	def update_annotations(self, genomeid2taxid):
+	def update_annotations(self, genomeid2taxid, reference=False):
 		'''Function that adds annotation of genome ids to nodes'''
 		logger.info("Update genome to taxid annotations using {genomeid2taxid}".format(genomeid2taxid=genomeid2taxid))
+		_ref = reference
 		update = {
 			"set_column": "id",
 			"where_column": "genome",
 			"set_value": "",
 			"where": ""
 		}
+		#if _ref:
+		#	update[""]
 		updated = 0
 		added = 0
 		with open(genomeid2taxid) as f:
 			for row in f:
 				try:
-					genome,name = row.strip().split(self.sep)
+					if len(row.strip().split(self.sep)) > 2:
+						genome,name,reference = row.strip().split(self.sep)
+					else:
+						genome,name = row.strip().split(self.sep)
 				except ValueError:
 					genome,name = row.strip().split("    ")
 				logger.debug("genome: {genome}, name: {name}".format(genome=genome,name=name))
@@ -386,6 +399,36 @@ def update_genomes(self):
 		self.taxonomydb.commit()
 		return
 
+	def update_node_names(self, refdict):
+		'''Function that adds annotation of genome ids to nodes'''
+		logger.info("Update node names in the database from {refdict}".format(refdict=refdict))
+		update = {
+			"set_column": "name",
+			"where_column": "name",
+			"set_value": "",
+			"where": ""
+		}
+		updated = 0
+		added = 0
+		with open(refdict) as f:
+			for row in f:
+				try:
+					old_name,name = row.strip().split(self.sep)
+				except ValueError:
+					old_name,name = row.strip().split("    ")
+				logger.debug("old_name: {old_name}, name: {name}".format(old_name=old_name,name=name))
+				## If no exception occured add old_name
+				update["set_value"] = name
+				update["where"] = old_name.strip()
+				res = self.taxonomydb.update_table(update,table="nodes")
+				if self.taxonomydb.rowcount()!=0:
+					if res:
+						updated += 1
+		self.taxonomydb.commit()
+		gid = self.taxonomydb.get_genomes()
+		logger.info("{updated} annotations were updated!".format(added=added, updated=updated))
+		return
+
 	def clean_database(self, ncbi=False):
 		'''Function that removes all node and node paths without annotation'''
 		logger.info("Fetch annotated nodes")
@@ -404,7 +447,7 @@ def clean_database(self, ncbi=False):
 		logger.info("Parents added: {an}".format(an=len(self.annotated_nodes)-an))
 		if ncbi:
 			logger.info("Keep main nodes of the NCBI taxonomy (parents on level 3 and above)")
-			self.keep = set(self.taxonomydb.get_children([1],maxdepth=1))  #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)])
+			self.keep = set(self.taxonomydb.get_children([1],maxdepth=3))  #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)])
 			logger.info("Adding root levels {nlev}".format(nlev=len(self.keep-self.annotated_nodes)))
 			self.annotated_nodes |= self.keep
 		'''Get all links related to an annotated node and its parents'''
@@ -434,6 +477,14 @@ def clean_database(self, ncbi=False):
 			self.taxonomydb.query("vacuum")
 		logger.info("Database is cleaned!")
 
+	def keep_levels(self, links):
+		parent_levels = {}
+		for c,p,l in links:
+			if self.rank[l] != 1:
+				parent_levels[p] = l
+		print(parent_levels)
+		return parent_levels
+
 	def update_database(self):
 		'''Update the database file'''
 		if self.replace:
@@ -446,6 +497,7 @@ def update_database(self):
 				logger.info("Replace tree, deleting all nodes downstream of selected parent!")
 			if len(self.old_links | self.non_overlapping_old_links-set(self.parent_link)) > 0:
 				logger.debug("Delete links no longer valid!")
+				'''Add function to keep taxonomy level from previous database (belongs to parent)'''
 				self.taxonomydb.delete_links((self.old_links | self.non_overlapping_old_links)-set(self.parent_link))
 			if len(self.old_nodes):
 				logger.debug("Delete nodes!")

diff --git a/flextaxd/modules/ProcessDirectory.py b/flextaxd/modules/ProcessDirectory.py
@@ -23,6 +23,7 @@ def __init__(self, database,limit=False):
 		self.genome_names = []
 		self.genome_path_dict = {}
 		self.files = []
+		#logger.debug(self.genome_id_dict)
 
 	def get_genome_names(self):
 		'''
@@ -73,10 +74,6 @@ def is_gcf_gca(self,fname,debug=False):
 		'''
 		try:
 			GCX,END,REST = fname.split("_",2)  ## If a name contains anything after the GCF number remove this if split by _
-			if GCX in ["GB","RS"]:
-				return False ## GTDB stype input treat as non GCF
-				#GCX = END
-				#END,REST = REST.split("_",1)  #remove GB or RS definitions
 			if debug:
 				logger.debug("[{} {} {}]".format(GCX,END,REST))
 			NUM,version = END.split(".",1)
@@ -162,6 +159,7 @@ def walk_directory(self,folder_path):
 		if not folder_path:
 			raise IOError("Parameter --genomes_path was not set".format(folder_path))
 		logger.info("Process genome path ({path})".format(path=folder_path))
+		logger.debug("Extensions valid: ({f})".format(f=self.ext))
 		for root, dirs, files in os.walk(folder_path,followlinks=True):
 			for file in files:
 				fname = file.rstrip(".gz") ## remove gz if present
@@ -177,6 +175,8 @@ def walk_directory(self,folder_path):
 		logger.info("Processed {count} genomes".format(count=count))
 		self.files = list(set(self.files))
 		self.genome_names = list(set(self.genome_names))
+		if len(self.genome_names) == 0:
+			logger.info("# WARNING: No genomes from the input folder was added! Are your genomes annotated properly in the database?")
 		return self.files, self.genome_names
 
 	def process_folder(self,folder_path):

diff --git a/flextaxd/modules/ReadTaxonomy.py b/flextaxd/modules/ReadTaxonomy.py
@@ -141,16 +141,23 @@ def read_nodes(self, treefile=False):
 				self.length +=1
 			self.database.commit()
 
-	def parse_genomeid2taxid(self,genomeid2taxid):
+	def parse_genomeid2taxid(self,genomeid2taxid,reference=False):
 		'''Parse file that annotates genome_id´s to nodes in the tree'''
 		nodeDict = self.database.get_nodes()
+		_ref = reference
 		with self.zopen(genomeid2taxid,"rt") as f:
 			headers = f.readline().strip().split("\t")
 			for row in f:
 				if row.strip() != "": ## If there are trailing empty lines in the file
-					genomeid,taxid = row.strip().split("\t")
 					try:
-						self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()])
+						genomeid,taxid = row.strip().split("\t")
+					except:
+						if not _ref: ## override if there is a reference in file, and use given ref
+							genomeid,taxid,reference = row.strip().split("\t")
+						else:
+							genomeid,taxid,override = row.strip().split("\t")
+					try:
+						self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()],reference=reference)
 					except KeyError:
 						logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid))
 			self.database.commit()