Merge pull request #39 from FOI-Bioinformatics/iss33

Iss33
FOI-Bioinformatics · Feb 26, 2021 · ccea0f9 · ccea0f9
2 parents caaafaf + 93a80cc
commit ccea0f9
Show file tree

Hide file tree

Showing 13 changed files with 215 additions and 95 deletions.
diff --git a/flextaxd/create_databases.py b/flextaxd/create_databases.py
@@ -15,6 +15,7 @@
 __pkgname__="flextaxd-create"
 __github__="https://github.com/FOI-Bioinformatics/flextaxd"
 from flextaxd.custom_taxonomy_databases import __version__
+from flextaxd.modules.functions import read_skip_file
 from importlib import import_module
 import shutil
 
@@ -104,7 +105,7 @@ def __init__(self,  message,expression=""):
 	classifier_opts.add_argument('--params', metavar="", default="",  help="Add extra params to create command (supports kraken*)")
 	classifier_opts.add_argument('--test', action='store_true', help="test database structure, only use 100 seqs")
 	classifier_opts.add_argument('--keep', action='store_true', help="Keep temporary files")
-	classifier_opts.add_argument('--skip', metavar="", default="", help="Do not include genomes within this taxonomy (child tree) in the database (works for kraken)")
+	classifier_opts.add_argument('--skip', metavar="", default="", help="Do not include genomes within this taxonomy (child tree) in the database (works for kraken), can be a file ending with txt and genome ids one per row")
 	classifier_opts.add_argument('-kp', '--build_processes',metavar="",type=int, default = None, help="Use a different number of cores for kraken classification")
 
 
@@ -156,14 +157,15 @@ def __init__(self,  message,expression=""):
 	logger.info("FlexTaxD-create logging initiated!")
 	logger.debug("Supported formats: {formats}".format(formats=programs))
 
+
 	'''
 		Process data
 	'''
 	if args.outdir:
 		if not os.path.exists(args.outdir):
 			os.system("mkdir -p {outdir}".format(outdir = args.outdir))
 	skip=False
-	if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)):
+	if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)) or os.path.exists("{db_path}/.tmp0.fasta"):
 		ans = input("Database library file already exist, (u)se library, (o)verwrite (c)ancel? (u o,c): ")
 		if ans in ["o", "O"]:
 			logger.info("Overwrite current build progress")
@@ -209,6 +211,10 @@ def __init__(self,  message,expression=""):
 		if not skip:
 			genomes = process_directory_obj.get_genome_path_dict()
 		else: genomes=False
+		if args.skip:
+			if args.skip.endswith(".txt"):
+				args.skip = read_skip_file(args.skip)
+				logger.info("File passed to skip, {n} genomes and {x} taxids added to skiplist".format(n=len(args.skip["genome_id"]),x=len(args.skip["tax_id"])))
 		classifierDB = classifier(args.database, args.db_name, genomes,args.outdir,
 										create_db=args.create_db,
 										limit=limit,
@@ -231,8 +237,9 @@ def __init__(self,  message,expression=""):
 		logger.info("Create database")
 		try:
 			classifierDB.create_database(args.outdir,args.keep)
-		except UnboundLocalError:
-			logger.error("#Error: No kraken database name was given!")
+		except UnboundLocalError as e:
+			logger.error("#Error: No database name was given!")
+			logger.error("#UnboundLocalError "+e)
 			exit()
 
 	logger.debug(report_time(start_time,final=True))

diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py
@@ -20,7 +20,7 @@
 each column (not <tab>|<tab>).
 '''
 
-__version__ = "0.3.5"
+__version__ = "0.3.6"
 __author__ = "David Sundell"
 __credits__ = ["David Sundell"]
 __license__ = "GPLv3"
@@ -103,45 +103,47 @@ def __init__(self,  message,expression=""):
     required.add_argument('-db', '--database',metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)")
 
     basic = parser.add_argument_group('basic', 'Basic commands')
-    basic.add_argument('-o', '--outdir',metavar="", default=".", help="Output directory")
-    basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp")
-    basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator")
-    basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!")
-    basic.add_argument('--validate', action='store_true', help="Validate database format")
-    basic.add_argument('--stats', action='store_true', help="Print some statistics from the database")
+    basic.add_argument('-o', '--outdir',metavar="", default=".",                    help="Output directory")
+    basic.add_argument("--dump", action='store_true',                               help="Write database to names.dmp and nodes.dmp")
+    basic.add_argument('--dump_mini', action='store_true',                          help="Dump minimal file with tab as separator")
+    basic.add_argument("--force", action='store_true',                              help="use when script is implemented in pipeline to avoid security questions on overwrite!")
+    basic.add_argument('--validate', action='store_true',                           help="Validate database format")
+    basic.add_argument('--stats', action='store_true',                              help="Print some statistics from the database")
 
     rmodules = get_read_modules()
     read_opts = parser.add_argument_group('read_opts', "Source options")
-    read_opts.add_argument('-tf', '--taxonomy_file',metavar="", default=None, help="Taxonomy source file")
-    read_opts.add_argument('-tt', '--taxonomy_type',metavar="", default="", choices=rmodules, help="Source format of taxonomy input file ({modules})".format(modules=",".join(rmodules)))
-    read_opts.add_argument('--taxid_base', metavar="", type=int, default=1, help="The base for internal taxonomy ID numbers, when using NCBI as base select base at minimum 3000000 (default = 1)")
+    read_opts.add_argument('-tf', '--taxonomy_file',metavar="", default=None,                   help="Taxonomy source file")
+    read_opts.add_argument('-tt', '--taxonomy_type',metavar="", default="", choices=rmodules,   help="Source format of taxonomy input file ({modules})".format(modules=",".join(rmodules)))
+    read_opts.add_argument('--taxid_base', metavar="", type=int, default=1,                     help="The base for internal taxonomy ID numbers, when using NCBI as base select base at minimum 3000000 (default = 1)")
 
     mod_opts = parser.add_argument_group('mod_opts', "Database modification options")
-    mod_opts.add_argument('-mf','--mod_file', metavar="", default=False, help="File contaning modifications parent,child,(taxonomy level)")
-    mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
-    mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
-    mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None,  help='Path to genome folder is required when using NCBI_taxonomy as source')
-    mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
-    mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")
-    mod_opts.add_argument('--clean_database',	action='store_true', help="Clean up database from unannotated nodes")
+    mod_opts.add_argument('-mf','--mod_file', metavar="", default=False,                help="File contaning modifications parent,child,(taxonomy level)")
+    mod_opts.add_argument('-md', '--mod_database', metavar="",default=False,            help="Database file containing modifications")
+    mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False,         help="File that lists which node a genome should be assigned to")
+    mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None,             help='Path to genome folder is required when using NCBI_taxonomy as source')
+    mod_opts.add_argument('-p', '--parent',metavar="", default=False,                   help="Parent from which to add (replace see below) branch")
+    mod_opts.add_argument('--replace', action='store_true',                             help="Add if existing children of parents should be removed!")
+    mod_opts.add_argument('--clean_database',	action='store_true',                    help="Clean up database from unannotated nodes")
+    mod_opts.add_argument('--skip_annotation',	action='store_true',                    help="Do not automatically add annotation when creating GTDB database")
+
 
     out_opts = parser.add_argument_group('output_opts', "Output options")
-    out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
-    out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
-    out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
-    out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")
+    out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__,  help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
+    out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes",                       help="change dump prefix reqires two names default(names,nodes)")
+    out_opts.add_argument('--dump_sep', metavar="", default="\t|\t",                                help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
+    out_opts.add_argument('--dump_descriptions', action='store_true', default=False,                help="Dump description names instead of database integers")
 
     vis_opts = parser.add_argument_group('vis_opts', "Visualisation options")
-    vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node")
+    vis_opts.add_argument('--visualise_node', metavar='', default=False,                            help="Visualise tree from selected node")
     vis_opts.add_argument('--vis_type', metavar='', default="newick", choices=__suppored_visualizations__, help="Choices [{allowed}]".format(allowed=", ".join(__suppored_visualizations__)))
-    vis_opts.add_argument('--vis_depth', metavar='', type=int, default=3, help="Maximum depth from node to visualise default 3, 0 = all levels")
+    vis_opts.add_argument('--vis_depth', metavar='', type=int, default=3,                           help="Maximum depth from node to visualise default 3, 0 = all levels")
 
     debugopts = parser.add_argument_group("Logging and debug options")
-    debugopts.add_argument('--logs', 				metavar='', default="logs/", 		help="Specify log directory")
+    debugopts.add_argument('--logs', 				metavar='', default="logs/", 		                    help="Specify log directory")
     debugopts.add_argument('--verbose',			    action='store_const', const=logging.INFO,				help="Verbose output")
     debugopts.add_argument('--debug',				action='store_const', const=logging.DEBUG,				help="Debug output")
     debugopts.add_argument('--supress',				action='store_const', const=logging.ERROR,	default=logging.WARNING,			help="Supress warnings")
-    debugopts.add_argument('--quiet',               action='store_true', default=False, help="Dont show logging messages in terminal!")
+    debugopts.add_argument('--quiet',               action='store_true', default=False,                     help="Dont show logging messages in terminal!")
 
     parser.add_argument("--version", action='store_true', help=argparse.SUPPRESS)
 
@@ -260,7 +262,7 @@ def __init__(self,  message,expression=""):
             '''Load taxonomy module'''
             logger.info("Loading module: ReadTaxonomy{type}".format(type=args.taxonomy_type))
             read_module = dynamic_import("modules", "ReadTaxonomy{type}".format(type=args.taxonomy_type))
-            read_obj = read_module(args.taxonomy_file, database=args.database)
+            read_obj = read_module(args.taxonomy_file, database=args.database,skip_annotation=args.skip_annotation)
             logger.info("Parse taxonomy")
             read_obj.parse_taxonomy()                                                           ## Parse taxonomy file
 

diff --git a/flextaxd/modules/CreateKrakenDatabase.py b/flextaxd/modules/CreateKrakenDatabase.py
@@ -67,14 +67,35 @@ def __init__(self, database, kraken_database, genome_names, outdir,verbose=False
 			self.taxidmap_debug = self.database.get_nodes()
 		self.seqhead_validator = {}
 		self.seqhead_count = 0
+		self.skiptax = set()
+		self.skipfiles = set()
 		if skip:
-			self.skiptax = parse_skip(skip.split(","))  ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
-		else:
-			self.skiptax = []
+			if type(skip) == type(dict()):
+				self.skipfiles = skip["genome_id"]
+
+				self.skiptax = self.parse_taxid_names(skip["tax_id"])
+				logger.info(self.skiptax)
+			else:
+				self.skiptax = parse_skip(skip.split(","))  ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
+
 		logger.info("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
 		if not os.path.exists("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)):
 			os.system("mkdir -p {krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
 
+	def parse_taxid_names(self,skiptax):
+		'''Parse taxid names'''
+		taxidmap = self.database.get_nodes()
+		newset = set()
+		for tid in skiptax:
+			if not tid.isdigit():
+				try:
+					newset.add(taxidmap[tid])
+				except KeyError:
+					logger.info("# WARNING: {taxa} not in database".format(taxa=tid))
+		if len(newset) > 0:
+			skiptax = newset
+		return skiptax
+
 	def _split(self,a, n):
 		k, m = divmod(len(a), n)
 		return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
@@ -104,6 +125,10 @@ def kraken_fasta_header(self,genomes,added):
 		tmplog = "{outdir}/{rand}.log".format(outdir=self.outdir.rstrip("/"),rand=batchint)
 		for i in range(len(genomes)):
 			genome = genomes[i]
+			if len(set([genome]) & self.skipfiles) > 0: ## Skip file if in skipfiles
+				logger.debug("Skip {genome}".format(genome=genome))
+				logger.info("Skip {genome}".format(genome=genome))
+				continue
 			filepath = self.genome_path[genome]
 			kraken_header = "kraken:taxid"
 			tmpname = ".tmp{rand}.gz".format(rand=random.randint(10**7,10**9))
@@ -123,7 +148,7 @@ def kraken_fasta_header(self,genomes,added):
 
 					continue
 			if self.create_db:
-				if taxid not in self.skiptax:
+				if len(set([taxid]) & self.skiptax) == 0:
 					'''Open temp file for manipulated (unzipped) genome fasta files'''
 					tmpfile = zopen(tmppath,"w")
 					taxidlines = []  ## Holder for sequences to be added to the seqid2taxid map from each file
@@ -134,6 +159,13 @@ def kraken_fasta_header(self,genomes,added):
 								if line.startswith(">"):
 									taxidmap = []
 									row = line.strip().split(" ")
+									kh = line.strip().split("|")
+									if len(kh) > 1:
+										if kh[1].startswith("kraken"):
+											'''Assume kraken header is already present'''
+											kh = True
+									else:
+										kh = False
 									seq_header = row[0]
 									if self.debug:
 										self.seqhead_validator[seq_header] = [seq_header, genome, str(taxid),self.taxidmap_debug[taxid],filepath]
@@ -143,9 +175,11 @@ def kraken_fasta_header(self,genomes,added):
 									endhead = " ".join(row[1:])
 									if True:  ## Remove end heading in all files as it may contain bad chars
 										endhead = "\n"
-									'''Format sequence header'''
-									if not self.krakenversion == "krakenuniq":
+
+									'''Format sequence header unless header already have kraken header'''
+									if not self.krakenversion == "krakenuniq" and not kh:
 										line = row[0] + "|" + kraken_header + "|" + str(taxid) + "  " + endhead	 ## Nessesary to be able to add sequences outside NCBI
+
 									'''Format seqid2taxid map'''
 									if not self.krakenversion == "kraken2":
 										taxidmap = [row[0].lstrip(">"), str(taxid), endhead]