Skip to content

Commit

Permalink
Merge pull request #39 from FOI-Bioinformatics/iss33
Browse files Browse the repository at this point in the history
Iss33
  • Loading branch information
davve2 authored Feb 26, 2021
2 parents caaafaf + 93a80cc commit ccea0f9
Show file tree
Hide file tree
Showing 13 changed files with 215 additions and 95 deletions.
15 changes: 11 additions & 4 deletions flextaxd/create_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
__pkgname__="flextaxd-create"
__github__="https://github.com/FOI-Bioinformatics/flextaxd"
from flextaxd.custom_taxonomy_databases import __version__
from flextaxd.modules.functions import read_skip_file
from importlib import import_module
import shutil

Expand Down Expand Up @@ -104,7 +105,7 @@ def __init__(self, message,expression=""):
classifier_opts.add_argument('--params', metavar="", default="", help="Add extra params to create command (supports kraken*)")
classifier_opts.add_argument('--test', action='store_true', help="test database structure, only use 100 seqs")
classifier_opts.add_argument('--keep', action='store_true', help="Keep temporary files")
classifier_opts.add_argument('--skip', metavar="", default="", help="Do not include genomes within this taxonomy (child tree) in the database (works for kraken)")
classifier_opts.add_argument('--skip', metavar="", default="", help="Do not include genomes within this taxonomy (child tree) in the database (works for kraken), can be a file ending with txt and genome ids one per row")
classifier_opts.add_argument('-kp', '--build_processes',metavar="",type=int, default = None, help="Use a different number of cores for kraken classification")


Expand Down Expand Up @@ -156,14 +157,15 @@ def __init__(self, message,expression=""):
logger.info("FlexTaxD-create logging initiated!")
logger.debug("Supported formats: {formats}".format(formats=programs))


'''
Process data
'''
if args.outdir:
if not os.path.exists(args.outdir):
os.system("mkdir -p {outdir}".format(outdir = args.outdir))
skip=False
if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)):
if os.path.exists("{db_path}/library/library.fna".format(db_path=args.db_name)) or os.path.exists("{db_path}/.tmp0.fasta"):
ans = input("Database library file already exist, (u)se library, (o)verwrite (c)ancel? (u o,c): ")
if ans in ["o", "O"]:
logger.info("Overwrite current build progress")
Expand Down Expand Up @@ -209,6 +211,10 @@ def __init__(self, message,expression=""):
if not skip:
genomes = process_directory_obj.get_genome_path_dict()
else: genomes=False
if args.skip:
if args.skip.endswith(".txt"):
args.skip = read_skip_file(args.skip)
logger.info("File passed to skip, {n} genomes and {x} taxids added to skiplist".format(n=len(args.skip["genome_id"]),x=len(args.skip["tax_id"])))
classifierDB = classifier(args.database, args.db_name, genomes,args.outdir,
create_db=args.create_db,
limit=limit,
Expand All @@ -231,8 +237,9 @@ def __init__(self, message,expression=""):
logger.info("Create database")
try:
classifierDB.create_database(args.outdir,args.keep)
except UnboundLocalError:
logger.error("#Error: No kraken database name was given!")
except UnboundLocalError as e:
logger.error("#Error: No database name was given!")
logger.error("#UnboundLocalError "+e)
exit()

logger.debug(report_time(start_time,final=True))
Expand Down
54 changes: 28 additions & 26 deletions flextaxd/custom_taxonomy_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
each column (not <tab>|<tab>).
'''

__version__ = "0.3.5"
__version__ = "0.3.6"
__author__ = "David Sundell"
__credits__ = ["David Sundell"]
__license__ = "GPLv3"
Expand Down Expand Up @@ -103,45 +103,47 @@ def __init__(self, message,expression=""):
required.add_argument('-db', '--database',metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)")

basic = parser.add_argument_group('basic', 'Basic commands')
basic.add_argument('-o', '--outdir',metavar="", default=".", help="Output directory")
basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp")
basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator")
basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!")
basic.add_argument('--validate', action='store_true', help="Validate database format")
basic.add_argument('--stats', action='store_true', help="Print some statistics from the database")
basic.add_argument('-o', '--outdir',metavar="", default=".", help="Output directory")
basic.add_argument("--dump", action='store_true', help="Write database to names.dmp and nodes.dmp")
basic.add_argument('--dump_mini', action='store_true', help="Dump minimal file with tab as separator")
basic.add_argument("--force", action='store_true', help="use when script is implemented in pipeline to avoid security questions on overwrite!")
basic.add_argument('--validate', action='store_true', help="Validate database format")
basic.add_argument('--stats', action='store_true', help="Print some statistics from the database")

rmodules = get_read_modules()
read_opts = parser.add_argument_group('read_opts', "Source options")
read_opts.add_argument('-tf', '--taxonomy_file',metavar="", default=None, help="Taxonomy source file")
read_opts.add_argument('-tt', '--taxonomy_type',metavar="", default="", choices=rmodules, help="Source format of taxonomy input file ({modules})".format(modules=",".join(rmodules)))
read_opts.add_argument('--taxid_base', metavar="", type=int, default=1, help="The base for internal taxonomy ID numbers, when using NCBI as base select base at minimum 3000000 (default = 1)")
read_opts.add_argument('-tf', '--taxonomy_file',metavar="", default=None, help="Taxonomy source file")
read_opts.add_argument('-tt', '--taxonomy_type',metavar="", default="", choices=rmodules, help="Source format of taxonomy input file ({modules})".format(modules=",".join(rmodules)))
read_opts.add_argument('--taxid_base', metavar="", type=int, default=1, help="The base for internal taxonomy ID numbers, when using NCBI as base select base at minimum 3000000 (default = 1)")

mod_opts = parser.add_argument_group('mod_opts', "Database modification options")
mod_opts.add_argument('-mf','--mod_file', metavar="", default=False, help="File contaning modifications parent,child,(taxonomy level)")
mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source')
mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")
mod_opts.add_argument('--clean_database', action='store_true', help="Clean up database from unannotated nodes")
mod_opts.add_argument('-mf','--mod_file', metavar="", default=False, help="File contaning modifications parent,child,(taxonomy level)")
mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source')
mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")
mod_opts.add_argument('--clean_database', action='store_true', help="Clean up database from unannotated nodes")
mod_opts.add_argument('--skip_annotation', action='store_true', help="Do not automatically add annotation when creating GTDB database")


out_opts = parser.add_argument_group('output_opts', "Output options")
out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")
out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")

vis_opts = parser.add_argument_group('vis_opts', "Visualisation options")
vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node")
vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node")
vis_opts.add_argument('--vis_type', metavar='', default="newick", choices=__suppored_visualizations__, help="Choices [{allowed}]".format(allowed=", ".join(__suppored_visualizations__)))
vis_opts.add_argument('--vis_depth', metavar='', type=int, default=3, help="Maximum depth from node to visualise default 3, 0 = all levels")
vis_opts.add_argument('--vis_depth', metavar='', type=int, default=3, help="Maximum depth from node to visualise default 3, 0 = all levels")

debugopts = parser.add_argument_group("Logging and debug options")
debugopts.add_argument('--logs', metavar='', default="logs/", help="Specify log directory")
debugopts.add_argument('--logs', metavar='', default="logs/", help="Specify log directory")
debugopts.add_argument('--verbose', action='store_const', const=logging.INFO, help="Verbose output")
debugopts.add_argument('--debug', action='store_const', const=logging.DEBUG, help="Debug output")
debugopts.add_argument('--supress', action='store_const', const=logging.ERROR, default=logging.WARNING, help="Supress warnings")
debugopts.add_argument('--quiet', action='store_true', default=False, help="Dont show logging messages in terminal!")
debugopts.add_argument('--quiet', action='store_true', default=False, help="Dont show logging messages in terminal!")

parser.add_argument("--version", action='store_true', help=argparse.SUPPRESS)

Expand Down Expand Up @@ -260,7 +262,7 @@ def __init__(self, message,expression=""):
'''Load taxonomy module'''
logger.info("Loading module: ReadTaxonomy{type}".format(type=args.taxonomy_type))
read_module = dynamic_import("modules", "ReadTaxonomy{type}".format(type=args.taxonomy_type))
read_obj = read_module(args.taxonomy_file, database=args.database)
read_obj = read_module(args.taxonomy_file, database=args.database,skip_annotation=args.skip_annotation)
logger.info("Parse taxonomy")
read_obj.parse_taxonomy() ## Parse taxonomy file

Expand Down
46 changes: 40 additions & 6 deletions flextaxd/modules/CreateKrakenDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,35 @@ def __init__(self, database, kraken_database, genome_names, outdir,verbose=False
self.taxidmap_debug = self.database.get_nodes()
self.seqhead_validator = {}
self.seqhead_count = 0
self.skiptax = set()
self.skipfiles = set()
if skip:
self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database
else:
self.skiptax = []
if type(skip) == type(dict()):
self.skipfiles = skip["genome_id"]

self.skiptax = self.parse_taxid_names(skip["tax_id"])
logger.info(self.skiptax)
else:
self.skiptax = parse_skip(skip.split(",")) ## if node should be skipd this must be true, otherwise nodes in modfile are added to existing database

logger.info("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))
if not os.path.exists("{krakendb}".format(outdir = self.outdir, krakendb=self.krakendb)):
os.system("mkdir -p {krakendb}".format(outdir = self.outdir, krakendb=self.krakendb))

def parse_taxid_names(self,skiptax):
'''Parse taxid names'''
taxidmap = self.database.get_nodes()
newset = set()
for tid in skiptax:
if not tid.isdigit():
try:
newset.add(taxidmap[tid])
except KeyError:
logger.info("# WARNING: {taxa} not in database".format(taxa=tid))
if len(newset) > 0:
skiptax = newset
return skiptax

def _split(self,a, n):
k, m = divmod(len(a), n)
return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))
Expand Down Expand Up @@ -104,6 +125,10 @@ def kraken_fasta_header(self,genomes,added):
tmplog = "{outdir}/{rand}.log".format(outdir=self.outdir.rstrip("/"),rand=batchint)
for i in range(len(genomes)):
genome = genomes[i]
if len(set([genome]) & self.skipfiles) > 0: ## Skip file if in skipfiles
logger.debug("Skip {genome}".format(genome=genome))
logger.info("Skip {genome}".format(genome=genome))
continue
filepath = self.genome_path[genome]
kraken_header = "kraken:taxid"
tmpname = ".tmp{rand}.gz".format(rand=random.randint(10**7,10**9))
Expand All @@ -123,7 +148,7 @@ def kraken_fasta_header(self,genomes,added):

continue
if self.create_db:
if taxid not in self.skiptax:
if len(set([taxid]) & self.skiptax) == 0:
'''Open temp file for manipulated (unzipped) genome fasta files'''
tmpfile = zopen(tmppath,"w")
taxidlines = [] ## Holder for sequences to be added to the seqid2taxid map from each file
Expand All @@ -134,6 +159,13 @@ def kraken_fasta_header(self,genomes,added):
if line.startswith(">"):
taxidmap = []
row = line.strip().split(" ")
kh = line.strip().split("|")
if len(kh) > 1:
if kh[1].startswith("kraken"):
'''Assume kraken header is already present'''
kh = True
else:
kh = False
seq_header = row[0]
if self.debug:
self.seqhead_validator[seq_header] = [seq_header, genome, str(taxid),self.taxidmap_debug[taxid],filepath]
Expand All @@ -143,9 +175,11 @@ def kraken_fasta_header(self,genomes,added):
endhead = " ".join(row[1:])
if True: ## Remove end heading in all files as it may contain bad chars
endhead = "\n"
'''Format sequence header'''
if not self.krakenversion == "krakenuniq":

'''Format sequence header unless header already have kraken header'''
if not self.krakenversion == "krakenuniq" and not kh:
line = row[0] + "|" + kraken_header + "|" + str(taxid) + " " + endhead ## Nessesary to be able to add sequences outside NCBI

'''Format seqid2taxid map'''
if not self.krakenversion == "kraken2":
taxidmap = [row[0].lstrip(">"), str(taxid), endhead]
Expand Down
Loading

0 comments on commit ccea0f9

Please sign in to comment.