Skip to content

Commit

Permalink
Merge pull request #54 from FOI-Bioinformatics/v0.4.3
Browse files Browse the repository at this point in the history
V0.4.3

* New Features
** Added function that retain defined levels (if merging database does not have levels)
** Added function to download genomes from a list of genomes
** Added function to export genomes from the database to a list (see above function)

* Bug fixes
** Double root id bug resolved
** Bug reading GTDB files resolved

* Other
** The database now keeps the information from which source the genome annotation is expected to be found (refseq/genbank)
  • Loading branch information
davve2 authored Jul 13, 2022
2 parents 423d49f + 1ad2986 commit c6cf4ab
Show file tree
Hide file tree
Showing 12 changed files with 202 additions and 76 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Supported database build programs
* krakenuniq
* centrifuge

The flextaxd (flextaxd) script allows customization of databases from NCBI, QIIME or CanSNPer source formats and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation).
The flextaxd (flextaxd) script allows customization of databases from several sources (See supported source formats above) and supports export functions into NCBI formatted names and nodes.dmp files as well as a standard tab separated file (or a selected separation).
### Create database
* Create your database from source files --taxonomy_file
* and select --taxonomy_type [supported formats]
Expand Down
20 changes: 13 additions & 7 deletions flextaxd/create_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def __init__(self, message,expression=""):
download_opts.add_argument('-p', '--processes',metavar="",type=int, default = 8, help="Use multiple cores for downloading genomes and kraken if -kp is not set")
download_opts.add_argument('--download', action='store_true', help="Download additional sequences")
download_opts.add_argument('-r', '--representative', action='store_true', help="Download GTDB representative genomes")
download_opts.add_argument('--download_file', default=False, help="Download genomes from file (path to file)")
download_opts.add_argument('--rep_path', metavar="URL", default=latest_genome_reps, help="Specify GTDB representative version URL full path")
download_opts.add_argument('--force_download', action='store_true', help="Download sequences from genbank if not in refseq (WARNING: might include genome withdrawals)")
download_opts.add_argument('--genomes_path', metavar="",default=None, help='path to genomes')
Expand Down Expand Up @@ -125,6 +126,8 @@ def __init__(self, message,expression=""):
if not os.path.exists(args.database):
raise FileNotFoundError("No database file could be found, please provide a FlexTaxD database to run FlexTaxD!")

if args.create_db and not args.genomes_path:
raise InputError("genomes_path parameter was not given")
if not os.path.exists(args.genomes_path):
raise FileNotFoundError("Directory {path} does not exist".format(path=args.genomes_path))

Expand Down Expand Up @@ -186,16 +189,19 @@ def __init__(self, message,expression=""):
process_directory_obj = process_directory(args.database)
genomes, missing = process_directory_obj.process_folder(args.genomes_path)
''' 2. Download missing files'''
if args.download or args.representative:
if args.download or args.representative or args.download_file:
download = dynamic_import("modules", "DownloadGenomes")
download_obj = download(args.processes,outdir=args.outdir,force=args.force_download,download_path=args.genomes_path)
new_genome_path, missing = download_obj.run(missing,representative=args.representative,url=args.rep_path)
if not new_genome_path:
still_missing = missing
if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
if args.download_file:
download_obj.download_from_file(args.download_file)
else:
new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
genomes += new_genomes
new_genome_path, missing = download_obj.run(missing,args.rep_path)
if not new_genome_path:
still_missing = missing
if len(still_missing) > 0: print("Not able to download: {nr}".format(nr=len(still_missing)))
else:
new_genomes, missing = process_directory_obj.process_folder(new_genome_path)
genomes += new_genomes
else:
if len(missing) > 0:
logger.info("Genome annotations with no matching source: {nr}".format(nr=len(missing)))
Expand Down
21 changes: 20 additions & 1 deletion flextaxd/custom_taxonomy_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
each column (not <tab>|<tab>).
'''

__version__ = "0.4.2"
__version__ = "0.4.3"
__author__ = "David Sundell"
__credits__ = ["David Sundell"]
__license__ = "GPLv3"
Expand Down Expand Up @@ -121,17 +121,21 @@ def __init__(self, message,expression=""):
mod_opts.add_argument('-md', '--mod_database', metavar="",default=False, help="Database file containing modifications")
mod_opts.add_argument('-gt', '--genomeid2taxid', metavar="", default=False, help="File that lists which node a genome should be assigned to")
mod_opts.add_argument('-gp', '--genomes_path', metavar="",default=None, help='Path to genome folder is required when using NCBI_taxonomy as source')
#mod_opts.add_argument('-un', '--update_names', metavar="",default=None, help='Update node names using old to new name file.')
mod_opts.add_argument('-p', '--parent',metavar="", default=False, help="Parent from which to add (replace see below) branch")
mod_opts.add_argument('--replace', action='store_true', help="Add if existing children of parents should be removed!")
mod_opts.add_argument('--clean_database', action='store_true', help="Clean up database from unannotated nodes")
mod_opts.add_argument('--skip_annotation', action='store_true', help="Do not automatically add annotation when creating GTDB database")
mod_opts.add_argument('--refdatabase', metavar="", default=False, help="For download command, give value of expected source, default (refseq)")


out_opts = parser.add_argument_group('output_opts', "Output options")
out_opts.add_argument('--dbprogram', metavar="", default=False,choices=__programs_supported__, help="Adjust output file to certain output specifications ["+", ".join(__programs_supported__)+"]")
out_opts.add_argument("--dump_prefix", metavar="", default="names,nodes", help="change dump prefix reqires two names default(names,nodes)")
out_opts.add_argument('--dump_sep', metavar="", default="\t|\t", help="Set output separator default(NCBI) also adds extra trailing columns for kraken")
out_opts.add_argument('--dump_descriptions', action='store_true', default=False, help="Dump description names instead of database integers")
out_opts.add_argument('--dump_genomes', action='store_true', default=False, help="Print list of genomes (and source) to file")
out_opts.add_argument('--dump_genome_annotations', action='store_true', default=False, help="Add genome taxid annotation to genomes dump")

vis_opts = parser.add_argument_group('vis_opts', "Visualisation options")
vis_opts.add_argument('--visualise_node', metavar='', default=False, help="Visualise tree from selected node")
Expand Down Expand Up @@ -261,6 +265,16 @@ def __init__(self, message,expression=""):
modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base)
modify_obj.clean_database(ncbi=ncbi)

'''Dump option, export list of genomes, added in flextaxd version 0.4.2'''
if args.dump_genomes:
logger.info("Dump list of genomes")
write_module = dynamic_import("modules", "WriteTaxonomy")
write_obj = write_module(args.outdir, database=args.database,prefix=args.dump_prefix,separator=args.dump_sep,minimal=args.dump_mini,desc=args.dump_descriptions,dbprogram=args.dbprogram,dump_genomes=True)
if args.dump_genome_annotations:
write_obj.dump_genome_annotations()
else:
write_obj.dump_genomes()

''' 0. Create taxonomy database (if it does not exist)'''
if args.taxonomy_file:
if not os.path.exists(args.database) or force:
Expand Down Expand Up @@ -304,6 +318,11 @@ def __init__(self, message,expression=""):
modify_obj = modify_module(database=args.database, update_genomes=True,taxid_base=args.taxid_base)
modify_obj.update_annotations(genomeid2taxid=args.genomeid2taxid)

# if args.update_names:
# modify_module = dynamic_import("modules", "ModifyTree")
# modify_obj = modify_module(database=args.database, update_node_names=True,taxid_base=args.taxid_base)
# modify_obj.update_node_names(args.update_names)

if (args.mod_file or args.mod_database) and args.clean_database:
modify_module = dynamic_import("modules", "ModifyTree")
modify_obj = modify_module(database=args.database,clean_database=args.clean_database,taxid_base=args.taxid_base)
Expand Down
10 changes: 10 additions & 0 deletions flextaxd/modules/DownloadGenomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,16 @@ def download_files(self,files):
logger.error("Could not stop all subprocesses check your process manager and end them manually!")
return self.not_downloaded

def download_from_file(self,inputfile,exclude="",all=False):
'''Download files with ncbis download script using an accession input file'''
if not all:
exclude = " ".join([exclude,"--exclude-gff3 --exclude-protein --exclude-rna --exclude-genomic-cds"])
args = ["genome", "accession" ,"--inputfile", input_file, exclude"]
p = Popen(args, stdout=PIPE,cwd=self.location)
(output, err) = p.communicate()
p_status = p.wait()
return self.location

def run(self, files, representative=False,url=""):
'''Download list of GCF and or GCA files from NCBI or download represenative genomes
Expand Down
74 changes: 63 additions & 11 deletions flextaxd/modules/ModifyTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,16 @@ def __init__(self, message):

class ModifyTree(object):
"""docstring for ModifyTree."""
def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, separator="\t",verbose=False,parent=False,replace=False,**kwargs):
def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, clean_database=False,update_genomes=False, update_node_names=False,separator="\t",verbose=False,parent=False,replace=False,**kwargs):
super(ModifyTree, self).__init__()
self.verbose = verbose
logger.info("Modify Tree")
self.sep = separator
self.taxonomy = {}
self.names = {}

self.keep_rank = {} ## Place holder for parent rank

self.parent=parent
self.replace = replace
self.ncbi_order = True
Expand Down Expand Up @@ -92,7 +94,7 @@ def __init__(self, database=".taxonomydb", mod_database=False, mod_file=False, c
self.modsource = self.parse_modification(self.moddb,"database")
elif mod_file:
self.modsource = self.parse_modification(mod_file,"file")
elif update_genomes or clean_database:
elif update_genomes or clean_database or update_node_names:
pass
else:
raise InputError("No modification source could be found both mod_database and mod_file file are empty!")
Expand Down Expand Up @@ -163,8 +165,16 @@ def _parse_new_links(self, parent=None,child=None,rank="no rank"):
self.new_nodes.add(parent_i)
child_i = self.get_id(child)
self.new_nodes.add(child_i)

rank_i = self.add_rank(rank)
'''If node has no level (no rank), check if parent database had a classified level. If so default add level'''
try:
level = self.parent_levels[int(child_i)]
logger.info("Rank kept for {node}, {rank}".format(node=child, rank=level))
except:
level = False
if not level:
rank_i = self.add_rank(rank)
else:
rank_i = level
self.new_links.add((parent_i,child_i,rank_i))
return

Expand Down Expand Up @@ -263,17 +273,15 @@ def parse_modification(self, input,modtype="database"):
if modtype == "database":
modparent = self.moddb.get_parent(self.moddb.get_id(self.parent))
logger.info("{n} existing links to {parent} ({parentlinks}) ({modparent})".format(n=len(self.existing_links),parent=self.parent,parentlinks=parentlinks,modparent=modparent))

self.parent_levels = self.keep_levels(self.existing_links | parentlinks | set(self.taxonomydb.get_links((self.existing_nodes & self.new_nodes))))
if modtype == "database":
self.mod_genomes = self.database_mod(input,self.parent)
elif modtype == "file":
self.file_mod(input)
else:
raise InputError("Wrong modification input database or file must be supplied")

### get links from current database
self.old_nodes = self.existing_nodes - self.new_nodes

logger.info("nodes:")
logger.info("old: {old}".format(old=len(self.old_nodes)))
logger.info("new: {new}".format(new=len(self.new_nodes)))
Expand All @@ -284,7 +292,6 @@ def parse_modification(self, input,modtype="database"):

self.overlapping_links = self.existing_links & self.new_links ## (links existing in both networks)
self.old_links = self.existing_links - self.new_links

logger.info("links:")
logger.info("old: {old}".format(old=len(self.old_links)))
logger.info("new: {new}".format(new=len(self.new_nodes)))
Expand All @@ -295,21 +302,27 @@ def parse_modification(self, input,modtype="database"):
'''Get all genomes annotated to new nodes in existing database'''
return True

def update_annotations(self, genomeid2taxid):
def update_annotations(self, genomeid2taxid, reference=False):
'''Function that adds annotation of genome ids to nodes'''
logger.info("Update genome to taxid annotations using {genomeid2taxid}".format(genomeid2taxid=genomeid2taxid))
_ref = reference
update = {
"set_column": "id",
"where_column": "genome",
"set_value": "",
"where": ""
}
#if _ref:
# update[""]
updated = 0
added = 0
with open(genomeid2taxid) as f:
for row in f:
try:
genome,name = row.strip().split(self.sep)
if len(row.strip().split(self.sep)) > 2:
genome,name,reference = row.strip().split(self.sep)
else:
genome,name = row.strip().split(self.sep)
except ValueError:
genome,name = row.strip().split(" ")
logger.debug("genome: {genome}, name: {name}".format(genome=genome,name=name))
Expand Down Expand Up @@ -386,6 +399,36 @@ def update_genomes(self):
self.taxonomydb.commit()
return

def update_node_names(self, refdict):
'''Function that adds annotation of genome ids to nodes'''
logger.info("Update node names in the database from {refdict}".format(refdict=refdict))
update = {
"set_column": "name",
"where_column": "name",
"set_value": "",
"where": ""
}
updated = 0
added = 0
with open(refdict) as f:
for row in f:
try:
old_name,name = row.strip().split(self.sep)
except ValueError:
old_name,name = row.strip().split(" ")
logger.debug("old_name: {old_name}, name: {name}".format(old_name=old_name,name=name))
## If no exception occured add old_name
update["set_value"] = name
update["where"] = old_name.strip()
res = self.taxonomydb.update_table(update,table="nodes")
if self.taxonomydb.rowcount()!=0:
if res:
updated += 1
self.taxonomydb.commit()
gid = self.taxonomydb.get_genomes()
logger.info("{updated} annotations were updated!".format(added=added, updated=updated))
return

def clean_database(self, ncbi=False):
'''Function that removes all node and node paths without annotation'''
logger.info("Fetch annotated nodes")
Expand All @@ -404,7 +447,7 @@ def clean_database(self, ncbi=False):
logger.info("Parents added: {an}".format(an=len(self.annotated_nodes)-an))
if ncbi:
logger.info("Keep main nodes of the NCBI taxonomy (parents on level 3 and above)")
self.keep = set(self.taxonomydb.get_children([1],maxdepth=1)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)])
self.keep = set(self.taxonomydb.get_children([1],maxdepth=3)) #set([self.nodeDict[node] for node in self.taxonomydb.get_children([1],maxdepth=1)])
logger.info("Adding root levels {nlev}".format(nlev=len(self.keep-self.annotated_nodes)))
self.annotated_nodes |= self.keep
'''Get all links related to an annotated node and its parents'''
Expand Down Expand Up @@ -434,6 +477,14 @@ def clean_database(self, ncbi=False):
self.taxonomydb.query("vacuum")
logger.info("Database is cleaned!")

def keep_levels(self, links):
parent_levels = {}
for c,p,l in links:
if self.rank[l] != 1:
parent_levels[p] = l
print(parent_levels)
return parent_levels

def update_database(self):
'''Update the database file'''
if self.replace:
Expand All @@ -446,6 +497,7 @@ def update_database(self):
logger.info("Replace tree, deleting all nodes downstream of selected parent!")
if len(self.old_links | self.non_overlapping_old_links-set(self.parent_link)) > 0:
logger.debug("Delete links no longer valid!")
'''Add function to keep taxonomy level from previous database (belongs to parent)'''
self.taxonomydb.delete_links((self.old_links | self.non_overlapping_old_links)-set(self.parent_link))
if len(self.old_nodes):
logger.debug("Delete nodes!")
Expand Down
8 changes: 4 additions & 4 deletions flextaxd/modules/ProcessDirectory.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self, database,limit=False):
self.genome_names = []
self.genome_path_dict = {}
self.files = []
#logger.debug(self.genome_id_dict)

def get_genome_names(self):
'''
Expand Down Expand Up @@ -73,10 +74,6 @@ def is_gcf_gca(self,fname,debug=False):
'''
try:
GCX,END,REST = fname.split("_",2) ## If a name contains anything after the GCF number remove this if split by _
if GCX in ["GB","RS"]:
return False ## GTDB stype input treat as non GCF
#GCX = END
#END,REST = REST.split("_",1) #remove GB or RS definitions
if debug:
logger.debug("[{} {} {}]".format(GCX,END,REST))
NUM,version = END.split(".",1)
Expand Down Expand Up @@ -162,6 +159,7 @@ def walk_directory(self,folder_path):
if not folder_path:
raise IOError("Parameter --genomes_path was not set".format(folder_path))
logger.info("Process genome path ({path})".format(path=folder_path))
logger.debug("Extensions valid: ({f})".format(f=self.ext))
for root, dirs, files in os.walk(folder_path,followlinks=True):
for file in files:
fname = file.rstrip(".gz") ## remove gz if present
Expand All @@ -177,6 +175,8 @@ def walk_directory(self,folder_path):
logger.info("Processed {count} genomes".format(count=count))
self.files = list(set(self.files))
self.genome_names = list(set(self.genome_names))
if len(self.genome_names) == 0:
logger.info("# WARNING: No genomes from the input folder was added! Are your genomes annotated properly in the database?")
return self.files, self.genome_names

def process_folder(self,folder_path):
Expand Down
13 changes: 10 additions & 3 deletions flextaxd/modules/ReadTaxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,16 +141,23 @@ def read_nodes(self, treefile=False):
self.length +=1
self.database.commit()

def parse_genomeid2taxid(self,genomeid2taxid):
def parse_genomeid2taxid(self,genomeid2taxid,reference=False):
'''Parse file that annotates genome_id´s to nodes in the tree'''
nodeDict = self.database.get_nodes()
_ref = reference
with self.zopen(genomeid2taxid,"rt") as f:
headers = f.readline().strip().split("\t")
for row in f:
if row.strip() != "": ## If there are trailing empty lines in the file
genomeid,taxid = row.strip().split("\t")
try:
self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()])
genomeid,taxid = row.strip().split("\t")
except:
if not _ref: ## override if there is a reference in file, and use given ref
genomeid,taxid,reference = row.strip().split("\t")
else:
genomeid,taxid,override = row.strip().split("\t")
try:
self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()],reference=reference)
except KeyError:
logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid))
self.database.commit()
Expand Down
Loading

0 comments on commit c6cf4ab

Please sign in to comment.