Skip to content

Commit

Permalink
Merge pull request #42 from FOI-Bioinformatics/version_update_v0.4.1
Browse files Browse the repository at this point in the history
Version update v0.4.1
  • Loading branch information
davve2 authored Jun 24, 2021
2 parents afa291d + fe58d55 commit cf99c8d
Show file tree
Hide file tree
Showing 8 changed files with 331 additions and 48 deletions.
4 changes: 2 additions & 2 deletions flextaxd/custom_taxonomy_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
each column (not <tab>|<tab>).
'''

__version__ = "0.3.7"
__version__ = "0.4.1"
__author__ = "David Sundell"
__credits__ = ["David Sundell"]
__license__ = "GPLv3"
Expand Down Expand Up @@ -100,7 +100,7 @@ def __init__(self, message,expression=""):
parser = argparse.ArgumentParser()

required = parser.add_argument_group('required', 'Required')
required.add_argument('-db', '--database',metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)")
required.add_argument('-db', '--database', '--db' ,metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)")

basic = parser.add_argument_group('basic', 'Basic commands')
basic.add_argument('-o', '--outdir',metavar="", default=".", help="Output directory")
Expand Down
161 changes: 146 additions & 15 deletions flextaxd/modules/NewickTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,35 @@ def print(self,type="newick"):
pylab.savefig("flextaxd.vis.png",)
return True

def double_opts_vis(self,links,taxid="name",full=False):
'''vis double opt'''
import inquirer
tn = self.database.get_nodes()
if taxid != "name":
taxid = tn[taxid]
parents = [tn[x[0]] for x in links]
questions = [
inquirer.List('parent',
message="The node {name} has multiple optional parents, select which line to visualise: ".format(name=taxid),
choices=parents,
),
]
answers = inquirer.prompt(questions)
selected = tn[answers["parent"]] ## Return back to id
taxids = []
unique = set()
for link in links:
if link[0] == selected:
rank = link[2]
taxids.append(link[1])
'''Update incoming taxid which may not be correct'''
self.taxid = link[1]
unique.add(link[1])
rank = rank +1
if len(unique) > len(taxids):
rank = False
return rank,taxids ## Parent rank is selected, child rank is plus one

def get_tree(self,table="tree",taxid=False,maxdepth=3):
'''Parameters
table - table in database (default tree)
Expand All @@ -173,13 +202,23 @@ def get_tree(self,table="tree",taxid=False,maxdepth=3):
or
list - List of links in tree, False
'''
selected = False
if taxid:
if maxdepth == 0:
maxdepth = 1000 ## It is not reasonable to expect trees with more than 1000 levels, if so bug has to be raised
nodes = self.database.get_children([taxid],maxdepth=maxdepth)
'''Check if double parent'''
links = self.database.get_links([taxid],order=True)
nodes = self.database.get_node(self.taxonomy[taxid])
find_tax = [taxid]
if len(nodes) > len(links):
links = self.database.get_links(nodes,order=True)
if len(links) > 1:
selected,find_tax = self.double_opts_vis(links,taxid)
nodes = self.database.get_children(find_tax,maxdepth=maxdepth,selected=selected)
if len(nodes) == 0:
raise VisualisationError("Given node has no children")
return self.database.get_links(nodes,order=True),nodes
links = self.database.get_links(nodes,order=True)
return links,nodes
else:
SELECT = "SELECT parent,child,rank_i FROM {table} ORDER BY child ASC".format(table=table)
return self.database.query(SELECT).fetchall(),False
Expand Down Expand Up @@ -211,29 +250,34 @@ def get_parent(self,name):
res = self.database.query(QUERY).fetchone()
return res

def get_child(self,name):
def get_child(self,name,rank_i=False):
'''return child'''
#QUERY = '''SELECT child,child,rank FROM tree LEFT JOIN rank on (tree.rank_i = rank.rank_i) WHERE child = "{node}"'''.format(node=name)
QUERY = '''SELECT parent,child,rank_i FROM tree WHERE parent = "{node}"'''.format(node=name)
if rank_i:
QUERY = '''SELECT parent,child,rank_i FROM tree WHERE parent = "{node}" and rank_i ={rank}'''.format(node=name,rank=rank_i)
logger.debug(QUERY)
res = self.database.query(QUERY).fetchone()
return res

def new_node(self,child,nodes,parent):
def new_node(self,child,nodes,parent,link=1):
'''Function that adds a new node to the newick tree'''
if self.link_exists == (child,parent): ## This has already been tried return directly
if self.link_exists == (child,parent,link): ## This has already been tried return directly
return False
try:
if len(self.c_p_set & set([(child,parent),(parent,child)])) == 0: ## If link does not exist
node = NewickNode(child, nodes[child], self.nodeDict[parent]) ## this works also for root as root has itself as child
if len(self.c_p_set & set([(child,parent,link),(parent,child,link)])) == 0: ## If link does not exist
try:
node = NewickNode(child, nodes[child], self.nodeDict[parent]) ## this works also for root as root has itself as child
except KeyError:
return False
'''Make sure link to parent was not made before'''
self.c_p_set |= set([(child,parent)])
self.c_p_set |= set([(parent,child)])
self.c_p_set |= set([(child,parent,link)])
self.c_p_set |= set([(parent,child,link)])
self.nodeDict[child] = node
self.link_exists=False
else:
logger.debug("Link {parent}-{child} already exists, retrieve node!".format(child=child,parent=parent))
self.link_exists = (child,parent)
self.link_exists = (child,parent,link)
node = self.nodeDict[child] ## add a reference to the node so that children can be added
pnode = self.nodeDict[parent] ## add child to the parent node
pnode.add_child(node)
Expand All @@ -244,16 +288,100 @@ def new_node(self,child,nodes,parent):
except KeyError:
raise Error("Something is wrong")
logger.debug("Adding parent: [{parent},{pname}] of child: [{child},{cname}]".format(parent=t_parent,pname=self.taxonomy[t_parent],cname=self.taxonomy[t_child],child=t_child))
if self.new_node(t_child,nodes,t_parent): ## Add the missing parent
self.new_node(child,nodes,parent) ## Try again to add the node
if self.new_node(t_child,nodes,t_parent,link=link-1): ## Add the missing parent
self.new_node(child,nodes,parent,link) ## Try again to add the node
else:
self.added_parent = child
return False
return True
logger.debug("NewickNode p:{parent} c: {child} added".format(parent=parent,child=child))
return True

def build_tree(self,taxid=False,maxdepth=3):
def unique_indexes(self,nodes):
'''Check duplicated indexes and give them unique IDs before print'''
QUERY = "SELECT child FROM tree WHERE child in ({nodes}) GROUP BY child HAVING count(parent) > 1 ".format(nodes=",".join(map(str,nodes))) ## Thanks to andrewjmc@github for this suggestion
child_w_dpi = self.database.query(QUERY).fetchall() ## Fetch all conflicting links and give them unique index before printing
child_w_dpi = list(*child_w_dpi)
return child_w_dpi

def fix_names(self,nodes,tn,tr):
'''Return names instead of taxid'''
nnodes = []
for x in range(len(nodes)):
fixnames = list(nodes[x])
fixnames[0] = tn[fixnames[0]]
fixnames[1] = tn[fixnames[1]]
fixnames[2] = tr[fixnames[2]]
nnodes.append(fixnames)
return nnodes


def double_vis_path(self,duplicates,taxid,nodes):
'''The range in the tree has two identical nodes, ask user to resolve which path goes where'''
import inquirer,random
tn = self.database.get_nodes()
tr = self.database.get_rank()
children = self.database.get_links(self.database.get_children([taxid],maxdepth=1))
parents = duplicates[taxid]
if len(children) > 2:
raise VisualisationError("Names occuring three times in the same tree are not taken care of at this time, export function does however!")
children_N = self.fix_names(children,tn,tr)
parents_N = self.fix_names(parents,tn,tr)
selto = parents_N[0]
default = children_N[0]
if taxid != "name":
taxid = tn[taxid]
questions = [
inquirer.List('parent',
message="The node {name} has two paths, select the correct path to ({p1})".format(name=taxid,p1=selto,children=children_N),
choices=children_N,
),
]
selected = inquirer.prompt(questions)["parent"]
import random
n = random.randint(10000000,10100000)
if selected != default:
children[0],children[1] = children[1],children[0]
## Change id for first pair
pc = list(parents[0])
pc[1] = n
parents[0] = tuple(pc)
pc = list(children[0])
pc[0] = n
children[0] = tuple(pc)
nodes[n] = taxid
return [*children,*parents],nodes

def duplicate_parents_check(self,nodes,tree,taxid):
'''Check if any node have optional parents
Parameters:
nodes - the list of nodes of interest
Returns:
list - list of nodes with optional parents
'''
duplicates = self.unique_indexes(nodes)
optional_parents = {}
for child in duplicates:
optional_parents[child] = self.database.get_parent(child,all=True)
return optional_parents

def fix_tree(self,tree,duplicates,nodes):
'''Change index for one all but one duplicate to make sure tree is printed correctly'''
update_tree = []
dup = [duplicates[k] for k in duplicates.keys()]
for link in tree:
if len(set([link]) & set(*dup)) > 0:
pass
else:
update_tree.append(link)
changes = []
for child in duplicates.keys():
changes,nodes = self.double_vis_path(duplicates,child,nodes)
update_tree += changes
tree = sorted(update_tree,key=lambda x:x[2])
return tree,nodes

def build_tree(self,taxid=False,maxdepth=3,check_parent=False):
'''Build newick tree from database
This function walks through a database of nodes and creates NewickNode objects
self-aware of their decending newick tree or their parent lineage,
Expand All @@ -265,7 +393,10 @@ def build_tree(self,taxid=False,maxdepth=3):
NewickTree nodeDict by their node name
'''
tree,nodes = self.get_tree(taxid=taxid,maxdepth=maxdepth)
nodes = self.get_nodes(nodes | set([self.taxid]))
duplicates = self.duplicate_parents_check(nodes - set([self.taxid]),tree,taxid)
nodes = self.get_nodes(nodes | set([self.taxid]),col=1)
if len(duplicates) > 0:
tree,nodes = self.fix_tree(tree,duplicates,nodes)
logger.debug("Nodes: {n} Links: {l}".format(n=len(nodes),l=len(tree)))
logger.debug([nodes,tree])
self.added_parent = False
Expand All @@ -284,7 +415,7 @@ def build_tree(self,taxid=False,maxdepth=3):
self.nodeDict["root"] = root ## Also add this reference as "root"
newickTree = root ## The master parent will contain the full tree
continue
self.new_node(child,nodes,parent)
self.new_node(child,nodes,parent,link=rank)

## The newickTree is the same as the master parent (containing the full tree, the root node is defined on row 135)
logger.debug("Tree complete, return newickTree")
Expand Down
25 changes: 19 additions & 6 deletions flextaxd/modules/ReadTaxonomy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/usr/bin/env python3 -c

'''
Read NCBI taxonomy dmp files (nodes or names) and holds a dictionary
Read SILVA taxonomy dmp files (nodes or names) and holds a dictionary
'''

from .database.DatabaseConnection import DatabaseFunctions
import logging
import gzip
import sys
logger = logging.getLogger(__name__)

class InputError(Exception):
Expand All @@ -16,7 +18,7 @@ def __init__(self, message):

class ReadTaxonomy(object):
"""docstring for ReadTaxonomy."""
def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verbose=False,ncbi=False,**kwargs):
def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verbose=False,**kwargs):
super(ReadTaxonomy, self).__init__()
### Connect to or create database
if database:
Expand All @@ -34,14 +36,25 @@ def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verb
self.qiime = 0

## Add base node
self.add_node("root")
self.root = self.add_node("root")
self.add_rank("no rank")
## Add basic link
self.add_link(child=1, parent=1,rank="no rank")

def set_qiime(self,opt):
self.qiime = opt

def zopen(self, path,*args, **kwargs):
'''Redefine open to handle zipped files automatically'''
if path.endswith(".gz"):
if str(*args) in ["r","w"] and sys.version_info.major >= 3:
## Python version three and above interprets binary formats as binary, add t (rt) to get text returned
args = (str(*args)+"t",)
else:
args = ("rt")
return gzip.open(path,*args,**kwargs)
return open(path,*args,**kwargs)

#@staticmethod
def parse_taxonomy(self,treefile=False):
'''Parse taxonomy information'''
Expand Down Expand Up @@ -93,7 +106,7 @@ def read_nodes(self, treefile=False):
rank = "no rank" #Base rank if rank is not used
if not treefile:
treefile = self.taxonomy_file
with open(treefile, "r") as _treefile:
with self.zopen(treefile, "r") as _treefile:
headers = _treefile.readline().strip().split(self.sep)
if "parent" not in headers or "child" not in headers:
raise InputError("Your input tree file does not contain the headers to specify child and parent!")
Expand Down Expand Up @@ -130,13 +143,13 @@ def read_nodes(self, treefile=False):
def parse_genomeid2taxid(self,genomeid2taxid):
'''Parse file that annotates genome_id´s to nodes in the tree'''
nodeDict = self.database.get_nodes()
with open(genomeid2taxid,"rt") as f:
with self.zopen(genomeid2taxid,"rt") as f:
headers = f.readline().strip().split("\t")
for row in f:
if row.strip() != "": ## If there are trailing empty lines in the file
genomeid,taxid = row.strip().split("\t")
try:
self.database.add_genome(genome=genomeid,_id=nodeDict[taxid.strip()])
self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()])
except KeyError:
logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid))
self.database.commit()
Expand Down
27 changes: 16 additions & 11 deletions flextaxd/modules/ReadTaxonomyCanSNPer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,30 @@ def __str__(self):
class ReadTaxonomyCanSNPer(ReadTaxonomy):
"""docstring for ReadTaxonomyCanSNPer."""
def __init__(self, taxonomy_file=False, database=".canSNPdb", taxid_base=1,root_name=False,rank="family", verbose=False,**kwargs):
super(ReadTaxonomyCanSNPer, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose)
super(ReadTaxonomyCanSNPer, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs)
self.input = taxonomy_file
#self.taxonomy = {}
self.taxonomy = {}
self.taxid_num = taxid_base
## Initiate database
logger.info(taxonomy_file)
logger.debug(root_name)
if not root_name:
logger.info("Fetching root name from file")
root_name = self.get_root_name(taxonomy_file)
if root_name != "root":
root_i = self.add_node(root_name)
logger.info("Adding root node {node}!".format(node=root_name))
else:
root_i = self.taxonomy[root_name]
logger.info("Adding, cellular organism node")
c_i = self.add_node("cellular organisms")
logger.info("Adding root node {node}!".format(node=root_name))
root_i = self.add_node(root_name)
self.taxid_num =taxid_base ## reset
logger.debug("Adding ranks!")
if rank != "no rank":
self.add_rank("no rank")
self.add_rank(rank)
self.add_rank("no rank")
self.add_link(root_i, root_i,rank=rank)
if root_name != "cellular organisms":
self.add_link(c_i,1)
if root_name != "root":
self.add_link(root_i,c_i,rank=rank) ## root is always added as top parent if not one force 1
self.taxonomy[root_i] = 1 ## Add local link to root
self.names = {}
self.root = root_i
self.length = 0
Expand All @@ -57,7 +61,7 @@ def __init__(self, taxonomy_file=False, database=".canSNPdb", taxid_base=1,root

def get_root_name(self,fin):
'''Get the root name of the CanSNP file unless root is specified'''
with open(fin) as f:
with self.zopen(fin) as f:
firstline = f.readline()
firstline = firstline.strip().replace("\t",";")
root = firstline.split(";") ## Root should be either the leftmost annotation in the tree or the only annotation on that row
Expand Down Expand Up @@ -87,10 +91,11 @@ def add_SNP(self,nodes,i):
def parse_taxonomy(self):
'''Retrieve node description from CanSNPer formatted tree'''
logger.info("Parse CanSNP tree file")
with open(self.taxonomy_file,"r") as f:
with self.zopen(self.taxonomy_file,"r") as f:
for row in f:
row = row.strip().replace("\t",";") ## Also accept tab separated tree files
nodes = row.strip().split(";") ## get node and all its parents in a list
logger.debug(nodes)
child = nodes[-1] ## get name of child node
'''If the tree was not properly formatted an a parent is missing make sure that function works anyway by adding any parent node above child'''
try:
Expand Down
Loading

0 comments on commit cf99c8d

Please sign in to comment.