diff --git a/flextaxd/custom_taxonomy_databases.py b/flextaxd/custom_taxonomy_databases.py index cc4cf91..723c14f 100644 --- a/flextaxd/custom_taxonomy_databases.py +++ b/flextaxd/custom_taxonomy_databases.py @@ -20,7 +20,7 @@ each column (not |). ''' -__version__ = "0.3.7" +__version__ = "0.4.1" __author__ = "David Sundell" __credits__ = ["David Sundell"] __license__ = "GPLv3" @@ -100,7 +100,7 @@ def __init__(self, message,expression=""): parser = argparse.ArgumentParser() required = parser.add_argument_group('required', 'Required') - required.add_argument('-db', '--database',metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)") + required.add_argument('-db', '--database', '--db' ,metavar="", type=str, default="FlexTaxD.db" , help="FlexTaxD taxonomy sqlite3 database file (fullpath)") basic = parser.add_argument_group('basic', 'Basic commands') basic.add_argument('-o', '--outdir',metavar="", default=".", help="Output directory") diff --git a/flextaxd/modules/NewickTree.py b/flextaxd/modules/NewickTree.py index ca63b02..cfcf52b 100644 --- a/flextaxd/modules/NewickTree.py +++ b/flextaxd/modules/NewickTree.py @@ -159,6 +159,35 @@ def print(self,type="newick"): pylab.savefig("flextaxd.vis.png",) return True + def double_opts_vis(self,links,taxid="name",full=False): + '''vis double opt''' + import inquirer + tn = self.database.get_nodes() + if taxid != "name": + taxid = tn[taxid] + parents = [tn[x[0]] for x in links] + questions = [ + inquirer.List('parent', + message="The node {name} has multiple optional parents, select which line to visualise: ".format(name=taxid), + choices=parents, + ), + ] + answers = inquirer.prompt(questions) + selected = tn[answers["parent"]] ## Return back to id + taxids = [] + unique = set() + for link in links: + if link[0] == selected: + rank = link[2] + taxids.append(link[1]) + '''Update incoming taxid which may not be correct''' + self.taxid = link[1] + unique.add(link[1]) + rank = rank +1 + if len(unique) > len(taxids): + rank = False + return rank,taxids ## Parent rank is selected, child rank is plus one + def get_tree(self,table="tree",taxid=False,maxdepth=3): '''Parameters table - table in database (default tree) @@ -173,13 +202,23 @@ def get_tree(self,table="tree",taxid=False,maxdepth=3): or list - List of links in tree, False ''' + selected = False if taxid: if maxdepth == 0: maxdepth = 1000 ## It is not reasonable to expect trees with more than 1000 levels, if so bug has to be raised - nodes = self.database.get_children([taxid],maxdepth=maxdepth) + '''Check if double parent''' + links = self.database.get_links([taxid],order=True) + nodes = self.database.get_node(self.taxonomy[taxid]) + find_tax = [taxid] + if len(nodes) > len(links): + links = self.database.get_links(nodes,order=True) + if len(links) > 1: + selected,find_tax = self.double_opts_vis(links,taxid) + nodes = self.database.get_children(find_tax,maxdepth=maxdepth,selected=selected) if len(nodes) == 0: raise VisualisationError("Given node has no children") - return self.database.get_links(nodes,order=True),nodes + links = self.database.get_links(nodes,order=True) + return links,nodes else: SELECT = "SELECT parent,child,rank_i FROM {table} ORDER BY child ASC".format(table=table) return self.database.query(SELECT).fetchall(),False @@ -211,29 +250,34 @@ def get_parent(self,name): res = self.database.query(QUERY).fetchone() return res - def get_child(self,name): + def get_child(self,name,rank_i=False): '''return child''' #QUERY = '''SELECT child,child,rank FROM tree LEFT JOIN rank on (tree.rank_i = rank.rank_i) WHERE child = "{node}"'''.format(node=name) QUERY = '''SELECT parent,child,rank_i FROM tree WHERE parent = "{node}"'''.format(node=name) + if rank_i: + QUERY = '''SELECT parent,child,rank_i FROM tree WHERE parent = "{node}" and rank_i ={rank}'''.format(node=name,rank=rank_i) logger.debug(QUERY) res = self.database.query(QUERY).fetchone() return res - def new_node(self,child,nodes,parent): + def new_node(self,child,nodes,parent,link=1): '''Function that adds a new node to the newick tree''' - if self.link_exists == (child,parent): ## This has already been tried return directly + if self.link_exists == (child,parent,link): ## This has already been tried return directly return False try: - if len(self.c_p_set & set([(child,parent),(parent,child)])) == 0: ## If link does not exist - node = NewickNode(child, nodes[child], self.nodeDict[parent]) ## this works also for root as root has itself as child + if len(self.c_p_set & set([(child,parent,link),(parent,child,link)])) == 0: ## If link does not exist + try: + node = NewickNode(child, nodes[child], self.nodeDict[parent]) ## this works also for root as root has itself as child + except KeyError: + return False '''Make sure link to parent was not made before''' - self.c_p_set |= set([(child,parent)]) - self.c_p_set |= set([(parent,child)]) + self.c_p_set |= set([(child,parent,link)]) + self.c_p_set |= set([(parent,child,link)]) self.nodeDict[child] = node self.link_exists=False else: logger.debug("Link {parent}-{child} already exists, retrieve node!".format(child=child,parent=parent)) - self.link_exists = (child,parent) + self.link_exists = (child,parent,link) node = self.nodeDict[child] ## add a reference to the node so that children can be added pnode = self.nodeDict[parent] ## add child to the parent node pnode.add_child(node) @@ -244,8 +288,8 @@ def new_node(self,child,nodes,parent): except KeyError: raise Error("Something is wrong") logger.debug("Adding parent: [{parent},{pname}] of child: [{child},{cname}]".format(parent=t_parent,pname=self.taxonomy[t_parent],cname=self.taxonomy[t_child],child=t_child)) - if self.new_node(t_child,nodes,t_parent): ## Add the missing parent - self.new_node(child,nodes,parent) ## Try again to add the node + if self.new_node(t_child,nodes,t_parent,link=link-1): ## Add the missing parent + self.new_node(child,nodes,parent,link) ## Try again to add the node else: self.added_parent = child return False @@ -253,7 +297,91 @@ def new_node(self,child,nodes,parent): logger.debug("NewickNode p:{parent} c: {child} added".format(parent=parent,child=child)) return True - def build_tree(self,taxid=False,maxdepth=3): + def unique_indexes(self,nodes): + '''Check duplicated indexes and give them unique IDs before print''' + QUERY = "SELECT child FROM tree WHERE child in ({nodes}) GROUP BY child HAVING count(parent) > 1 ".format(nodes=",".join(map(str,nodes))) ## Thanks to andrewjmc@github for this suggestion + child_w_dpi = self.database.query(QUERY).fetchall() ## Fetch all conflicting links and give them unique index before printing + child_w_dpi = list(*child_w_dpi) + return child_w_dpi + + def fix_names(self,nodes,tn,tr): + '''Return names instead of taxid''' + nnodes = [] + for x in range(len(nodes)): + fixnames = list(nodes[x]) + fixnames[0] = tn[fixnames[0]] + fixnames[1] = tn[fixnames[1]] + fixnames[2] = tr[fixnames[2]] + nnodes.append(fixnames) + return nnodes + + + def double_vis_path(self,duplicates,taxid,nodes): + '''The range in the tree has two identical nodes, ask user to resolve which path goes where''' + import inquirer,random + tn = self.database.get_nodes() + tr = self.database.get_rank() + children = self.database.get_links(self.database.get_children([taxid],maxdepth=1)) + parents = duplicates[taxid] + if len(children) > 2: + raise VisualisationError("Names occuring three times in the same tree are not taken care of at this time, export function does however!") + children_N = self.fix_names(children,tn,tr) + parents_N = self.fix_names(parents,tn,tr) + selto = parents_N[0] + default = children_N[0] + if taxid != "name": + taxid = tn[taxid] + questions = [ + inquirer.List('parent', + message="The node {name} has two paths, select the correct path to ({p1})".format(name=taxid,p1=selto,children=children_N), + choices=children_N, + ), + ] + selected = inquirer.prompt(questions)["parent"] + import random + n = random.randint(10000000,10100000) + if selected != default: + children[0],children[1] = children[1],children[0] + ## Change id for first pair + pc = list(parents[0]) + pc[1] = n + parents[0] = tuple(pc) + pc = list(children[0]) + pc[0] = n + children[0] = tuple(pc) + nodes[n] = taxid + return [*children,*parents],nodes + + def duplicate_parents_check(self,nodes,tree,taxid): + '''Check if any node have optional parents + Parameters: + nodes - the list of nodes of interest + Returns: + list - list of nodes with optional parents + ''' + duplicates = self.unique_indexes(nodes) + optional_parents = {} + for child in duplicates: + optional_parents[child] = self.database.get_parent(child,all=True) + return optional_parents + + def fix_tree(self,tree,duplicates,nodes): + '''Change index for one all but one duplicate to make sure tree is printed correctly''' + update_tree = [] + dup = [duplicates[k] for k in duplicates.keys()] + for link in tree: + if len(set([link]) & set(*dup)) > 0: + pass + else: + update_tree.append(link) + changes = [] + for child in duplicates.keys(): + changes,nodes = self.double_vis_path(duplicates,child,nodes) + update_tree += changes + tree = sorted(update_tree,key=lambda x:x[2]) + return tree,nodes + + def build_tree(self,taxid=False,maxdepth=3,check_parent=False): '''Build newick tree from database This function walks through a database of nodes and creates NewickNode objects self-aware of their decending newick tree or their parent lineage, @@ -265,7 +393,10 @@ def build_tree(self,taxid=False,maxdepth=3): NewickTree nodeDict by their node name ''' tree,nodes = self.get_tree(taxid=taxid,maxdepth=maxdepth) - nodes = self.get_nodes(nodes | set([self.taxid])) + duplicates = self.duplicate_parents_check(nodes - set([self.taxid]),tree,taxid) + nodes = self.get_nodes(nodes | set([self.taxid]),col=1) + if len(duplicates) > 0: + tree,nodes = self.fix_tree(tree,duplicates,nodes) logger.debug("Nodes: {n} Links: {l}".format(n=len(nodes),l=len(tree))) logger.debug([nodes,tree]) self.added_parent = False @@ -284,7 +415,7 @@ def build_tree(self,taxid=False,maxdepth=3): self.nodeDict["root"] = root ## Also add this reference as "root" newickTree = root ## The master parent will contain the full tree continue - self.new_node(child,nodes,parent) + self.new_node(child,nodes,parent,link=rank) ## The newickTree is the same as the master parent (containing the full tree, the root node is defined on row 135) logger.debug("Tree complete, return newickTree") diff --git a/flextaxd/modules/ReadTaxonomy.py b/flextaxd/modules/ReadTaxonomy.py index 0f5c547..3bc06c9 100644 --- a/flextaxd/modules/ReadTaxonomy.py +++ b/flextaxd/modules/ReadTaxonomy.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -c ''' -Read NCBI taxonomy dmp files (nodes or names) and holds a dictionary +Read SILVA taxonomy dmp files (nodes or names) and holds a dictionary ''' from .database.DatabaseConnection import DatabaseFunctions import logging +import gzip +import sys logger = logging.getLogger(__name__) class InputError(Exception): @@ -16,7 +18,7 @@ def __init__(self, message): class ReadTaxonomy(object): """docstring for ReadTaxonomy.""" - def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verbose=False,ncbi=False,**kwargs): + def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verbose=False,**kwargs): super(ReadTaxonomy, self).__init__() ### Connect to or create database if database: @@ -34,7 +36,7 @@ def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verb self.qiime = 0 ## Add base node - self.add_node("root") + self.root = self.add_node("root") self.add_rank("no rank") ## Add basic link self.add_link(child=1, parent=1,rank="no rank") @@ -42,6 +44,17 @@ def __init__(self, taxonomy_file=False, taxonomy_name=False, database=False,verb def set_qiime(self,opt): self.qiime = opt + def zopen(self, path,*args, **kwargs): + '''Redefine open to handle zipped files automatically''' + if path.endswith(".gz"): + if str(*args) in ["r","w"] and sys.version_info.major >= 3: + ## Python version three and above interprets binary formats as binary, add t (rt) to get text returned + args = (str(*args)+"t",) + else: + args = ("rt") + return gzip.open(path,*args,**kwargs) + return open(path,*args,**kwargs) + #@staticmethod def parse_taxonomy(self,treefile=False): '''Parse taxonomy information''' @@ -93,7 +106,7 @@ def read_nodes(self, treefile=False): rank = "no rank" #Base rank if rank is not used if not treefile: treefile = self.taxonomy_file - with open(treefile, "r") as _treefile: + with self.zopen(treefile, "r") as _treefile: headers = _treefile.readline().strip().split(self.sep) if "parent" not in headers or "child" not in headers: raise InputError("Your input tree file does not contain the headers to specify child and parent!") @@ -130,13 +143,13 @@ def read_nodes(self, treefile=False): def parse_genomeid2taxid(self,genomeid2taxid): '''Parse file that annotates genome_id´s to nodes in the tree''' nodeDict = self.database.get_nodes() - with open(genomeid2taxid,"rt") as f: + with self.zopen(genomeid2taxid,"rt") as f: headers = f.readline().strip().split("\t") for row in f: if row.strip() != "": ## If there are trailing empty lines in the file genomeid,taxid = row.strip().split("\t") try: - self.database.add_genome(genome=genomeid,_id=nodeDict[taxid.strip()]) + self.database.add_genome(genome=genomeid.strip(),_id=nodeDict[taxid.strip()]) except KeyError: logger.warning("# WARNING: {taxid} not found in the database".format(taxid=taxid)) self.database.commit() diff --git a/flextaxd/modules/ReadTaxonomyCanSNPer.py b/flextaxd/modules/ReadTaxonomyCanSNPer.py index 99c73ba..313d0b5 100644 --- a/flextaxd/modules/ReadTaxonomyCanSNPer.py +++ b/flextaxd/modules/ReadTaxonomyCanSNPer.py @@ -27,9 +27,9 @@ def __str__(self): class ReadTaxonomyCanSNPer(ReadTaxonomy): """docstring for ReadTaxonomyCanSNPer.""" def __init__(self, taxonomy_file=False, database=".canSNPdb", taxid_base=1,root_name=False,rank="family", verbose=False,**kwargs): - super(ReadTaxonomyCanSNPer, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose) + super(ReadTaxonomyCanSNPer, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs) self.input = taxonomy_file - #self.taxonomy = {} + self.taxonomy = {} self.taxid_num = taxid_base ## Initiate database logger.info(taxonomy_file) @@ -37,16 +37,20 @@ def __init__(self, taxonomy_file=False, database=".canSNPdb", taxid_base=1,root if not root_name: logger.info("Fetching root name from file") root_name = self.get_root_name(taxonomy_file) - if root_name != "root": - root_i = self.add_node(root_name) - logger.info("Adding root node {node}!".format(node=root_name)) - else: - root_i = self.taxonomy[root_name] + logger.info("Adding, cellular organism node") + c_i = self.add_node("cellular organisms") + logger.info("Adding root node {node}!".format(node=root_name)) + root_i = self.add_node(root_name) self.taxid_num =taxid_base ## reset logger.debug("Adding ranks!") + if rank != "no rank": + self.add_rank("no rank") self.add_rank(rank) - self.add_rank("no rank") - self.add_link(root_i, root_i,rank=rank) + if root_name != "cellular organisms": + self.add_link(c_i,1) + if root_name != "root": + self.add_link(root_i,c_i,rank=rank) ## root is always added as top parent if not one force 1 + self.taxonomy[root_i] = 1 ## Add local link to root self.names = {} self.root = root_i self.length = 0 @@ -57,7 +61,7 @@ def __init__(self, taxonomy_file=False, database=".canSNPdb", taxid_base=1,root def get_root_name(self,fin): '''Get the root name of the CanSNP file unless root is specified''' - with open(fin) as f: + with self.zopen(fin) as f: firstline = f.readline() firstline = firstline.strip().replace("\t",";") root = firstline.split(";") ## Root should be either the leftmost annotation in the tree or the only annotation on that row @@ -87,10 +91,11 @@ def add_SNP(self,nodes,i): def parse_taxonomy(self): '''Retrieve node description from CanSNPer formatted tree''' logger.info("Parse CanSNP tree file") - with open(self.taxonomy_file,"r") as f: + with self.zopen(self.taxonomy_file,"r") as f: for row in f: row = row.strip().replace("\t",";") ## Also accept tab separated tree files nodes = row.strip().split(";") ## get node and all its parents in a list + logger.debug(nodes) child = nodes[-1] ## get name of child node '''If the tree was not properly formatted an a parent is missing make sure that function works anyway by adding any parent node above child''' try: diff --git a/flextaxd/modules/ReadTaxonomySILVA.py b/flextaxd/modules/ReadTaxonomySILVA.py new file mode 100644 index 0000000..1b96300 --- /dev/null +++ b/flextaxd/modules/ReadTaxonomySILVA.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 -c + +''' +Read SILVA taxonomy holds a dictionary with taxonomy tree and name translation +''' + +__version__ = "1.0" +__author__ = "David Sundell" +__credits__ = ["David Sundell"] +__license__ = "GPLv3" +__maintainer__ = "FOI bioinformatics group" +__email__ = "bioinformatics@foi.se" +__date__ = "2020-01-17" +__status__ = "Production" + +from .ReadTaxonomy import ReadTaxonomy +import logging +logger = logging.getLogger(__name__) + +class ImportFormatError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + + +class ReadTaxonomySILVA(ReadTaxonomy): + """docstring for ReadTaxonomySILVA.""" + def __init__(self, taxonomy_file=False, database=".silva", taxid_base=1,root_name="root",rank="family", verbose=False,**kwargs): + super(ReadTaxonomySILVA, self).__init__(taxonomy_file=taxonomy_file, database=database,verbose=verbose,**kwargs) + self.input = taxonomy_file + self.taxid_num = taxid_base + ## Initiate database + logger.info(taxonomy_file) + self.length = 0 + self.database.commit() + logger.debug("Root: {root} link: [{base}]".format(root=self.root, base=self.taxid_num)) + + def parse_taxonomy(self): + '''Retrieve node description from SILVA formatted tree''' + logger.info("Parse SILVA tree file") + with self.zopen(self.taxonomy_file,"r") as f: + for row in f: + tree,info = row.strip().rsplit(";",1) ## separate tree from info columns + nodes = tree.strip().split(";") ## get node and all its parents in a list. + info = info.lstrip("\t") + try: + taxid,rank,_ = info.split("\t",2) + except ValueError: + taxid,rank = info.split("\t") + child = nodes[-1] ## get name of child node + self.add_rank(rank) + '''If the tree was not properly formatted an a parent is missing make sure that function works anyway by adding any parent node above child''' + try: + pname = nodes[-2].strip() + parent_i = self.taxonomy[pname] ## Check if parent of child node exists + except IndexError: ## Should be first row with only one node (parent) + child_i = self.add_node(child,id=taxid) + self.add_link(child_i,self.root,rank=rank) + continue + '''Now all parents exists, add new child node and add the link''' + child_i = self.add_node(child,id=taxid) + self.add_link(child_i,parent_i,rank=rank) + self.database.commit() + self.length = len(self.taxonomy) ## Check number of new nodes added + self.taxid_num = len(self.taxonomy) + logger.info("New taxonomy ids assigned {taxidnr}".format(taxidnr=self.length)) diff --git a/flextaxd/modules/WriteTaxonomy.py b/flextaxd/modules/WriteTaxonomy.py index f3f6a6e..204b648 100644 --- a/flextaxd/modules/WriteTaxonomy.py +++ b/flextaxd/modules/WriteTaxonomy.py @@ -23,6 +23,7 @@ def __init__(self, path, database=".taxonomydb",separator="\t|\t",minimal=False, self.dump_descriptions = desc logging.debug("Add descriptions: {desc}".format(desc=self.dump_descriptions)) ### Allows a minimal output file with only nessesary fields default is NCBI id | name | empty | scientific name + self.updated = False self.minimal = minimal if self.minimal: if dbprogram: @@ -72,18 +73,44 @@ def get_links(self, table, select="child,parent,rank"): logging.debug(QUERY) return self.database.query(QUERY).fetchall() + def unique_indexes(self): + '''Check duplicated indexes and give them unique IDs before print''' + QUERY = "SELECT child FROM tree GROUP BY child HAVING count(parent) > 1" ## Thanks to andrewjmc@github for this suggestion + child_w_dpi = self.database.query(QUERY).fetchall() ## Fetch all conflicting links and give them unique index before printing + child_w_dpi = list(*child_w_dpi) + lmax = 10000000 + if len(child_w_dpi) > 0: + self.nodeDict = self.database.get_nodes(col=1) + lmax = max(self.nodeDict) + return lmax,child_w_dpi + def nodes(self): '''Write database tree to nodes.dmp''' logging.info('Write tree to: {}{}.dmp'.format(self.path,self.prefix[1])) + child_w_dpi = [] + checklist = {} with open('{}{}.dmp'.format(self.path,self.prefix[1]),"w") as outputfile: ## Retrieve all links that exists in the database if self.dump_descriptions: self.nodeDict = self.database.get_nodes() print("child\tparent\trank", sep=self.separator, end="\n", file=outputfile) - + else: + lmax,child_w_dpi = self.unique_indexes() links = self.get_links('tree','child,parent,rank') for link in links: link = list(link) + '''If child with duplicate index, make sure link has unique index, match with node''' + if link[0] in child_w_dpi: + try: + checklist[link[0]] += 1 + name = self.nodeDict[link[0]] + '''Update node index to uniqe''' + link[0] = lmax+100+len(checklist.keys())+checklist[link[0]] + self.nodeDict[link[0]] = name + self.updated = True + except KeyError: + checklist[link[0]] = 0 + ## Do nothing if self.link_order: link[0],link[1] = link[1],link[0] if self.dump_descriptions: @@ -104,7 +131,10 @@ def names(self): end = "\t|\n" with open('{}{}.dmp'.format(self.path,self.prefix[0]),"w") as outputfile: ## Retrieve all nodes that exists in the database - nodes = self.get_all('nodes', 'id,name') + if self.updated: + nodes = self.nodeDict.items() + else: + nodes = self.get_all('nodes', 'id,name') empty = "" for node in nodes: if not self.minimal: diff --git a/flextaxd/modules/database/CreateDatabase.py b/flextaxd/modules/database/CreateDatabase.py index ddc8f44..905b324 100644 --- a/flextaxd/modules/database/CreateDatabase.py +++ b/flextaxd/modules/database/CreateDatabase.py @@ -30,7 +30,7 @@ def __init__(self, verbose=False): FOREIGN KEY (parent) REFERENCES nodes (id), FOREIGN KEY (child) REFERENCES nodes (id), FOREIGN KEY (rank_i) REFERENCES rank (rank_i), - unique (parent, child) + unique (parent, child,rank_i) );""" self.sql_create_rank_table = """CREATE TABLE IF NOT EXISTS rank ( diff --git a/flextaxd/modules/database/DatabaseConnection.py b/flextaxd/modules/database/DatabaseConnection.py index 274e3c8..142e3db 100644 --- a/flextaxd/modules/database/DatabaseConnection.py +++ b/flextaxd/modules/database/DatabaseConnection.py @@ -326,13 +326,21 @@ def check_parent(self): True: if node has one and only one parent ''' try: - QUERY = "SELECT parent FROM tree GROUP BY child HAVING count(parent) > 1" ## Thanks to andrewjmc@github for this suggestion + QUERY = "SELECT child FROM tree GROUP BY child HAVING count(parent) > 1" ## Thanks to andrewjmc@github for this suggestion logger.debug(QUERY) - return self.query(QUERY).fetchall() + child_w_dp = self.query(QUERY).fetchall() + QUERY = "SELECT parent,rank_i FROM tree WHERE child in ({children})".format(children=",".join(map(str,list(*child_w_dp)))) + p_ranks = self.query(QUERY).fetchall() + if len(p_ranks) != len(set(p_ranks)): + logger.error("Nodes with two parents have identical ranks, Fatal Error") + elif len(child_w_dp) > 0: + logger.info("Found: {n} nodes with multiple parents, however in different lineages, OK.".format(n=len(child_w_dp))) + return [] + return child_w_dp except AttributeError: logger.info("AttributeError occured") logger.info(QUERY) - logger.info(res) + logger.info(child_w_dp) raise TreeError("Node: {node} has more than one parent!".format(node=name)) '''Get functions of class''' @@ -379,6 +387,18 @@ def get_genomes(self, database=False,limit=0,table="genomes",cols="id,genome"): genomeDict[genome] = id return genomeDict + def get_node(self, name, database=False): + '''Retrieve one node by name + + ------ + Returns + list - list with node id + ''' + QUERY = '''SELECT id FROM nodes where name = "{name}"'''.format(name=name) + if not database: + database = self.database + return [n[0] for n in self.query(QUERY).fetchall()] + def get_nodes(self, database=False,col=False): '''Retrieve the whole node info table of the database to decrease the number of database calls! @@ -412,10 +432,10 @@ def get_links(self, nodes=False,database=False,swap=False,only_parents=False,ord if only_parents and nodes: QUERY = '''SELECT {order},rank_i FROM tree WHERE child in ({nodes})'''.format(nodes=",".join(map(str,nodes)),order=",".join(order)) + elif nodes and order: + QUERY = '''SELECT {order},rank_i FROM tree WHERE child in ({nodes}) ORDER BY parent ASC'''.format(nodes=",".join(map(str,nodes)),order=",".join(order)) elif nodes: QUERY = '''SELECT {order},rank_i FROM tree WHERE parent in ({nodes}) OR child in ({nodes})'''.format(nodes=",".join(map(str,nodes)),order=",".join(order)) - elif nodes and order: - QUERY = '''SELECT {order},rank_i FROM tree WHERE child in ({nodes}) ORDER BY child ASC''' else: QUERY = '''SELECT {order},rank_i FROM tree'''.format(order=",".join(order)) logger.debug(QUERY) @@ -642,15 +662,20 @@ def get_rank(self,col=1): rankDict[rank[1]] = rank[0] return rankDict - def get_children(self,parents,children=set(),level=0,maxdepth=50): + def get_children(self,parents,children=set(),level=0,maxdepth=50,selected=False): '''Get all children from a parent Returns ------ set - unique list of children from a decending tree ''' + if level >= maxdepth: + return children QUERY = '''SELECT child FROM tree WHERE parent in({nodes})'''.format(nodes=",".join(map(str,list(parents)))) + if selected: + QUERY = '''SELECT child FROM tree WHERE parent in({nodes}) and rank_i = {rank}'''.format(nodes=",".join(map(str,list(parents))), rank=selected) #logger.debug(QUERY) + #print(QUERY) res = self.query(QUERY).fetchall() if (len(res) + len(children)) != 0: children = set([child_i[0] for child_i in res]) @@ -658,7 +683,7 @@ def get_children(self,parents,children=set(),level=0,maxdepth=50): children |= self.get_children(parents=children,level=level+1,maxdepth=maxdepth) return children - def get_parent(self,name): + def get_parent(self,name,all=False): '''Get parent from node id parent Returns @@ -668,7 +693,10 @@ def get_parent(self,name): #QUERY = '''SELECT parent,child,rank FROM tree LEFT JOIN rank on (tree.rank_i = rank.rank_i) WHERE child = "{node}"'''.format(node=name) QUERY = '''SELECT parent,child,rank_i FROM tree WHERE child = "{node}"'''.format(node=name) logger.debug(QUERY) - res = self.query(QUERY).fetchone() + if all: + res = self.query(QUERY).fetchall() + else: + res = self.query(QUERY).fetchone() return res def get_parents(self,name,parents=set(),depth=0,find_all=False): @@ -679,10 +707,15 @@ def get_parents(self,name,parents=set(),depth=0,find_all=False): list - all parents of a node ''' ld = depth+1 - name = set(list(map(int,name))) ## Make sure all names are int + try: + name = set(list(map(int,name))) ## Make sure all names are int + except TypeError: + name = set(list(map(int,[x[0] for x in name]))) ## Make sure all names are int if isinstance(name, int): name = [name] - QUERY = '''SELECT parent,child FROM tree WHERE child in ({node})'''.format(node=",".join(map(str,name))) + if len(name & set([1])) > 0: + return parents + QUERY = '''SELECT parent,child,rank_i FROM tree WHERE child in ({node})'''.format(node=",".join(map(str,name))) #logger.debug(QUERY) res = self.query(QUERY).fetchall() if not find_all: @@ -691,6 +724,10 @@ def get_parents(self,name,parents=set(),depth=0,find_all=False): except IndexError: logger.warning("WARNING: parent could not be found for node {res} \n{query}".format(query=QUERY, res=name)) return set() + except TypeError: + #print(res) + logger.error("ERROR: cannot fetch parents when parents are duplicated!") + raise TreeError("cannot fetch parents when parents are duplicated!") if res[0] == res[1]: ## Special for root return set([res[0]]) else: @@ -702,7 +739,7 @@ def get_parents(self,name,parents=set(),depth=0,find_all=False): ## Add current node parents |= set([int(res[0])]) else: - parents = set([int(pc[0]) for pc in res]) + parents = set([(int(pc[0]),int(pc[-1])) for pc in res]) if len(parents-name) == 0: ## There are no more parents to fetch return parents else: