-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewick2orthoxml.py
executable file
·191 lines (150 loc) · 7.84 KB
/
newick2orthoxml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python
import sys
from ete2 import PhyloTree, orthoxml
import argparse
__author__ = "Jaime Huerta-Cepas"
__email__ = "[email protected]"
__DESCRIPTION__ = """
This is a python script that extracts evolutionary events
(speciation and duplication) from a newick tree and exports them as an
OrthoXML file.
It uses ETE's orthoxml API to create and populate OrthoXML documents:
http://etetoolkit.org
http://pythonhosted.org/ete2/tutorial/tutorial_etree2orthoxml.html#orthoxml-parser
"""
SPECIES_NAME_DELIMITER = ""
SPECIES_NAME_POS = 0
def extract_spname(node_name):
return node_name.split(SPECIES_NAME_DELIMITER)[SPECIES_NAME_POS]
def export_as_orthoXML(t, database, evoltype_attr="evoltype"):
""" This function takes a ETE PhyloTree instance and export all
its speciation and duplication events as OrthoXML format.
Note that this function can be enriched or customized to include
more orthoXML features using the orthoxml.py module.
"""
# Creates an empty orthoXML object
O = orthoxml.orthoXML()
#Generate the structure containing sequence information
leaf2id= {}
sp2genes = {}
for genid, leaf in enumerate(t.iter_leaves()):
spname=extract_spname(leaf.name)
if spname not in sp2genes:
sp = orthoxml.species(spname)
db = orthoxml.database(name=database)
genes = orthoxml.genes()
sp.add_database(db)
db.set_genes(genes)
sp2genes[spname] = genes
# add info to the orthoXML document
O.add_species(sp)
else:
genes = sp2genes[spname]
gn = orthoxml.gene(protId=leaf.name, id=genid)
leaf2id[leaf] = genid
genes.add_gene(gn)
# Add an ortho group container to the orthoXML document
ortho_groups = orthoxml.groups()
O.set_groups(ortho_groups)
# OrthoXML does not support duplication events to be at the root
# of the tree, so we search for the top most speciation events in
# the tree and export them as separate ortholog groups
is_speciation = lambda n: getattr(n, evoltype_attr, "") == "S" or not n.children
for speciation_root in t.iter_leaves(is_leaf_fn=is_speciation):
# Creates an orthogroup in which all events will be added
node2event = {}
node2event[speciation_root] = orthoxml.group()
ortho_groups.add_orthologGroup(node2event[speciation_root])
# if root node is a leaf, just export an orphan sequence within the group
if speciation_root.is_leaf():
node2event[speciation_root].add_geneRef(orthoxml.geneRef(leaf2id[speciation_root]))
# otherwise, descend the tree and export orthology structure
for node in speciation_root.traverse("preorder"):
if node.is_leaf():
continue
parent_event = node2event[node]
for ch in node.children:
if ch.is_leaf():
parent_event.add_geneRef(orthoxml.geneRef(leaf2id[ch]))
else:
node2event[ch] = orthoxml.group()
try:
evoltype = getattr(ch, evoltype_attr)
except AttributeError:
if not ch.is_leaf():
print ch.features
raise AttributeError("\n\nUnknown evolutionary event. Please use [S] for speciation and [D] for duplication: %s" %ch.get_ascii())
if evoltype == "D":
parent_event.add_paralogGroup(node2event[ch])
elif evoltype == "S":
parent_event.add_orthologGroup(node2event[ch])
O.export(sys.stdout, 0, namespace_="")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__DESCRIPTION__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('tree', metavar='tree_file', type=str, nargs=1,
help='A tree file (or text string) in newick format.')
parser.add_argument("--sp_delimiter", dest="species_delimiter",
type=str, default="_",
help=("When species names are guessed from node names,"
" this argument specifies how to split node name to guess"
" the species code"))
parser.add_argument("--sp_field", dest="species_field",
type=int, default=1,
help=("When species names are guessed from node names,"
" this argument specifies the position of the species"
" name code relative to the name splitting delimiter"))
parser.add_argument("--root", dest="root",
type=str, nargs="*",
help="Roots the tree to the node grouping the list"
" of node names provided (space separated). In example:"
"'--root human rat mouse'")
parser.add_argument("--skip_ortholog_detection", dest="skip_ortholog_detection",
action="store_true",
help=("Skip automatic detection of"
" speciation and duplication events, thus relying in the"
" correct annotation of the provided tree using"
" the extended newick format (i.e. '((A, A)[&&NHX:evoltype=D], B)[&&NHX:evoltype=S];')"))
parser.add_argument("--evoltype_attr", dest="evoltype_attr",
type=str, default="evoltype",
help=("When orthology detection is disabled,"
" the attribute name provided here will be expected to exist"
" in all internal nodes and read from the extended newick format"))
parser.add_argument("--database", dest="database",
type=str, default="",
help=("Database name"))
parser.add_argument("--show", dest="show",
action="store_true", default="",
help=("Show the tree and its evolutionary events before orthoXML export"))
parser.add_argument("--ascii", dest="ascii",
action="store_true", default="",
help=("Show the tree using ASCII representation and all its evolutionary"
" events before orthoXML export"))
parser.add_argument("--newick", dest="newick",
action="store_true", default="",
help=("print the extended newick format for provided tree using"
" ASCII representation and all its evolutionary events"
" before orthoXML export"))
args = parser.parse_args()
newick = args.tree[0]
SPECIES_NAME_POS = args.species_field
SPECIES_NAME_DELIMITER = args.species_delimiter
# load a phylomeDB Tree provided as a newick file in the command line
t = PhyloTree(newick, sp_naming_function=extract_spname)
if args.root:
if len(args.root) > 1:
outgroup = t.get_common_ancestor(args.root)
else:
outgroup = t & args.root[0]
t.set_outgroup(outgroup)
if not args.skip_ortholog_detection:
# detect speciation and duplication events using the species overlap
# algorithm used in phylomeDB
t.get_descendant_evol_events()
if args.ascii:
print t.get_ascii(attributes=[args.evoltype_attr, "name"], show_internal=True)
if args.newick:
print t.write(features=[args.evoltype_attr], format_root_node=True)
if args.show:
t.show()
export_as_orthoXML(t, args.database, args.evoltype_attr)