-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_ptm_curation.py
289 lines (256 loc) · 11.5 KB
/
generate_ptm_curation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import os
import gzip
import csv
import json
import tqdm
import codecs
import pystow
from collections import defaultdict
from indra.statements import stmt_from_json, RegulateActivity, RegulateAmount,\
modtype_to_modclass, Modification
from indra.databases import hgnc_client
from indra.util import batch_iter
from indra.sources import indra_db_rest
import indra.tools.assemble_corpus as ac
from indra.assemblers.html import HtmlAssembler
processed_stmts_path = pystow.join('indra', 'db',
name='processed_statements.tsv.gz')
def enzyme_is_phosphatase(stmt):
if isinstance(stmt, Modification) and stmt.enz is not None:
if hgnc_client.is_phosphatase(stmt.enz.name):
return True
elif isinstance(stmt, (RegulateActivity, RegulateAmount)):
if hgnc_client.is_phosphatase(stmt.subj.name):
return True
return False
def get_e3_ubi_ligases():
if not os.path.exists('e3_ubi_ligases.txt'):
df = pystow.ensure_excel(
'indra_signor',
url='https://esbl.nhlbi.nih.gov/Databases/KSBP2/'
'Targets/Lists/E3-ligases/'
'Definite%20Ligase%20List.xlsx',
read_excel_kwargs={'skiprows': 12})
gene_names = []
for gene in df['Gene Symbol']:
if hgnc_client.get_hgnc_id(gene):
gene_names.append(gene)
else:
# Some outdated gene names and ones that aren't all caps
hgnc_id = hgnc_client.get_current_hgnc_id(gene.upper())
if hgnc_id:
current_name = hgnc_client.get_hgnc_name(hgnc_id)
gene_names.append(current_name)
else:
print(f'Gene {gene} not found in HGNC')
with open('e3_ubi_ligases.txt', 'w') as fh:
for gene_name in gene_names:
fh.write(f'{gene_name}\n')
else:
with open('e3_ubi_ligases.txt', 'r') as fh:
gene_names = [line.strip()
for line in fh.readlines() if line.strip()]
return gene_names
UBI_LIGASES = get_e3_ubi_ligases()
def enzyme_is_e3_ubi_ligase(stmt):
assert len(stmt.evidence) == 1
if stmt.evidence[0].text == 'UbiNet 2.0':
return False
if isinstance(stmt, Modification) and stmt.enz is not None:
if stmt.enz.name in UBI_LIGASES:
return True
elif isinstance(stmt, (RegulateActivity, RegulateAmount)):
if stmt.subj.name in UBI_LIGASES:
return True
return False
semantic_constraints = {
'dephosphorylation': enzyme_is_phosphatase,
'ubiquitination': enzyme_is_e3_ubi_ligase
}
def load_statement_json(json_str: str, attempt: int = 1, max_attempts: int = 5):
try:
return json.loads(json_str)
except json.JSONDecodeError:
if attempt < max_attempts:
json_str = codecs.escape_decode(json_str)[0].decode()
return load_statement_json(
json_str, attempt=attempt + 1, max_attempts=max_attempts
)
raise ValueError
# This function can be used to get a clean set of just
# a given type of modification statements but since SIGNOR
# requires polarity of regulation along with PTMs we need
# to use get_mod_and_reg_stmts instead
def get_mod_stmts(mod_key='dephophorylation'):
# Initial filter for a given mod
stmts_by_hash = defaultdict(list)
with gzip.open(processed_stmts_path, 'rt') as fh:
reader = csv.reader(fh, delimiter='\t')
for lines in tqdm.tqdm(batch_iter(reader, 10000), total=6041):
for stmt_hash, stmt_json in lines:
if mod_key not in stmt_json.lower():
continue
stmt_json = load_statement_json(stmt_json)
stmt = stmt_from_json(stmt_json)
if isinstance(stmt, modtype_to_modclass[mod_key]):
constraint = semantic_constraints.get(mod_key)
if constraint:
if constraint(stmt):
stmts_by_hash[stmt_hash].append(stmt)
else:
stmts_by_hash[stmt_hash].append(stmt)
return stmts_by_hash
def get_mod_and_reg_stmts(mod_key='dephophorylation', regulation_types=None):
if regulation_types is None:
regulation_types = {RegulateActivity}
stmts_by_hash = defaultdict(list)
stmt_types_allowed = \
tuple({modtype_to_modclass[mod_key]} | regulation_types)
with gzip.open(processed_stmts_path, 'rt') as fh:
reader = csv.reader(fh, delimiter='\t')
for lines in tqdm.tqdm(batch_iter(reader, 10000), total=6041):
for stmt_hash, stmt_json in lines:
stmt_json = load_statement_json(stmt_json)
stmt = stmt_from_json(stmt_json)
if isinstance(stmt, stmt_types_allowed):
agents = stmt.agent_list()
if len(agents) != 2 or agents[0] is None:
continue
controller = agents[0]
if controller.db_refs.get('HGNC'):
constraint = semantic_constraints.get(mod_key)
if constraint:
if constraint(stmt):
stmts_by_hash[stmt_hash].append(stmt)
else:
stmts_by_hash[stmt_hash].append(stmt)
return stmts_by_hash
if __name__ == '__main__':
####################
mod_key = 'ubiquitination'
regulation_types = {RegulateAmount, RegulateActivity}
####################
mod_class = modtype_to_modclass[mod_key]
stmts_by_hash = get_mod_and_reg_stmts(mod_key, regulation_types)
print(f'Got {len(stmts_by_hash)} statements of type {mod_key} and'
f' regulation types {regulation_types}')
# Filter for human genes, non-self regulations
stmts_by_hash = {k: v for k, v in stmts_by_hash.items() if
'HGNC' in v[0].real_agent_list()[1].db_refs}
print(f'Filtered to {len(stmts_by_hash)} statements with human genes')
stmts_by_hash = {k: v for k, v in stmts_by_hash.items() if
v[0].real_agent_list()[0].name != v[0].real_agent_list()[1].name}
print(f'Filtered to {len(stmts_by_hash)} statements with non-self regulations')
# Flatten list of hash-based lists
stmts = []
for ss in stmts_by_hash.values():
stmts += ss
# Filter for curated statements
curs = indra_db_rest.get_curations()
stmts = ac.filter_by_curation(stmts, curs)
# Get just modification statements of the given type
mod_stmts = [stmt for stmt in stmts if isinstance(stmt, mod_class)]
# Organize mod Statements by enzyme/substrate pair
mod_stmts_by_pair = defaultdict(list)
for stmt in mod_stmts:
agents = stmt.real_agent_list()
pair = (agents[0].name, agents[1].name)
mod_stmts_by_pair[pair].append(stmt)
# Organize SIGNOR modification Statements by whether they have a
# specific site or not
has_signors_without_site = {}
has_signors_with_site = defaultdict(set)
for k, v in mod_stmts_by_pair.items():
for stmt in v:
if stmt.evidence[0].source_api == 'signor':
if stmt.position:
has_signors_with_site[k].add(stmt.position)
else:
has_signors_without_site[k] = True
has_signors_with_site = dict(has_signors_with_site)
# Filter out Statements that are already in SIGNOR
mod_stmts_by_pair_new = defaultdict(set)
for k, v in mod_stmts_by_pair.items():
for stmt in v:
if k in has_signors_with_site:
if stmt.position is None:
continue
elif stmt.position in has_signors_with_site[k]:
continue
elif k in has_signors_without_site:
if stmt.position is None:
continue
mod_stmts_by_pair_new[k].add(stmt)
mod_stmts_by_pair_new = dict(mod_stmts_by_pair_new)
# We now organize all statements by relevant paper
stmts_by_paper = defaultdict(dict)
for stmt in stmts:
pair = (stmt.real_agent_list()[0].name, stmt.real_agent_list()[1].name)
if pair not in mod_stmts_by_pair_new:
continue
for ev in stmt.evidence:
if not ev.pmid:
continue
if pair in stmts_by_paper[ev.pmid]:
stmts_by_paper[ev.pmid][pair].add(stmt)
else:
stmts_by_paper[ev.pmid][pair] = {stmt}
stmts_by_paper = dict(stmts_by_paper)
all_relevant_stmts = []
for paper, stmts_by_pair in stmts_by_paper.items():
for pair, stmts_for_pair in stmts_by_pair.items():
stmts_by_type = defaultdict(list)
for stmt in stmts_for_pair:
if isinstance(stmt, mod_class):
if stmt in mod_stmts_by_pair_new[pair]:
stmts_by_type[type(stmt)].append(stmt)
else:
stmts_by_type[type(stmt)].append(stmt)
# This means that in this paper, for this pair, we have
# both a modification and a regulation statement
if mod_class in stmts_by_type and len(stmts_by_type) > 1:
# Now we need to save all _novel_ modifications and all
# regulation statements from this paper
for stmt in stmts_for_pair:
if isinstance(stmt, mod_class):
if stmt in mod_stmts_by_pair_new[pair]:
all_relevant_stmts.append(stmt)
else:
all_relevant_stmts.append(stmt)
# Assemble Statements and dump the results
stmts_assembled = ac.run_preassembly(all_relevant_stmts,
return_toplevel=False)
# Sort evidence by PMID
for stmt in stmts_assembled:
stmt.evidence = sorted(stmt.evidence, key=lambda x: x.pmid)
# This function sorts statements such that they are grouped according
# to phosphatase/target pairs, sorted according to the number of
# evidences for the given type of modification overall, and then sorted
# according to the number of evidences for the specific statement
# type with Modification always coming first within the group.
def get_sort_key(stmt, ev_counts):
agents = stmt.real_agent_list()
a, b = [a.name for a in agents[:2]]
ab_key = (a, b)
group_ev_key = ev_counts.get(ab_key, 0)
stmt_type_key = 0 if isinstance(stmt, mod_class) else 1
ev_key = len(stmt.evidence)
return (-group_ev_key, ab_key, stmt_type_key, -ev_key)
# We need these to sort the statements, these are evidences
# for the given type of modification specifically
ev_counts = defaultdict(int)
for stmt in stmts_assembled:
agents = stmt.real_agent_list()
a, b = [a.name for a in agents[:2]]
ab_key = (a, b)
if isinstance(stmt, mod_class):
ev_counts[ab_key] += len(stmt.evidence)
ev_counts = dict(ev_counts)
sorted_stmts = sorted(stmts_assembled, key=lambda x: get_sort_key(x, ev_counts))
# Dump Statements as pickle as HTML
ac.dump_statements(sorted_stmts, f'{mod_key}s_with_reg_sorted.pkl')
ha = HtmlAssembler(sorted_stmts,
db_rest_url='https://db.indra.bio',
title=f'INDRA {mod_key} Statements with regulation sample')
m = ha.make_model(grouping_level='agent-pair')
ha.save_model(f'indra_{mod_key}_with_reg.html')