-
Notifications
You must be signed in to change notification settings - Fork 0
/
visualize_targets.py
112 lines (81 loc) · 3.13 KB
/
visualize_targets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import csv
import pandas
import operator
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
import requests
def read_csv(filename):
smiles = []
chembl_id = []
proba = []
ec50 = []
with open(filename) as ifile:
data = csv.reader(ifile, delimiter=',')
firstline = True
for line in data:
if firstline:
firstline = False
continue
smiles.append(line[0])
chembl_id.append(line[1])
proba.append(line[2])
ec50.append(line[3])
return smiles, chembl_id, proba, ec50
def fetch_target_information(target):
re = requests.get('https://www.ebi.ac.uk/chembl/api/data/target/{0}.json'.format(target))
return (target, re.json()['organism'], re.json()['pref_name'])
def write_csv(output, filename):
with open(filename, "w") as ofile:
writer = csv.writer(ofile)
writer.writerow(['CHEMBL_ID', 'Organism', 'Target Name', 'Target Count'])
writer.writerows(output)
def main():
output = []
parser = argparse.ArgumentParser(description='options parser for visualize_targets.py')
parser.add_argument('--input', dest="input_filename", required=True)
parser.add_argument('--output', dest="output_filename", required=True)
args = parser.parse_args()
filename = args.input_filename
output_filename = args.output_filename
smiles, chembl_id, proba, ec50 = read_csv(filename)
# Remove duplicates
unique_chembl_id = set(chembl_id)
print ''
print "Total number of unique protein targets: {}".format(len(unique_chembl_id))
# Generate dictionary for unique ids and their counts
unique_chembl_id_d = {key: 0 for key in unique_chembl_id}
for i,x in enumerate(chembl_id):
unique_chembl_id_d[x] += 1
sorted_chembl = sorted(unique_chembl_id_d.items(), key=operator.itemgetter(1), reverse=True)
# Get the cutoff value so that only top 10 values are shown
df = pandas.DataFrame(unique_chembl_id_d.items(), columns=['chembl_id', 'count'])
dfList = df['count'].tolist()
dfList.sort(reverse=True)
cutoff = dfList[10]
subset = df.loc[df['count'] > cutoff]
subset_sorted = subset.sort_values(by='count', ascending=False)
print ''
print 'The top 5 most populated targets: '
print subset_sorted.head(n=5)
print ''
unique_top_chembl_ids = subset_sorted['chembl_id'].tolist()
print 'Grabbing information for unique targets...'
for ids in unique_top_chembl_ids:
target, organism, name = fetch_target_information(ids)
#sprint "target, organism, name : {}, {}, {}".format(target, organism, name)
temp = [ids, organism, name,unique_chembl_id_d[ids]]
output.append(temp)
print ''
print "Writing output..."
print ''
write_csv(output, output_filename)
print "Generating plot..."
sns.set(style="white", context="talk")
sns.barplot(subset_sorted['chembl_id'], subset['count'], palette="GnBu_d")
plt.xticks(rotation=35)
plt.tick_params(labelsize=10)
sns.plt.title('Chembl ID Count')
sns.plt.show()
if __name__ == '__main__':
main()