forked from mqcomplab/iSIM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
isim_div.py
213 lines (161 loc) · 6.84 KB
/
isim_div.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from isim_comp import *
def get_new_index_n(total_data, selected_condensed, n, select_from_n, n_ary = 'RR'):
"""Select a diverse object using the ECS_MeDiv algorithm"""
n_total = n + 1
# min value that is guaranteed to be higher than all the comparisons
min_value = 3.08
# placeholder index
index = len(total_data) + 1
# for all indices that have not been selected
for i in select_from_n:
# column sum
c_total = selected_condensed + total_data[i]
# calculating similarity
sim_index = gen_sim_dict(c_total, n_total)[n_ary]
# if the sim of the set is less than the similarity of the previous diverse set, update min_value and index
if sim_index < min_value:
index = i
min_value = sim_index
return index
def get_new_index_sqrt(total_data, selected_condensed, n, select_from_n, k = 2, n_ary = 'RR'):
n_total = n + 1
# min value that is guaranteed to be higher than all the comparisons
min_value = 3.08
# placeholder index
index = len(total_data) + 1
# for all indices that have not been selected
for i in select_from_n:
# column sum
c_total = selected_condensed + total_data[i]
# calculating similarity
sim_index = gen_sim_dict(c_total, n_objects = n_total, k = k)[n_ary]
# if the sim of the set is less than the similarity of the previous diverse set, update min_value and index
if sim_index < min_value:
index = i
min_value = sim_index
return index
def get_new_index_reverse(total_data, selected_condensed, n, select_from_n, n_ary = 'RR'):
n_total = n - 1
#min value that is guaranteed to be lower than all the comparisons
min_value = 3.08
#placeholder index
index = len(total_data) + 1
#for all the molecules that are already selected
for i in select_from_n:
#new column sum
c_total = selected_condensed - total_data[i]
#calculating isim
sim_index = gen_sim_dict(c_total, n_objects = n_total)[n_ary]
# if the sim of the set when taking that molecule out is lower than the min value, store new index and value
if sim_index < min_value:
index = i
min_value = sim_index
return index
def get_new_indices_b_max(total_data, selected_b, select_from_b, n_ary):
all_comps = []
for i in select_from_b:
comps = []
for j in selected_b:
new_indices = [i, j]
new_fingerprints = total_data[new_indices]
sim_index = gen_sim_dict(new_fingerprints)[n_ary]
comps.append(sim_index)
all_comps.append(comps)
sim_values = [max(comps) for comps in all_comps]
min_sim = min(sim_values)
min_list = [j for j, v in enumerate(sim_values) if v == min_sim]
return select_from_b[min_list[0]]
def diversity(data, percentage: int, start = 'medoid', n_ary = 'RR', method = 'isim'):
""" diversity: function to select from a dataset the most diverse molecules
-----------------------------------------------------------------------
Arguments
---------
data: np.array
Array of arrays containing the binary string objects
percentaje: int
Percentage of the provided data that wants to be sampled
start: str or list
srt: key on what is used to start the selection
{'medoid', 'random', 'outlier'}
list: contains the indexes of the molecules you want to start the selection
n_ary: str
Key with the abbreviation of the similarity index to perform the selection
"""
# total number of objects
n_total = len(data)
# indices of all the objects
total_indices = np.array(range(n_total))
if start =='medoid':
seed = calculate_medoid(data, n_ary = n_ary)
selected_n = [seed]
elif start == 'random':
seed = np.random.randint(0, n_total - 1)
selected_n = [seed]
elif start == 'outlier':
seed = calculate_outlier(data, n_ary = n_ary)
selected_n = [seed]
elif isinstance(start, list):
selected_n = start
else:
raise ValueError('Select a correct starting point: medoid, random or outlier')
# Number of initial objects
n = len(selected_n)
# Number of objects be selected
n_max = int(n_total * percentage / 100)
# Condensation of selected initial selection
selected_condensed = np.sum([data[i] for i in selected_n], axis = 0)
while len(selected_n) < n_max:
# indices from which to select the new fingerprints
select_from_n = np.delete(total_indices, selected_n)
if method == 'isim':
# new index selected
new_index_n = get_new_index_n(data, selected_condensed, n, select_from_n, n_ary = n_ary)
elif method == 'bmax':
new_index_n = get_new_indices_b_max(data, selected_n, select_from_n, n_ary = n_ary)
elif isinstance(method, int):
new_index_n = get_new_index_sqrt(data, selected_condensed, n, select_from_n, k = method, n_ary = n_ary)
# updating column sum vector
selected_condensed += data[new_index_n]
# updating selected indices
selected_n.append(new_index_n)
n = len(selected_n)
return selected_n
def reverse_diversity(data, percentage: int, n_ary = 'RR'):
""" diversity: function to select from a dataset the most diverse molecules
-----------------------------------------------------------------------
Arguments
---------
data: np.array
Array of arrays containing the binary string objects
percentaje: int
Percentage of the provided data that wants to be sampled
n_ary: str
Key with the abbreviation of the similarity index to perform the selection
"""
# total number of objects
n_total = len(data)
# indices of all the objects
total_indices = np.array(range(n_total))
# Number of initial objects
n = n_total
# Number of objects be deselected
n_max = int(n_total * percentage / 100)
# Deselected objeccts
deselected_n = []
# Condensation of the total fingerprints
selected_condensed = np.sum(data, axis = 0)
# Select from list
select_from_n = total_indices
while len(select_from_n) > n_max:
# indices from which to select the new fingerprints
select_from_n = np.delete(total_indices, deselected_n)
#print(selected_condensed)
# new index selected
new_index_n = get_new_index_reverse(data, selected_condensed, n, select_from_n, n_ary = n_ary)
# updating column sum vector
selected_condensed = selected_condensed - data[new_index_n]
# updating selected indices
deselected_n.append(new_index_n)
# Update the number of selected
n = n_total - len(deselected_n)
return select_from_n