forked from mqcomplab/iSIM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
isim_clustering.py
158 lines (129 loc) · 4.59 KB
/
isim_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from isim_comp import gen_sim_dict
import numpy as np
""" Proof-of-principle Hierarchical Agglomerative Clustering using iSIM
----------------------------------------------------------------------
Miranda-Quintana Group, Department of Chemistry, University of Florida
----------------------------------------------------------------------
Please, cite the original paper on iSIM:
"""
def pre_process(fingerprints):
"""
Function to pre-process the fingerprints before clustering.
Parameters:
-----------
fingerprints: numpy.ndarray or list
Array of fingerprints to cluster.
Returns:
--------
processed_fingerprints: numpy.ndarray
Array of pre-processed fingerprints.
"""
processed_fingerprints = []
for i, fp in enumerate(fingerprints):
l = np.append(fp, 1)
processed_fingerprints.append([np.array(l), [i]])
return processed_fingerprints
def max_indices(fingerprints, n_ary = 'RR'):
"""
Function to find the indices of the two most similar fingerprints in a list of fingerprints.
Parameters:
-----------
fingerprints: numpy.ndarray or list
Array of fingerprints to cluster.
n_ary: str
Type of iSIM to use. Default is 'RR'. Other options are 'JT' and 'SM'.
Returns:
--------
max1: int
Index of the first most similar fingerprint.
max2: int
Index of the second most similar fingerprint.
"""
max_sim = -3.08
max1, max2 = len(fingerprints), len(fingerprints)
for i, d1 in enumerate(fingerprints):
for j, d2 in enumerate(fingerprints):
if i == j:
pass
else:
fp1 = np.array(d1[0][:-1])
fp2 = np.array(d2[0][:-1])
n = d1[0][-1] + d2[0][-1]
s = gen_sim_dict(data = fp1 + fp2, n_objects = n)[n_ary]
if s > max_sim:
max_sim = s
max1 = i
max2 = j
return max1, max2
def update_data(data, max1, max2):
"""
Function to update the list of fingerprints after combining the two most similar fingerprints.
Parameters:
-----------
data: list
List of fingerprints to cluster.
max1: int
Index of the first most similar fingerprint.
max2: int
Index of the second most similar fingerprint.
Returns:
--------
new_data: list
List of fingerprints after combining the two most similar fingerprints.
"""
new_cluster = data[max1][-1] + data[max2][-1]
new_condensed = data[max1][0] + data[max2][0]
new_data = []
for i, d in enumerate(data):
if i == max1:
pass
elif i == max2:
pass
else:
new_data.append(d)
new_data.append([new_condensed, new_cluster])
return new_data
def cluster_tree(processed_fingerprints, n_ary = 'RR'):
tree = []
while len(processed_fingerprints) > 1:
tree.append([])
for d in processed_fingerprints:
tree[-1].append(d[-1])
max1, max2 = max_indices(processed_fingerprints, n_ary = n_ary)
processed_fingerprints = update_data(processed_fingerprints, max1, max2)
return tree
def gen_z(tree):
# Combine the last two elements of the tree
tree.append([tree[-1][-1] + tree[-1][-2]])
# Get the numer of original data objects
n = len(tree[0])
# Initialize the Z matrix
Z = np.zeros((n - 1, 4))
# Fill the third and fourth elements of the Z matrix
for i in range(0, n-1):
Z[i, 2] = i + 1
Z[i, 3] = len(tree[i + 1][-1])
# Get the clusters in order for indexing
clusters = []
for item in tree[0]:
clusters.append(item)
for i in range(1, len(tree) - 1):
clusters.append(tree[i][-1])
# Fill the first and second elements of the Z matrix
for i in range(0, n - 1):
# Get elements in the cluster
c_clusters = []
for element in tree[i]:
if element not in tree[i+1]:
c_clusters.append(element)
# Combine elements in c_clusters in one list
comb_clusters = [item for sublist in c_clusters for item in sublist]
#if comb_clusters == tree[i+1][-1] and len(c_clusters) == 2: print('OK')
Z[i][0] = clusters.index(c_clusters[0])
Z[i][1] = clusters.index(c_clusters[1])
return Z
def hierarchical_clustering(fingerprints, n_ary = 'RR'):
processed_fingerprints = pre_process(fingerprints)
tree = cluster_tree(processed_fingerprints, n_ary = n_ary)
Z = gen_z(tree)
return tree, Z