-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathOnline_temporal_clustering_JSI_release.py
259 lines (209 loc) · 9.06 KB
/
Online_temporal_clustering_JSI_release.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import copy
import heapq
import math
import operator
import numpy as np
import scipy
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0, 0), (0, 0), *args, **kwargs)
self._verts3d = xs, ys, zs
def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0], ys[0]), (xs[1], ys[1]))
FancyArrowPatch.draw(self, renderer)
def kernel_linear(x, y):
return scipy.dot(x, y)
def kernel_poly(x, y, a=1.0, b=1.0, p=2.0):
return (a * scipy.dot(x, y) + b) ** p
#bigger sigma closer to Euclidian
def kernel_gauss(x, y, sigma= 0.1):
v = x - y
l = math.sqrt(scipy.square(v).sum())
return math.exp(-sigma * (l ** 2))
def kernel_normalise(k):
return lambda x, y: k(x, y) / math.sqrt(k(x, x) + k(y, y))
def kernel_dist(x, y):
# if gaussian kernel:
return 2 - 2 * kernel(x, y)
# if not
# return kernel(x,x)-2*kernel(x,y)+kernel(y,y)
kernel = kernel_normalise(kernel_gauss)
# the time past which you cannot add an instance to the cluster
deltaT = 5 #bigger number bigger clusters, tends to combine small clusters with big ones
# if the cluster is older than memoryDelta, then remove it from the currentClusters and put it in the allClsuters list
memoryDelta = deltaT + 1 #bigger number smaler clusters, lots of empty space... only sure clusters
#the number of current clusters... pool of clusters
num_clusterss = 3 #bigger number scattered clusters, lots of empty space... if you increase this, also increase the memory parameters
threshold_cluster_size = 10
class OnlineVariance(object):
"""
Welford's algorithm computes the sample variance incrementally.
"""
def __init__(self, iterable=None, ddof=1):
self.ddof, self.n, self.mean, self.M2, self.std = ddof, 0, 0.0, 0.0, 0.0
def std_calc(self, datum):
self.n += 1
self.delta = datum - self.mean
self.mean += self.delta / self.n
self.M2 += self.delta * (datum - self.mean)
self.variance = self.M2 / (self.n - self.ddof)
self.std = np.sqrt(self.variance)
def merge_std(self, new):
n_mean = (self.n * self.mean + new.n * new.mean) / (self.n + new.n)
self.variance = (self.n * self.std * self.std + new.n * new.std * new.std + self.n * (self.mean - n_mean) * (
self.mean - n_mean) + new.n * (new.mean - n_mean) * (new.mean - n_mean)) / (self.n + new.n)
self.std = np.sqrt(self.variance)
self.M2 = self.variance * (self.n + new.n - self.ddof)
self.n += new.n
self.mean = n_mean
class Cluster(object):
def __init__(self, a, timestamp):
self.center = a
self.size = kernel(a, a)
self.timestampEnd = timestamp
self.timestampStart = timestamp
self.firstPoint = a
self.endPoint = a
self.num_points = 1
self.STD = OnlineVariance(ddof=0)
self.STD.std_calc(kernel(a, a))
def add(self, e, timestampNew):
self.size += kernel(self.center, e)
self.center += (e - self.center) / self.size
self.timestampEnd = timestampNew
self.endPoint = e
self.num_points += 1
self.STD.std_calc(kernel(self.center, e))
def merge(self, c):
self.center = (self.center * self.size + c.center * c.size) / (self.size + c.size)
self.size += c.size
self.num_points += c.num_points
self.num_points -= 1
self.STD.merge_std(c.STD)
if self.timestampEnd < c.timestampEnd:
self.timestampEnd = c.timestampEnd
self.endPoint = c.endPoint
if self.timestampStart > c.timestampStart:
self.timestampStart = c.timestampStart
self.firstPoint = c.firstPoint
def resize(self, dim):
extra = scipy.zeros(dim - len(self.center))
self.center = scipy.append(self.center, extra)
def __str__(self):
return "Cluster( %s, %f, %f , %f )" % (self.center, self.size, self.timestampStart, self.timestampEnd)
class Dist(object):
"""this is just a tuple,
but we need an object so we can define cmp for heapq"""
def __init__(self, x, y, d):
self.x = x
self.y = y
self.d = d
def __cmp__(self, o):
return cmp(self.d, o.d)
def __str__(self):
return "Dist(%f)" % (self.d)
p = 0
class OnlineCluster(object):
def __init__(self, N):
"""N-1 is the largest number of clusters I can find Higher N makes me slower"""
self.n = 0
self.N = N
self.allClusters = []
self.currentClusters = []
# max number of dimensions we've seen so far
self.dim = 0
# cache inter-cluster distances
self.dist = []
def resize(self, dim):
for c in self.currentClusters:
c.resize(dim)
self.dim = dim
def cluster(self, e, timestamp):
#check if there are old clusters and move them to memory
possibleMerge = []
for clusterI in self.currentClusters:
# if the cluster is older than memoryDelta, then remove it from the currentClusters and put it in the allClsuters list
if timestamp - clusterI.timestampEnd >= memoryDelta:
self.currentClusters.remove(clusterI)
self.removedist(clusterI)
#merging clusters... TODO
if len(self.allClusters) > 0:
min_distance = kernel(clusterI.center, self.allClusters[-1].center)
closest_cluster = copy.deepcopy(self.allClusters[-1])
for clusterR in reversed(self.allClusters):
if clusterI.timestampEnd - clusterR.timestampEnd < memoryDelta:
possibleMerge.append (clusterR)
distance = kernel(clusterI.center, clusterR.center)
if min_distance > distance:
min_distance = distance
closest_cluster = clusterR
else:
break
if (abs(min_distance-closest_cluster.STD.mean) < p*closest_cluster.STD.std) or (abs(min_distance-clusterI.STD.mean) < p*clusterI.STD.std):
for i in reversed(self.allClusters):
if i.STD.std == closest_cluster.STD.std:
self.allClusters.remove(i)
break
closest_cluster.merge(clusterI)
self.allClusters.append(closest_cluster)
else:
self.allClusters.append(clusterI)
else:
self.allClusters.append(clusterI)
if len(e) > self.dim:
self.resize(len(e))
if len(self.currentClusters) > 0:
# compare new points to each existing cluster
c = [(i, kernel_dist(x.center, e)) for i, x in enumerate(self.currentClusters)]
closest = self.currentClusters[min(c, key=operator.itemgetter(1))[0]]
#if the point is older than deltaT, do not add it
if (timestamp - closest.timestampEnd) < deltaT:
closest.add(e, timestamp)
# invalidate dist-cache for this cluster
self.updatedist(closest)
else:
aaa = 5
if len(self.currentClusters) >= self.N and len(self.currentClusters) > 1:
# merge closest two clusters
m = heapq.heappop(self.dist)
try:
self.currentClusters.remove(m.y)
self.removedist(m.y)
m.x.merge(m.y)
self.updatedist(m.x)
except:
print m.y
# make a new cluster for this point
newc = Cluster(e, timestamp)
self.currentClusters.append(newc)
self.updatedist(newc)
self.n += 1
def removedist(self, c):
"""invalidate intercluster distance cache for c"""
r = []
for x in self.dist:
if x.x == c or x.y == c:
r.append(x)
for x in r: self.dist.remove(x)
heapq.heapify(self.dist)
def updatedist(self, c):
"""Cluster c has changed, re-compute all intercluster distances"""
self.removedist(c)
for x in self.currentClusters:
if x == c: continue
d = kernel_dist(x.center, c.center)
t = Dist(x, c, d)
heapq.heappush(self.dist, t)
def trimclusters(self):
"""Return only clusters over threshold"""
clusters = self.allClusters + self.currentClusters
#t = scipy.mean([x.size for x in filter(lambda x: x.size > 0, clusters)]) * 0.5
t = threshold_cluster_size
#print "Threshold: " + str(t)
return filter(lambda x: x.num_points >= t, clusters)
#return filter(lambda x: x.num_points >= t and x.num_points / x.size < 0.73, clusters)
activity_means = [[3.0, 5.0], [7.0, 7.0], [5.0, 5.0]]