-
Notifications
You must be signed in to change notification settings - Fork 1
/
kmeans.py
78 lines (58 loc) · 2.59 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
from data_utils import load_dataset
from distance import euclidian
import matplotlib.pyplot as plt
def plot(dataset, history_centroids, belongs_to):
colors = ['r', 'g']
fig, ax = plt.subplots()
for index in range(dataset.shape[0]):
instances_close = [i for i in range(len(belongs_to)) if belongs_to[i] == index]
for instance_index in instances_close:
ax.plot(dataset[instance_index][0], dataset[instance_index][1], (colors[index] + 'o'))
history_points = []
for index, centroids in enumerate(history_centroids):
for inner, item in enumerate(centroids):
if index == 0:
history_points.append(ax.plot(item[0], item[1], 'bo')[0])
else:
history_points[inner].set_data(item[0], item[1])
print("centroids {} {}".format(index, item))
plt.pause(0.8)
def load_distance_function(distance_name):
distances = {
'euclidian': euclidian
}
return distances.get(distance_name)
def kmeans(k, dataset, epsilon=0, distance='euclidian'):
history_centroids = []
dist_method = load_distance_function(distance)
num_instances, num_features = dataset.shape
prototypes = dataset[np.random.randint(0, num_instances - 1, size=k)]
history_centroids.append(prototypes)
prototypes_old = np.zeros(prototypes.shape)
belongs_to = np.zeros((num_instances, 1))
norm = dist_method(prototypes, prototypes_old)
iteration = 0
while norm > epsilon:
iteration += 1
norm = dist_method(prototypes, prototypes_old)
prototypes_old = prototypes
for index_instance, instance in enumerate(dataset):
dist_vec = np.zeros((k, 1))
for index_prototype, prototype in enumerate(prototypes):
dist_vec[index_prototype] = dist_method(prototype,
instance)
belongs_to[index_instance, 0] = np.argmin(dist_vec)
tmp_prototypes = np.zeros((k, num_features))
for index in range(len(prototypes)):
instances_close = [i for i in range(len(belongs_to)) if belongs_to[i] == index]
prototype = np.mean(dataset[instances_close], axis=0)
tmp_prototypes[index, :] = prototype
prototypes = tmp_prototypes
history_centroids.append(tmp_prototypes)
return prototypes, history_centroids, belongs_to
def execute():
dataset = load_dataset('flame.txt')
centroids, history_centroids, belongs_to = kmeans(2, dataset)
plot(dataset, history_centroids, belongs_to)
execute()