-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKMeansRexCore.cpp
234 lines (191 loc) · 6.61 KB
/
KMeansRexCore.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/* KMeansRexCore.cpp
A fast, easy-to-read implementation of the K-Means clustering algorithm.
allowing customized initialization (random samples or plus plus)
and vectorized execution via the Eigen matrix template library.
Intended to be compiled as a shared library libkmeansrex.so
which can then be utilized from high-level interactive environments,
such as Matlab or Python.
Contains:
Utility Fcns:
discrete_rand : sampling discrete r.v.
select_without_replacement : sample discrete w/out replacement
Cluster Location Mu Initialization:
sampleRowsRandom : sample at random (w/out replacement)
sampleRowsPlusPlus : sample via K-Means++ (Arthur et al)
see http://en.wikipedia.org/wiki/K-means%2B%2B
K-Means Algorithm (aka Lloyd's Algorithm)
run_lloyd : executes lloyd for spec'd number of iterations
External "C" function interfaces (for calling from Python)
NOTE: These take only pointers to float arrays, not Eigen array types
RunKMeans : compute cluster centers and assignments via lloyd
SampleRowsPlusPlus : get just a plusplus initialization
Dependencies:
mersenneTwister2002.c : random number generator
Author: Mike Hughes (www.michaelchughes.com)
Date: 2 April 2015
*/
/*
extern "C" {
void RunKMeans(double *X_IN, int N, int D, int K, int Niter, int seed, char* initname, double *Mu_OUT, double *Z_OUT);
void SampleRowsPlusPlus(double *X_IN, int N, int D, int K, int seed, double *Mu_OUT);
}
*/
#include <iostream>
#include "mersenneTwister2002.c"
#include <Eigen/Dense>
using namespace Eigen;
using namespace std;
/* DEFINE Custom Type Names to make code more readable
ExtMat : 2-dim matrix/array externally defined (e.g. in Matlab or Python)
*/
typedef Map<ArrayXXd> ExtMat;
typedef ArrayXXd Mat;
typedef ArrayXd Vec;
// ====================================================== Utility Functions
void set_seed( int seed ) {
init_genrand( seed );
}
int discrete_rand( Vec &p ) {
double total = p.sum();
int K = (int) p.size();
double r = total*genrand_double();
double cursum = p(0);
int newk = 0;
while ( r >= cursum && newk < K-1) {
newk++;
cursum += p[newk];
}
if ( newk < 0 || newk >= K ) {
cerr << "Badness. Chose illegal discrete value." << endl;
return -1;
}
return newk;
}
void select_without_replacement( int N, int K, Vec &chosenIDs) {
Vec p = Vec::Ones(N);
for (int kk =0; kk<K; kk++) {
int choice;
int doKeep = false;
while ( doKeep==false) {
doKeep=true;
choice = discrete_rand( p );
for (int previd=0; previd<kk; previd++) {
if (chosenIDs[previd] == choice ) {
doKeep = false;
break;
}
}
}
chosenIDs[kk] = choice;
}
}
// ======================================================= Init Cluster Locs Mu
void sampleRowsRandom( ExtMat &X, ExtMat &Mu ) {
int N = X.rows();
int K = Mu.rows();
Vec ChosenIDs = Vec::Zero(K);
select_without_replacement( N, K, ChosenIDs );
for (int kk=0; kk<K; kk++) {
Mu.row( kk ) = X.row( ChosenIDs[kk] );
}
}
void sampleRowsPlusPlus( ExtMat &X, ExtMat &Mu ) {
int N = X.rows();
int K = Mu.rows();
Vec ChosenIDs = Vec::Ones(K);
int choice = discrete_rand( ChosenIDs );
Mu.row(0) = X.row( choice );
ChosenIDs[0] = choice;
Vec minDist(N);
Vec curDist(N);
for (int kk=1; kk<K; kk++) {
curDist = ( X.rowwise() - Mu.row(kk-1) ).square().rowwise().sum().sqrt();
if (kk==1) {
minDist = curDist;
} else {
minDist = curDist.min( minDist );
}
choice = discrete_rand( minDist );
ChosenIDs[kk] = choice;
Mu.row(kk) = X.row( choice );
}
}
void init_Mu( ExtMat &X, ExtMat &Mu, const char* initname ) {
if ( string( initname ) == "random" ) {
sampleRowsRandom( X, Mu );
} else if ( string( initname ) == "plusplus" ) {
sampleRowsPlusPlus( X, Mu );
}
}
// ======================================================= Update Cluster Assignments Z
void pairwise_distance( ExtMat &X, ExtMat &Mu, Mat &Dist ) {
int N = X.rows();
int D = X.cols();
int K = Mu.rows();
// For small dims D, for loop is noticeably faster than fully vectorized.
// Odd but true. So we do fastest thing
if ( D <= 16 )
{
for (int kk=0; kk<K; kk++) {
Dist.col(kk) = ( X.rowwise() - Mu.row(kk) ).square().rowwise().sum();
}
}
else
{
Dist.matrix() = -2*( X.matrix() * Mu.transpose().matrix() );
Dist.rowwise() += Mu.square().rowwise().sum().transpose().row(0);
}
}
double assignClosest( ExtMat &X, ExtMat &Mu, ExtMat &Z, Mat &Dist) {
double totalDist = 0;
int minRowID;
pairwise_distance( X, Mu, Dist );
for (int nn=0; nn<X.rows(); nn++) {
totalDist += Dist.row(nn).minCoeff( &minRowID );
Z(nn,0) = minRowID;
}
return totalDist;
}
// ======================================================= Update Cluster Locations Mu
void calc_Mu( ExtMat &X, ExtMat &Mu, ExtMat &Z) {
Mu = Mat::Zero( Mu.rows(), Mu.cols() );
Vec NperCluster = Vec::Zero( Mu.rows() );
for (int nn=0; nn<X.rows(); nn++) {
Mu.row( (int) Z(nn,0) ) += X.row( nn );
NperCluster[ (int) Z(nn,0)] += 1;
}
Mu.colwise() /= NperCluster;
}
// ======================================================= Overall Lloyd Algorithm
void run_lloyd( ExtMat &X, ExtMat &Mu, ExtMat &Z, int Niter ) {
double prevDist=INT_MAX,totalDist = 0;
Mat Dist = Mat::Zero( X.rows(), Mu.rows() );
for (int iter=0; iter<Niter; iter++) {
totalDist = assignClosest( X, Mu, Z, Dist );
calc_Mu( X, Mu, Z );
if ( prevDist == totalDist ) {
break;
}
prevDist = totalDist;
}
}
// =================================================================================
// =================================================================================
// =========================== EXTERNALLY CALLABLE FUNCTIONS ======================
// =================================================================================
// =================================================================================
void RunKMeans(double *X_IN, int N, int D, int K, int Niter,
int seed, const char* initname, double *Mu_OUT, double *Z_OUT) {
set_seed( seed );
ExtMat X ( X_IN, N, D);
ExtMat Mu ( Mu_OUT, K, D);
ExtMat Z ( Z_OUT, N, 1);
init_Mu( X, Mu, initname);
run_lloyd( X, Mu, Z, Niter );
}
void SampleRowsPlusPlus(double *X_IN, int N, int D, int K, int seed, double *Mu_OUT) {
set_seed( seed );
ExtMat X ( X_IN, N, D);
ExtMat Mu ( Mu_OUT, K, D);
sampleRowsPlusPlus( X, Mu);
}