From 775eefcdc25e0f67ceee2f866edc584104cf14d7 Mon Sep 17 00:00:00 2001 From: lbollar Date: Thu, 29 Sep 2016 23:46:33 -0500 Subject: [PATCH 1/7] Issue #64: added n_init to kmeans --- src/kmeans.jl | 165 ++++++++++++++++++++++++++++--------------------- test/kmeans.jl | 6 +- 2 files changed, 97 insertions(+), 74 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index 3dcaf9cb..f553b996 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -17,12 +17,14 @@ const _kmeans_default_init = :kmpp const _kmeans_default_maxiter = 100 const _kmeans_default_tol = 1.0e-6 const _kmeans_default_display = :none +const _kmeans_default_n_init = 10 function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; weights=nothing, maxiter::Integer=_kmeans_default_maxiter, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) + m, n = size(X) m2, k = size(centers) @@ -43,18 +45,37 @@ function kmeans(X::Matrix, k::Int; weights=nothing, init=_kmeans_default_init, maxiter::Integer=_kmeans_default_maxiter, + n_init::Integer=_kmeans_default_n_init, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) + m, n = size(X) (2 <= k < n) || error("k must have 2 <= k < n.") - iseeds = initseeds(init, X, k) - centers = copyseeds(X, iseeds) - kmeans!(X, centers; - weights=weights, - maxiter=maxiter, - tol=tol, - display=display) + n_init > 0 || error("n_init must be greater than 0") + + lowestcost::Float64 = Inf + local bestresult::KmeansResult + + for i = 1:n_init + + iseeds = initseeds(init, X, k) + centers = copyseeds(X, iseeds) + result = kmeans!(X, centers; + weights=weights, + maxiter=maxiter, + tol=tol, + display=display) + + if result.totalcost < lowestcost + lowestcost = result.totalcost + bestresult = result + end + + end + + return bestresult + end #### Core implementation @@ -72,86 +93,88 @@ function _kmeans!{T<:AbstractFloat}( tol::Real, # in: tolerance of change at convergence displevel::Int) # in: the level of display - # initialize - - k = size(centers, 2) - to_update = Array(Bool, k) # indicators of whether a center needs to be updated - unused = Int[] - num_affected::Int = k # number of centers, to which the distances need to be recomputed - - dmat = pairwise(SqEuclidean(), centers, x) - dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T - update_assignments!(dmat, true, assignments, costs, counts, to_update, unused) - objv = w == nothing ? sum(costs) : dot(w, costs) - - # main loop - t = 0 - converged = false - if displevel >= 2 - @printf "%7s %18s %18s | %8s \n" "Iters" "objv" "objv-change" "affected" - println("-------------------------------------------------------------") - @printf("%7d %18.6e\n", t, objv) - end - while !converged && t < maxiter - t = t + 1 + + # initialize - # update (affected) centers + k = size(centers, 2) + to_update = Array(Bool, k) # indicators of whether a center needs to be updated + unused = Int[] + num_affected::Int = k # number of centers, to which the distances need to be recomputed - update_centers!(x, w, assignments, to_update, centers, cweights) + dmat = pairwise(SqEuclidean(), centers, x) + dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T + update_assignments!(dmat, true, assignments, costs, counts, to_update, unused) + objv = w == nothing ? sum(costs) : dot(w, costs) - if !isempty(unused) - repick_unused_centers(x, costs, centers, unused) - end + # main loop + t = 0 + converged = false + if displevel >= 2 + @printf "%7s %18s %18s | %8s \n" "Iters" "objv" "objv-change" "affected" + println("-------------------------------------------------------------") + @printf("%7d %18.6e\n", t, objv) + end - # update pairwise distance matrix + while !converged && t < maxiter + t = t + 1 - if !isempty(unused) - to_update[unused] = true - end + # update (affected) centers - if t == 1 || num_affected > 0.75 * k - pairwise!(dmat, SqEuclidean(), centers, x) - else - # if only a small subset is affected, only compute for that subset - affected_inds = find(to_update) - dmat_p = pairwise(SqEuclidean(), centers[:, affected_inds], x) - dmat[affected_inds, :] = dmat_p - end + update_centers!(x, w, assignments, to_update, centers, cweights) - # update assignments + if !isempty(unused) + repick_unused_centers(x, costs, centers, unused) + end - update_assignments!(dmat, false, assignments, costs, counts, to_update, unused) - num_affected = sum(to_update) + length(unused) + # update pairwise distance matrix - # compute change of objective and determine convergence + if !isempty(unused) + to_update[unused] = true + end - prev_objv = objv - objv = w == nothing ? sum(costs) : dot(w, costs) - objv_change = objv - prev_objv + if t == 1 || num_affected > 0.75 * k + pairwise!(dmat, SqEuclidean(), centers, x) + else + # if only a small subset is affected, only compute for that subset + affected_inds = find(to_update) + dmat_p = pairwise(SqEuclidean(), centers[:, affected_inds], x) + dmat[affected_inds, :] = dmat_p + end - if objv_change > tol - warn("The objective value changes towards an opposite direction") - end + # update assignments - if abs(objv_change) < tol - converged = true - end + update_assignments!(dmat, false, assignments, costs, counts, to_update, unused) + num_affected = sum(to_update) + length(unused) - # display iteration information (if asked) + # compute change of objective and determine convergence - if displevel >= 2 - @printf("%7d %18.6e %18.6e | %8d\n", t, objv, objv_change, num_affected) - end - end + prev_objv = objv + objv = w == nothing ? sum(costs) : dot(w, costs) + objv_change = objv - prev_objv - if displevel >= 1 - if converged - println("K-means converged with $t iterations (objv = $objv)") - else - println("K-means terminated without convergence after $t iterations (objv = $objv)") - end - end + if objv_change > tol + warn("The objective value changes towards an opposite direction") + end + + if abs(objv_change) < tol + converged = true + end + + # display iteration information (if asked) + + if displevel >= 2 + @printf("%7d %18.6e %18.6e | %8d\n", t, objv, objv_change, num_affected) + end + end + + if displevel >= 1 + if converged + println("K-means converged with $t iterations (objv = $objv)") + else + println("K-means terminated without convergence after $t iterations (objv = $objv)") + end + end return KmeansResult(centers, assignments, costs, counts, cweights, @compat(Float64(objv)), t, converged) diff --git a/test/kmeans.jl b/test/kmeans.jl index 7e678ca7..2c9acd79 100644 --- a/test/kmeans.jl +++ b/test/kmeans.jl @@ -12,7 +12,7 @@ k = 10 x = rand(m, n) # non-weighted -r = kmeans(x, k; maxiter=50) +r = kmeans(x, k; maxiter=50, n_init=2) @test isa(r, KmeansResult{Float64}) @test size(r.centers) == (m, k) @test length(r.assignments) == n @@ -24,7 +24,7 @@ r = kmeans(x, k; maxiter=50) @test_approx_eq sum(r.costs) r.totalcost # non-weighted (float32) -r = kmeans(@compat(map(Float32, x)), k; maxiter=50) +r = kmeans(@compat(map(Float32, x)), k; maxiter=50, n_init=2) @test isa(r, KmeansResult{Float32}) @test size(r.centers) == (m, k) @test length(r.assignments) == n @@ -37,7 +37,7 @@ r = kmeans(@compat(map(Float32, x)), k; maxiter=50) # weighted w = rand(n) -r = kmeans(x, k; maxiter=50, weights=w) +r = kmeans(x, k; maxiter=50, weights=w, n_init=2) @test isa(r, KmeansResult{Float64}) @test size(r.centers) == (m, k) @test length(r.assignments) == n From f85cce789ebca588fcef06c160429dd49619f5ff Mon Sep 17 00:00:00 2001 From: lbollar Date: Fri, 30 Sep 2016 14:26:26 -0500 Subject: [PATCH 2/7] cleaned up for PR --- src/kmeans.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index f553b996..ff26a2df 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -24,7 +24,6 @@ function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; maxiter::Integer=_kmeans_default_maxiter, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) - m, n = size(X) m2, k = size(centers) @@ -45,14 +44,14 @@ function kmeans(X::Matrix, k::Int; weights=nothing, init=_kmeans_default_init, maxiter::Integer=_kmeans_default_maxiter, - n_init::Integer=_kmeans_default_n_init, + n_init::Integer=_kmeans_default_n_init, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) m, n = size(X) (2 <= k < n) || error("k must have 2 <= k < n.") - n_init > 0 || error("n_init must be greater than 0") + n_init > 0 || throw(ArgumentError("n_init must be greater than 0")) lowestcost::Float64 = Inf local bestresult::KmeansResult From 3ea7c3955e807c3f1e1bb3463c41eb03ccf1eeff Mon Sep 17 00:00:00 2001 From: lbollar Date: Fri, 30 Sep 2016 16:02:18 -0500 Subject: [PATCH 3/7] removed tabs --- src/kmeans.jl | 162 +++++++++++++++++++++++++------------------------- 1 file changed, 81 insertions(+), 81 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index ff26a2df..cb92a7f7 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -47,33 +47,33 @@ function kmeans(X::Matrix, k::Int; n_init::Integer=_kmeans_default_n_init, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) - + m, n = size(X) (2 <= k < n) || error("k must have 2 <= k < n.") - n_init > 0 || throw(ArgumentError("n_init must be greater than 0")) + n_init > 0 || throw(ArgumentError("n_init must be greater than 0")) - lowestcost::Float64 = Inf - local bestresult::KmeansResult + lowestcost::Float64 = Inf + local bestresult::KmeansResult - for i = 1:n_init + for i = 1:n_init - iseeds = initseeds(init, X, k) - centers = copyseeds(X, iseeds) - result = kmeans!(X, centers; - weights=weights, - maxiter=maxiter, - tol=tol, - display=display) + iseeds = initseeds(init, X, k) + centers = copyseeds(X, iseeds) + result = kmeans!(X, centers; + weights=weights, + maxiter=maxiter, + tol=tol, + display=display) - if result.totalcost < lowestcost - lowestcost = result.totalcost - bestresult = result - end + if result.totalcost < lowestcost + lowestcost = result.totalcost + bestresult = result + end - end - - return bestresult + end + + return bestresult end @@ -93,87 +93,87 @@ function _kmeans!{T<:AbstractFloat}( displevel::Int) # in: the level of display - - # initialize + + # initialize - k = size(centers, 2) - to_update = Array(Bool, k) # indicators of whether a center needs to be updated - unused = Int[] - num_affected::Int = k # number of centers, to which the distances need to be recomputed + k = size(centers, 2) + to_update = Array(Bool, k) # indicators of whether a center needs to be updated + unused = Int[] + num_affected::Int = k # number of centers, to which the distances need to be recomputed - dmat = pairwise(SqEuclidean(), centers, x) - dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T - update_assignments!(dmat, true, assignments, costs, counts, to_update, unused) - objv = w == nothing ? sum(costs) : dot(w, costs) + dmat = pairwise(SqEuclidean(), centers, x) + dmat = convert(Array{T}, dmat) #Can be removed if one day Distance.result_type(SqEuclidean(), T, T) == T + update_assignments!(dmat, true, assignments, costs, counts, to_update, unused) + objv = w == nothing ? sum(costs) : dot(w, costs) - # main loop - t = 0 - converged = false - if displevel >= 2 - @printf "%7s %18s %18s | %8s \n" "Iters" "objv" "objv-change" "affected" - println("-------------------------------------------------------------") - @printf("%7d %18.6e\n", t, objv) - end + # main loop + t = 0 + converged = false + if displevel >= 2 + @printf "%7s %18s %18s | %8s \n" "Iters" "objv" "objv-change" "affected" + println("-------------------------------------------------------------") + @printf("%7d %18.6e\n", t, objv) + end - while !converged && t < maxiter - t = t + 1 + while !converged && t < maxiter + t = t + 1 - # update (affected) centers + # update (affected) centers - update_centers!(x, w, assignments, to_update, centers, cweights) + update_centers!(x, w, assignments, to_update, centers, cweights) - if !isempty(unused) - repick_unused_centers(x, costs, centers, unused) - end + if !isempty(unused) + repick_unused_centers(x, costs, centers, unused) + end - # update pairwise distance matrix + # update pairwise distance matrix - if !isempty(unused) - to_update[unused] = true - end + if !isempty(unused) + to_update[unused] = true + end - if t == 1 || num_affected > 0.75 * k - pairwise!(dmat, SqEuclidean(), centers, x) - else - # if only a small subset is affected, only compute for that subset - affected_inds = find(to_update) - dmat_p = pairwise(SqEuclidean(), centers[:, affected_inds], x) - dmat[affected_inds, :] = dmat_p - end + if t == 1 || num_affected > 0.75 * k + pairwise!(dmat, SqEuclidean(), centers, x) + else + # if only a small subset is affected, only compute for that subset + affected_inds = find(to_update) + dmat_p = pairwise(SqEuclidean(), centers[:, affected_inds], x) + dmat[affected_inds, :] = dmat_p + end - # update assignments + # update assignments - update_assignments!(dmat, false, assignments, costs, counts, to_update, unused) - num_affected = sum(to_update) + length(unused) + update_assignments!(dmat, false, assignments, costs, counts, to_update, unused) + num_affected = sum(to_update) + length(unused) - # compute change of objective and determine convergence + # compute change of objective and determine convergence - prev_objv = objv - objv = w == nothing ? sum(costs) : dot(w, costs) - objv_change = objv - prev_objv + prev_objv = objv + objv = w == nothing ? sum(costs) : dot(w, costs) + objv_change = objv - prev_objv - if objv_change > tol - warn("The objective value changes towards an opposite direction") - end + if objv_change > tol + warn("The objective value changes towards an opposite direction") + end - if abs(objv_change) < tol - converged = true - end + if abs(objv_change) < tol + converged = true + end - # display iteration information (if asked) + # display iteration information (if asked) - if displevel >= 2 - @printf("%7d %18.6e %18.6e | %8d\n", t, objv, objv_change, num_affected) - end - end + if displevel >= 2 + @printf("%7d %18.6e %18.6e | %8d\n", t, objv, objv_change, num_affected) + end + end - if displevel >= 1 - if converged - println("K-means converged with $t iterations (objv = $objv)") - else - println("K-means terminated without convergence after $t iterations (objv = $objv)") - end - end + if displevel >= 1 + if converged + println("K-means converged with $t iterations (objv = $objv)") + else + println("K-means terminated without convergence after $t iterations (objv = $objv)") + end + end return KmeansResult(centers, assignments, costs, counts, cweights, @compat(Float64(objv)), t, converged) From fd296d429df7812c05f78db549727f348777e9c9 Mon Sep 17 00:00:00 2001 From: lbollar Date: Wed, 5 Oct 2016 11:53:03 -0500 Subject: [PATCH 4/7] removed line breaks to conform with style --- src/kmeans.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index cb92a7f7..6d565fe7 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -57,7 +57,6 @@ function kmeans(X::Matrix, k::Int; local bestresult::KmeansResult for i = 1:n_init - iseeds = initseeds(init, X, k) centers = copyseeds(X, iseeds) result = kmeans!(X, centers; @@ -70,11 +69,8 @@ function kmeans(X::Matrix, k::Int; lowestcost = result.totalcost bestresult = result end - end - return bestresult - end #### Core implementation From 50f0654fc0ff1d03910a4b850b33dff304d42c41 Mon Sep 17 00:00:00 2001 From: lbollar Date: Wed, 5 Oct 2016 12:04:06 -0500 Subject: [PATCH 5/7] fixed another whitespace conflict --- src/kmeans.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index 6d565fe7..59ec0880 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -87,8 +87,6 @@ function _kmeans!{T<:AbstractFloat}( maxiter::Int, # in: maximum number of iterations tol::Real, # in: tolerance of change at convergence displevel::Int) # in: the level of display - - # initialize From a9cb7aba7331e4aa7c8407851002c76c99c2fdf7 Mon Sep 17 00:00:00 2001 From: lbollar Date: Wed, 5 Oct 2016 12:29:57 -0500 Subject: [PATCH 6/7] used vim command to hopefully fix trailing whitespace issues --- src/kmeans.jl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index 59ec0880..ffcae95d 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -9,7 +9,7 @@ type KmeansResult{T<:AbstractFloat} <: ClusteringResult counts::Vector{Int} # number of samples assigned to each cluster (k) cweights::Vector{Float64} # cluster weights (k) totalcost::Float64 # total cost (i.e. objective) (k) - iterations::Int # number of elapsed iterations + iterations::Int # number of elapsed iterations converged::Bool # whether the procedure converged end @@ -21,7 +21,7 @@ const _kmeans_default_n_init = 10 function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; weights=nothing, - maxiter::Integer=_kmeans_default_maxiter, + maxiter::Integer=_kmeans_default_maxiter, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) @@ -35,19 +35,19 @@ function kmeans!{T<:AbstractFloat}(X::Matrix{T}, centers::Matrix{T}; counts = Array(Int, k) cweights = Array(Float64, k) - _kmeans!(X, conv_weights(T, n, weights), centers, - assignments, costs, counts, cweights, + _kmeans!(X, conv_weights(T, n, weights), centers, + assignments, costs, counts, cweights, round(Int, maxiter), tol, display_level(display)) end -function kmeans(X::Matrix, k::Int; +function kmeans(X::Matrix, k::Int; weights=nothing, init=_kmeans_default_init, - maxiter::Integer=_kmeans_default_maxiter, + maxiter::Integer=_kmeans_default_maxiter, n_init::Integer=_kmeans_default_n_init, tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) - + m, n = size(X) (2 <= k < n) || error("k must have 2 <= k < n.") @@ -59,13 +59,13 @@ function kmeans(X::Matrix, k::Int; for i = 1:n_init iseeds = initseeds(init, X, k) centers = copyseeds(X, iseeds) - result = kmeans!(X, centers; - weights=weights, + result = kmeans!(X, centers; + weights=weights, maxiter=maxiter, tol=tol, display=display) - if result.totalcost < lowestcost + if result.totalcost < lowestcost lowestcost = result.totalcost bestresult = result end @@ -84,10 +84,10 @@ function _kmeans!{T<:AbstractFloat}( costs::Vector{T}, # out: costs of the resultant assignments (n) counts::Vector{Int}, # out: the number of samples assigned to each cluster (k) cweights::Vector{Float64}, # out: the weights of each cluster - maxiter::Int, # in: maximum number of iterations - tol::Real, # in: tolerance of change at convergence + maxiter::Int, # in: maximum number of iterations + tol::Real, # in: tolerance of change at convergence displevel::Int) # in: the level of display - + # initialize k = size(centers, 2) @@ -169,7 +169,7 @@ function _kmeans!{T<:AbstractFloat}( end end - return KmeansResult(centers, assignments, costs, counts, cweights, + return KmeansResult(centers, assignments, costs, counts, cweights, @compat(Float64(objv)), t, converged) end @@ -261,7 +261,7 @@ function update_centers!{T<:AbstractFloat}( n::Int = size(x, 2) k::Int = size(centers, 2) - # initialize center weights + # initialize center weights for i = 1 : k if to_update[i] cweights[i] = 0. @@ -315,7 +315,7 @@ function update_centers!{T<:AbstractFloat}( n::Int = size(x, 2) k::Int = size(centers, 2) - # initialize center weights + # initialize center weights for i = 1 : k if to_update[i] cweights[i] = 0. @@ -330,7 +330,7 @@ function update_centers!{T<:AbstractFloat}( if wj > 0 @inbounds cj = assignments[j] 1 <= cj <= k || error("assignment out of boundary.") - + if to_update[cj] rj = view(centers, :, cj) xj = view(x, :, j) From b8b381b4df3a29129e6ccee3d3e8998810a3ac9e Mon Sep 17 00:00:00 2001 From: lbollar Date: Wed, 5 Oct 2016 12:47:55 -0500 Subject: [PATCH 7/7] extraneous newline --- src/kmeans.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/kmeans.jl b/src/kmeans.jl index ffcae95d..178bcc3d 100644 --- a/src/kmeans.jl +++ b/src/kmeans.jl @@ -48,7 +48,6 @@ function kmeans(X::Matrix, k::Int; tol::Real=_kmeans_default_tol, display::Symbol=_kmeans_default_display) - m, n = size(X) (2 <= k < n) || error("k must have 2 <= k < n.") n_init > 0 || throw(ArgumentError("n_init must be greater than 0"))