From 7f42e0758c47f096be0ec1351ee9f187a70c7278 Mon Sep 17 00:00:00 2001 From: Abhinav Natarajan Date: Tue, 31 Oct 2023 18:48:30 +0000 Subject: [PATCH] refactor imports --- src/RedClust.jl | 94 ++++---- src/example_data.jl | 37 ++-- src/mcmc.jl | 512 +++++++++++++++++++++---------------------- src/pointestimate.jl | 63 +++--- src/prior.jl | 85 ++++--- src/summaries.jl | 28 ++- src/types.jl | 135 ++++++------ src/utils.jl | 52 ++--- 8 files changed, 495 insertions(+), 511 deletions(-) diff --git a/src/RedClust.jl b/src/RedClust.jl index 8554c00..c93436a 100644 --- a/src/RedClust.jl +++ b/src/RedClust.jl @@ -3,49 +3,67 @@ ## Introduction -RedClust is the main module of `RedClust.jl`, a Julia package for Bayesian clustering of high-dimensional Euclidean data using pairwise dissimilarities instead of the raw observations. +RedClust is the main module of `RedClust.jl`, a Julia package for Bayesian clustering of high-dimensional Euclidean data using pairwise dissimilarities instead of the raw observations. -Use `names(RedClust)` to get the export list of this module, and type `?name` to get help on a specific `name`. +Use `names(RedClust)` to get the export list of this module, and type `?name` to get help on a specific `name`. See https://abhinavnatarajan.github.io/RedClust.jl/ for detailed documentation. """ module RedClust -export -# fit prior -fitprior, -fitprior2, -sampledist, -sampleK, - -# MCMC sampler -runsampler, - -# utility functions -adjacencymatrix, -sortlabels, -uppertriangle, -generatemixture, -makematrix, - -# summary functions -evaluateclustering, -summarise, - -# point estimate -getpointestimate, -binderloss, -infodist, - -# types -MCMCOptionsList, -PriorHyperparamsList, -MCMCData, -MCMCResult, - -# data -example_datasets, -example_dataset +# mcmc.jl +using Clustering: kmeans, kmedoids, mutualinfo, randindex, varinfo, vmeasure +using Dates +using Distances: Euclidean, pairwise +using Distributions: Beta, Dirichlet, Distributions, Gamma, MvNormal, Normal, Uniform, + fit_mle, logpdf, pdf, rate, shape, truncated +using HDF5: close, create_group, h5open +using LinearAlgebra: Diagonal, I +using LoopVectorization: @turbo +using Printf: @sprintf +using ProgressBars: ProgressBar +using Random: AbstractRNG, TaskLocalRNG, rand, seed! +using SpecialFunctions: logbeta, loggamma +using StaticArrays +using StatsBase: + autocor, counts, entropy, levelsmap, mean, mean_and_var, sample, std, var, wsample + + +export + # fit prior + fitprior, + fitprior2, + sampledist, + sampleK, + + # MCMC sampler + runsampler, + + # utility functions + adjacencymatrix, + sortlabels, + uppertriangle, + generatemixture, + makematrix, + + # summary functions + evaluateclustering, + summarise, + + # point estimate + getpointestimate, + binderloss, + infodist, + + # types + MCMCOptionsList, + PriorHyperparamsList, + MCMCData, + MCMCResult, + + # data + example_datasets, + example_dataset include("./types.jl") include("./utils.jl") @@ -55,4 +73,4 @@ include("./pointestimate.jl") include("./summaries.jl") include("./example_data.jl") -end \ No newline at end of file +end diff --git a/src/example_data.jl b/src/example_data.jl index 3a3110d..03f7269 100644 --- a/src/example_data.jl +++ b/src/example_data.jl @@ -1,11 +1,8 @@ -using Random: seed! -using HDF5: h5open, close, create_group - const datafilename = joinpath(@__DIR__, "..", "data", "example_datasets.h5") # function generate_example_data(n::Int) -# seed!(44) -# K = 10 # Number of clusters +# seed!(44) +# K = 10 # Number of clusters # N = 100 # Number of points # data_σ = [0.25, 0.2, 0.18][n] # Variance of the normal kernel # data_dim = [10, 50, 10][n] # Data dimension @@ -24,44 +21,44 @@ const datafilename = joinpath(@__DIR__, "..", "data", "example_datasets.h5") # example["cluster_labels"] = clusts # example["cluster_weights"] = probs # example["oracle_coclustering_probabilities"] = oracle_coclustering -# end +# end # close(datafile) # end # save_example_data() """ -Return a read-only handle to a HDF5 file that contains example datasets from the main paper. You must remember to close the file once you are done reading from it. This function is provided for reproducibility purposes only; it is recommended to read the datasets via the convenience function [`example_dataset`](@ref). +Return a read-only handle to a HDF5 file that contains example datasets from the main paper. You must remember to close the file once you are done reading from it. This function is provided for reproducibility purposes only; it is recommended to read the datasets via the convenience function [`example_dataset`](@ref). """ function example_datasets() h5open(datafilename, "r") end @doc raw""" - example_dataset(n::Int) + example_dataset(n::Int) Returns a named tuple containing the dataset from the n``^{\mathrm{th}}`` simulated example in the main paper. This dataset was generated using the following code in Julia v1.8.1: ```julia - using RedClust - using Random: seed! - seed!(44) - K = 10 # Number of clusters - N = 100 # Number of points - data_σ = [0.25, 0.2, 0.18][n] # Variance of the normal kernel - data_dim = [10, 50, 10][n] # Data dimension - data = generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim); + using RedClust + using Random: seed! + seed!(44) + K = 10 # Number of clusters + N = 100 # Number of points + data_σ = [0.25, 0.2, 0.18][n] # Variance of the normal kernel + data_dim = [10, 50, 10][n] # Data dimension + data = generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim); ``` Note however that the above code may produce different results on your computer because the random number generator in Julia is not meant for reproducible results across different computers, different versions of Julia, or different versions of the Random.jl package, even with appropriate seeding. Therefore the datasets have been included with this package, and it is recommended to access them via this function. -See also [`generatemixture`](@ref). +See also [`generatemixture`](@ref). """ function example_dataset(n::Int) if n ∉ [1, 2, 3] throw(ArgumentError("n must be 1, 2, or 3.")) end datafile = example_datasets() - egdata = datafile["example" * string(n)] + egdata = datafile["example"*string(n)] x = read(egdata["points"]) points = [x[:, i] for i in 1:last(size(x))] clusts = read(egdata["cluster_labels"]) @@ -69,6 +66,6 @@ function example_dataset(n::Int) probs = read(egdata["cluster_weights"]) oracle_coclustering = read(egdata["oracle_coclustering_probabilities"]) close(datafile) - data = (points = points, distmatrix = distmatrix, clusts = clusts, probs = probs, oracle_coclustering = oracle_coclustering) + data = (points=points, distmatrix=distmatrix, clusts=clusts, probs=probs, oracle_coclustering=oracle_coclustering) return data -end \ No newline at end of file +end diff --git a/src/mcmc.jl b/src/mcmc.jl index 0c885bb..adb3c84 100644 --- a/src/mcmc.jl +++ b/src/mcmc.jl @@ -1,19 +1,10 @@ -using Clustering: kmedoids -using Distributions: Normal, Uniform, Gamma, Beta, logpdf, truncated -using ProgressBars: ProgressBar -using SpecialFunctions: loggamma -using StaticArrays -using StatsBase: sample, counts, mean_and_var, mean, var -using Random: rand - function loglik( - data::MCMCData, + data::MCMCData, state::MCMCState, params::PriorHyperparamsList)::Float64 D = data.D logD = data.logD clusts = state.clusts - n = length(clusts) clustsizes = state.clustsizes K = state.K δ1 = params.δ1 @@ -28,45 +19,45 @@ function loglik( loggammaδ1 = loggamma(δ1) loggammaδ2 = loggamma(δ2) - C = findall(clustsizes .> 0) - L1::Float64 = 0 - - # Cohesive part of likelihood + C = findall(clustsizes .> 0) + L1::Float64 = 0 + + # Cohesive part of likelihood @inbounds for k = 1:K - # for k = 1:K + # for k = 1:K clust_k = findall(clusts .== C[k]) sz_k = clustsizes[C[k]] pairs_k = binomial(sz_k, 2) a = α + δ1 * pairs_k - b = β + matsum(D, clust_k, clust_k) / 2 + b = β + matsum(D, clust_k, clust_k) / 2 L1 += (δ1 - 1) * matsum(logD, clust_k, clust_k) / 2 - pairs_k * loggammaδ1 + - αβratio + - loggamma(a) - a * log(b) + αβratio + + loggamma(a) - a * log(b) end - + # Repulsive part of likelihood L2 = 0 @inbounds for k = 1:K clust_k = findall(clusts .== C[k]) sz_k = clustsizes[C[k]] - for t = (k + 1):K + for t = (k+1):K clust_t = findall(clusts .== C[t]) - sz_t = clustsizes[C[t]] + sz_t = clustsizes[C[t]] pairs_kt = sz_k * sz_t z = ζ + δ2 * pairs_kt g = γ + matsum(D, clust_k, clust_t) L2 += (δ2 - 1) * matsum(logD, clust_k, clust_t) - pairs_kt * loggammaδ2 + - ζγratio + - loggamma(z) - z * log(g) + ζγratio + + loggamma(z) - z * log(g) end end L = (L1 + L2 * repulsion) - return L + return L end function logprior(state::MCMCState, params::PriorHyperparamsList - )::Float64 +)::Float64 clustsizes = state.clustsizes K = state.K r = state.r @@ -75,15 +66,15 @@ function logprior(state::MCMCState, σ = params.σ u = params.u v = params.v - @inbounds clustsizes = view(state.clustsizes, findall(state.clustsizes .> 0)) + @inbounds clustsizes = view(state.clustsizes, findall(state.clustsizes .> 0)) n = length(state.clusts) - # prior on partition - L = loggamma(K + 1) + (n - K) * log(p) + (r * K) * log(1 - p) - K * loggamma(r) + logpdf(Gamma(η, 1/σ), r) + logpdf(Beta(u, v), p) + # prior on partition + L = loggamma(K + 1) + (n - K) * log(p) + (r * K) * log(1 - p) - K * loggamma(r) + logpdf(Gamma(η, 1 / σ), r) + logpdf(Beta(u, v), p) for nj in clustsizes L += log(nj) + loggamma(nj + r - 1) end - return L + return L end function sample_r!(state::MCMCState, params::PriorHyperparamsList) @@ -97,51 +88,51 @@ function sample_r!(state::MCMCState, params::PriorHyperparamsList) proposalsd_r = params.proposalsd_r temp = sample_r(r, p, C, K, η, σ, proposalsd_r) state.r = temp.r - return (r = temp.r, accept = temp.accept) + return (r=temp.r, accept=temp.accept) end function sample_r( - r::Float64, - p::Float64, + r::Float64, + p::Float64, C::Vector{Int}, - K::Integer, + K::Integer, η::Float64, σ::Float64, proposalsd_r::Float64 - ) - # proposal +) + # proposal proposal_distribution = truncated( - Normal(r, proposalsd_r), - lower = 0, - upper = Inf + Normal(r, proposalsd_r), + lower=0, + upper=Inf ) r_candidate = rand(proposal_distribution) reverse_distribution = truncated( - Normal(r_candidate, proposalsd_r), - lower = 0, - upper = Inf + Normal(r_candidate, proposalsd_r), + lower=0, + upper=Inf ) - - # calculate transition probability - logprobcandidate = (η-1) * log(r_candidate) + K * (r_candidate * log(1-p) - loggamma(r_candidate)) - r_candidate * σ - logprobcurrent = (η-1) * log(r) + K * (r * log(1-p) - loggamma(r)) - r * σ - for nk in C - logprobcandidate = logprobcandidate + loggamma(nk - 1 + r_candidate) - logprobcurrent = logprobcurrent + loggamma(nk - 1 + r) + + # calculate transition probability + logprobcandidate = (η - 1) * log(r_candidate) + K * (r_candidate * log(1 - p) - loggamma(r_candidate)) - r_candidate * σ + logprobcurrent = (η - 1) * log(r) + K * (r * log(1 - p) - loggamma(r)) - r * σ + for nk in C + logprobcandidate = logprobcandidate + loggamma(nk - 1 + r_candidate) + logprobcurrent = logprobcurrent + loggamma(nk - 1 + r) end - # Log-ratio of proposal densities - logproposalratio = logpdf(proposal_distribution, r_candidate) - - logpdf(reverse_distribution, r) - - rnew = r - accept = false - # Metropolis acceptance step - if log(rand(Uniform())) < - minimum([0, logprobcandidate - logprobcurrent - logproposalratio]) # accept - rnew = r_candidate - accept = true + # Log-ratio of proposal densities + logproposalratio = logpdf(proposal_distribution, r_candidate) - + logpdf(reverse_distribution, r) + + rnew = r + accept = false + # Metropolis acceptance step + if log(rand(Uniform())) < + minimum([0, logprobcandidate - logprobcurrent - logproposalratio]) # accept + rnew = r_candidate + accept = true end - return (r = rnew, accept = accept) + return (r=rnew, accept=accept) end function sample_p!(state::MCMCState, params::PriorHyperparamsList)::Float64 @@ -154,12 +145,12 @@ function sample_p!(state::MCMCState, params::PriorHyperparamsList)::Float64 end function sample_p( - K::Int, + K::Int, n::Int, - r::Float64, - u::Float64, + r::Float64, + u::Float64, v::Float64 - )::Float64 +)::Float64 rand(Beta(n - K + u, r * K + v)) end @@ -168,13 +159,13 @@ function sample_labels_Gibbs!( data::MCMCData, state::MCMCState, params::PriorHyperparamsList - ) +) - D = data.D + D = data.D logD = data.logD clusts = state.clusts clustsizes = state.clustsizes - n = length(clusts) + n = length(clusts) r = state.r p = state.p δ1 = params.δ1 @@ -186,7 +177,7 @@ function sample_labels_Gibbs!( repulsion = params.repulsion maxK = params.maxK - α_i = zeros(n) + α_i = zeros(n) β_i = zeros(n) ζ_i = zeros(n) γ_i = zeros(n) @@ -197,57 +188,57 @@ function sample_labels_Gibbs!( loggammaδ1 = loggamma(δ1) loggammaδ2 = loggamma(δ2) logp = log(p) - log_oneminusp = log(1-p) - @inbounds for i in 1:n - clustsizes[clusts[i]] = clustsizes[clusts[i]] - 1 - clusts[i] = -1 - C_i = findall(clustsizes .> 0) - K_i = length(C_i) + log_oneminusp = log(1 - p) + @inbounds for i in 1:n + clustsizes[clusts[i]] = clustsizes[clusts[i]] - 1 + clusts[i] = -1 + C_i = findall(clustsizes .> 0) + K_i = length(C_i) if (maxK == 0 || K_i < maxK) && K_i < n - candidate_clusts = [C_i; [findfirst(clustsizes .== 0)]] + candidate_clusts = [C_i; [findfirst(clustsizes .== 0)]] else candidate_clusts = C_i end - m = length(candidate_clusts) - - # Calculate everything that we will need - for k in C_i - clust_k = findall(clusts .== k) - sz_clust_k = clustsizes[k] - α_i[k] = α + δ1 * sz_clust_k - β_i[k] = β + matsum(D, [i], clust_k) - ζ_i[k] = ζ + δ2 * sz_clust_k - γ_i[k] = γ + matsum(D, [i], clust_k) - sum_logD_i[k] = matsum(logD, [i], clust_k) + m = length(candidate_clusts) + + # Calculate everything that we will need + for k in C_i + clust_k = findall(clusts .== k) + sz_clust_k = clustsizes[k] + α_i[k] = α + δ1 * sz_clust_k + β_i[k] = β + matsum(D, [i], clust_k) + ζ_i[k] = ζ + δ2 * sz_clust_k + γ_i[k] = γ + matsum(D, [i], clust_k) + sum_logD_i[k] = matsum(logD, [i], clust_k) end - - L1 = zeros(m) - L2 = zeros(m) - log_prior_ratio = zeros(m) - logprobs = zeros(m) - - for k in 1:(m-1) # compute cohesive likelihood component for each proposed cluster + + L1 = zeros(m) + L2 = zeros(m) + log_prior_ratio = zeros(m) + logprobs = zeros(m) + + for k in 1:(m-1) # compute cohesive likelihood component for each proposed cluster sz_candidate_clust_k = clustsizes[candidate_clusts[k]] - L1[k] = loggamma(α_i[candidate_clusts[k]]) + αβratio - - α_i[candidate_clusts[k]] * log(β_i[candidate_clusts[k]]) + - (δ1 - 1) * sum_logD_i[candidate_clusts[k]] - sz_candidate_clust_k * loggammaδ1 + L1[k] = loggamma(α_i[candidate_clusts[k]]) + αβratio - + α_i[candidate_clusts[k]] * log(β_i[candidate_clusts[k]]) + + (δ1 - 1) * sum_logD_i[candidate_clusts[k]] - sz_candidate_clust_k * loggammaδ1 log_prior_ratio[k] = log(sz_candidate_clust_k + 1) + logp + log(sz_candidate_clust_k - 1 + r) - log(sz_candidate_clust_k) - end + end if clustsizes[candidate_clusts[m]] == 0 # new cluster log_prior_ratio[m] = log(K_i + 1) + r * log_oneminusp L1[m] = 0 else sz_candidate_clust_k = clustsizes[candidate_clusts[m]] - L1[m] = loggamma(α_i[candidate_clusts[m]]) + αβratio - - α_i[candidate_clusts[m]] * log(β_i[candidate_clusts[m]]) + - (δ1 - 1) * sum_logD_i[candidate_clusts[m]] - sz_candidate_clust_k * loggammaδ1 + L1[m] = loggamma(α_i[candidate_clusts[m]]) + αβratio - + α_i[candidate_clusts[m]] * log(β_i[candidate_clusts[m]]) + + (δ1 - 1) * sum_logD_i[candidate_clusts[m]] - sz_candidate_clust_k * loggammaδ1 log_prior_ratio[m] = log(sz_candidate_clust_k + 1) + logp + log(sz_candidate_clust_k - 1 + r) - log(sz_candidate_clust_k) end - for t in C_i + for t in C_i L2_ik_prime[t] = loggamma(ζ_i[t]) - ζ_i[t] * log(γ_i[t]) + - ζγratio + (δ2 - 1) * sum_logD_i[t] - clustsizes[t] * loggammaδ2 + ζγratio + (δ2 - 1) * sum_logD_i[t] - clustsizes[t] * loggammaδ2 end L2_i = vecsum(L2_ik_prime, C_i) for k = 1:m @@ -257,8 +248,8 @@ function sample_labels_Gibbs!( k = sample_logweights(logprobs) ci_new = candidate_clusts[k] - clusts[i] = ci_new - clustsizes[ci_new] = clustsizes[ci_new] + 1 + clusts[i] = ci_new + clustsizes[ci_new] = clustsizes[ci_new] + 1 end state.K = sum(clustsizes .> 0) return nothing @@ -270,14 +261,13 @@ function sample_labels_Gibbs_restricted!( state::MCMCState, params::PriorHyperparamsList, items_to_reallocate::Vector{Int}, - candidate_clusts::SVector{2, ClustLabelType}, # vector of length 2 - final_clusts::ClustLabelVector = ClustLabelVector() - ) + candidate_clusts::SVector{2,ClustLabelType}, # vector of length 2 + final_clusts::ClustLabelVector=ClustLabelVector() +) - D = data.D + D = data.D logD = data.logD clusts = state.clusts - n = length(clusts) clustsizes = state.clustsizes K = state.K C = findall(clustsizes .> 0) @@ -291,10 +281,10 @@ function sample_labels_Gibbs_restricted!( ζ = params.ζ γ = params.γ repulsion = params.repulsion - free_allocation = isempty(final_clusts) # force the result of the Gibbs scan (when we want to compute transition probability only) - log_transition_prob = 0 # transition probability of the Gibbs scan + free_allocation = isempty(final_clusts) # force the result of the Gibbs scan (when we want to compute transition probability only) + log_transition_prob = 0 # transition probability of the Gibbs scan - α_i = zeros(MVector{2}) + α_i = zeros(MVector{2}) β_i = zeros(MVector{2}) ζ_i = zeros(K) γ_i = zeros(K) @@ -306,59 +296,59 @@ function sample_labels_Gibbs_restricted!( loggammaδ2 = loggamma(δ2) logp = log(p) L1 = zeros(MVector{2}) - L2 = zeros(MVector{2}) + L2 = zeros(MVector{2}) log_prior_ratio = zeros(MVector{2}) - logprobs = zeros(MVector{2}) - @inbounds for i in items_to_reallocate - clustsizes[clusts[i]] = clustsizes[clusts[i]] - 1 - clusts[i] = -1 - - # Calculate everything that we will need + logprobs = zeros(MVector{2}) + @inbounds for i in items_to_reallocate + clustsizes[clusts[i]] = clustsizes[clusts[i]] - 1 + clusts[i] = -1 + + # Calculate everything that we will need @simd for k in 1:2 clust_k = findall(clusts .== candidate_clusts[k]) - sz_clust_k = clustsizes[candidate_clusts[k]] - α_i[k] = α + δ1 * sz_clust_k - β_i[k] = β + matsum(D, [i], clust_k) + sz_clust_k = clustsizes[candidate_clusts[k]] + α_i[k] = α + δ1 * sz_clust_k + β_i[k] = β + matsum(D, [i], clust_k) end @simd for k in 1:K - clust_k = findall(clusts .== C[k]) - sz_clust_k = clustsizes[C[k]] - ζ_i[k] = ζ + δ2 * sz_clust_k - γ_i[k] = γ + matsum(D, [i], clust_k) - sum_logD_i[k] = matsum(logD, [i], clust_k) + clust_k = findall(clusts .== C[k]) + sz_clust_k = clustsizes[C[k]] + ζ_i[k] = ζ + δ2 * sz_clust_k + γ_i[k] = γ + matsum(D, [i], clust_k) + sum_logD_i[k] = matsum(logD, [i], clust_k) end - - for k in 1:2 # compute cohesive likelihood component for each proposed cluster + + for k in 1:2 # compute cohesive likelihood component for each proposed cluster sz_candidate_clust_k = clustsizes[candidate_clusts[k]] L1[k] = loggamma(α_i[k]) + αβratio - α_i[k] * log(β_i[k]) + - (δ1 - 1) * sum_logD_i[candidate_clust_inds[k]] - sz_candidate_clust_k * loggammaδ1 + (δ1 - 1) * sum_logD_i[candidate_clust_inds[k]] - sz_candidate_clust_k * loggammaδ1 log_prior_ratio[k] = log(sz_candidate_clust_k + 1) + logp + log(sz_candidate_clust_k - 1 + r) - log(sz_candidate_clust_k) - end + end for t in 1:K L2_ik_prime[t] = loggamma(ζ_i[t]) - ζ_i[t] * log(γ_i[t]) + - ζγratio + (δ2 - 1) * sum_logD_i[t] - clustsizes[C[t]] * loggammaδ2 + ζγratio + (δ2 - 1) * sum_logD_i[t] - clustsizes[C[t]] * loggammaδ2 end - L2_i = L2_ik_prime[1] + L2_ik_prime[2] + L2_i = L2_ik_prime[1] + L2_ik_prime[2] for k = 1:2 L2[k] = L2_i - L2_ik_prime[candidate_clust_inds[k]] end - @. logprobs = log_prior_ratio + (L1 + L2 * repulsion) - if free_allocation - k = sample_logweights(logprobs) - ci_new = candidate_clusts[k] - else - ci_new = final_clusts[i] - k = findfirst(candidate_clusts .== ci_new) - end - - clusts[i] = ci_new - clustsizes[ci_new] = clustsizes[ci_new] + 1 - - # calculate the transition probability - logprobs .+= minimum(logprobs) - probs = exp.(logprobs) - probs ./= probs[1] + probs[2] - log_transition_prob += log(probs[k]) + @. logprobs = log_prior_ratio + (L1 + L2 * repulsion) + if free_allocation + k = sample_logweights(logprobs) + ci_new = candidate_clusts[k] + else + ci_new = final_clusts[i] + k = findfirst(candidate_clusts .== ci_new) + end + + clusts[i] = ci_new + clustsizes[ci_new] = clustsizes[ci_new] + 1 + + # calculate the transition probability + logprobs .+= minimum(logprobs) + probs = exp.(logprobs) + probs ./= probs[1] + probs[2] + log_transition_prob += log(probs[k]) end return log_transition_prob end @@ -368,28 +358,28 @@ function sample_labels!( state::MCMCState, params::PriorHyperparamsList, options::MCMCOptionsList - ) +) # Unpack - - n = length(state.clusts) + + n = length(state.clusts) r = state.r p = state.p numMH = options.numMH numGibbs = options.numGibbs - accept = fill(false, numMH) + accept = fill(false, numMH) split = fill(false, numMH) @inbounds for MH_counter in 1:numMH - # for MH_counter in 1:numMH + # for MH_counter in 1:numMH clusts = state.clusts clustsizes = state.clustsizes K = state.K # BEGIN MH # Pick chaperones - i, j = sample(1:n, 2, replace = false) + i, j = sample(1:n, 2, replace=false) ci = clusts[i] # labels of their current cluster allocations cj = clusts[j] - + # automatically reject if we have max number of clusters if params.maxK > 0 && ci == cj && length(findall(clustsizes .> 0)) >= params.maxK continue @@ -397,48 +387,48 @@ function sample_labels!( # choose elements to reallocate S = findall(clusts .== ci .|| clusts .== cj) - S = S[S .!= i .&& S .!= j] - + S = S[S.!=i.&&S.!=j] + # initialise random launch state claunch = copy(clusts) szlaunch = copy(clustsizes) Klaunch = K - if ci == cj + if ci == cj claunch[i] = findall(clustsizes .== 0)[1] szlaunch[ci] = szlaunch[ci] - 1 szlaunch[claunch[i]] = szlaunch[claunch[i]] + 1 Klaunch = K + 1 end candidate_clusts = SA[claunch[i], claunch[j]] - for k in S + for k in S claunch[k] = sample(candidate_clusts) szlaunch[clusts[k]] = szlaunch[clusts[k]] - 1 szlaunch[claunch[k]] = szlaunch[claunch[k]] + 1 end launchstate = MCMCState(claunch, r, p, szlaunch, Klaunch) - + # restricted Gibbs scans - for scan_counter in 1:numGibbs - sample_labels_Gibbs_restricted!(data, - launchstate, params, S, candidate_clusts) + for _ in 1:numGibbs + sample_labels_Gibbs_restricted!(data, + launchstate, params, S, candidate_clusts) end - + if ci == cj # split proposal split[MH_counter] = true # final Gibbs scan - log_transition_prob = sample_labels_Gibbs_restricted!(data, - launchstate, params, S, candidate_clusts) + log_transition_prob = sample_labels_Gibbs_restricted!(data, + launchstate, params, S, candidate_clusts) finalstate = launchstate cfinal = finalstate.clusts szfinal = finalstate.clustsizes Kfinal = finalstate.K - + # log prior ratio for MH acceptance ratio - log_prior_ratio = log(K + 1) + r * log(1 - p) - log(p) - loggamma(r) + - loggamma(szfinal[cfinal[i]] - 1 + r) + loggamma(szfinal[cfinal[j]] - 1 + r) + - log(szfinal[cfinal[i]]) + log(szfinal[cfinal[j]]) + - -(loggamma(clustsizes[ci] - 1 + r) + log(clustsizes[ci])) - + log_prior_ratio = log(K + 1) + r * log(1 - p) - log(p) - loggamma(r) + + loggamma(szfinal[cfinal[i]] - 1 + r) + loggamma(szfinal[cfinal[j]] - 1 + r) + + log(szfinal[cfinal[i]]) + log(szfinal[cfinal[j]]) + + -(loggamma(clustsizes[ci] - 1 + r) + log(clustsizes[ci])) + # proposal density ratio pr(new | old) / pr(old | new) log_proposal_ratio = log_transition_prob # note that the reverse proposal density is just 1 @@ -455,24 +445,24 @@ function sample_labels!( finalstate = MCMCState(cfinal, r, p, szfinal, Kfinal) # log prior ratio for MH acceptance ratio - log_prior_ratio = -(log(K) + r * log(1 - p) - log(p) - loggamma(r)) + - loggamma(szfinal[cj] - 1 + r) + log(szfinal[cj]) + - -(loggamma(clustsizes[ci] - 1 + r) + loggamma(clustsizes[cj] - 1 + r) - + log(clustsizes[ci]) + log(clustsizes[cj])) - + log_prior_ratio = -(log(K) + r * log(1 - p) - log(p) - loggamma(r)) + + loggamma(szfinal[cj] - 1 + r) + log(szfinal[cj]) + + -(loggamma(clustsizes[ci] - 1 + r) + loggamma(clustsizes[cj] - 1 + r) + + log(clustsizes[ci]) + log(clustsizes[cj])) + # proposal density ratio pr(new | old) / pr(old | new) - log_transition_prob = sample_labels_Gibbs_restricted!(data, - launchstate, params, S, candidate_clusts, clusts) - + log_transition_prob = sample_labels_Gibbs_restricted!(data, + launchstate, params, S, candidate_clusts, clusts) + log_proposal_ratio = -log_transition_prob # note that the reverse proposal density is just 1 end - + # likelihood ratio for MH acceptance ratio log_lik_ratio = loglik(data, - finalstate, params) - - loglik(data, state, params) - + finalstate, params) - + loglik(data, state, params) + # MH acceptance step log_acceptance_ratio = minimum( [0, log_prior_ratio + log_lik_ratio - log_proposal_ratio]) @@ -482,25 +472,25 @@ function sample_labels!( end # END MH end - - # Final Gibbs scan + + # Final Gibbs scan sample_labels_Gibbs!(data, state, params) - return (accept = accept, split = split) + return (accept=accept, split=split) end """ - runsampler(data, - options = MCMCOptionsList(), - params = PriorHyperparamsList(); - verbose = true) -> MCMCResult + runsampler(data, + options = MCMCOptionsList(), + params = PriorHyperparamsList(); + verbose = true) -> MCMCResult Runs the MCMC sampler on the data. # Arguments -- `data::MCMCData`: contains the distance matrix. -- `options::MCMCOptionsList`: contains the number of iterations, burnin, etc. +- `data::MCMCData`: contains the distance matrix. +- `options::MCMCOptionsList`: contains the number of iterations, burnin, etc. - `params::PriorHyperparamsList`: contains the prior hyperparameters for the model. -- `verbose::Bool`: if false, disables all info messages and progress bars. +- `verbose::Bool`: if false, disables all info messages and progress bars. # Returns A struct of type `MCMCResult` containing the MCMC samples, convergence diagnostics, and summary statistics. @@ -508,31 +498,31 @@ A struct of type `MCMCResult` containing the MCMC samples, convergence diagnosti # See also [`MCMCData`](@ref), [`MCMCOptionsList`](@ref), [`fitprior`](@ref), [`PriorHyperparamsList`](@ref), [`MCMCResult`](@ref). """ -function runsampler(data::MCMCData, - options::MCMCOptionsList = MCMCOptionsList(), - params::Union{PriorHyperparamsList, Nothing} = nothing, - init::Union{MCMCState, Nothing} = nothing; - verbose = true - )::MCMCResult - if !verbose - ostream = devnull - else - ostream = stdout - end +function runsampler(data::MCMCData, + options::MCMCOptionsList=MCMCOptionsList(), + params::Union{PriorHyperparamsList,Nothing}=nothing, + init::Union{MCMCState,Nothing}=nothing; + verbose=true +)::MCMCResult + if !verbose + ostream = devnull + else + ostream = stdout + end numiters = options.numiters burnin = options.burnin thin = options.thin numsamples = options.numsamples if isnothing(params) - params = fitprior(data.D, "k-medoids", true; verbose = verbose) + params = fitprior(data.D, "k-medoids", true; verbose=verbose) end if isnothing(init) init = MCMCState( - clusts = kmedoids(data.D, - (params.maxK > 0 ? minimum([params.maxK, params.K_initial]) : params.K_initial); - maxiter=1000).assignments, - r = rand(Gamma(params.η, 1 / params.σ)), - p = rand(Beta(params.u, params.v)) + clusts=kmedoids(data.D, + (params.maxK > 0 ? minimum([params.maxK, params.K_initial]) : params.K_initial); + maxiter=1000).assignments, + r=rand(Gamma(params.η, 1 / params.σ)), + p=rand(Beta(params.u, params.v)) ) end result = MCMCResult(data, options, params) @@ -540,29 +530,29 @@ function runsampler(data::MCMCData, # Start sampling j::Int = 1 - printstyled(ostream, "Run MCMC\n"; bold = true, color= :blue) - printstyled(ostream, "Setup: "; bold = true) + printstyled(ostream, "Run MCMC\n"; bold=true, color=:blue) + printstyled(ostream, "Setup: "; bold=true) println(ostream, "$numiters iterations, $numsamples samples, $(size(data.D, 1)) observations.") runtime = @elapsed ( - for i in ProgressBar(1:numiters, output_stream = ostream) - result.r_acceptances[i] = sample_r!(state, params).accept - sample_p!(state, params) - temp = sample_labels!(data, state, params, options) - temprange = ((i-1) * options.numMH + 1) : (i * options.numMH) - result.splitmerge_acceptances[temprange] .= temp.accept - result.splitmerge_splits[temprange] .= temp.split - - # Record sample if necessary - if i > burnin && (i - burnin) % thin == 0 - result.clusts[j] .= sortlabels(state.clusts) - result.K[j] = state.K - result.r[j] = state.r - result.p[j] = state.p - result.loglik[j] = loglik(data, state, params) - result.logposterior[j] = result.loglik[j] + logprior(state, params) - j = j + 1 + for i in ProgressBar(1:numiters, output_stream=ostream) + result.r_acceptances[i] = sample_r!(state, params).accept + sample_p!(state, params) + temp = sample_labels!(data, state, params, options) + temprange = ((i-1)*options.numMH+1):(i*options.numMH) + result.splitmerge_acceptances[temprange] .= temp.accept + result.splitmerge_splits[temprange] .= temp.split + + # Record sample if necessary + if i > burnin && (i - burnin) % thin == 0 + result.clusts[j] .= sortlabels(state.clusts) + result.K[j] = state.K + result.r[j] = state.r + result.p[j] = state.p + result.loglik[j] = loglik(data, state, params) + result.logposterior[j] = result.loglik[j] + logprior(state, params) + j = j + 1 + end end - end ) println(ostream, "Computing summary statistics and diagnostics.") @@ -601,20 +591,20 @@ end function sample_rp( clustsizes::Vector{Int}, - options::MCMCOptionsList = MCMCOptionsList(), - params::PriorHyperparamsList = PriorHyperparamsList(); - verbose = true - ) - if !verbose - ostream = devnull - else - ostream = stdout - end + options::MCMCOptionsList=MCMCOptionsList(), + params::PriorHyperparamsList=PriorHyperparamsList(); + verbose=true +) + if !verbose + ostream = devnull + else + ostream = stdout + end # Unpack parameters numiters = options.numiters burnin = options.burnin thin = options.thin - η = params.η + η = params.η σ = params.σ proposalsd_r = params.proposalsd_r u = params.u @@ -622,25 +612,25 @@ function sample_rp( C = clustsizes[findall(clustsizes .> 0)] n = sum(C) K = length(C) - - # Initialise result + + # Initialise result r = rand(Gamma(η, σ)) p = rand(Beta(u, v)) - numsamples = Integer(floor((numiters - burnin) / thin)) - result = (r = zeros(numsamples), p = zeros(numsamples)) - j = 1 - for i in ProgressBar(1:numiters, output_stream = ostream) - # Sample r - r = sample_r(r, p, C, K, η, σ, proposalsd_r).r - # Sample p - p = sample_p(K, n, r, u, v) - - # Record sample if necessary - if i > burnin && (i - burnin) % thin == 0 - result.r[j] = r - result.p[j] = p - j = j + 1 + numsamples = Integer(floor((numiters - burnin) / thin)) + result = (r=zeros(numsamples), p=zeros(numsamples)) + j = 1 + for i in ProgressBar(1:numiters, output_stream=ostream) + # Sample r + r = sample_r(r, p, C, K, η, σ, proposalsd_r).r + # Sample p + p = sample_p(K, n, r, u, v) + + # Record sample if necessary + if i > burnin && (i - burnin) % thin == 0 + result.r[j] = r + result.p[j] = p + j = j + 1 end end - return result -end \ No newline at end of file + return result +end diff --git a/src/pointestimate.jl b/src/pointestimate.jl index 72ade17..eead858 100644 --- a/src/pointestimate.jl +++ b/src/pointestimate.jl @@ -1,23 +1,20 @@ -using Clustering: randindex, mutualinfo, varinfo -using StatsBase: entropy - """ - getpointestimate(samples::MCMCResult; method = "MAP", loss = "VI") + getpointestimate(samples::MCMCResult; method = "MAP", loss = "VI") -Computes a point estimate from a vector of samples of cluster allocations by searching for a minimiser of the posterior expectation of some loss function. +Computes a point estimate from a vector of samples of cluster allocations by searching for a minimiser of the posterior expectation of some loss function. -# Arguments +# Arguments - `method::String `: must be one of the following - - - `"MAP"`: maximum a posteriori. - - `"MLE"`: maximum likelihood estimation. - - `"MPEL"`: minimum posterior expected loss. - `"MAP"` and `"MLE"` search among the MCMC samples for the clustering with the maximum log posterior and log likelihood respectively. `"MPEL"` searches for a clustering that minimises the posterior expected loss of some loss function specified by the `loss` argument. The search space is the set of samples in `samples`. -- `loss`: Determines the loss function used for the `"MPEL"` method. Must be either be a string or a function. If specified as a string, must be one of `"binder"` (Binder loss), `"omARI"` (one minus the Adjusted Rand Index), `"VI"` (Variation of Information distance), or `"ID"` (Information Distance). If specified as a function, must have a method defined for `(x::Vector{Int}, y::Vector{Int}) -> Real`. + - `"MAP"`: maximum a posteriori. + - `"MLE"`: maximum likelihood estimation. + - `"MPEL"`: minimum posterior expected loss. + `"MAP"` and `"MLE"` search among the MCMC samples for the clustering with the maximum log posterior and log likelihood respectively. `"MPEL"` searches for a clustering that minimises the posterior expected loss of some loss function specified by the `loss` argument. The search space is the set of samples in `samples`. +- `loss`: Determines the loss function used for the `"MPEL"` method. Must be either be a string or a function. If specified as a string, must be one of `"binder"` (Binder loss), `"omARI"` (one minus the Adjusted Rand Index), `"VI"` (Variation of Information distance), or `"ID"` (Information Distance). If specified as a function, must have a method defined for `(x::Vector{Int}, y::Vector{Int}) -> Real`. # Returns -Returns a tuple `(clust, i)` where `clust` is a clustering allocation in `samples` and `i` is its sample index. +Returns a tuple `(clust, i)` where `clust` is a clustering allocation in `samples` and `i` is its sample index. """ -function getpointestimate(samples::MCMCResult; method::String = "MAP", loss::Union{String, Function} = "VI")::Tuple{ClustLabelVector, Int} +function getpointestimate(samples::MCMCResult; method::String="MAP", loss::Union{String,Function}="VI")::Tuple{ClustLabelVector,Int} # input validation if method == "MPEL" && typeof(loss) == String if loss ∉ ["binder", "omARI", "VI", "ID"] @@ -39,51 +36,51 @@ function getpointestimate(samples::MCMCResult; method::String = "MAP", loss::Uni if loss isa Function lossfn = loss elseif typeof(loss) == String - if loss == "binder" + if loss == "binder" lossfn = (x::ClustLabelVector, y::ClustLabelVector) -> randindex(x, y)[3] - elseif loss == "omARI" + elseif loss == "omARI" lossfn = (x::ClustLabelVector, y::ClustLabelVector) -> 1 - randindex(x, y)[1] elseif loss == "VI" lossfn = varinfo elseif loss == "ID" - lossfn = (x::ClustLabelVector, y::ClustLabelVector) -> infodist(x, y; normalised = false) + lossfn = (x::ClustLabelVector, y::ClustLabelVector) -> infodist(x, y; normalised=false) end end clusts = samples.clusts lossmatrix = zeros(length(clusts), length(clusts)) for i in eachindex(clusts) for j in (i+1):length(clusts) - lossmatrix[i, j] = lossfn(clusts[i], clusts[j]) + lossmatrix[i, j] = lossfn(clusts[i], clusts[j]) end end lossmatrix += transpose(lossmatrix) - i = argmin(vec(sum(lossmatrix, dims = 1))) + i = argmin(vec(sum(lossmatrix, dims=1))) return (clusts[i], i) end end """ - binderloss(a::ClustLabelVector, b::ClustLabelVector; - normalised = true) -> Float64 + binderloss(a::ClustLabelVector, b::ClustLabelVector; + normalised = true) -> Float64 -Computes the Binder loss between two clusterings. If `normalised = true` then the result is equal to one minus the rand index between `a` and `b`. +Computes the Binder loss between two clusterings. If `normalised = true` then the result is equal to one minus the rand index between `a` and `b`. """ function binderloss( a::ClustLabelVector, b::ClustLabelVector; - normalised = true - )::Float64 + normalised=true +)::Float64 length(a) == length(b) || throw(ArgumentError("Length of the input vectors must be equal.")) n = length(a) return randindex(a, b)[3] * (normalised ? 1 : binomial(n, 2)) end @doc raw""" - infodist(a::ClustLabelVector, b::ClustLabelVector; - normalised = true) -> Float64 + infodist(a::ClustLabelVector, b::ClustLabelVector; + normalised = true) -> Float64 -Computes the information distance between two clusterings. The information distance is defined as -```math +Computes the information distance between two clusterings. The information distance is defined as +```math d_{\mathrm{ID}}(a, b) = \max \{H(A), H(B)\} - I(A, B) ``` where ``A`` and ``B`` are the cluster membership probability functions for ``a`` and ``b`` respectively, ``H`` denotes the entropy of a distribution, and ``I`` denotes the mutual information between two distributions. The information distance has range ``[0, \log N]`` where ``N`` is the number of observations. If normalised = true, the result is scaled to the range ``[0, 1]``. @@ -91,11 +88,11 @@ where ``A`` and ``B`` are the cluster membership probability functions for ``a`` function infodist( a::ClustLabelVector, b::ClustLabelVector; - normalised = true - )::Float64 + normalised=true +)::Float64 length(a) != length(b) && throw(ArgumentError("Length of the input vectors must be equal.")) n = length(a) - hu = entropy(counts(a)/n) - hv = entropy(counts(b)/n) - return normalised ? 1 - mutualinfo(a, b; normed = false) / maximum([hu, hv]) : maximum([hu, hv]) - mutualinfo(a, b; normed = false) -end \ No newline at end of file + hu = entropy(counts(a) / n) + hv = entropy(counts(b) / n) + return normalised ? 1 - mutualinfo(a, b; normed=false) / maximum([hu, hv]) : maximum([hu, hv]) - mutualinfo(a, b; normed=false) +end diff --git a/src/prior.jl b/src/prior.jl index e1bec9c..ee9858d 100644 --- a/src/prior.jl +++ b/src/prior.jl @@ -1,38 +1,31 @@ -using Clustering: kmeans, kmedoids -using Distances: pairwise, Euclidean -using Distributions: fit_mle, Gamma, shape, rate, Distributions, Beta -using ProgressBars: ProgressBar -using StatsBase: counts, std -using SpecialFunctions: logbeta - """ - fitprior(data, algo, diss = false; - Kmin = 1, - Kmax = Int(floor(size(data)[end] / 2), + fitprior(data, algo, diss = false; + Kmin = 1, + Kmax = Int(floor(size(data)[end] / 2), verbose = true) -Determines the best prior hyperparameters from the data. A notional clustering is obtained using k-means or k-medoids and the elbow method, and the distances are split into within-cluster distances and inter-cluster distances based on the notional clustering. These distances are then used to fit the prior hyperparameters using MLE and empirical Bayes sampling. +Determines the best prior hyperparameters from the data. A notional clustering is obtained using k-means or k-medoids and the elbow method, and the distances are split into within-cluster distances and inter-cluster distances based on the notional clustering. These distances are then used to fit the prior hyperparameters using MLE and empirical Bayes sampling. # Required Arguments -- `data::Union{Vector{Vector{Float64}}, Matrix{Float64}}`: can either be a vector of (possibly multi-dimensional) observations, or a matrix with each column an observation, or a square matrix of pairwise dissimilarities. +- `data::Union{Vector{Vector{Float64}}, Matrix{Float64}}`: can either be a vector of (possibly multi-dimensional) observations, or a matrix with each column an observation, or a square matrix of pairwise dissimilarities. - `algo::String`: must be one of `"k-means"` or `"k-medoids"`. # Optional Arguments -- `diss::bool`: if true, `data` will be assumed to be a pairwise dissimilarity matrix. -- `Kmin::Integer`: minimum number of clusters to scan for the elbow method. +- `diss::bool`: if true, `data` will be assumed to be a pairwise dissimilarity matrix. +- `Kmin::Integer`: minimum number of clusters to scan for the elbow method. - `Kmax::Integer`: maximum number of clusters to scan for the elbow method. If left unspecified, it is set to half the number of observations. -- `verbose::Bool`: if false, disables all info messages and progress bars. +- `verbose::Bool`: if false, disables all info messages and progress bars. # Returns An object of type [`PriorHyperparamsList`](@ref). """ function fitprior( data::Union{Vector{Vector{Float64}}, Matrix{Float64}}, - algo::String, - diss::Bool = false; - Kmin::Integer = 1, + algo::String, + diss::Bool = false; + Kmin::Integer = 1, Kmax::Integer = Int(floor(size(data)[end] / 2)), - verbose::Bool = true + verbose::Bool = true ) ostream = verbose ? stdout : devnull printstyled(ostream, "Fitting prior hyperparameters\n"; bold = true) @@ -72,7 +65,7 @@ function fitprior( objective[k] = temp.totalcost if !temp.converged @warn "Clustering did not converge at K = $k" - end + end end elbow = detectknee(Kmin:Kmax, objective)[1] K = elbow @@ -112,7 +105,7 @@ function fitprior( γ = 1 else fitB = fit_mle(Gamma, B) - δ2 = shape(fitB) + δ2 = shape(fitB) ζ = length(B) * δ2 γ = sum(B) end @@ -123,11 +116,11 @@ function fitprior( α = α, β = β, ζ = ζ, - γ = γ, + γ = γ, η = η, σ = σ, proposalsd_r = proposalsd_r, - u = u, + u = u, v = v, K_initial = K ) @@ -135,33 +128,33 @@ function fitprior( end @doc raw""" - fitprior2(data, algo, diss = false; - Kmin = 1, - Kmax = Int(floor(size(data)[end] / 2), + fitprior2(data, algo, diss = false; + Kmin = 1, + Kmax = Int(floor(size(data)[end] / 2), verbose = true) -Determines the best prior hyperparameters from the data. Uses the same method as [`fitprior`](@ref) to obtain values for ``\sigma``, ``\eta``, ``u``, and ``v``, but derives values for the cluster-specific parameters by considering within-cluster and cross-cluster distances over clusterings with ``K`` clusters for all values of ``K \in [K_\mathrm{min}, K_\mathrm{max}]``, weighted by the prior predictive distribution of ``K`` in that range. +Determines the best prior hyperparameters from the data. Uses the same method as [`fitprior`](@ref) to obtain values for ``\sigma``, ``\eta``, ``u``, and ``v``, but derives values for the cluster-specific parameters by considering within-cluster and cross-cluster distances over clusterings with ``K`` clusters for all values of ``K \in [K_\mathrm{min}, K_\mathrm{max}]``, weighted by the prior predictive distribution of ``K`` in that range. # Required Arguments -- `data::Union{Vector{Vector{Float64}}, Matrix{Float64}}`: can either be a vector of (possibly multi-dimensional) observations, or a matrix with each column an observation, or a square matrix of pairwise dissimilarities. +- `data::Union{Vector{Vector{Float64}}, Matrix{Float64}}`: can either be a vector of (possibly multi-dimensional) observations, or a matrix with each column an observation, or a square matrix of pairwise dissimilarities. - `algo::String`: must be one of `"k-means"` or `"k-medoids"`. # Optional Arguments -- `diss::bool`: if true, `data` will be assumed to be a pairwise dissimilarity matrix. -- `Kmin::Integer`: minimum number of clusters to scan for the elbow method. +- `diss::bool`: if true, `data` will be assumed to be a pairwise dissimilarity matrix. +- `Kmin::Integer`: minimum number of clusters to scan for the elbow method. - `Kmax::Integer`: maximum number of clusters to scan for the elbow method. If left unspecified, it is set to half the number of observations. -- `verbose::Bool`: if false, disables all info messages and progress bars. +- `verbose::Bool`: if false, disables all info messages and progress bars. # Returns An object of type [`PriorHyperparamsList`](@ref). """ function fitprior2( data::Union{Vector{Vector{Float64}}, Matrix{Float64}}, - algo::String, - diss::Bool = false; - Kmin::Integer = 1, + algo::String, + diss::Bool = false; + Kmin::Integer = 1, Kmax::Integer = Int(floor(size(data)[end] / 2)), - verbose::Bool = true + verbose::Bool = true ) ostream = verbose ? stdout : devnull printstyled(ostream, "Fitting prior hyperparameters\n"; bold = true) @@ -231,7 +224,7 @@ function fitprior2( fitp = fit_mle(Beta, temp.p) u, v = Distributions.params(fitp) - # Generate prior on K + # Generate prior on K Kprior = pmf(sampleK(η, σ, u, v, maximum([10000, 100 * N]), N)) append!(Kprior, zeros(N - length(Kprior))) @@ -261,7 +254,7 @@ function fitprior2( γ = 1 else fitB = fit_mle(Gamma, B, wtsB) - δ2 = shape(fitB) + δ2 = shape(fitB) ζ = sum(szB[Kmin:Kmax] .* Kprior[Kmin:Kmax]) * δ2 γ = sum(B .* wtsB) end @@ -272,11 +265,11 @@ function fitprior2( α = α, β = β, ζ = ζ, - γ = γ, + γ = γ, η = η, σ = σ, proposalsd_r = proposalsd_r, - u = u, + u = u, v = v, K_initial = K ) @@ -322,7 +315,7 @@ Returns a vector of length `numsamples` containing samples of ``K`` (number of c """ function sampleK(η::Real, σ::Real, u::Real, v::Real, numsamples::Int, n::Int) # Input validation - if n < 1 + if n < 1 throw(ArgumentError("n must be a positive integer.")) end if numsamples < 1 @@ -345,7 +338,7 @@ end sampleK(params::PriorHyperparamsList, numsamples::Int, n::Int)::Vector{Int} = sampleK(params.η, params.σ, params.u, params.v, numsamples, n) function detectknee( - xvalues::AbstractVector{<:Real}, + xvalues::AbstractVector{<:Real}, yvalues::AbstractVector{<:Real} )::NTuple{2,Real} # Max values to create line @@ -367,8 +360,8 @@ function detectknee( end function pmf(X::AbstractVector{<:Integer}) - xmin = minimum(X) - p = counts(X)./length(X) - p = [zeros(xmin-1); p] - return p -end \ No newline at end of file + xmin = minimum(X) + p = counts(X)./length(X) + p = [zeros(xmin-1); p] + return p +end diff --git a/src/summaries.jl b/src/summaries.jl index 3b6671e..5f243b1 100644 --- a/src/summaries.jl +++ b/src/summaries.jl @@ -1,16 +1,14 @@ -using Clustering: randindex, varinfo, vmeasure, mutualinfo - @doc raw""" - evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) -Returns a named tuple of values quantifying the accuracy of the clustering assignments in `clusts` with respect to the ground truth clustering assignments in `truth`. +evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) +Returns a named tuple of values quantifying the accuracy of the clustering assignments in `clusts` with respect to the ground truth clustering assignments in `truth`. # Return values -- `nbloss`: Normalised Binder loss (= 1 - rand index). Lower is better. -- `ari`: Adjusted Rand index. Higher is better. +- `nbloss`: Normalised Binder loss (= 1 - rand index). Lower is better. +- `ari`: Adjusted Rand index. Higher is better. - `vi`: Variation of Information distance (``\in [0, \log N]``). Lower is better. - `nvi`: Normalised VI distance (``\in [0, 1]``). -- `id`: Information Distance (``\in [0, \log N]``). Lower is better. +- `id`: Information Distance (``\in [0, \log N]``). Lower is better. - `nid`: Normalised Information Distance (``\in [0, 1]``). -- `nmi`: Normalised Mutual Information (``\in [0, 1]``). Higher is better. +- `nmi`: Normalised Mutual Information (``\in [0, 1]``). Higher is better. """ function evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) length(clusts) != length(truth) && throw(ArgumentError("Length of inputs must be equal.")) @@ -18,21 +16,21 @@ function evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) ari, _, nbloss, _ = randindex(clusts, truth) vi = varinfo(clusts, truth) nvi = vi / log(n) - id = infodist(clusts, truth; normalised = false) + id = infodist(clusts, truth; normalised=false) nid = id / log(n) nmi = mutualinfo(clusts, truth) - return (nbloss = nbloss, ari = ari, vi = vi, nvi, id = id, nid = nid, nmi = nmi) + return (nbloss=nbloss, ari=ari, vi=vi, nvi, id=id, nid=nid, nmi=nmi) end """ - summarise([io::IO], clusts::ClustLabelVector, - truth::ClustLabelVector, - printoutput = true) -> String +summarise([io::IO], clusts::ClustLabelVector, +truth::ClustLabelVector, +printoutput = true) -> String Prints a summary of the clustering accuracy of `clusts` with respect to the ground truth in `truth`. The output is printed to the output stream `io`, which defaults to `stdout` if not provided. """ function summarise(io::IO, clusts::ClustLabelVector, truth::ClustLabelVector) temp = evaluateclustering(clusts, truth) - printstyled(io, "Clustering summary\n"; color = :green, bold = true) + printstyled(io, "Clustering summary\n"; color=:green, bold=true) println(io, "Number of clusters : $(length(unique(clusts)))") println(io, "Normalised Binder loss : $(temp.nbloss)") println(io, "Adjusted Rand Index : $(temp.ari)") @@ -43,4 +41,4 @@ end function summarise(clusts::ClustLabelVector, truth::ClustLabelVector) summarise(stdout, clusts, truth) -end \ No newline at end of file +end diff --git a/src/types.jl b/src/types.jl index 89e0de7..52d7b50 100644 --- a/src/types.jl +++ b/src/types.jl @@ -1,6 +1,5 @@ -using LinearAlgebra: I, Diagonal - const ClustLabelType = Int + "Alias for `Vector{Int}`" const ClustLabelVector = Vector{ClustLabelType} @@ -8,21 +7,21 @@ const _numGibbs_default = 5 const _numMH_default = 1 """ - MCMCOptionsList(; - numiters = 5000, - burnin = floor(0.2 * numiters), - thin = 1, - numGibbs = 5, - numMH = 1) + MCMCOptionsList(; + numiters = 5000, + burnin = floor(0.2 * numiters), + thin = 1, + numGibbs = 5, + numMH = 1) -List of options for running the MCMC. + List of options for running the MCMC. -# Constructor arguments -- `numiters::Integer`: number of iterations to run. -- `burnin::Integer`: number of iterations to discard as burn-in. -- `thin::Integer`: will keep every `thin` samples. -- `numGibbs:Integer`: number of intermediate Gibbs scans in the split-merge step. -- `numMH:Integer`: number of split-merge steps per MCMC iteration. + # Constructor arguments + - `numiters::Integer`: number of iterations to run. + - `burnin::Integer`: number of iterations to discard as burn-in. + - `thin::Integer`: will keep every `thin` samples. + - `numGibbs:Integer`: number of intermediate Gibbs scans in the split-merge step. + - `numMH:Integer`: number of split-merge steps per MCMC iteration. """ struct MCMCOptionsList numiters::Int @@ -32,11 +31,11 @@ struct MCMCOptionsList numMH::Int numsamples::Int function MCMCOptionsList(; - numiters::Int = 5000, - burnin::Int = Integer(floor(0.2 * numiters)), - thin::Int = 1, - numGibbs::Int = _numGibbs_default, - numMH::Int = _numMH_default) + numiters::Int=5000, + burnin::Int=Integer(floor(0.2 * numiters)), + thin::Int=1, + numGibbs::Int=_numGibbs_default, + numMH::Int=_numMH_default) # input validation if numiters < 1 error("numiters must be ≥ 1.") @@ -59,7 +58,7 @@ struct MCMCOptionsList end function Base.show(io::IO, ::MIME"text/plain", options::MCMCOptionsList) - printstyled(io, "MCMC Options\n"; color = :green, bold = true) + printstyled(io, "MCMC Options\n"; color=:green, bold=true) println(io, "$(options.numiters) iteration$(options.numiters != 1 ? "s" : "")") println(io, "$(options.burnin) burnin iteration$(options.burnin != 1 ? "s" : "")") println(io, "$(options.numsamples) sample$(options.numsamples != 1 ? "s" : "")") @@ -68,9 +67,9 @@ function Base.show(io::IO, ::MIME"text/plain", options::MCMCOptionsList) end @doc raw""" - PriorHyperparamsList(; [kwargs]) +PriorHyperparamsList(; [kwargs]) -Contains the prior hyperparameters for the model. It is recommended to set the values using the [`fitprior`](@ref) function. Do not set these values manually unless you know what you are doing. +Contains the prior hyperparameters for the model. It is recommended to set the values using the [`fitprior`](@ref) function. Do not set these values manually unless you know what you are doing. # Constructor arguments - `δ1::Float64 = 1`: the parameter ``\delta_1``. @@ -83,46 +82,46 @@ Contains the prior hyperparameters for the model. It is recommended to set the v - `σ::Float64 = 1`: the rate parameter ``\sigma`` in the prior for ``r``. - `u::Float64 = 1`: the parameter ``u`` in the prior for ``p``. - `v::Float64 = 1`: the parameter ``v`` in the prior for ``p``. -- `proposalsd_r::Float64 = 1`: standard deviation of the truncated Normal proposal when sampling `r` via a Metropolis-Hastings step. +- `proposalsd_r::Float64 = 1`: standard deviation of the truncated Normal proposal when sampling `r` via a Metropolis-Hastings step. - `K_initial::Int = 1`: initial number of clusters to start the MCMC. -- `repulsion::Bool = true`: whether to use the repulsive component of the likelihood. +- `repulsion::Bool = true`: whether to use the repulsive component of the likelihood. - `maxK::Int = 0`: maxmimum number of clusters to allow. If set to 0 then no maximum is imposed. # See also [`fitprior`](@ref) """ Base.@kwdef mutable struct PriorHyperparamsList - δ1::Float64 = 1 + δ1::Float64 = 1 δ2::Float64 = 1 - α::Float64 = 1 - β::Float64 = 1 - ζ::Float64 = 1 - γ::Float64 = 1 - η::Float64 = 1 - σ::Float64 = 1 + α::Float64 = 1 + β::Float64 = 1 + ζ::Float64 = 1 + γ::Float64 = 1 + η::Float64 = 1 + σ::Float64 = 1 proposalsd_r::Float64 = sqrt(η) / σ - u::Float64 = 1 - v::Float64 = 1 + u::Float64 = 1 + v::Float64 = 1 K_initial::Int = 1 repulsion::Bool = true maxK::Int = 0 end function Base.show(io::IO, ::MIME"text/plain", params::PriorHyperparamsList) - printstyled(io, "Model Hyperparameters\n"; color = :green, bold = true) - printstyled(io, "Likelihood Hyperparameters\n"; color = :magenta, bold = true) + printstyled(io, "Model Hyperparameters\n"; color=:green, bold=true) + printstyled(io, "Likelihood Hyperparameters\n"; color=:magenta, bold=true) println(io, "δ₁ = $(prettynumber(params.δ1))") println(io, "δ₂ = $(prettynumber(params.δ2))") println(io, "α = $(prettynumber(params.α))") println(io, "β = $(prettynumber(params.β))") println(io, "ζ = $(prettynumber(params.ζ))") println(io, "γ = $(prettynumber(params.γ))") - printstyled(io, "Partition Prior Hyperparameters\n"; color = :magenta, bold = true) + printstyled(io, "Partition Prior Hyperparameters\n"; color=:magenta, bold=true) println(io, "η = $(prettynumber(params.η))") println(io, "σ = $(prettynumber(params.σ))") println(io, "u = $(prettynumber(params.u))") println(io, "v = $(prettynumber(params.v))") - printstyled(io, "Miscellaneous Hyperparameters\n"; color = :magenta, bold = true) + printstyled(io, "Miscellaneous Hyperparameters\n"; color=:magenta, bold=true) println(io, "Proposal standard deviation for sampling r = $(prettynumber(params.proposalsd_r))") println(io, "Repulsion is enabled? $(params.repulsion)") println(io, "Maximum number of clusters = $(params.maxK > 0 ? params.maxK : "none")") @@ -130,66 +129,66 @@ function Base.show(io::IO, ::MIME"text/plain", params::PriorHyperparamsList) end Base.@kwdef mutable struct MCMCState - clusts::ClustLabelVector + clusts::ClustLabelVector r::Float64 p::Float64 - clustsizes::Vector{Int} = counts(clusts, 1:length(clusts)) + clustsizes::Vector{Int} = counts(clusts, 1:length(clusts)) K::Int = sum(clustsizes .> 0) end """ - MCMCData(points::AbstractVector{<:AbstractVector{<:Float64}}) - MCMCData(dissimilaritymatrix::AbstractMatrix{Float64}) - -Contains the pairwise dissimilarities for the MCMC sampler. +MCMCData(points::AbstractVector{<:AbstractVector{<:Float64}}) +MCMCData(dissimilaritymatrix::AbstractMatrix{Float64}) + +Contains the pairwise dissimilarities for the MCMC sampler. """ -struct MCMCData +struct MCMCData D::Matrix{Float64} logD::Matrix{Float64} - function MCMCData(D::AbstractMatrix{Float64}) + function MCMCData(D::AbstractMatrix{Float64}) if any(D .!= D') error("D must be symmetric.") end if size(D, 1) != size(D, 2) error("D must be a square matrix.") end - new(Matrix(D), Matrix(log.(D .- Diagonal(D) .+ I(size(D,1))))) + new(Matrix(D), Matrix(log.(D .- Diagonal(D) .+ I(size(D, 1))))) end end function MCMCData(pnts::AbstractVector{<:AbstractVector{<:Float64}}) - D = pairwise(Euclidean(), makematrix(pnts), dims = 2) + D = pairwise(Euclidean(), makematrix(pnts), dims=2) MCMCData(D) end function Base.show(io::IO, data::MCMCData) - printstyled(io, "MCMC data : "; color = :green, bold = true) + printstyled(io, "MCMC data : "; color=:green, bold=true) println(io, "$(size(data.D, 1))×$(size(data.D, 1)) dissimilarity matrix.") end """ -Struct containing MCMC samples. +Struct containing MCMC samples. # Fields -- `options::MCMCOptionsList`: options passed to the sampler. -- `params::PriorHyperparamsList`: prior hyperparameters used by the sampler. -- `clusts::Vector{ClustLabelVector}`: contains the clustering allocations. `clusts[i]` is a vector containing the clustering allocation from the `i`th sample. -- `posterior_coclustering::Matrix{Float64}`: the posterior coclustering matrix. +- `options::MCMCOptionsList`: options passed to the sampler. +- `params::PriorHyperparamsList`: prior hyperparameters used by the sampler. +- `clusts::Vector{ClustLabelVector}`: contains the clustering allocations. `clusts[i]` is a vector containing the clustering allocation from the `i`th sample. +- `posterior_coclustering::Matrix{Float64}`: the posterior coclustering matrix. - `K::Vector{Int}`: posterior samples of ``K``, i.e., the number of clusters. `K[i]` is the number of clusters in `clusts[i]`. - `r::Vector{Float64}`: posterior samples of the parameter ``r``. - `p::Vector{Float64}`: posterior samples of the parameter ``p``. -- `K_mean`, `r_mean`, `p_mean`: posterior mean of `K`, `r`, and `p` respectively. -- `K_variance`, `r_variance`, `p_variance`: posterior variance of `K`, `r`, and `p` respectively. -- `K_acf::Vector{Float64}`, `r_acf::Vector{Float64}`, `p_acf::Vector{Float64}`: autocorrelation function for `K`, `r`, and `p` respectively. -- `K_iac`, `r_iac`, and `p_iac`: integrated autocorrelation coefficient for `K`, `r`, and `p` respectively. +- `K_mean`, `r_mean`, `p_mean`: posterior mean of `K`, `r`, and `p` respectively. +- `K_variance`, `r_variance`, `p_variance`: posterior variance of `K`, `r`, and `p` respectively. +- `K_acf::Vector{Float64}`, `r_acf::Vector{Float64}`, `p_acf::Vector{Float64}`: autocorrelation function for `K`, `r`, and `p` respectively. +- `K_iac`, `r_iac`, and `p_iac`: integrated autocorrelation coefficient for `K`, `r`, and `p` respectively. - `K_ess::Float64`, `r_ess::Float64`, and `p_ess::Float64`: effective sample size for `K`, `r`, and `p` respectively. -- `loglik::Vector{Float64}`: log-likelihood for each sample. +- `loglik::Vector{Float64}`: log-likelihood for each sample. - `logposterior::Vector{Float64}`: a function proportional to the log-posterior for each sample, with constant of proportionality equal to the normalising constant of the partition prior. - `splitmerge_splits`: Boolean vector indicating the iterations when a split proposal was used in the split-merge step. Has length `numMH * numiters` (see [`MCMCOptionsList`](@ref)). -- `splitmerge_acceptance_rate`: acceptance rate of the split-merge proposals. -- `r_acceptances`: Boolean vector indicating the iterations (including burnin and the thinned out iterations) where the Metropolis-Hastings proposal for `r` was accepted. +- `splitmerge_acceptance_rate`: acceptance rate of the split-merge proposals. +- `r_acceptances`: Boolean vector indicating the iterations (including burnin and the thinned out iterations) where the Metropolis-Hastings proposal for `r` was accepted. - `r_acceptance_rate:`: Metropolis-Hastings acceptance rate for `r`. - `runtime`: total runtime for all iterations. -- `mean_iter_time`: average time taken for each iteration. +- `mean_iter_time`: average time taken for each iteration. """ Base.@kwdef mutable struct MCMCResult clusts::Vector{ClustLabelVector} @@ -231,7 +230,7 @@ Base.@kwdef mutable struct MCMCResult numiters = options.numiters numsamples = options.numsamples numMH = options.numMH - x.clusts = [zeros(ClustLabelType, numpts) for i in 1:numsamples] + x.clusts = [zeros(ClustLabelType, numpts) for _ in 1:numsamples] x.posterior_coclustering = zeros(numpts, numpts) x.K = Int.(zeros(numsamples)) x.K_acf = zeros(numsamples) @@ -249,8 +248,8 @@ Base.@kwdef mutable struct MCMCResult end function Base.show(io::IO, ::MIME"text/plain", result::MCMCResult) - printstyled(io, "MCMC Summary\n"; color = :green, bold = true) - printstyled(io, "General\n"; color = :magenta, bold = true) + printstyled(io, "MCMC Summary\n"; color=:green, bold=true) + printstyled(io, "General\n"; color=:magenta, bold=true) println(io, "$(result.options.numiters) iteration$(result.options.numiters != 1 ? "s" : "")") println(io, "$(result.options.burnin) iteration$(result.options.numiters != 1 ? "s" : "") discarded as burnin") println(io, "$(result.options.numsamples) sample$(result.options.numsamples != 1 ? "s" : "")") @@ -263,21 +262,21 @@ function Base.show(io::IO, ::MIME"text/plain", result::MCMCResult) println(io, "Runtime = $(prettytime(result.runtime))") println(io, "Time per iteration : $(prettytime(result.mean_iter_time))") println(io, "") - printstyled(io, "Summary for K\n"; color = :magenta, bold = true) + printstyled(io, "Summary for K\n"; color=:magenta, bold=true) println(io, "IAC : $(prettynumber(result.K_iac))") println(io, "ESS : $(prettynumber(result.K_ess))") println(io, "ESS per sample : $(prettynumber(result.K_ess / result.options.numsamples))") println(io, "Posterior mean : $(prettynumber(result.K_mean))") println(io, "Posterior variance : $(prettynumber(result.K_variance))") println(io, "") - printstyled(io, "Summary for r\n"; color = :magenta, bold = true) + printstyled(io, "Summary for r\n"; color=:magenta, bold=true) println(io, "IAC : $(prettynumber(result.r_iac))") println(io, "ESS : $(prettynumber(result.r_ess))") println(io, "ESS per sample : $(prettynumber(result.r_ess / result.options.numsamples))") println(io, "Posterior mean : $(prettynumber(result.r_mean))") println(io, "Posterior variance : $(prettynumber(result.r_variance))") println(io, "") - printstyled(io, "Summary for p\n"; color = :magenta, bold = true) + printstyled(io, "Summary for p\n"; color=:magenta, bold=true) println(io, "IAC : $(prettynumber(result.p_iac))") println(io, "ESS : $(prettynumber(result.p_ess))") println(io, "ESS per sample : $(prettynumber(result.p_ess / result.options.numsamples))") diff --git a/src/utils.jl b/src/utils.jl index a92ff88..e9a2820 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,11 +1,3 @@ -using Distributions: Dirichlet, MvNormal, pdf -using LinearAlgebra: I -using LoopVectorization: @turbo -using Random: rand, AbstractRNG, TaskLocalRNG, seed! -using StatsBase: autocor, wsample, levelsmap, mean -using Printf: @sprintf -using Dates - # use the Gumbel-max trick to sample from a vector of discrete log-probabilities @inline function sample_logweights(logprobs::AbstractVector{Float64})::Int logprobs .-= minimum(logprobs) @@ -50,39 +42,39 @@ function iac_ess_acf(x::AbstractVector{<:Real}) acf = autocor(x) iac = sum(acf) * 2 ess = length(x) / iac - return (iac = iac, ess = ess, acf = acf) + return (iac=iac, ess=ess, acf=acf) end function uppertriangle( M::Matrix{T} - )::Vector{T} where {T} +)::Vector{T} where {T} n = size(M, 1) - [M[i, j] for i in 1:n for j in (i + 1):n] + [M[i, j] for i in 1:n for j in (i+1):n] end """ - adjacencymatrix(clusts::ClustLabelVector) -> Matrix{Bool} -Returns the `n`×`n` adjacency matrix corresponding to the given cluster label vector `clusts`, where `n = length(clusts)`. -""" +adjacencymatrix(clusts::ClustLabelVector) -> Matrix{Bool} +Returns the `n`×`n` adjacency matrix corresponding to the given cluster label vector `clusts`, where `n = length(clusts)`. +""" function adjacencymatrix( clusts::ClustLabelVector - )::Matrix{Bool} +)::Matrix{Bool} clusts .== clusts' end """ - sortlabels(x::ClustLabelVector) -> ClustLabelVector +sortlabels(x::ClustLabelVector) -> ClustLabelVector Returns a cluster label vector `y` such that `x` and `y` have the same adjacency structure and labels in `y` occur in sorted ascending order. """ function sortlabels( x::ClustLabelVector - )::ClustLabelVector +)::ClustLabelVector temp = levelsmap(x) [temp[i] for i in x] end """ - generatemixture(N, K; [α, dim, radius, σ, rng]) +generatemixture(N, K; [α, dim, radius, σ, rng]) Generates a multivariate Normal mixture, with kernel weights generated from a Dirichlet prior. The kernels are centred at the vertices of a `dim`-dimensional simplex with edge length `radius`. @@ -106,8 +98,8 @@ Named tuple containing the following fields- - `probs::Float64`: vector of `K` cluster weights generated from the Dirichlet prior, used to generate the observations. - `oracle_coclustering::Matrix{Float64}`: `N`×`N` matrix of co-clustering probabilities, calculated assuming full knowledge of the cluster centres and cluster weights. """ -function generatemixture(N::Integer, K::Integer; α::Real = K, dim::Real = K, radius::Real = 1, σ::Real = 0.1, rng::Union{AbstractRNG, <:Integer} = TaskLocalRNG()) - # input validation +function generatemixture(N::Integer, K::Integer; α::Real=K, dim::Real=K, radius::Real=1, σ::Real=0.1, rng::Union{AbstractRNG,<:Integer}=TaskLocalRNG()) + # input validation N < 1 && throw(ArgumentError("N must be greater than 1.")) (K < 1 || K > N) && throw(ArgumentError("K must satisfy 1 ≤ K ≤ N.")) α ≤ 0 && throw(ArgumentError("α must be positive.")) @@ -124,7 +116,7 @@ function generatemixture(N::Integer, K::Integer; α::Real = K, dim::Real = K, ra # generate cluster centres clust_centres = fill(zeros(dim), K) for i = 1:K - clust_centres[i] = cat(zeros(i - 1), radius, zeros(dim - i); dims = 1) + clust_centres[i] = cat(zeros(i - 1), radius, zeros(dim - i); dims=1) end # generate points @@ -138,7 +130,7 @@ function generatemixture(N::Integer, K::Integer; α::Real = K, dim::Real = K, ra numiters = 5000 oracle_posterior = zeros(K, N) oracle_coclustering = zeros(N, N) - for c in 1:numiters + for _ in 1:numiters tempprobs = rand(rng, Dirichlet(K, α)) for i = 1:N for j = 1:K @@ -149,13 +141,13 @@ function generatemixture(N::Integer, K::Integer; α::Real = K, dim::Real = K, ra oracle_coclustering += oracle_posterior' * oracle_posterior end oracle_coclustering ./= numiters #oracle coclustering matrix - distM = [pnts[i][j] for i in 1:N, j in 1:dim]' |> - (x -> pairwise(Euclidean(), x, dims = 2)) - return (points = pnts, distancematrix = distM, clusts = clusts, probs = probs, oracle_coclustering = oracle_coclustering) + distM = [pnts[i][j] for i in 1:N, j in 1:dim]' |> + (x -> pairwise(Euclidean(), x, dims=2)) + return (points=pnts, distancematrix=distM, clusts=clusts, probs=probs, oracle_coclustering=oracle_coclustering) end """ - makematrix(x::AbstractVector{<:AbstractVector}) -> Matrix +makematrix(x::AbstractVector{<:AbstractVector}) -> Matrix Convert a vector of vectors into a matrix, where each vector becomes a column in the matrix. """ @@ -176,7 +168,7 @@ function prettytime(t) x = Dates.canonicalize(Dates.Second(Integer(floor(t)))) out = "" for i in 1:length(x.periods) - out *= string(x.periods[i].value) * " " + out *= string(x.periods[i].value) * " " if typeof(x.periods[i]) == Second out *= "s" elseif typeof(x.periods[i]) == Minute @@ -186,12 +178,12 @@ function prettytime(t) elseif typeof(x.periods[i]) == Day out *= "day" * (x.periods[i].value != 1 ? "s" : "") end - if i != length(x.periods) - out *= " " + if i != length(x.periods) + out *= " " end end end out end -prettynumber(x) = x < 1 ? @sprintf("%.3e", x) : @sprintf("%.3f", x) \ No newline at end of file +prettynumber(x) = x < 1 ? @sprintf("%.3e", x) : @sprintf("%.3f", x)