diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..cba9134 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 * * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml new file mode 100644 index 0000000..a9f0087 --- /dev/null +++ b/.github/workflows/Documentation.yml @@ -0,0 +1,26 @@ +name: Documentation + +on: + push: + branches: + - master # update to match your development branch (master, main, dev, trunk, ...) + tags: '*' + pull_request: + +jobs: + build: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: julia-actions/setup-julia@v1 + with: + version: '1.7' + - name: Install dependencies + run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - name: Build and deploy + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # If authenticating with GitHub Actions token + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # If authenticating with SSH deploy key + run: julia --project=docs/ docs/make.jl \ No newline at end of file diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..f49313b --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,15 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml new file mode 100644 index 0000000..219ee76 --- /dev/null +++ b/.github/workflows/Test.yml @@ -0,0 +1,31 @@ +name: Run tests + +on: + push: + branches: + - master # update to match your development branch (master, main, dev, trunk, ...) + tags: '*' + pull_request: + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: ['1.7', '1.8'] + julia-arch: [x64] + os: [ubuntu-latest, windows-latest, macOS-latest] + + steps: + - uses: actions/checkout@v2 + - name: Setup Julia + uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.julia-version }} + arch: ${{ matrix.julia-arch }} + - name: Build package + uses: julia-actions/julia-buildpkg@v1 + - name: Run unit tests + uses: julia-actions/julia-runtest@v1 + with: + annotate: true \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..adf2697 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/Manifest.toml +/docs/build/ +ssh_keys \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4a2b808 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Abhinav Natarajan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..e7ce00f --- /dev/null +++ b/Project.toml @@ -0,0 +1,27 @@ +name = "RedClust" +uuid = "bf1adee6-87fe-4679-8d23-51fe99940a25" +authors = ["Abhinav Natarajan "] +version = "0.1.0" + +[deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Match = "7eb4fadd-790c-5f42-8a69-bfa0b872bfbf" +ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" +RCall = "6f49c342-dc21-5d91-9882-a32aef131414" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" + +[compat] +Clustering = "0.13.5, 0.14" +Distances = "0.10.3" +Distributions = "0.25.42" +Match = "1.2" +ProgressBars = "1.4" +RCall = "0.13.8" +SpecialFunctions = "1.8, 2" +StatsBase = "0.33.8" +julia = "1.7" diff --git a/README.md b/README.md new file mode 100644 index 0000000..aa01566 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# RedClust + +[![Stable](https://img.shields.io/badge/docs-stable-blue?style=flat-square)](https://abhinavnatarajan.github.io/RedClust.jl/stable/) +[![Dev](https://img.shields.io/badge/docs-dev-blue?style=flat-square)](https://abhinavnatarajan.github.io/RedClust.jl/dev/) +![GitHub Workflow Status (branch)](https://img.shields.io/github/workflow/status/abhinavnatarajan/RedClust.jl/Run%20tests/master?style=flat-square) +![GitHub](https://img.shields.io/github/license/abhinavnatarajan/RedClust.jl?style=flat-square) +![GitHub tag (latest SemVer)](https://img.shields.io/github/v/tag/abhinavnatarajan/RedClust.jl?style=flat-square) + +## Introduction + +[RedClust](https://github.com/abhinavnatarajan/RedClust.jl) is a [Julia](https://julialang.org/) package for Bayesian clustering of high-dimensional Euclidean data using pairwise dissimilarities instead of the raw observations. It uses an MCMC sampler to generate posterior samples from the space of all possible clustering structures on the data. + +## Installation +The package can be installed by typing `]add RedClust` into the Julia REPL or by the usual method: +```julia +using Pkg +Pkg.add("RedClust") +``` +RedClust also requires [`R`](https://www.r-project.org/) and the R package [`salso`](https://CRAN.R-project.org/package=salso). If R is already installed, make sure the `R_HOME` environment variable is set to the R home directory (you could run `R.home()` in R to determine the location of this directory). If R or `salso` are not found, they are automatically installed during package installation. + +## Basic example +```julia +using RedClust +# Generate data +pnts, distM, clusts, probs, oracle_coclustering = + generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim) +# Let RedClust choose the best prior hyperparameters +params = fitprior(pnts, "k-means", false).params +# Set the MCMC options +options = MCMCOptionsList(numiters = 5000) +data = MCMCData(D = distM) +# Run the sampler +result = runsampler(data, options, params) +``` + +## Citing this package +If you want to use this package in your work, please cite it as: + +Natarajan, A., De Iorio, M., Heinecke, A., Mayer, E. and Glenn, S., 2022. ‘Cohesion and Repulsion in Bayesian Distance Clustering’, arXiv [2107.05414](https://arxiv.org/abs/2107.05414). diff --git a/deps/build.jl b/deps/build.jl new file mode 100644 index 0000000..c5eaba3 --- /dev/null +++ b/deps/build.jl @@ -0,0 +1,6 @@ +script = +`Rscript -e 'dir.create(Sys.getenv("R_LIBS_USER"), recursive = TRUE) +if (!require("salso")) { + install.packages("salso", lib = Sys.getenv("R_LIBS_USER"), repos = "http://cran.us.r-project.org") +}'` +run(script) \ No newline at end of file diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 0000000..2d40f1f --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,103 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.8.1" +manifest_format = "2.0" +project_hash = "f7d6d28d478dfcb10e5302531e2f35d5341ef21b" + +[[deps.ANSIColoredPrinters]] +git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" +uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" +version = "0.0.1" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.DocStringExtensions]] +deps = ["LibGit2"] +git-tree-sha1 = "5158c2b41018c5f7eb1470d558127ac274eca0c9" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.9.1" + +[[deps.Documenter]] +deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "6030186b00a38e9d0434518627426570aac2ef95" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.27.23" + +[[deps.IOCapture]] +deps = ["Logging", "Random"] +git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a" +uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" +version = "0.2.2" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.3" + +[[deps.LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.Parsers]] +deps = ["Dates"] +git-tree-sha1 = "3d5bf43e3e8b412656404ed9466f1dcbf7c50269" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "2.4.0" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.RedClust]] +path = ".." +uuid = "bf1adee6-87fe-4679-8d23-51fe99940a25" +version = "0.1.0" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..4e73f63 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,3 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +RedClust = "bf1adee6-87fe-4679-8d23-51fe99940a25" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..b23ff8b --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,26 @@ +using RedClust +using Documenter + +DocMeta.setdocmeta!(RedClust, :DocTestSetup, :(using RedClust); recursive=true) + +makedocs(; + modules=[RedClust], + authors="Abhinav Natarajan ", + repo="https://github.com/abhinavnatarajan/RedClust.jl/blob/{commit}{path}#{line}", + sitename="RedClust.jl", + format=Documenter.HTML(; + prettyurls=get(ENV, "CI", "false") == "true", + canonical="https://abhinavnatarajan.github.io/RedClust.jl", + edit_link="main", + assets=String[], + ), + pages=[ + "Introduction" => "index.md", + "Reference" => "reference.md" + ] +) + +deploydocs(; + repo="github.com/abhinavnatarajan/RedClust.jl", + devbranch="master" +) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..db0fef0 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,91 @@ +```@meta +CurrentModule = RedClust +``` + +# Introduction + +[RedClust](https://github.com/abhinavnatarajan/RedClust.jl) is a [Julia](https://julialang.org/) package for Bayesian clustering of high-dimensional Euclidean data using pairwise dissimilarity information instead of the raw observations. It uses an MCMC sampler to generate posterior samples from the space of all possible clustering structures on the data. + +## Installation +The package can be installed by typing `]add RedClust` into the Julia REPL or by the usual method: +```julia +using Pkg +Pkg.add("RedClust") +``` +RedClust also requires [`R`](https://www.r-project.org/) and the R package [`salso`](https://CRAN.R-project.org/package=salso). If R is already installed, make sure the `R_HOME` environment variable is set to the R home directory (you could run `R.home()` in R to determine the location of this directory). If R or salso are not found, they are automatically installed during package installation. + +## Basic example +```julia +using RedClust +# Generate data +pnts, distM, clusts, probs, oracle_coclustering = + generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim) +# Let RedClust choose the best prior hyperparameters +params = fitprior(pnts, "k-means", false).params +# Set the MCMC options +options = MCMCOptionsList(numiters = 5000) +data = MCMCData(D = distM) +# Run the sampler +result = runsampler(data, options, params) +``` + +## Model +RedClust implements the model described in Natarajan et al. (2022). The key features are- +1. The use of a random partition model with an unknown number of clusters ``K`` which allows for posterior inference on ``K``. That is, there is a prior distribution on the space of all possible clustering structures with any number of clusters from one to the number of observations. The number of clusters is an object of inference to be determined by MCMC sampling. +2. The pairwise dissimilarities between observations are assumed to be mutually independent given the clustering assignment; that is, the clustering likelihood is a *composite* or *pseduo-likelihood*. +3. The clustering likelihood is comprised of a *cohesive* part and a *repulsive* part. + +The prior on the clustering structure can be chosen according to application in question. The current version of RedClust implements a *microclustering* prior (Miller et al., 2015; Betancourt et al., 2022). This means that the partition is generated by drawing cluster sizes ``n_1, \ldots, n_K`` from a random distribution ``\nu`` (conditional upon ``n_1 + \ldots n_K = n`` where ``n`` is the number of observations), and cluster labels are given by a uniform random permutation of +```math +\underbrace{1, \ldots, 1}_{n_1 \text{ times}}, \ldots, \underbrace{K, \ldots, K}_{n_K \text{ times}} +``` +RedClust implements a microclustering prior with ``\nu`` a shifted negative binomial with random parameters ``r`` and ``p``. +```math +\begin{align*} +r &\sim \mathrm{Gamma}(\eta, \sigma)\\ +p &\sim \mathrm{Beta}(u, v)\\ +\pi(\rho_n \mid r, p) &\propto K! p^{n-K}(1-p)^{rK}\Gamma(r)^{-K}\prod_{k=1}^K n_k \Gamma(n_k+r-1) +\end{align*} +``` +where ``\rho_n`` is the partition and ``K`` is the number of clusters in the partition. The partition likelihood is given by +```math +\begin{align*} +\lambda_k &\overset{\mathrm{iid}}{\sim} \mathrm{Gamma}(\alpha, \beta), \quad 1 \leq k \leq K\\ +\theta_{kt} &\overset{\mathrm{iid}}{\sim} \mathrm{Gamma}(\zeta, \gamma), \quad 1 \leq k < t \leq K +\end{align*} +``` +```math +\pi(D, \mid \rho_n, \boldsymbol{\lambda}, \boldsymbol{\theta}, r, p) = \left[ \prod_{k=1}^K \prod_{\substack{i, j \in C_k \\ i \neq j}} \frac{D_{ij}^{\delta_1 - 1}\lambda_k^{\delta_1}}{\Gamma(\delta_1)} \exp(-\lambda_k D_{ij}) \right] \left[\prod_{\substack{t, k = 1 \\ t \neq K}} \prod_{\substack{i \in C_k \\ j \in C_t}} \frac{D_{ij}^{\delta_2-1}\theta_{kt}^{\delta_2}}{\Gamma(\delta_2)} \exp(-\theta_{kt}D_{ij}) \right] +``` +where ``\boldsymbol D`` is the matrix of pairwise dissimilarities between observations. + +## Point estimation +A clustering point-estimate ``\boldsymbol c^*`` can be determined by searching for a clustering that minimises the expectation of the Variation of Information distance (Wade and Ghahramani, 2018) from a clustering ``\boldsymbol c'`` chosen randomly from the posterior distribution. That is, +```math +\boldsymbol c^* = \argmin_{\boldsymbol c} \mathbb{E}_{\boldsymbol c'}[d_{VI}(\boldsymbol c, \boldsymbol c')] +``` +1. A naive method is to restrict the search space to those clusterings visited by the MCMC sampler. +2. A better method is the SALSO algorithm (Dahl et al., 2022), implemented in the [R](https://www.r-project.org/) package [salso](https://CRAN.R-project.org/package=salso). +The MCMC sampler in RedClust automatically computes a point estimate using one of the methods above. The choice of method can be specified in the options passed to the sampler. See [`MCMCOptionsList`](@ref). + +## Citing this package +If you want to use this package in your work, please cite it as: + +Natarajan, A., De Iorio, M., Heinecke, A., Mayer, E. and Glenn, S., 2022. ‘Cohesion and Repulsion in Bayesian Distance Clustering’, arXiv [2107.05414](https://arxiv.org/abs/2107.05414). + +## References +Betancourt, B., Zanella, G. and Steorts, R. C. (2022), ‘Random partition models for microclustering +tasks’, _Journal of the American Statistical Association_ 117(539), 1215–1227. DOI: [10.1080/01621459.2020.1841647](https://doi.org/10.1080/01621459.2020.1841647). + +Dahl, D. B., Johnson, D. J. and M¨uller, P. (2022), ‘Search algorithms and loss functions +for bayesian clustering’, _Journal of Computational and Graphical Statistics_ 0(0), 1–13. DOI: [10.1080/10618600.2022.2069779](https://doi.org/10.1080/10618600.2022.2069779). + +Miller, J., Betancourt, B., Zaidi, A., Wallach, H. and Steorts, R. C. (2015), ‘Microclustering: +When the cluster sizes grow sublinearly with the size of the data set’, arXiv +[1512.00792](https://arxiv.org/abs/1512.00792). + +Wade, S. and Ghahramani, Z. (2018), ‘Bayesian Cluster Analysis: Point Estimation and +Credible Balls (with Discussion)’, _Bayesian Analysis_ **13**(2), 559 – 626. DOI: [10.1214/17-BA1073](https://doi.org/10.1214/17-BA1073). + + + diff --git a/docs/src/reference.md b/docs/src/reference.md new file mode 100644 index 0000000..8c1b837 --- /dev/null +++ b/docs/src/reference.md @@ -0,0 +1,28 @@ +# Reference + +## Main functions +```@autodocs +Modules = [RedClust] +Pages = ["Prior.jl", "MCMC.jl"] +Order = [:function] +``` + +## Summary and clustering evaluation +```@autodocs +Modules = [RedClust] +Pages = ["SummaryFunctions.jl"] +Order = [:function] +``` + +## Convenience functions +```@autodocs +Modules = [RedClust] +Pages = ["Utility.jl"] +Order = [:function] +``` + +## Types +```@autodocs +Modules = [RedClust] +Order = [:type] +``` diff --git a/examples/basic_example.jl b/examples/basic_example.jl new file mode 100644 index 0000000..dff8e57 --- /dev/null +++ b/examples/basic_example.jl @@ -0,0 +1,62 @@ +using RedClust +using Random: seed! +using CairoMakie +using StatsBase: counts + +# Generate data +seed!(44) +K = 10 # Number of clusters +N = 100 # Number of points +data_σ = 0.25 # Variance of the normal kernel +data_dim = 10 # Data dimension +data = generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim) +pnts, distM, clusts, probs, oracle_coclustering = data +# See the true adjacency matrix and oracle co-clustering matrices +fig1 = Figure() +Axis(fig1[1, 1], aspect = 1) +heatmap!(adjacencymatrix(clusts), colormap = :Blues) +fig1 +fig2 = Figure() +Axis(fig2[1, 1], aspect = 1) +heatmap!(oracle_coclustering, colormap = :Blues) +fig2 + +# Determine the best the prior hyperparameters +params = fitprior(pnts, "k-means", false).params + +# MCMC options +options = MCMCOptionsList(usesalso = true) +data = MCMCData(D = distM) + +# Run the sampler +result = runsampler(data, options, params) + +# Check the results +# Posterior coclustering matrix +fig3 = Figure() +Axis(fig3[1, 1], aspect = 1) +heatmap!(result.posterior_coclustering, colormap = :Blues) +fig3 +# Point-estimate adjacency matrix +fig4 = Figure() +Axis(fig4[1, 1], aspect = 1) +heatmap!(adjacencymatrix(result.pntestimate), colormap = :Blues) +fig4 + +# Posterior distribution of K +K_barplot = Figure(resolution = (400, 300), fontsize = 20) +ax = Axis(K_barplot[1, 1]) +barplot!(ax, minimum(unique(result.K)):maximum(unique(result.K)), counts(result.K)) +K_barplot +# Posterior distribution of r +r_hist = Figure(resolution = (400, 300), fontsize = 20) +ax = Axis(r_hist[1, 1]) +hist!(ax, result.r, bins = 25) +r_hist +# Posterior distribution of p +p_hist = Figure(resolution = (400, 300), fontsize = 20) +ax = Axis(p_hist[1, 1]) +hist!(ax, result.p, bins = 25) +p_hist +# Summary of MCMC and point estimate +summarise(result, clusts); \ No newline at end of file diff --git a/src/MCMC.jl b/src/MCMC.jl new file mode 100644 index 0000000..8aa62c6 --- /dev/null +++ b/src/MCMC.jl @@ -0,0 +1,544 @@ +using Random: rand +using StatsBase: sample, counts, mean_and_var, mean, var +using Distributions: Normal, Uniform, Gamma, Beta, logpdf, truncated +using SpecialFunctions: loggamma +using LinearAlgebra: I +using Clustering: kmedoids, varinfo +using ProgressBars: ProgressBar +using RCall: rcopy, @R_str + + + +function loglik( + data::MCMCData, + state::MCMCState, + params::PriorHyperparamsList, + clustering::Bool, + partition::Bool + )::Float64 + D = data.D + logD = data.logD + clusts = state.clusts + clustsizes = state.clustsizes + K = state.K + r = state.r + p = state.p + δ1 = params.δ1 + δ2 = params.δ2 + α = params.α + β = params.β + ζ = params.ζ + γ = params.γ + η = params.η + σ = params.σ + u = params.u + v = params.v + repulsion = params.repulsion + + C = findall(clustsizes .> 0) + n = size(D, 1) + L = 0 + + if clustering + # Cohesive part of likelihood + for k = 1:K + clust_k = clusts .== C[k] + sz_k = clustsizes[C[k]] + pairs_k = numpairs(sz_k) + a = α + δ1 * pairs_k + b = β + sum(D[clust_k, clust_k]) / 2 + L += (δ1 - 1) * sum(logD[clust_k, clust_k]) / 2 - pairs_k * loggamma(δ1) + + α * log(β) - loggamma(α) + + loggamma(a) - a * log(b) + end + + # Repulsive part of likelihood + if repulsion + for k = 1:K + clust_k = clusts .== C[k] + sz_k = clustsizes[C[k]] + for t = (k + 1):K + clust_t = clusts .== C[t] + sz_t = clustsizes[C[t]] + pairs_kt = sz_k * sz_t + z = ζ + δ2 * pairs_kt + g = γ + sum(D[clust_k, clust_t]) + L += (δ2 - 1) * sum(logD[clust_k, clust_t]) - pairs_kt * loggamma(δ2) + + ζ * log(γ) - loggamma(ζ) + + loggamma(z) - z * log(g) + end + end + end + end + + if partition + L += loggamma(K + 1) + (n - K) * log(p) + (r * K) * log(1 - p) - K * loggamma(r) + for nj in clustsizes[C] + L += log(nj) + loggamma(nj + r - 1) + end + end + return L +end + +function sample_r!(state::MCMCState, params::PriorHyperparamsList) + r = state.r + p = state.p + clustsizes = state.clustsizes + C = clustsizes[findall(clustsizes .> 0)] + K = state.K + η = params.η + σ = params.σ + proposalsd_r = params.proposalsd_r + temp = sample_r(r, p, C, K, η, σ, proposalsd_r) + state.r = temp.r + return (r = temp.r, accept = temp.accept) +end + +function sample_r( + r::Float64, + p::Float64, + C::Vector{Int}, + K::Integer, + η::Float64, + σ::Float64, + proposalsd_r::Float64 + ) + # proposal + proposal_distribution = truncated( + Normal(r, proposalsd_r), + lower = 0, + upper = Inf + ) + r_candidate = rand(proposal_distribution) + reverse_distribution = truncated( + Normal(r_candidate, proposalsd_r), + lower = 0, + upper = Inf + ) + + # calculate transition probability + logprobcandidate = (η-1) * log(r_candidate) + K * (r_candidate * log(1-p) - loggamma(r_candidate)) - r_candidate * σ + logprobcurrent = (η-1) * log(r) + K * (r * log(1-p) - loggamma(r)) - r * σ + for nk in C + logprobcandidate = logprobcandidate + loggamma(nk - 1 + r_candidate) + logprobcurrent = logprobcurrent + loggamma(nk - 1 + r) + end + # Log-ratio of proposal densities + logproposalratio = logpdf(proposal_distribution, r_candidate) - + logpdf(reverse_distribution, r) + + rnew = r + accept = false + # Metropolis acceptance step + if log(rand(Uniform())) < + minimum([0, logprobcandidate - logprobcurrent - logproposalratio]) # accept + rnew = r_candidate + accept = true + end + return (r = rnew, accept = accept) +end + +function sample_p!(state::MCMCState, params::PriorHyperparamsList)::Float64 + K = state.K + n = length(state.clusts) + r = state.r + u = params.u + v = params.v + state.p = sample_p(K, n, r, u, v) +end + +function sample_p( + K::Int, + n::Int, + r::Float64, + u::Float64, + v::Float64 + )::Float64 + rand(Beta(n - K + u, r * K + v)) +end + +## Sample clustering allocation labels via Gibbs sampling +function sample_labels_Gibbs!( + data::MCMCData, + state::MCMCState, + params::PriorHyperparamsList, + items_to_reallocate::Vector{Int} = Vector(1:length(state.clusts)), + candidate_clusts::Vector{ClustLabelType} = ClustLabelType[], + final_clusts::ClustLabelVector = ClustLabelVector() + ) + + D = data.D + logD = data.logD + clusts = state.clusts + clustsizes = state.clustsizes + n = length(clusts) + r = state.r + p = state.p + δ1 = params.δ1 + δ2 = params.δ2 + α = params.α + β = params.β + ζ = params.ζ + γ = params.γ + repulsion = params.repulsion + maxK = params.maxK + free_label_set = isempty(candidate_clusts) # free allocation of cluster labels is allowed + free_allocation = isempty(final_clusts) # force the result of the Gibbs scan (when we want to compute transition probability only) + log_transition_prob = 0 # transition probability of the Gibbs scan + + α_i = zeros(n) + β_i = zeros(n) + ζ_i = zeros(n) + γ_i = zeros(n) + sum_logD_i = zeros(n) + L2_ik_prime = zeros(n) + αβratio = α * log(β) - loggamma(α) + ζγratio = ζ * log(γ) - loggamma(ζ) + for i in items_to_reallocate + clustsizes[clusts[i]] = clustsizes[clusts[i]] - 1 + clusts[i] = -1 + C_i = findall(clustsizes .> 0) + K_i = length(C_i) + + candidate_clusts = C_i + if free_label_set && (maxK == 0 || K_i < maxK) && K_i < n + candidate_clusts = append!(candidate_clusts, findall(clustsizes .== 0)[1]) + end + m = length(candidate_clusts) + + # Calculate everything that we will need + + for k in C_i + clust_k = (clusts .== k) + sz_clust_k = clustsizes[k] + α_i[k] = α + δ1 * sz_clust_k + β_i[k] = β + sum(D[i, clust_k]) + ζ_i[k] = ζ + δ2 * sz_clust_k + γ_i[k] = γ + sum(D[i, clust_k]) + sum_logD_i[k] = sum(logD[i, clust_k]) + end + + L1 = zeros(m) + L2 = zeros(m) + log_prior_ratio = zeros(m) + logprobs = zeros(m) + + for k in 1:m # compute cohesive likelihood component for each proposed cluster + if clustsizes[candidate_clusts[k]] == 0 # new cluster + log_prior_ratio[k] = log(K_i + 1) + r * log(1 - p) + L1[k] = 0 + else + sz_candidate_clust_k = clustsizes[candidate_clusts[k]] + L1[k] = loggamma(α_i[candidate_clusts[k]]) + αβratio - + α_i[candidate_clusts[k]] * log(β_i[candidate_clusts[k]]) + + (δ1 - 1) * sum_logD_i[candidate_clusts[k]] - sz_candidate_clust_k * loggamma(δ1) + log_prior_ratio[k] = log(sz_candidate_clust_k + 1) + log(p) + log(sz_candidate_clust_k - 1 + r) - log(sz_candidate_clust_k) + end + end + + logprobs .= log_prior_ratio .+ L1 + + if repulsion + for t in C_i + L2_ik_prime[t] = loggamma(ζ_i[t]) - ζ_i[t] * log(γ_i[t]) + + ζγratio + (δ2 - 1) * sum_logD_i[t] - clustsizes[t] * loggamma(δ2) + end + L2_i = sum(L2_ik_prime[C_i]) + for k = 1:m + if clustsizes[candidate_clusts[k]] == 0 + L2[k] = L2_i + else + L2[k] = L2_i - L2_ik_prime[candidate_clusts[k]] + end + end + logprobs .+= L2 + end + if free_allocation + k = sample_logweights(logprobs) + ci_new = candidate_clusts[k] + else + ci_new = final_clusts[i] + k = findall(candidate_clusts .== ci_new)[1] + end + + clusts[i] = ci_new + clustsizes[ci_new] = clustsizes[ci_new] + 1 + + # calculate the transition probability + logprobs .+= minimum(logprobs) + probs = exp.(logprobs) + probs ./= sum(probs) + log_transition_prob += log(probs[k]) + end + state.K = sum(clustsizes .> 0) + return log_transition_prob +end + +function sample_labels!( + data::MCMCData, + state::MCMCState, + params::PriorHyperparamsList, + options::MCMCOptionsList + ) + # Unpack + + n = length(state.clusts) + r = state.r + p = state.p + numMH = options.numMH + numGibbs = options.numGibbs + + accept = fill(false, numMH) + split = fill(false, numMH) + for MH_counter in 1:numMH + clusts = state.clusts + clustsizes = state.clustsizes + K = state.K + # BEGIN MH + # Pick chaperones + i, j = sample(1:n, 2, replace = false) + ci = clusts[i] # labels of their current cluster allocations + cj = clusts[j] + + # automatically reject if we have max number of clusters + if params.maxK > 0 && ci == cj && length(findall(clustsizes .> 0)) >= params.maxK + continue + end + + # choose elements to reallocate + S = findall(clusts .== ci .|| clusts .== cj) + S = S[S .!= i .&& S .!= j] + + # initialise random launch state + claunch = copy(clusts) + szlaunch = copy(clustsizes) + Klaunch = K + if ci == cj + claunch[i] = findall(clustsizes .== 0)[1] + szlaunch[ci] = szlaunch[ci] - 1 + szlaunch[claunch[i]] = szlaunch[claunch[i]] + 1 + Klaunch = K + 1 + end + candidate_clusts = [claunch[i], claunch[j]] + for k in S + claunch[k] = sample(candidate_clusts) + szlaunch[clusts[k]] = szlaunch[clusts[k]] - 1 + szlaunch[claunch[k]] = szlaunch[claunch[k]] + 1 + end + launchstate = MCMCState(claunch, r, p, szlaunch, Klaunch) + + # restricted Gibbs scans + for scan_counter in 1:numGibbs + sample_labels_Gibbs!(data, + launchstate, params, S, candidate_clusts) + end + + if ci == cj # split proposal + split[MH_counter] = true + # final Gibbs scan + log_transition_prob = sample_labels_Gibbs!(data, + launchstate, params, S, candidate_clusts) + finalstate = launchstate + cfinal = finalstate.clusts + szfinal = finalstate.clustsizes + Kfinal = finalstate.K + + # log prior ratio for MH acceptance ratio + log_prior_ratio = log(K + 1) + r * log(1 - p) - log(p) - loggamma(r) + + loggamma(szfinal[cfinal[i]] - 1 + r) + loggamma(szfinal[cfinal[j]] - 1 + r) + + log(szfinal[cfinal[i]]) + log(szfinal[cfinal[j]]) + + -(loggamma(clustsizes[ci] - 1 + r) + log(clustsizes[ci])) + + # proposal density ratio pr(new | old) / pr(old | new) + log_proposal_ratio = log_transition_prob + # note that the reverse proposal density is just 1 + else # merge + cfinal = copy(launchstate.clusts) + szfinal = copy(launchstate.clustsizes) + Kfinal = launchstate.K + clust_i = findall(cfinal .== ci) + sz_clust_i = length(clust_i) + cfinal[clust_i] .= cj + szfinal[ci] = 0 + szfinal[cj] = szfinal[cj] + sz_clust_i + Kfinal -= 1 + finalstate = MCMCState(cfinal, r, p, szfinal, Kfinal) + + # log prior ratio for MH acceptance ratio + log_prior_ratio = -(log(K) + r * log(1 - p) - log(p) - loggamma(r)) + + loggamma(szfinal[cj] - 1 + r) + log(szfinal[cj]) + + -(loggamma(clustsizes[ci] - 1 + r) + loggamma(clustsizes[cj] - 1 + r) + + log(clustsizes[ci]) + log(clustsizes[cj])) + + # proposal density ratio pr(new | old) / pr(old | new) + log_transition_prob = sample_labels_Gibbs!(data, + launchstate, params, S, candidate_clusts, clusts) + + log_proposal_ratio = -log_transition_prob + # note that the reverse proposal density is just 1 + end + + # likelihood ratio for MH acceptance ratio + log_lik_ratio = loglik(data, + finalstate, params, true, false) - + loglik(data, state, params, true, false) + + # MH acceptance step + log_acceptance_ratio = minimum( + [0, log_prior_ratio + log_lik_ratio - log_proposal_ratio]) + if (log(rand(Uniform())) < log_acceptance_ratio) # accept + state = finalstate + accept[MH_counter] = true + end + # END MH + end + + + # Final Gibbs scan + temp = sample_labels_Gibbs!(data, state, params) + return (accept = accept, split = split) +end + +""" + runsampler(data[, options, params]) + +Runs the MCMC sampler on the data. + +# Arguments +- `data::MCMCData`: contains the distance matrix. +- `options = MCMCOptionsList()`: contains the number of iterations, burnin, etc. +- `params = PriorHyperparamsList()`: contains the prior hyperparameters for the model. + +# Returns +A struct of type `MCMCResult` containing the MCMC samples, convergence diagnostics, and summary statistics. + +# See also +[`MCMCData`](@ref), [`MCMCOptionsList`](@ref), [`fitprior`](@ref), [`PriorHyperparamsList`](@ref). +""" +function runsampler(data::MCMCData, + options::MCMCOptionsList = MCMCOptionsList(), + params::Union{PriorHyperparamsList, Nothing} = nothing, + init::Union{MCMCState, Nothing} = nothing + )::MCMCResult + numiters = options.numiters + burnin = options.burnin + thin = options.thin + numsamples = options.numsamples + if isnothing(params) + params = fitprior(data.D, "k-medoids", true).params + end + if isnothing(init) + init = MCMCState( + clusts = kmedoids(data.D, + (params.maxK > 0 ? minimum([params.maxK, params.K_initial]) : params.K_initial); + maxiter=1000).assignments, + r = rand(Gamma(params.η, 1 / params.σ)), + p = rand(Beta(params.u, params.v)) + ) + end + result = MCMCResult(data, options, params) + state = init + + # Start sampling + j::Int = 1 + print("Start sampling...\n") + runtime = 0 + runtime = @elapsed ( + for i in ProgressBar(1:numiters) + result.r_acceptances[i] = sample_r!(state, params).accept + sample_p!(state, params) + temp = sample_labels!(data, state, params, options) + temprange = ((i-1) * options.numMH + 1) : (i * options.numMH) + result.splitmerge_acceptances[temprange] .= temp.accept + result.splitmerge_splits[temprange] .= temp.split + + # Record sample if necessary + if i > burnin && (i - burnin) % thin == 0 + result.clusts[j] .= sortlabels(state.clusts) + result.K[j] = state.K + result.r[j] = state.r + result.p[j] = state.p + result.loglik[j] = loglik(data, state, params, true, true) + j = j + 1 + end + end + ) + + print("Computing summary statistics and diagnostics...\n") + # Create co-clustering matrix and compute its IAC + result.posterior_coclustering = sum(adjacencymatrix.(result.clusts)) ./ numsamples + + # Get point estimate + if options.pointestimation + result.pntestimate = pointestimate(result.clusts; loss = "VI", usesalso = options.usesalso) + end + + + # Compute normalised autocorrelation and integrated autocorrelation coefficient of the samples + # For K + result.K_iac, result.K_ess, result.K_acf = iac_ess_acf(result.K) + result.K_mean, result.K_variance = mean_and_var(result.K) + + # For r + result.r_iac, result.r_ess, result.r_acf = iac_ess_acf(result.r) + result.r_mean, result.r_variance = mean_and_var(result.r) + + # For p + result.p_iac, result.p_ess, result.p_acf = iac_ess_acf(result.p) + result.p_mean, result.p_variance = mean_and_var(result.p) + + # Acceptance rate of the MH steps + if options.numMH > 0 + result.splitmerge_acceptance_rate = mean(result.splitmerge_acceptances) + else + result.splitmerge_acceptance_rate = 0 + end + result.r_acceptance_rate = mean(result.r_acceptances) + + # Miscellaneous + result.options = options + result.params = params + result.runtime = runtime + result.mean_iter_time = runtime / numiters + return result +end + +function sample_rp( + clustsizes::Vector{Int}, + options::MCMCOptionsList = MCMCOptionsList(), + params::PriorHyperparamsList = PriorHyperparamsList() + ) + # Unpack parameters + numiters = options.numiters + burnin = options.burnin + thin = options.thin + η = params.η + σ = params.σ + proposalsd_r = params.proposalsd_r + u = params.u + v = params.v + C = clustsizes[findall(clustsizes .> 0)] + n = sum(C) + K = length(C) + + # Initialise result + r = rand(Gamma(η, σ)) + p = rand(Beta(u, v)) + numsamples = Integer(floor((numiters - burnin) / thin)) + result = (r = zeros(numsamples), p = zeros(numsamples)) + j = 1 + for i in ProgressBar(1:numiters) + # Sample r + r = sample_r(r, p, C, K, η, σ, proposalsd_r).r + # Sample p + p = sample_p(K, n, r, u, v) + + # Record sample if necessary + if i > burnin && (i - burnin) % thin == 0 + result.r[j] = r + result.p[j] = p + j = j + 1 + end + end + return result +end \ No newline at end of file diff --git a/src/Plot.jl b/src/Plot.jl new file mode 100644 index 0000000..93047c1 --- /dev/null +++ b/src/Plot.jl @@ -0,0 +1,17 @@ +using CairoMakie + +function sqmatrixplot(M::Matrix, size_inches = (4.5, 4), fontsize = 18) + size_pt = 72 .* size_inches + fig = Figure(resolution = size_pt, fontsize = fontsize) + Axis(fig[1, 1], aspect = 1) + heatmap!(M, colormap = :Blues) + return fig +end + +function lineplot(v::Vector{Float64}, size_inches = (4, 3), fontsize = 18) + size_pt = 72 .* size_inches + fig = Figure(resolution = size_pt, fontsize = fontsize) + Axis(fig[1, 1]) + lines!(v) + return fig +end \ No newline at end of file diff --git a/src/Prior.jl b/src/Prior.jl new file mode 100644 index 0000000..574a9f3 --- /dev/null +++ b/src/Prior.jl @@ -0,0 +1,194 @@ +using Distributions: fit_mle, Gamma, shape, rate, Distributions +using Clustering: kmeans, kmedoids +using Distances: pairwise, Euclidean +using StatsBase: counts, std +using ProgressBars: ProgressBar +using RCall: rcopy, @R_str + +""" + fitprior(data, algo; diss = false, Kmin = 1, Kmax = Int(floor(size(data)[end] / 2), useR = true) + +Determines the best prior hyperparameters from the data. A notional clustering is obtained using k-means or k-medoids, and the distances are split into within-cluster distances and inter-cluster distances based on the notional clustering. These distances are then used to fit the prior hyperparameters using MLE and empirical Bayes sampling. + +# Arguments +- `data::Union{Vector{Vector{Float64}}, Matrix{Float64}}`: can either be a vector of (possibly multi-dimensional) observations, or a matrix with each column an observation, or a square matrix of pairwise dissimilarities. +- `algo::String`: must be one of `"k-means"` or `"k-medoids"`. +- `diss::bool = false`: if true, `data` will be assumed to be a pairwise dissimilarity matrix. +- `Kmin::Integer`: minimum number of clusters. +- `Kmax::Integer = Int(floor(size(data)[end] / 2))`: maximum number of clusters. If left unspecified, it is set to half the number of observations. +- `useR::Bool = false`: if `false`, will use the `kmeans` or `kmedoids` from the Julia package [`Clustering.jl`](https://juliastats.org/Clustering.jl/stable/). If `true`, will use `kmeans` or `cluster::pam` in R. + +# Returns +A named tuple containing the following +- `params::PriorHyperparamsList`: the fitted hyperparameters. +- `K::Integer`: the number of clusters in the notional clustering. +- `notionalclustering::ClustLabelVector`: the cluster labels in the notional clustering. + +# See also +[`PriorHyperparamsList`](@ref) +""" +function fitprior( + data::Union{Vector{Vector{Float64}}, Matrix{Float64}}, + algo::String, + diss::Bool = false; + Kmin = 1, + Kmax = Int(floor(size(data)[end] / 2)), + useR = false +) + if data isa Vector{Vector{Float64}} + if diss + @warn "diss = true but data is not a dissimilarity matrix. Assuming that the data is a vector of observations." + end + x = makematrix(data) + diss = false + else + x = data + end + N = size(x, 2) + + # Input validation + if diss && size(x, 1) != size(x, 2) + throw(ArgumentError("Supplied dissimilarity matrix is not square.")) + end + if algo == "k-means" && diss + throw(ArgumentError("Cannot use algorithm `k-means` with a dissimilarity matrix.")) + end + if algo != "k-means" && algo != "k-medoids" + throw(ArgumentError("Algo must be 'k-means' or 'k-medoids'.")) + end + + if !diss + dissM = pairwise(Euclidean(), x, dims = 2) + else + dissM = x + end + + + # Get notional clustering + print("Computing notional clustering.\n") + objective = Vector{Float64}(undef, Kmax - Kmin + 1) + if algo == "k-means" + if useR + input = x' + objective = rcopy( + R""" + sapply($Kmin:$Kmax, + function(k){kmeans($input, centers = k, nstart = 20)$tot.withinss}) + """ + ) + else + clustfn = kmeans + input = x + end + elseif algo == "k-medoids" + input = dissM + if useR + objective = rcopy( + R""" + library(cluster) + sapply($Kmin:$Kmax, + function(k){pam($dissM, k = k, diss = T, pamonce = 5)$objective[["swap"]]}) + """ + ) + else + clustfn = kmedoids + end + end + + if !useR + for k =1:(Kmax-Kmin+1) + temp = clustfn(input, k; maxiter=1000) + objective[k] = temp.totalcost + # if !temp.converged + # @warn "Clustering did not converge at K = $k" + # end + end + end + elbow = detectknee(Kmin:Kmax, objective)[1] + K = elbow + if useR + if (algo == "k-means") + notionalclustering = rcopy( + R""" + kmeans($input, centers = $K, nstart = 20) + """ + )[:cluster] + elseif algo == "k-medoids" + notionalclustering = rcopy( + R""" + library(cluster) + pam($input, k = $K, diss = T, pamonce = 5) + """ + )[:clustering] + end + else + notionalclustering = clustfn(input, K; maxiter=1000).assignments + end + notionaladjmatrix = adjacencymatrix(notionalclustering) + A = uppertriangle(dissM)[uppertriangle(notionaladjmatrix) .== 1] + B = uppertriangle(dissM)[uppertriangle(notionaladjmatrix) .== 0] + + # Compute likelihood parameters + print("Computing likelihood hyperparameters.\n") + fitA = fit_mle(Gamma, A) + fitB = fit_mle(Gamma, B) + δ1 = shape(fitA) + α = length(A) * δ1 + β = sum(A) + δ2 = shape(fitB) + ζ = length(B) * δ2 + γ = sum(B) + + # Compute partition prior parameters + print("Computing partition prior hyperparameters.\n") + clustsizes = counts(notionalclustering) + temp = sample_rp(clustsizes) + proposalsd_r = std(temp.r) + fitr = fit_mle(Gamma, temp.r) + η = shape(fitr) + σ = rate(fitr) + fitp = fit_mle(Beta, temp.p) + u, v = Distributions.params(fitp) + + params = PriorHyperparamsList( + δ1 = δ1, + δ2 = δ2, + α = α, + β = β, + ζ = ζ, + γ = γ, + η = η, + σ = σ, + proposalsd_r = proposalsd_r, + u = u, + v = v, + K_initial = K + ) + @NamedTuple{ + params, K, notionalclustering, + objective}( + (params, K, notionalclustering, objective) + ) +end + +function detectknee( + xvalues::AbstractVector{<:Real}, + yvalues::AbstractVector{<:Real} + )::NTuple{2,Real} + # Max values to create line + ind = sortperm(xvalues) + x = xvalues[ind] + y = yvalues[ind] + x1 = x[1]; x2 = x[end] + y1 = y[1]; y2 = y[end] + + # Line between extreme values + a = (y2-y1)/(x2-x1) + b = y1 - a*x1 + + # Distance from points to line + distances = [abs(a * x[i] + b - y[i]) / sqrt(a^2 + 1) for i in eachindex(x)] + # Max distance point + maxind = argmax(distances) + x[maxind], y[maxind] +end \ No newline at end of file diff --git a/src/RedClust.jl b/src/RedClust.jl new file mode 100644 index 0000000..1cada67 --- /dev/null +++ b/src/RedClust.jl @@ -0,0 +1,16 @@ +module RedClust + +include("./Types.jl") +include("./Utility.jl") +include("./Prior.jl") +include("./MCMC.jl") +# include("./Plot.jl") +include("./SummaryFunctions.jl") + + +export fitprior, runsampler, # main functions +adjacencymatrix, sortlabels, uppertriangle, generatemixture, makematrix, pointestimate, # convenience +evaluateclustering, summarise, # summary functions +MCMCOptionsList, PriorHyperparamsList, MCMCData, MCMCResult # types + +end \ No newline at end of file diff --git a/src/SummaryFunctions.jl b/src/SummaryFunctions.jl new file mode 100644 index 0000000..58d7f27 --- /dev/null +++ b/src/SummaryFunctions.jl @@ -0,0 +1,109 @@ +using Clustering: randindex, varinfo, vmeasure, mutualinfo + +""" + evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) +Returns a tuple of values quantifying the accuracy of the clustering assignments in `clusts` with respect to the ground truth clustering assignments in `truth`. +# Return values +- `bloss`: Normalised Binder loss (= 1 - rand index). Lower is better. +- `ari`: Adjusted Rand index. Higher is better. +- `vidist`: Variation of Information distances. Lower is better. +- `vmeas`: V-Measure. Higher is better. +- `nmi`: Normalised Mutual Information. Higher is better. +""" +function evaluateclustering(clusts::ClustLabelVector, truth::ClustLabelVector) + bloss = binderloss(clusts, truth) + ari = randindex(clusts, truth)[1] + vidist = varinfo(clusts, truth) + vmeas = vmeasure(clusts, truth) + nmi = mutualinfo(clusts, truth) + return (bloss, ari = ari, vidist = vidist, vmeas = vmeas, nmi = nmi) +end + +function evaluateclustering(result::MCMCResult, truth::ClustLabelVector) + reftruth = Ref(truth) + bloss = binderloss.(result.clusts, reftruth) + ari = (x -> randindex(x, truth)).(result.clusts, reftruth) + vidist = varinfo.(result.clusts, reftruth) + vmeas = vmeasure.(result.clusts, reftruth) + nmi = mutualinfo.(result.clusts, reftruth) + return (bloss, ari = ari, vidist = vidist, vmeas = vmeas, nmi = nmi) +end + +function summarise(clusts::ClustLabelVector, truth::ClustLabelVector)::String + output = "" + temp = evaluateclustering(clusts, truth) + output *= "Clustering summary\n" + output *= "Number of clusters : $(length(unique(clusts)))\n" + output *= "Normalised Binder loss : $(temp.bloss)\n" + output *= "Variation of Information distance : $(temp.vidist)\n" + output *= "Adjusted Rand Index : $(temp.ari)\n" + output *= "Normalised Mutual Information : $(temp.nmi)\n" + output *= "V-Measure : $(temp.vmeas)\n" + return output +end + +""" + summarise(result::MCMCResult, truth::ClustLabelVector = [], printoutput = true) +Returns a string that summarises the MCMC output. If `truth` is a vector of cluster labels, the output also contains clustering metrics to quantify the accuracy of the point-estimate from the MCMC with respect to `truth`. If `printoutput = true`, the output will be printed to console before being returned. +""" +function summarise(result::MCMCResult, truth::ClustLabelVector=[], printoutput=true)::String + output = "" + output *= "General Summary\n" + output *= "Iterations : $(result.options.numiters)\n" + output *= "Burn-in : $(result.options.burnin)\n" + output *= "Thin : $(result.options.thin)\n" + output *= "Number of samples : $(result.options.numsamples)\n" + output *= "Number of intermediate Gibbs scans in each split-merge step : $(result.options.numGibbs)\n" + output *= "Number of split-merge steps per iteration : $(result.options.numMH)\n" + output *= "Acceptance rate for split-merge steps : $(result.splitmerge_acceptance_rate)\n" + output *= "Acceptance rate for sampling r : $(result.r_acceptance_rate)\n" + output *= "Runtime : $(result.runtime)\n" + output *= "Time per iteration : $(result.mean_iter_time)\n" + output *= "\n" + output *= "Summary for K\n" + output *= "IAC : $(result.K_iac)\n" + output *= "ESS : $(result.K_ess)\n" + output *= "ESS per sample : $(result.K_ess / result.options.numsamples)\n" + output *= "Posterior mean : $(result.K_mean)\n" + output *= "Posterior variance : $(result.K_variance)\n" + output *= "\n" + output *= "Summary for r\n" + output *= "IAC : $(result.r_iac)\n" + output *= "ESS : $(result.r_ess)\n" + output *= "ESS per sample : $(result.r_ess / result.options.numsamples)\n" + output *= "Posterior mean : $(result.r_mean)\n" + output *= "Posterior variance : $(result.r_variance)\n" + output *= "\n" + output *= "Summary for p\n" + output *= "IAC : $(result.p_iac)\n" + output *= "ESS : $(result.p_ess)\n" + output *= "ESS per sample : $(result.p_ess / result.options.numsamples)\n" + output *= "Posterior mean : $(result.p_mean)\n" + output *= "Posterior variance : $(result.p_variance)\n" + if length(truth) > 0 + temp = evaluateclustering(result.pntestimate, truth) + output *= "\n" + output *= "Clustering summary\n" + output *= "Number of clusters : $(length(unique(result.pntestimate)))\n" + output *= "Normalised Binder loss : $(temp.bloss)\n" + output *= "Variation of Information distance : $(temp.vidist)\n" + output *= "Adjusted Rand Index : $(temp.ari)\n" + output *= "Normalised Mutual Information : $(temp.nmi)\n" + output *= "V-Measure : $(temp.vmeas)\n" + end + if printoutput print(output) end + return output +end + +function binderloss( + a::ClustLabelVector, + b::ClustLabelVector, + normalised = true + )::Float64 + loss = sum(abs.(adjacencymatrix(a) .- adjacencymatrix(b))) + if normalised + n = length(a) + loss /= n * (n - 1) + end + return loss +end \ No newline at end of file diff --git a/src/Types.jl b/src/Types.jl new file mode 100644 index 0000000..c46f4d4 --- /dev/null +++ b/src/Types.jl @@ -0,0 +1,168 @@ +const ClustLabelType = Int +"Alias for Vector{Int}" +const ClustLabelVector = Vector{ClustLabelType} + +const _numGibbs_default = 5 +const _numMH_default = 1 + +""" +List of options for running the MCMC. + +Constructor: + MCMCOptionsList(; [numiters, burnin, thin, numGibbs, numMH, pointestimation, usesalso]) + +# Constructor arguments +- `numiters::Integer = 5000`: number of iterations to run. +- `burnin::Integer = floor(0.2 * numiters)`: number of iterations to discard as burn-in. +- `thin::Integer = 1`: will keep every `thin` samples. +- `numGibbs:Integer = 1`: number of intermediate Gibbs scans in the split-merge step. +- `numMH:Integer = 1`: number of split-merge steps per MCMC iteration. +- `pointestimation::Bool = true`: whether to compute a point-estimate clustering from the posterior samples. +- `usesalso::Bool = false`: whether to use the SALSO algorithm to compute a point estimate. +""" +Base.@kwdef struct MCMCOptionsList + numiters::Int = 5000 + burnin::Int = Integer(floor(0.2 * numiters)) + thin::Int = 1 + numGibbs::Int = _numGibbs_default + numMH::Int = _numMH_default + pointestimation::Bool = true + usesalso::Bool = false + numsamples::Int = Int(floor((numiters - burnin) / thin)) +end + +@doc raw""" +Contains the prior hyperparameters for the model. Call the constructor with the line + + PriorHyperparamsList(; [kwargs]) + +# Constructor arguments +- `δ1::Float64 = 1`: the parameter ``\delta_1``. +- `α::Float64 = 1`: the shape parameter ``\alpha`` in the prior for each ``\lambda_k``. +- `β::Float64 = 1`: the rate parameter ``\beta`` in the prior for each ``\lambda_k``. +- `δ2::Float64 = 1`: the parameter ``\delta_2``. +- `ζ::Float64 = 1`: the shape parameter ``\zeta`` in the prior for each ``\theta_{kt}``. +- `γ::Float64 = 1`: the rate parameter ``\gamma`` in the prior for each ``\theta_{kt}``. +- `η::Float64 = 1`: the shape parameter ``\eta`` in the prior for ``r``. +- `σ::Float64 = 1`: the rate parameter ``\sigma`` in the prior for ``r``. +- `u::Float64 = 1`: the parameter ``u`` in the prior for ``p``. +- `v::Float64 = 1`: the parameter ``v`` in the prior for ``p``. +- `proposalsd_r::Float64 = 1`: standard deviation of the truncated Normal proposal when sampling `r` via a Metropolis-Hastings step. +- `K_initial::Int = 1`: initial number of clusters to start the MCMC. +- `repulsion::Bool = true`: whether to use the repulsive component of the likelihood. +- `maxK::Int = 0`: maxmimum number of clusters to allow. If set to 0 then we can get up to `n` clusters, where `n` is the number of observations. + +# See also +[`fitprior`](@ref) +""" +Base.@kwdef mutable struct PriorHyperparamsList + δ1::Float64 = 1 + δ2::Float64 = 1 + α::Float64 = 1 + β::Float64 = 1 + ζ::Float64 = 1 + γ::Float64 = 1 + η::Float64 = 1 + σ::Float64 = 1 + proposalsd_r::Float64 = 1 + u::Float64 = 1 + v::Float64 = 1 + K_initial::Int = 1 + repulsion::Bool = true + maxK::Int = 0 +end + +Base.@kwdef mutable struct MCMCState + clusts::ClustLabelVector + r::Float64 + p::Float64 + clustsizes::Vector{Int} = counts(clusts, 1:length(clusts)) + K::Int = sum(clustsizes .> 0) +end + +"Contains the pairwise dissimilarities. Construct by calling `MCMCData(D)`, where `D` is a square matrix of pairwise dissimilarities." +Base.@kwdef struct MCMCData + D::Matrix{Float64} + logD::Matrix{Float64} = log.(D + I(size(D,1))) +end + +""" +Struct containing MCMC samples. +# Fields +- `options::MCMCOptionsList`: options passed to the sampler. +- `params::PriorHyperparamsList`: prior hyperparameters used by the sampler. +- `clusts::Vector{ClustLabelVector}`: contains the clustering allocations. `clusts[i]` is a vector containing the clustering allocation from the `i`th sample. +- `pntestimate::ClustLabelVector`: contains the point-estimate clustering allocation. If `options.pointestimation == false` then this is a vector of zeros. +- `posterior_coclustering::Matrix{Float64}`: the posterior coclustering matrix. +- `K::Vector{Int}`: posterior samples of ``K``, i.e., the number of clusters. `K[i]` is the number of clusters in `clusts[i]`. +- `r::Vector{Float64}`: posterior samples of the parameter ``r``. +- `p::Vector{Float64}`: posterior samples of the parameter ``p``. +- `K_mean`, `r_mean`, `p_mean`: posterior mean of `K`, `r`, and `p` respectively. +- `K_variance`, `r_variance`, `p_variance`: posterior variance of `K`, `r`, and `p` respectively. +- `K_acf::Vector{Float64}`, `r_acf::Vector{Float64}`, `p_acf::Vector{Float64}`: autocorrelation function for `K`, `r`, and `p` respectively. +- `K_iac`, `r_iac`, and `p_iac`: integrated autocorrelation coefficient for `K`, `r`, and `p` respectively. +- `K_ess::Float64`, `r_ess::Float64`, and `p_ess::Float64`: effective sample size for `K`, `r`, and `p` respectively. +- `loglik::Vector{Float64}`: log-likelihood for each sample. +- `splitmerge_splits`: Boolean vector indicating the iterations when a split proposal was used in the split-merge step. Has length `numMH * numiters` (see [`MCMCOptionsList`](@ref)). +- `splitmerge_acceptance_rate`: acceptance rate of the split-merge proposals. +- `r_acceptances`: Boolean vector indicating the iterations (including burnin and the thinned out iterations) where the Metropolis-Hastings proposal for `r` was accepted. +- `r_acceptance_rate:`: Metropolis-Hastings acceptance rate for `r`. +- `runtime`: total runtime for all iterations. +- `mean_iter_time`: average time taken for each iteration. +""" +Base.@kwdef mutable struct MCMCResult + clusts::Vector{ClustLabelVector} + posterior_coclustering::Matrix{Float64} + K::Vector{Int} + K_ess::Float64 + K_acf::Vector{Float64} + K_iac::Float64 + K_mean::Float64 + K_variance::Float64 + r::Vector{Float64} + r_ess::Float64 + r_acf::Vector{Float64} + r_iac::Float64 + r_mean::Float64 + r_variance::Float64 + p::Vector{Float64} + p_ess::Float64 + p_acf::Vector{Float64} + p_iac::Float64 + p_mean::Float64 + p_variance::Float64 + splitmerge_acceptances::Vector{Bool} + r_acceptances::Vector{Bool} + r_acceptance_rate::Float64 + splitmerge_splits::Vector{Bool} + splitmerge_acceptance_rate::Float64 + runtime::Float64 + mean_iter_time::Float64 + loglik::Vector{Float64} + options::MCMCOptionsList + params::PriorHyperparamsList + pntestimate::Vector{Int} + function MCMCResult(data::MCMCData, options::MCMCOptionsList, params::PriorHyperparamsList) + x = new() + numpts = size(data.D, 1) + x.options = options + x.params = params + numiters = options.numiters + numsamples = options.numsamples + numMH = options.numMH + x.clusts = [zeros(ClustLabelType, numpts) for i in 1:numsamples] + x.posterior_coclustering = zeros(numpts, numpts) + x.K = Int.(zeros(numsamples)) + x.K_acf = zeros(numsamples) + x.r = zeros(numsamples) + x.r_acf = zeros(numsamples) + x.p = zeros(numsamples) + x.p_acf = zeros(numsamples) + x.loglik = zeros(numsamples) + x.splitmerge_acceptances = Vector{Bool}(undef, numiters * numMH) + x.splitmerge_splits = Vector{Bool}(undef, numiters * numMH) + x.r_acceptances = Vector{Bool}(undef, numiters) + x.pntestimate = zeros(Int, numpts) + return x + end +end \ No newline at end of file diff --git a/src/Utility.jl b/src/Utility.jl new file mode 100644 index 0000000..5e484fc --- /dev/null +++ b/src/Utility.jl @@ -0,0 +1,155 @@ +using StatsBase: autocor, wsample, levelsmap +using Distributions: Dirichlet, MvNormal, pdf +using LinearAlgebra: I +using Random: rand +using Match +using RCall: @R_str, rcopy + +function numpairs(n::Int)::Int + Integer(n * (n - 1) / 2) +end + +function sample_logweights(logprobs::Vector{Float64})::Int + logprobs = logprobs .- minimum(logprobs) + u = rand(Uniform(), length(logprobs)) + argmax(-log.(-log.(u)) + logprobs) +end + +# Compute the integrated autocorrelation coefficient +function iac_ess_acf(x::AbstractVector{<:Real}) + acf = autocor(x) + iac = sum(acf) * 2 + ess = length(x) / iac + return (iac = iac, ess = ess, acf = acf) +end + +""" + adjacencymatrix(clusts::ClustLabelVector) +Returns the `n`x`n` adjacency matrix corresponding to the given cluster label vector `clusts`, where `n = length(clusts)`. +""" +function adjacencymatrix( + clusts::ClustLabelVector + )::Matrix{Bool} + clusts .== clusts' +end + +function uppertriangle( + M::Matrix{T} + )::Vector{T} where {T} + n = size(M, 1) + [M[i, j] for i in 1:n for j in (i + 1):n] +end + +""" + sortlabels(x::ClustLabelVector) +Returns a cluster label vector `y` such that `x` and `y` have the same adjacency structure and labels in `y` occur in sorted ascending order. +""" +function sortlabels( + x::ClustLabelVector + )::ClustLabelVector + temp = levelsmap(x) + [temp[i] for i in x] +end + +""" + generatemixture(N, K; α = K, dim = K, radius = 1, σ = 0.1) + +Generates a multivariate Normal mixture, with kernel weights generated from a Dirichlet prior. The kernels are centred at the vertices of a `dim`-dimensional simplex with edge length `radius`. + +# Arguments +- `N::Integer`: number of observations to generate. +- `K::Integer`: number of mixture kernels. +- `α::Float64 = K`: parameter for the Dirichlet prior. +- `dim::Integer = K`: dimension of the observations. +- `radius::Float64 = 1`: radius of the simplex whose vertices are the kernel means. +- `σ::Float64 = 0.1 `: variance of each kernel. + +# Returns + +Named tuple containing the following fields- + +- `pnts::Vector{Vector{Float64}}`: a vector of `N` observations. +- `distM::Matrix{Float64}`: an `N`x`N` matrix of pairwise Euclidean distances between the observations. +- `clusts::ClustLabelVector`: vector of `N` cluster assignments. +- `probs::Float64`: vector of `K` cluster weights generated from the Dirichlet prior, used to generate the observations. +- `oracle_coclustering::Matrix{Float64}`: `N`x`N` matrix of co-clustering probabilities, calculated assuming full knowledge of the cluster centres and cluster weights. +""" +function generatemixture(N, K; α = K, dim = K, radius = 1, σ = 0.1) + probs = rand(Dirichlet(K, α)) + clusts = sort(wsample(1:K, probs, N)) + + # generate cluster centres + clust_centres = fill(zeros(dim), K) + for i = 1:K + clust_centres[i] = cat(zeros(i - 1), radius, zeros(dim - i); dims = 1) + end + + # generate points + Σ = σ^2 * I(dim) + pnts = fill(zeros(dim), N) + oracle_posterior = zeros(K, N) + for i = 1:N + pnts[i] = rand(MvNormal(clust_centres[clusts[i]], Σ)) + end + # calculate oracle + numiters = 1000 + oracle_posterior = zeros(K, N) + oracle_coclustering = zeros(N, N) + for c in 1:numiters + tempprobs = rand(Dirichlet(K, α)) + for i = 1:N + for j = 1:K + oracle_posterior[j, i] = tempprobs[j] * pdf(MvNormal(clust_centres[j], Σ), pnts[i]) + end + oracle_posterior[:, i] .= oracle_posterior[:, i] ./ sum(oracle_posterior[:, i]) + end + oracle_coclustering += oracle_posterior' * oracle_posterior + end + oracle_coclustering ./= numiters #oracle coclustering matrix + distM = [pnts[i][j] for i in 1:N, j in 1:dim]' |> + (x -> pairwise(Euclidean(), x, dims = 2)) + return (pnts = pnts, distM = distM, clusts = clusts, probs = probs, oracle_coclustering = oracle_coclustering) +end + +"Convert a vector of vectors into a matrix, where each vector becomes a column in the matrix." +function makematrix(x::Vector{Vector{T}})::Matrix where {T} + [x[i][j] for j in 1:length(x[1]), i in 1:length(x)] +end + +""" + pointestimate(clusts::Vector{ClustLabelVector}; loss = "VI", usesalso = false) +Computes a point estimate from a vector of samples of cluster allocations by searching for a minimiser of the posterior expectation of some loss function. + +# Arguments +- `loss::String`: must be either `"VI"` or `"binder"`. Determines the loss function as either the Variation of Information distance or Binder loss. +`usesalso::Bool`: if true, the SALSO algorithm is used. If false, the search space is restricted to the list of samples passed to the function. +""" +function pointestimate(clusts::Vector{ClustLabelVector}; loss = "VI", usesalso = false) + if usesalso + clustsM = makematrix(clusts)' + pntestimate = rcopy(R""" + library(salso) + salso(x = $clustsM, loss = $loss) + """) + else + pntestimate = zeros(Int, length(clusts[1])) + lossfn = @match loss begin + "VI" => varinfo + "binder" => (x, y) -> binderloss(x, y, false) + end + npts = length(clusts[1]) + minscore = Inf + for clust in clusts + temp = mean(lossfn.(clusts, Ref(clust))) + if temp < minscore + minscore = temp + pntestimate .= clust + end + end + end + return pntestimate +end + + + + diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..02f40ff --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,3 @@ +[deps] +"Test" = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +"StatsBase" = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..d31eb96 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,143 @@ +using RedClust, Test +using StatsBase: sample + +@testset "Convenience functions" begin + + @testset "makematrix" begin + # check makematrix + m = 500 + n = 1000 + temp = [rand(m) for i in 1:n] + tempmatrix = makematrix(temp) + @test sum([sum(tempmatrix[:, i] .== temp[i]) for i in 1:n]) == m * n + end + + @testset "adjacencymatrix" begin + # check adjacency matrix + K = 20 + n = 500 + temp = sample(1:K, n) + adjmatrix = adjacencymatrix(temp) + tempsum = 0 + for (i, x) in pairs(adjmatrix) + tempsum += (adjmatrix[i] == (temp[i[1]] == temp[i[2]])) + end + @test tempsum == n * n + end + + @testset "sortlabels" begin + # check sortlabels + K = 20 + n = 500 + temp = sample(1:K, n) + @test n^2 == sum(adjacencymatrix(temp) .== adjacencymatrix(sortlabels(temp))) + end + +end + +K = 10 # Number of clusters +N = 100 # Number of points +data_σ = 0.25 # Variance of the normal kernel +data_dim = 10 # Data dimension +data = generatemixture(N, K; α = 10, σ = data_σ, dim = data_dim) +pnts, distM, clusts, probs, oracle_coclustering = data + +# check fitting the prior +@testset "Fitting the prior" begin + @test ( + try + fitprior(pnts, "k-means", false; useR = true) # kmeans with points, R + true + catch e + false + end + ) + @test ( + try + fitprior(pnts, "k-medoids", false; useR = true) # kmedoids with points, R + true + catch e + false + end + ) + @test ( + try + fitprior(distM, "k-medoids", true; useR = true) # kmedoids with distances, R + true + catch e + false + end + ) + @test ( + try + fitprior(pnts, "k-means", false; useR = false) # kmeans with points, Julia + true + catch e + false + end + ) + @test ( + try + fitprior(pnts, "k-medoids", false; useR = false) # kmedoids with points, Julia + true + catch e + false + end + ) + @test ( + try + fitprior(distM, "k-medoids", true; useR = false) # kmedoids with distances, Julia + true + catch e + false + end + ) + + @test_throws ArgumentError fitprior(distM, "hierarchical", true; useR = false) # check that only 2 methods allowed + + @test_logs (:warn,) fitprior(pnts, "k-means", true; useR = false) # check that dist is disregarded when pnts is supplied + + @test_throws ArgumentError fitprior(distM, "k-means", true; useR = false) # check that kmeans does not work when distances are given +end + + +# check the sampler +params = fitprior(pnts, "k-means", false; useR = false).params +options1 = MCMCOptionsList(usesalso = true) +options2 = MCMCOptionsList(usesalso = false) +options3 = MCMCOptionsList(pointestimation = false, usesalso = false) +data = MCMCData(D = distM) +@testset "Test sampler" begin + @test ( + try + result1 = runsampler(data, options1, params) # point estimation with salso + true + catch e + false + end + ) + @test ( + try + result2 = runsampler(data, options2, params) # point estimation without salso + true + catch e + false + end + ) + @test ( + try + result3 = runsampler(data, options3, params) # no point estimation + sum(result3.pntestimate .== 0) == length(result3.pntestimate) + catch e + false + end + ) + @test ( + try + result4 = runsampler(data) # test defaults + true + catch e + false + end + ) +end \ No newline at end of file