From 6a09838fa9f42fbad0e49a4a00ad3721b132b45a Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 16 Nov 2023 04:43:55 -0600
Subject: [PATCH 1/2] Add `rtol` convergence criterion

Converging based on an absolute tolerance implies that you
know something about what to expect for the range of
log-likelihoods. In cases where the ground truth may not
be known, it can be useful to allow termination based on
improvement that is small *relative* to the log-likelihood.
---
 src/classic_em.jl    | 11 +++++++----
 src/fit_em.jl        |  9 ++++++---
 src/stochastic_em.jl | 24 ++++++++++++++----------
 test/runtests.jl     | 30 ++++++++++++++++++++++++++++--
 4 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/src/classic_em.jl b/src/classic_em.jl
index 9a02929..ffe5aee 100644
--- a/src/classic_em.jl
+++ b/src/classic_em.jl
@@ -5,10 +5,11 @@ The EM algorithm was introduced by A. P. Dempster, N. M. Laird and D. B. Rubin i
 struct ClassicEM <: AbstractEM end
 
 """
-    fit_mle!(α::AbstractVector, dists::AbstractVector{F} where {F<:Distribution}, y::AbstractVecOrMat, method::ClassicEM; display=:none, maxiter=1000, atol=1e-3, robust=false)
+    fit_mle!(α::AbstractVector, dists::AbstractVector{F} where {F<:Distribution}, y::AbstractVecOrMat, method::ClassicEM; display=:none, maxiter=1000, atol=1e-3, rtol=nothing, robust=false)
 Use the EM algorithm to update the Distribution `dists` and weights `α` composing a mixture distribution.
 - `robust = true` will prevent the (log)likelihood to overflow to `-∞` or `∞`.
-- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops. 
+- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops.
+- `rtol` relative tolerance for convergence, `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<rtol*(|ℓ⁽ⁱ⁺¹⁾| + |ℓ⁽ⁱ⁾|)/2` (does not check if `rtol` is `nothing`)
 - `display` value can be `:none`, `:iter`, `:final` to display Loglikelihood evolution at each iterations `:iter` or just the final one `:final`
 """
 function fit_mle!(
@@ -19,6 +20,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -58,7 +60,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
@@ -88,6 +90,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -127,7 +130,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
diff --git a/src/fit_em.jl b/src/fit_em.jl
index cd6017a..861a068 100644
--- a/src/fit_em.jl
+++ b/src/fit_em.jl
@@ -6,7 +6,7 @@ The `mix` input is a mixture that is used to initilize the EM algorithm.
 - `method` determines the algorithm used.
 - `infos = true` returns a `Dict` with informations on the algorithm (converged, iteration number, loglikelihood).
 - `robust = true` will prevent the (log)likelihood to overflow to `-∞` or `∞`.
-- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops. 
+- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops.
 - `display` value can be `:none`, `:iter`, `:final` to display Loglikelihood evolution at each iterations `:iter` or just the final one `:final`
 """
 function fit_mle(
@@ -17,6 +17,7 @@ function fit_mle(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
     infos = false,
 )
@@ -35,6 +36,7 @@ function fit_mle(
             display = display,
             maxiter = maxiter,
             atol = atol,
+            rtol = rtol,
             robust = robust,
         )
     else
@@ -47,6 +49,7 @@ function fit_mle(
             display = display,
             maxiter = maxiter,
             atol = atol,
+            rtol = rtol,
             robust = robust,
         )
     end
@@ -58,7 +61,7 @@ end
     fit_mle(mix::AbstractArray{<:MixtureModel}, y::AbstractVecOrMat, weights...; method = ClassicEM(), display=:none, maxiter=1000, atol=1e-3, robust=false, infos=false)
 
 Do the same as `fit_mle` for each (initial) mixtures in the mix array. Then it selects the one with the largest loglikelihood.
-Warning: It uses try and catch to avoid errors messages in case EM converges toward a singular solution (probably using robust should be enough in most case to avoid errors). 
+Warning: It uses try and catch to avoid errors messages in case EM converges toward a singular solution (probably using robust should be enough in most case to avoid errors).
 """
 function fit_mle(
     mix::AbstractArray{<:MixtureModel},
@@ -148,7 +151,7 @@ function E_step!(
     γ[:, :] = exp.(LL .- c)
 end
 
-# Utilities 
+# Utilities
 
 size_sample(y::AbstractMatrix) = size(y, 2)
 size_sample(y::AbstractVector) = length(y)
diff --git a/src/stochastic_em.jl b/src/stochastic_em.jl
index 3752cb4..c105076 100644
--- a/src/stochastic_em.jl
+++ b/src/stochastic_em.jl
@@ -1,5 +1,5 @@
 """
-    Base.@kwdef struct StochasticEM<:AbstractEM 
+    Base.@kwdef struct StochasticEM<:AbstractEM
         rng::AbstractRNG = Random.GLOBAL_RNG
     end
 The Stochastic EM algorithm was introduced by G. Celeux, and J. Diebolt. in 1985 in [*The SEM Algorithm: A probabilistic teacher algorithm derived from the EM algorithm for the mixture problem*](https://cir.nii.ac.jp/crid/1574231874553755008).
@@ -16,7 +16,7 @@ end
     fit_mle!(α::AbstractVector, dists::AbstractVector{F} where {F<:Distribution}, y::AbstractVecOrMat, method::StochasticEM; display=:none, maxiter=1000, atol=1e-3, robust=false)
 Use the stochastic EM algorithm to update the Distribution `dists` and weights `α` composing a mixture distribution.
 - `robust = true` will prevent the (log)likelihood to overflow to `-∞` or `∞`.
-- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops. 
+- `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops.
 - `display` value can be `:none`, `:iter`, `:final` to display Loglikelihood evolution at each iterations `:iter` or just the final one `:final`
 """
 function fit_mle!(
@@ -27,6 +27,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -53,7 +54,7 @@ function fit_mle!(
         # S-step
         ẑ[:] .= [rand(method.rng, Categorical(ℙ...)) for ℙ in eachrow(γ)]
         cat = [findall(ẑ .== k) for k = 1:K]
-        
+
         # M-step
         # using ẑ, maximize (update) the parameters
         α[:] = length.(cat)/N
@@ -70,7 +71,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
@@ -100,6 +101,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -126,7 +128,7 @@ function fit_mle!(
         # S-step
         ẑ = [rand(method.rng, Categorical(ℙ...)) for ℙ in eachrow(γ)]
         cat = [findall(ẑ .== k) for k = 1:K]
-        
+
         # M-step
         # using ẑ, maximize (update) the parameters
         α[:] = length.(cat)/N
@@ -143,7 +145,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
@@ -174,6 +176,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -200,7 +203,7 @@ function fit_mle!(
         # S-step
         ẑ = [rand(method.rng, Categorical(ℙ...)) for ℙ in eachrow(γ)]
         cat = [findall(ẑ .== k) for k = 1:K]
-        
+
         # M-step
         # using ẑ, maximize (update) the parameters
         α[:] = [length(cat[k])*sum(w[cat[k]]) for k in 1:K]/sum(w)
@@ -217,7 +220,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
@@ -248,6 +251,7 @@ function fit_mle!(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
 )
 
@@ -274,7 +278,7 @@ function fit_mle!(
         # S-step
         ẑ = [rand(method.rng, Categorical(ℙ...)) for ℙ in eachrow(γ)]
         cat = [findall(ẑ .== k) for k = 1:K]
-        
+
         # M-step
         # using ẑ, maximize (update) the parameters
         α[:] = [sum(w[cat[k]]) for k in 1:K]/sum(w)
@@ -291,7 +295,7 @@ function fit_mle!(
         push!(history["logtots"], logtotp)
         history["iterations"] += 1
 
-        if abs(logtotp - logtot) < atol
+        if abs(logtotp - logtot) < atol || (rtol !== nothing && abs(logtotp - logtot) < rtol * (abs(logtot) + abs(logtotp)) / 2)
             (display in [:iter, :final]) &&
                 println("EM converged in ", it, " iterations, final loglikelihood = ", logtotp)
             history["converged"] = true
diff --git a/test/runtests.jl b/test/runtests.jl
index bdfee57..6aaccd9 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -23,6 +23,15 @@ using Random
     @test isapprox(θ₁, p[1]...; rtol = rtol)
     @test isapprox(α, p[2][1]; rtol = rtol)
     @test isapprox(θ₂, p[2][2]; rtol = rtol)
+
+    # Test rtol
+    mix_mle2 =
+        fit_mle(mix_guess, y; display = :none, rtol = 1e-8, atol = 0, robust = false, infos = false)
+    p = params(mix_mle2)[1]
+    @test isapprox([β, 1 - β], probs(mix_mle2); rtol = rtol)
+    @test isapprox(θ₁, p[1]...; rtol = rtol)
+    @test isapprox(α, p[2][1]; rtol = rtol)
+    @test isapprox(θ₂, p[2][2]; rtol = rtol)
 end
 
 @testset "Stochastic EM Univariate continuous Mixture Exponential + Laplace" begin
@@ -53,6 +62,23 @@ end
     @test isapprox(μ, p[1][1]; rtol = rtol)
     @test isapprox(α, p[2][1]; rtol = rtol)
     @test isapprox(θ₂, p[2][2]; rtol = rtol)
+
+    mix_mle2 = fit_mle(
+        mix_guess,
+        y;
+        display = :none,
+        atol = 0,
+        rtol = 1e-6,
+        robust = false,
+        infos = false,
+        method = StochasticEM(),
+    )
+    p = params(mix_mle2)[1]
+    @test isapprox([β, 1 - β], probs(mix_mle2); rtol = rtol)
+    @test isapprox(θ₁, p[1][2]; rtol = rtol)
+    @test isapprox(μ, p[1][1]; rtol = rtol)
+    @test isapprox(α, p[2][1]; rtol = rtol)
+    @test isapprox(θ₂, p[2][2]; rtol = rtol)
 end
 
 @testset "Multivariate Gaussian Mixture" begin
@@ -143,7 +169,7 @@ end
     α = 1 / 2
     β = 0.3
 
-    rtol = 5e-2 #  
+    rtol = 5e-2 #
     d1 = MixtureModel([Normal(θ₁, σ₁), Normal(θ₂, σ₂)], [α, 1 - α])
     d2 = Normal(θ₀, σ₀)
     mix_true = MixtureModel([d1, d2], [β, 1 - β])
@@ -186,7 +212,7 @@ end
     α = 1 / 2
     β = 0.5
 
-    rtol = 5e-2 # 
+    rtol = 5e-2 #
     d1 = MixtureModel([Normal(θ₁, σ₁), Laplace(θ₂, σ₂)], [α, 1 - α])
     d2 = Normal(θ₀, σ₀)
     mix_true = MixtureModel([d1, d2], [β, 1 - β])

From 76f443551fd1346006c265ab0a3e35684c4bb76b Mon Sep 17 00:00:00 2001
From: Tim Holy <tim.holy@gmail.com>
Date: Thu, 16 Nov 2023 05:05:25 -0600
Subject: [PATCH 2/2] Add a couple missing rtols in docstrings

---
 src/fit_em.jl        | 7 +++++--
 src/stochastic_em.jl | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/fit_em.jl b/src/fit_em.jl
index 861a068..53aecb7 100644
--- a/src/fit_em.jl
+++ b/src/fit_em.jl
@@ -1,5 +1,5 @@
 """
-    fit_mle(mix::MixtureModel, y::AbstractVecOrMat, weights...; method = ClassicEM(), display=:none, maxiter=1000, atol=1e-3, robust=false, infos=false)
+    fit_mle(mix::MixtureModel, y::AbstractVecOrMat, weights...; method = ClassicEM(), display=:none, maxiter=1000, atol=1e-3, rtol=nothing, robust=false, infos=false)
 Use the an Expectation Maximization (EM) algorithm to maximize the Loglikelihood (fit) the mixture with an i.i.d sample `y`.
 The `mix` input is a mixture that is used to initilize the EM algorithm.
 - `weights` when provided, it will compute a weighted version of the EM. (Useful for fitting mixture of mixtures)
@@ -7,6 +7,7 @@ The `mix` input is a mixture that is used to initilize the EM algorithm.
 - `infos = true` returns a `Dict` with informations on the algorithm (converged, iteration number, loglikelihood).
 - `robust = true` will prevent the (log)likelihood to overflow to `-∞` or `∞`.
 - `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops.
+- `rtol` relative tolerance for convergence, `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<rtol*(|ℓ⁽ⁱ⁺¹⁾| + |ℓ⁽ⁱ⁾|)/2` (does not check if `rtol` is `nothing`)
 - `display` value can be `:none`, `:iter`, `:final` to display Loglikelihood evolution at each iterations `:iter` or just the final one `:final`
 """
 function fit_mle(
@@ -58,7 +59,7 @@ function fit_mle(
 end
 
 """
-    fit_mle(mix::AbstractArray{<:MixtureModel}, y::AbstractVecOrMat, weights...; method = ClassicEM(), display=:none, maxiter=1000, atol=1e-3, robust=false, infos=false)
+    fit_mle(mix::AbstractArray{<:MixtureModel}, y::AbstractVecOrMat, weights...; method = ClassicEM(), display=:none, maxiter=1000, atol=1e-3, rtol=nothing, robust=false, infos=false)
 
 Do the same as `fit_mle` for each (initial) mixtures in the mix array. Then it selects the one with the largest loglikelihood.
 Warning: It uses try and catch to avoid errors messages in case EM converges toward a singular solution (probably using robust should be enough in most case to avoid errors).
@@ -71,6 +72,7 @@ function fit_mle(
     display = :none,
     maxiter = 1000,
     atol = 1e-3,
+    rtol = nothing,
     robust = false,
     infos = false,
 )
@@ -96,6 +98,7 @@ function fit_mle(
                 display = display,
                 maxiter = maxiter,
                 atol = atol,
+                rtol = rtol,
                 robust = robust,
                 infos = true,
             )
diff --git a/src/stochastic_em.jl b/src/stochastic_em.jl
index c105076..4cd4da4 100644
--- a/src/stochastic_em.jl
+++ b/src/stochastic_em.jl
@@ -17,6 +17,7 @@ end
 Use the stochastic EM algorithm to update the Distribution `dists` and weights `α` composing a mixture distribution.
 - `robust = true` will prevent the (log)likelihood to overflow to `-∞` or `∞`.
 - `atol` criteria determining the convergence of the algorithm. If the Loglikelihood difference between two iteration `i` and `i+1` is smaller than `atol` i.e. `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<atol`, the algorithm stops.
+- `rtol` relative tolerance for convergence, `|ℓ⁽ⁱ⁺¹⁾ - ℓ⁽ⁱ⁾|<rtol*(|ℓ⁽ⁱ⁺¹⁾| + |ℓ⁽ⁱ⁾|)/2` (does not check if `rtol` is `nothing`)
 - `display` value can be `:none`, `:iter`, `:final` to display Loglikelihood evolution at each iterations `:iter` or just the final one `:final`
 """
 function fit_mle!(