JuliaStats · palday · Aug 2, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,15 @@
+MixedModels v4.25.3 Release Notes
+==============================
+- Fix a bug in the handling of rank deficiency in the `simulate[!]` code. This has important correctness implications for bootstrapping models with rank-deficient fixed effects (as can happen in the case of partial crossing of the fixed effects / missing cells). [#778]
+
 MixedModels v4.25.2 Release Notes
 ==============================
 - Use `public` keyword so that users don't see unnecessary docstring warnings on 1.11+. [#776]
 - Fix accidental export of `dataset` and `datasets` and make them `public` instead. [#776]
 
 MixedModels v4.25.1 Release Notes
 ==============================
-- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775] 
+- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775]
 
 MixedModels v4.25 Release Notes
 ==============================
@@ -549,3 +553,4 @@ Package dependencies
 [#773]: https://github.com/JuliaStats/MixedModels.jl/issues/773
 [#775]: https://github.com/JuliaStats/MixedModels.jl/issues/775
 [#776]: https://github.com/JuliaStats/MixedModels.jl/issues/776
+[#778]: https://github.com/JuliaStats/MixedModels.jl/issues/778
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MixedModels"
 uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
 author = ["Phillip Alday <[email protected]>", "Douglas Bates <[email protected]>", "Jose Bayoan Santiago Calderon <[email protected]>"]
-version = "4.25.2"
+version = "4.25.3"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -177,7 +177,7 @@ end
 
 """
     parametricbootstrap([rng::AbstractRNG], nsamp::Integer, m::MixedModel{T}, ftype=T;
-        β = coef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;))
+        β = fixef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;))
 
 Perform `nsamp` parametric bootstrap replication fits of `m`, returning a `MixedModelBootstrap`.
 
@@ -214,7 +214,7 @@ function parametricbootstrap(
     n::Integer,
     morig::MixedModel{T},
     ftype::Type{<:AbstractFloat}=T;
-    β::AbstractVector=coef(morig),
+    β::AbstractVector=fixef(morig),
     σ=morig.σ,
     θ::AbstractVector=morig.θ,
     use_threads::Bool=false,
@@ -232,9 +232,13 @@ function parametricbootstrap(
     if σ !== missing
         σ = T(σ)
     end
-    β, θ = convert(Vector{T}, β), convert(Vector{T}, θ)
-    βsc, θsc = similar(ftype.(β)), similar(ftype.(θ))
-    p, k = length(β), length(θ)
+    β = convert(Vector{T}, β)
+    θ = convert(Vector{T}, θ)
+    # scratch -- note that this is the length of the unpivoted coef vector 
+    βsc = coef(morig)
+    θsc = zeros(ftype, length(θ))
+    p = length(βsc)
+    k = length(θsc)
     m = deepcopy(morig)
     for (key, val) in pairs(optsum_overrides)
         setfield!(m.optsum, key, val)
@@ -251,7 +255,6 @@ function parametricbootstrap(
     samp = replicate(n; progress) do
         simulate!(rng, m; β, σ, θ)
         refit!(m; progress=false)
-        # @info "" m.optsum.feval
         (
             objective=ftype.(m.objective),
             σ=ismissing(m.σ) ? missing : ftype(m.σ),

diff --git a/src/simulate.jl b/src/simulate.jl
@@ -18,29 +18,33 @@ function simulate(m::MixedModel, args...; kwargs...)
 end
 
 """
-    simulate!(rng::AbstractRNG, m::MixedModel{T}; β=m.β, σ=m.σ, θ=T[])
-    simulate!(m::MixedModel; β=m.β, σ=m.σ, θ=m.θ)
+    simulate!(rng::AbstractRNG, m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[])
+    simulate!(m::MixedModel; β=fixef(m), σ=m.σ, θ=m.θ)
 
 Overwrite the response (i.e. `m.trms[end]`) with a simulated response vector from model `m`.
 
 This simulation includes sampling new values for the random effects.
 
+`β` can be specified either as a pivoted, full rank coefficient vector (cf. [`fixef`](@ref))
+or as an unpivoted full dimension coefficient vector (cf. [`coef`](@ref)), where the entries
+corresponding to redundant columns will be ignored.
+
 !!! note
     Note that `simulate!` methods with a `y::AbstractVector` as the first argument
     (besides the RNG) and `simulate` methods return the simulated response. This is
     in contrast to `simulate!` methods with a `m::MixedModel` as the first argument,
     which modify the model's response and return the entire modified model.
 """
 function simulate!(
-    rng::AbstractRNG, m::LinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[]
+    rng::AbstractRNG, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]
 ) where {T}
     # XXX should we add support for doing something with weights?
     simulate!(rng, m.y, m; β, σ, θ)
     return unfit!(m)
 end
 
 function simulate!(
-    rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[]
+    rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]
 ) where {T}
     # note that these m.resp.y and m.LMM.y will later be synchronized in (re)fit!()
     # but for now we use them as distinct scratch buffers to avoid allocations
@@ -85,7 +89,7 @@ function _rand(rng::AbstractRNG, d::Distribution, location, scale=NaN, n=1)
     return rand(rng, dist) / n
 end
 
-function simulate!(m::MixedModel{T}; β=coef(m), σ=m.σ, θ=T[]) where {T}
+function simulate!(m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]) where {T}
     return simulate!(Random.GLOBAL_RNG, m; β, σ, θ)
 end
 
@@ -121,7 +125,7 @@ function simulate!(
     y::AbstractVector,
     m::LinearMixedModel,
     newdata::Tables.ColumnTable;
-    β=m.β,
+    β=fixef(m),
     σ=m.σ,
     θ=m.θ,
 )
@@ -147,20 +151,18 @@ function simulate!(
 end
 
 function simulate!(
-    rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=m.β, σ=m.σ, θ=m.θ
+    rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=m.θ
 ) where {T}
-    length(β) == length(pivot(m)) || length(β) == m.feterm.rank ||
+    length(β) == length(pivot(m)) || length(β) == rank(m) ||
         throw(ArgumentError("You must specify all (non-singular) βs"))
 
     β = convert(Vector{T}, β)
     σ = T(σ)
     θ = convert(Vector{T}, θ)
     isempty(θ) || setθ!(m, θ)
 
-    if length(β) ≠ length(pivot(m))
-        padding = length(pivot(m)) - rank(m)
-        append!(β, fill(-0.0, padding))
-        invpermute!(β, pivot(m))
+    if length(β) == length(pivot(m))
+        β = view(view(β, pivot(m)), 1:rank(m))
     end
 
     # initialize y to standard normal
@@ -172,15 +174,15 @@ function simulate!(
     end
 
     # scale by σ and add fixed-effects contribution
-    return mul!(y, m.X, β, one(T), σ)
+    return mul!(y, fullrankx(m), β, one(T), σ)
 end
 
 function simulate!(
     rng::AbstractRNG,
     y::AbstractVector,
     m::GeneralizedLinearMixedModel,
     newdata::Tables.ColumnTable;
-    β=m.β,
+    β=fixef(m),
     σ=m.σ,
     θ=m.θ,
 )
@@ -209,7 +211,7 @@ function simulate!(
     rng::AbstractRNG,
     y::AbstractVector,
     m::GeneralizedLinearMixedModel{T};
-    β=m.β,
+    β=fixef(m),
     σ=m.σ,
     θ=m.θ,
 ) where {T}
@@ -250,7 +252,7 @@ function _simulate!(
 
     if length(β) == length(pivot(m))
         # unlike LMM, GLMM stores the truncated, pivoted vector directly
-        β = view(β, view(pivot(m), 1:(rank(m))))
+        β = view(view(β, pivot(m)), 1:rank(m))
     end
     fast = (length(m.θ) == length(m.optsum.final))
     setpar! = fast ? setθ! : setβθ!

diff --git a/test/bootstrap.jl b/test/bootstrap.jl
@@ -226,7 +226,7 @@ end
         rng = MersenneTwister(0);
         x = rand(rng, 100);
         data = (x = x, x2 = 1.5 .* x, y = rand(rng, [0,1], 100), z = repeat('A':'T', 5))
-        @testset "$family" for family in [Normal(), Bernoulli()] 
+        @testset "$family" for family in [Normal(), Bernoulli()]
             model = @suppress fit(MixedModel, @formula(y ~ x + x2 + (1|z)), data, family; progress=false)
             boot = quickboot(model, 10)
 
@@ -242,6 +242,33 @@ end
             yf = simulate(StableRNG(1), model; β=fixef(model))
             @test all(x -> isapprox(x...), zip(yc, yf))
         end
+
+        @testset "partial crossing" begin
+            id = lpad.(string.(1:40), 2, "0")
+            B = ["b0", "b1", "b2"]
+            C = ["c0", "c1", "c2", "c3", "c4"]
+            df = DataFrame(reshape(collect(Iterators.product(B, C, id)), :), [:b, :c, :id])
+            df[!, :y] .= 0
+            filter!(df) do row
+                b = last(row.b)
+                c = last(row.c)
+                return b != c
+            end
+
+            m = LinearMixedModel(@formula(y ~ 1 + b * c + (1|id)), df)
+            β = 1:rank(m)
+            σ = 1
+            simulate!(StableRNG(628), m; β, σ)
+            fit!(m)
+
+            boot = parametricbootstrap(StableRNG(271828), 1000,  m);
+            bootci = DataFrame(shortestcovint(boot))
+            filter!(:group => ismissing, bootci)
+            select!(bootci, :names => disallowmissing => :coef, :lower, :upper)
+            transform!(bootci, [:lower, :upper] => ByRow(middle) => :mean)
+
+            @test all(x -> isapprox(x[1], x[2]; atol=0.1),  zip(coef(m), bootci.mean))
+        end
     end
 end