From 19b90aa0384100fa9c3a38db631afbb89287b848 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 2 Aug 2024 17:34:04 +0000 Subject: [PATCH] rework passing and handling of FE coefficients to `simulate`! for rank deficient models (#778) * rework passing and handling of FE coefficients to `simulate`! for rank deficient models * add test * NEWS * version bump --- NEWS.md | 7 ++++++- Project.toml | 2 +- src/bootstrap.jl | 15 +++++++++------ src/simulate.jl | 34 ++++++++++++++++++---------------- test/bootstrap.jl | 29 ++++++++++++++++++++++++++++- 5 files changed, 62 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3c413b95c..1a53f74ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +MixedModels v4.25.3 Release Notes +============================== +- Fix a bug in the handling of rank deficiency in the `simulate[!]` code. This has important correctness implications for bootstrapping models with rank-deficient fixed effects (as can happen in the case of partial crossing of the fixed effects / missing cells). [#778] + MixedModels v4.25.2 Release Notes ============================== - Use `public` keyword so that users don't see unnecessary docstring warnings on 1.11+. [#776] @@ -5,7 +9,7 @@ MixedModels v4.25.2 Release Notes MixedModels v4.25.1 Release Notes ============================== -- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775] +- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775] MixedModels v4.25 Release Notes ============================== @@ -549,3 +553,4 @@ Package dependencies [#773]: https://github.com/JuliaStats/MixedModels.jl/issues/773 [#775]: https://github.com/JuliaStats/MixedModels.jl/issues/775 [#776]: https://github.com/JuliaStats/MixedModels.jl/issues/776 +[#778]: https://github.com/JuliaStats/MixedModels.jl/issues/778 diff --git a/Project.toml b/Project.toml index d66f85475..122ab02b0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MixedModels" uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" author = ["Phillip Alday ", "Douglas Bates ", "Jose Bayoan Santiago Calderon "] -version = "4.25.2" +version = "4.25.3" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" diff --git a/src/bootstrap.jl b/src/bootstrap.jl index 269c5567f..c90dc33a7 100644 --- a/src/bootstrap.jl +++ b/src/bootstrap.jl @@ -177,7 +177,7 @@ end """ parametricbootstrap([rng::AbstractRNG], nsamp::Integer, m::MixedModel{T}, ftype=T; - β = coef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;)) + β = fixef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;)) Perform `nsamp` parametric bootstrap replication fits of `m`, returning a `MixedModelBootstrap`. @@ -214,7 +214,7 @@ function parametricbootstrap( n::Integer, morig::MixedModel{T}, ftype::Type{<:AbstractFloat}=T; - β::AbstractVector=coef(morig), + β::AbstractVector=fixef(morig), σ=morig.σ, θ::AbstractVector=morig.θ, use_threads::Bool=false, @@ -232,9 +232,13 @@ function parametricbootstrap( if σ !== missing σ = T(σ) end - β, θ = convert(Vector{T}, β), convert(Vector{T}, θ) - βsc, θsc = similar(ftype.(β)), similar(ftype.(θ)) - p, k = length(β), length(θ) + β = convert(Vector{T}, β) + θ = convert(Vector{T}, θ) + # scratch -- note that this is the length of the unpivoted coef vector + βsc = coef(morig) + θsc = zeros(ftype, length(θ)) + p = length(βsc) + k = length(θsc) m = deepcopy(morig) for (key, val) in pairs(optsum_overrides) setfield!(m.optsum, key, val) @@ -251,7 +255,6 @@ function parametricbootstrap( samp = replicate(n; progress) do simulate!(rng, m; β, σ, θ) refit!(m; progress=false) - # @info "" m.optsum.feval ( objective=ftype.(m.objective), σ=ismissing(m.σ) ? missing : ftype(m.σ), diff --git a/src/simulate.jl b/src/simulate.jl index 9fbeb4ec9..035a8c02c 100644 --- a/src/simulate.jl +++ b/src/simulate.jl @@ -18,13 +18,17 @@ function simulate(m::MixedModel, args...; kwargs...) end """ - simulate!(rng::AbstractRNG, m::MixedModel{T}; β=m.β, σ=m.σ, θ=T[]) - simulate!(m::MixedModel; β=m.β, σ=m.σ, θ=m.θ) + simulate!(rng::AbstractRNG, m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]) + simulate!(m::MixedModel; β=fixef(m), σ=m.σ, θ=m.θ) Overwrite the response (i.e. `m.trms[end]`) with a simulated response vector from model `m`. This simulation includes sampling new values for the random effects. +`β` can be specified either as a pivoted, full rank coefficient vector (cf. [`fixef`](@ref)) +or as an unpivoted full dimension coefficient vector (cf. [`coef`](@ref)), where the entries +corresponding to redundant columns will be ignored. + !!! note Note that `simulate!` methods with a `y::AbstractVector` as the first argument (besides the RNG) and `simulate` methods return the simulated response. This is @@ -32,7 +36,7 @@ This simulation includes sampling new values for the random effects. which modify the model's response and return the entire modified model. """ function simulate!( - rng::AbstractRNG, m::LinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[] + rng::AbstractRNG, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[] ) where {T} # XXX should we add support for doing something with weights? simulate!(rng, m.y, m; β, σ, θ) @@ -40,7 +44,7 @@ function simulate!( end function simulate!( - rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[] + rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[] ) where {T} # note that these m.resp.y and m.LMM.y will later be synchronized in (re)fit!() # but for now we use them as distinct scratch buffers to avoid allocations @@ -85,7 +89,7 @@ function _rand(rng::AbstractRNG, d::Distribution, location, scale=NaN, n=1) return rand(rng, dist) / n end -function simulate!(m::MixedModel{T}; β=coef(m), σ=m.σ, θ=T[]) where {T} +function simulate!(m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]) where {T} return simulate!(Random.GLOBAL_RNG, m; β, σ, θ) end @@ -121,7 +125,7 @@ function simulate!( y::AbstractVector, m::LinearMixedModel, newdata::Tables.ColumnTable; - β=m.β, + β=fixef(m), σ=m.σ, θ=m.θ, ) @@ -147,9 +151,9 @@ function simulate!( end function simulate!( - rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=m.β, σ=m.σ, θ=m.θ + rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=m.θ ) where {T} - length(β) == length(pivot(m)) || length(β) == m.feterm.rank || + length(β) == length(pivot(m)) || length(β) == rank(m) || throw(ArgumentError("You must specify all (non-singular) βs")) β = convert(Vector{T}, β) @@ -157,10 +161,8 @@ function simulate!( θ = convert(Vector{T}, θ) isempty(θ) || setθ!(m, θ) - if length(β) ≠ length(pivot(m)) - padding = length(pivot(m)) - rank(m) - append!(β, fill(-0.0, padding)) - invpermute!(β, pivot(m)) + if length(β) == length(pivot(m)) + β = view(view(β, pivot(m)), 1:rank(m)) end # initialize y to standard normal @@ -172,7 +174,7 @@ function simulate!( end # scale by σ and add fixed-effects contribution - return mul!(y, m.X, β, one(T), σ) + return mul!(y, fullrankx(m), β, one(T), σ) end function simulate!( @@ -180,7 +182,7 @@ function simulate!( y::AbstractVector, m::GeneralizedLinearMixedModel, newdata::Tables.ColumnTable; - β=m.β, + β=fixef(m), σ=m.σ, θ=m.θ, ) @@ -209,7 +211,7 @@ function simulate!( rng::AbstractRNG, y::AbstractVector, m::GeneralizedLinearMixedModel{T}; - β=m.β, + β=fixef(m), σ=m.σ, θ=m.θ, ) where {T} @@ -250,7 +252,7 @@ function _simulate!( if length(β) == length(pivot(m)) # unlike LMM, GLMM stores the truncated, pivoted vector directly - β = view(β, view(pivot(m), 1:(rank(m)))) + β = view(view(β, pivot(m)), 1:rank(m)) end fast = (length(m.θ) == length(m.optsum.final)) setpar! = fast ? setθ! : setβθ! diff --git a/test/bootstrap.jl b/test/bootstrap.jl index 576aad918..49c28441f 100644 --- a/test/bootstrap.jl +++ b/test/bootstrap.jl @@ -226,7 +226,7 @@ end rng = MersenneTwister(0); x = rand(rng, 100); data = (x = x, x2 = 1.5 .* x, y = rand(rng, [0,1], 100), z = repeat('A':'T', 5)) - @testset "$family" for family in [Normal(), Bernoulli()] + @testset "$family" for family in [Normal(), Bernoulli()] model = @suppress fit(MixedModel, @formula(y ~ x + x2 + (1|z)), data, family; progress=false) boot = quickboot(model, 10) @@ -242,6 +242,33 @@ end yf = simulate(StableRNG(1), model; β=fixef(model)) @test all(x -> isapprox(x...), zip(yc, yf)) end + + @testset "partial crossing" begin + id = lpad.(string.(1:40), 2, "0") + B = ["b0", "b1", "b2"] + C = ["c0", "c1", "c2", "c3", "c4"] + df = DataFrame(reshape(collect(Iterators.product(B, C, id)), :), [:b, :c, :id]) + df[!, :y] .= 0 + filter!(df) do row + b = last(row.b) + c = last(row.c) + return b != c + end + + m = LinearMixedModel(@formula(y ~ 1 + b * c + (1|id)), df) + β = 1:rank(m) + σ = 1 + simulate!(StableRNG(628), m; β, σ) + fit!(m) + + boot = parametricbootstrap(StableRNG(271828), 1000, m); + bootci = DataFrame(shortestcovint(boot)) + filter!(:group => ismissing, bootci) + select!(bootci, :names => disallowmissing => :coef, :lower, :upper) + transform!(bootci, [:lower, :upper] => ByRow(middle) => :mean) + + @test all(x -> isapprox(x[1], x[2]; atol=0.1), zip(coef(m), bootci.mean)) + end end end