Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rework passing and handling of FE coefficients to simulate! for rank deficient models #778

Merged
merged 9 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
MixedModels v4.25.3 Release Notes
==============================
- Fix a bug in the handling of rank deficiency in the `simulate[!]` code. This has important correctness implications for bootstrapping models with rank-deficient fixed effects (as can happen in the case of partial crossing of the fixed effects / missing cells). [#778]

MixedModels v4.25.2 Release Notes
==============================
- Use `public` keyword so that users don't see unnecessary docstring warnings on 1.11+. [#776]
- Fix accidental export of `dataset` and `datasets` and make them `public` instead. [#776]

MixedModels v4.25.1 Release Notes
==============================
- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775]
- Use more sophisticated checks on property names in `restoreoptsum` to allow for optsums saved by pre-v4.25 versions to be used with this version and later. [#775]

MixedModels v4.25 Release Notes
==============================
Expand Down Expand Up @@ -549,3 +553,4 @@ Package dependencies
[#773]: https://github.com/JuliaStats/MixedModels.jl/issues/773
[#775]: https://github.com/JuliaStats/MixedModels.jl/issues/775
[#776]: https://github.com/JuliaStats/MixedModels.jl/issues/776
[#778]: https://github.com/JuliaStats/MixedModels.jl/issues/778
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MixedModels"
uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
author = ["Phillip Alday <[email protected]>", "Douglas Bates <[email protected]>", "Jose Bayoan Santiago Calderon <[email protected]>"]
version = "4.25.2"
version = "4.25.3"

[deps]
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
Expand Down
15 changes: 9 additions & 6 deletions src/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ end

"""
parametricbootstrap([rng::AbstractRNG], nsamp::Integer, m::MixedModel{T}, ftype=T;
β = coef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;))
β = fixef(m), σ = m.σ, θ = m.θ, progress=true, optsum_overrides=(;))

Perform `nsamp` parametric bootstrap replication fits of `m`, returning a `MixedModelBootstrap`.

Expand Down Expand Up @@ -214,7 +214,7 @@ function parametricbootstrap(
n::Integer,
morig::MixedModel{T},
ftype::Type{<:AbstractFloat}=T;
β::AbstractVector=coef(morig),
β::AbstractVector=fixef(morig),
σ=morig.σ,
θ::AbstractVector=morig.θ,
use_threads::Bool=false,
Expand All @@ -232,9 +232,13 @@ function parametricbootstrap(
if σ !== missing
σ = T(σ)
end
β, θ = convert(Vector{T}, β), convert(Vector{T}, θ)
βsc, θsc = similar(ftype.(β)), similar(ftype.(θ))
p, k = length(β), length(θ)
β = convert(Vector{T}, β)
θ = convert(Vector{T}, θ)
# scratch -- note that this is the length of the unpivoted coef vector
βsc = coef(morig)
θsc = zeros(ftype, length(θ))
p = length(βsc)
k = length(θsc)
m = deepcopy(morig)
for (key, val) in pairs(optsum_overrides)
setfield!(m.optsum, key, val)
Expand All @@ -251,7 +255,6 @@ function parametricbootstrap(
samp = replicate(n; progress) do
simulate!(rng, m; β, σ, θ)
refit!(m; progress=false)
# @info "" m.optsum.feval
(
objective=ftype.(m.objective),
σ=ismissing(m.σ) ? missing : ftype(m.σ),
Expand Down
34 changes: 18 additions & 16 deletions src/simulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,33 @@ function simulate(m::MixedModel, args...; kwargs...)
end

"""
simulate!(rng::AbstractRNG, m::MixedModel{T}; β=m.β, σ=m.σ, θ=T[])
simulate!(m::MixedModel; β=m.β, σ=m.σ, θ=m.θ)
simulate!(rng::AbstractRNG, m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[])
simulate!(m::MixedModel; β=fixef(m), σ=m.σ, θ=m.θ)
Comment on lines +21 to +22
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's worth documenting somewhere the difference between the possible kinds of inputs to this function. coef-style (all values in input model matrix order, pivoted out betas are -0.0) vs. fixef-style (only betas for full-rank columns, in pivot order)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


Overwrite the response (i.e. `m.trms[end]`) with a simulated response vector from model `m`.

This simulation includes sampling new values for the random effects.

`β` can be specified either as a pivoted, full rank coefficient vector (cf. [`fixef`](@ref))
or as an unpivoted full dimension coefficient vector (cf. [`coef`](@ref)), where the entries
corresponding to redundant columns will be ignored.

!!! note
Note that `simulate!` methods with a `y::AbstractVector` as the first argument
(besides the RNG) and `simulate` methods return the simulated response. This is
in contrast to `simulate!` methods with a `m::MixedModel` as the first argument,
which modify the model's response and return the entire modified model.
"""
function simulate!(
rng::AbstractRNG, m::LinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[]
rng::AbstractRNG, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]
) where {T}
# XXX should we add support for doing something with weights?
simulate!(rng, m.y, m; β, σ, θ)
return unfit!(m)
end

function simulate!(
rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=coef(m), σ=m.σ, θ=T[]
rng::AbstractRNG, m::GeneralizedLinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]
) where {T}
# note that these m.resp.y and m.LMM.y will later be synchronized in (re)fit!()
# but for now we use them as distinct scratch buffers to avoid allocations
Expand Down Expand Up @@ -85,7 +89,7 @@ function _rand(rng::AbstractRNG, d::Distribution, location, scale=NaN, n=1)
return rand(rng, dist) / n
end

function simulate!(m::MixedModel{T}; β=coef(m), σ=m.σ, θ=T[]) where {T}
function simulate!(m::MixedModel{T}; β=fixef(m), σ=m.σ, θ=T[]) where {T}
return simulate!(Random.GLOBAL_RNG, m; β, σ, θ)
end

Expand Down Expand Up @@ -121,7 +125,7 @@ function simulate!(
y::AbstractVector,
m::LinearMixedModel,
newdata::Tables.ColumnTable;
β=m.β,
β=fixef(m),
σ=m.σ,
θ=m.θ,
)
Expand All @@ -147,20 +151,18 @@ function simulate!(
end

function simulate!(
rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=m.β, σ=m.σ, θ=m.θ
rng::AbstractRNG, y::AbstractVector, m::LinearMixedModel{T}; β=fixef(m), σ=m.σ, θ=m.θ
) where {T}
length(β) == length(pivot(m)) || length(β) == m.feterm.rank ||
length(β) == length(pivot(m)) || length(β) == rank(m) ||
throw(ArgumentError("You must specify all (non-singular) βs"))

β = convert(Vector{T}, β)
σ = T(σ)
θ = convert(Vector{T}, θ)
isempty(θ) || setθ!(m, θ)

if length(β) ≠ length(pivot(m))
padding = length(pivot(m)) - rank(m)
append!(β, fill(-0.0, padding))
invpermute!(β, pivot(m))
if length(β) == length(pivot(m))
β = view(view(β, pivot(m)), 1:rank(m))
Comment on lines +164 to +165
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the branch that handles the coef case where we need to pull out the non--0.0 values in the correct order to multiply by fullrank(m) right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct.

end

# initialize y to standard normal
Expand All @@ -172,15 +174,15 @@ function simulate!(
end

# scale by σ and add fixed-effects contribution
return mul!(y, m.X, β, one(T), σ)
return mul!(y, fullrankx(m), β, one(T), σ)
end

function simulate!(
rng::AbstractRNG,
y::AbstractVector,
m::GeneralizedLinearMixedModel,
newdata::Tables.ColumnTable;
β=m.β,
β=fixef(m),
σ=m.σ,
θ=m.θ,
)
Expand Down Expand Up @@ -209,7 +211,7 @@ function simulate!(
rng::AbstractRNG,
y::AbstractVector,
m::GeneralizedLinearMixedModel{T};
β=m.β,
β=fixef(m),
σ=m.σ,
θ=m.θ,
) where {T}
Expand Down Expand Up @@ -250,7 +252,7 @@ function _simulate!(

if length(β) == length(pivot(m))
# unlike LMM, GLMM stores the truncated, pivoted vector directly
β = view(β, view(pivot(m), 1:(rank(m))))
β = view(view(β, pivot(m)), 1:rank(m))
end
fast = (length(m.θ) == length(m.optsum.final))
setpar! = fast ? setθ! : setβθ!
Expand Down
29 changes: 28 additions & 1 deletion test/bootstrap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ end
rng = MersenneTwister(0);
x = rand(rng, 100);
data = (x = x, x2 = 1.5 .* x, y = rand(rng, [0,1], 100), z = repeat('A':'T', 5))
@testset "$family" for family in [Normal(), Bernoulli()]
@testset "$family" for family in [Normal(), Bernoulli()]
model = @suppress fit(MixedModel, @formula(y ~ x + x2 + (1|z)), data, family; progress=false)
boot = quickboot(model, 10)

Expand All @@ -242,6 +242,33 @@ end
yf = simulate(StableRNG(1), model; β=fixef(model))
@test all(x -> isapprox(x...), zip(yc, yf))
end

@testset "partial crossing" begin
id = lpad.(string.(1:40), 2, "0")
B = ["b0", "b1", "b2"]
C = ["c0", "c1", "c2", "c3", "c4"]
df = DataFrame(reshape(collect(Iterators.product(B, C, id)), :), [:b, :c, :id])
df[!, :y] .= 0
filter!(df) do row
b = last(row.b)
c = last(row.c)
return b != c
end

m = LinearMixedModel(@formula(y ~ 1 + b * c + (1|id)), df)
β = 1:rank(m)
σ = 1
simulate!(StableRNG(628), m; β, σ)
fit!(m)

boot = parametricbootstrap(StableRNG(271828), 1000, m);
bootci = DataFrame(shortestcovint(boot))
filter!(:group => ismissing, bootci)
select!(bootci, :names => disallowmissing => :coef, :lower, :upper)
transform!(bootci, [:lower, :upper] => ByRow(middle) => :mean)

@test all(x -> isapprox(x[1], x[2]; atol=0.1), zip(coef(m), bootci.mean))
end
end
end

Expand Down
Loading