Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ddof to PowerDivergenceTest and ChisqTest to allow changes to degrees of freedom #288

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
169 changes: 104 additions & 65 deletions src/power_divergence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ function testname(x::PowerDivergenceTest)
end

# parameter of interest: name, value under h0, point estimate
population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities", x.theta0, x.thetahat)
population_param_of_interest(x::PowerDivergenceTest) = ("Multinomial Probabilities",
x.theta0, x.thetahat)
default_tail(test::PowerDivergenceTest) = :right

pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=tail)
Expand All @@ -49,8 +50,8 @@ pvalue(x::PowerDivergenceTest; tail=:right) = pvalue(Chisq(x.df),x.stat; tail=ta
Compute a confidence interval with coverage `level` for multinomial proportions using
one of the following methods. Possible values for `method` are:

- `:auto` (default): If the minimum of the expected cell counts exceeds 100, Quesenberry-Hurst
intervals are used, otherwise Sison-Glaz.
- `:auto` (default): If the minimum of the expected cell counts exceeds 100,
Quesenberry-Hurst intervals are used, otherwise Sison-Glaz.
- `:sison_glaz`: Sison-Glaz intervals
- `:bootstrap`: Bootstrap intervals
- `:quesenberry_hurst`: Quesenberry-Hurst intervals
Expand All @@ -59,8 +60,9 @@ one of the following methods. Possible values for `method` are:
# References

* Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013.
* Sison, C.P and Glaz, J. Simultaneous confidence intervals and sample size determination
for multinomial proportions. Journal of the American Statistical Association,
* Sison, C.P and Glaz, J. Simultaneous confidence intervals and sample size
determination for multinomial proportions. Journal of the American Statistical
Association,
90:366-369, 1995.
* Quesensberry, C.P. and Hurst, D.C. Large Sample Simultaneous Confidence Intervals for
Multinational Proportions. Technometrics, 6:191-195, 1964.
Expand Down Expand Up @@ -102,28 +104,35 @@ end

# Bootstrap
function ci_bootstrap(x::PowerDivergenceTest,alpha::Float64, iters::Int64)
m = mapslices(x -> quantile(x, [alpha / 2, 1 - alpha / 2]), rand(Multinomial(x.n, convert(Vector{Float64}, x.thetahat)),iters) / x.n, dims=2)
Tuple{Float64,Float64}[(boundproportion(m[i,1]), boundproportion(m[i,2])) for i in 1:length(x.thetahat)]
m = mapslices(x -> quantile(x, [alpha / 2, 1 - alpha / 2]), rand(Multinomial(x.n,
convert(Vector{Float64}, x.thetahat)),iters) / x.n, dims=2)
Tuple{Float64,Float64}[(boundproportion(m[i,1]), boundproportion(m[i,2])) for i in
1:length(x.thetahat)]
end

# Quesenberry, Hurst (1964)
function ci_quesenberry_hurst(x::PowerDivergenceTest, alpha::Float64; GC::Bool=true)
m = length(x.thetahat)
cv = GC ? quantile(Chisq(1), 1 - alpha / m) : quantile(Chisq(m - 1), 1 - alpha)

u = (cv .+ 2 .* x.observed .+ sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv))
l = (cv .+ 2 .* x.observed .- sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .- x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv))
u = (cv .+ 2 .* x.observed .+ sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .-
x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv))
l = (cv .+ 2 .* x.observed .- sqrt.(cv .* (cv .+ 4 .* x.observed .* (x.n .-
x.observed) ./ x.n))) ./ (2 .* (x.n .+ cv))
Tuple{Float64,Float64}[(boundproportion(l[j]), boundproportion(u[j])) for j in 1:m]
end

# asymptotic simultaneous confidence interval
# Gold (1963)
function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true, GC::Bool=true)
function ci_gold(x::PowerDivergenceTest, alpha::Float64; correct::Bool=true,
GC::Bool=true)
m = length(x.thetahat)
cv = GC ? quantile(Chisq(1), 1 - alpha / 2m) : quantile(Chisq(m - 1), 1 - alpha / 2)

u = [x.thetahat[j] + sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) + ifelse(correct, inv(2x.n), 0) for j in 1:m]
l = [x.thetahat[j] - sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) - ifelse(correct, inv(2x.n), 0) for j in 1:m]
u = [x.thetahat[j] + sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) +
ifelse(correct, inv(2x.n), 0) for j in 1:m]
l = [x.thetahat[j] - sqrt.(cv * x.thetahat[j] * (1 - x.thetahat[j]) / x.n) -
ifelse(correct, inv(2x.n), 0) for j in 1:m]
Tuple{Float64,Float64}[ (boundproportion(l[j]), boundproportion(u[j])) for j in 1:m]
end

Expand Down Expand Up @@ -172,7 +181,8 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo
m1[i] = mu[1]
m2[i] = mu[2] + mu[1] - mu[1]^2
m3[i] = mu[3] + mu[2] * (3 - 3mu[1]) + (mu[1] - 3mu[1]^2 + 2mu[1]^3)
m4[i] = mu[4] + mu[3] * (6 - 4mu[1]) + mu[2] * (7 - 12mu[1] + 6mu[1]^2) + mu[1] - 4mu[1]^2 + 6mu[1]^3 - 3mu[1]^4
m4[i] = (mu[4] + mu[3] * (6 - 4mu[1]) + mu[2] * (7 - 12mu[1] + 6mu[1]^2) +
mu[1] - 4mu[1]^2 + 6mu[1]^3 - 3mu[1]^4)
m5[i] = den
end
for i in 1:k
Expand All @@ -183,7 +193,8 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo
g1 = s3 / s2^(3/2)
g2 = s4 / s2^2

poly = 1 + g1 * (z^3 - 3z) / 6 + g2 * (z^4 - 6z^2 + 3) / 24 + g1^2 * (z^6 - 15z^4 + 45z^2 - 15) / 72
poly = 1 + g1 * (z^3 - 3z) / 6 + g2 * (z^4 - 6z^2 + 3) / 24 + g1^2 * (z^6 - 15z^4
+ 45z^2 - 15) / 72
f = poly * exp(-z^2 / 2) / sqrt(2π)
probx = prod(m5)

Expand Down Expand Up @@ -220,45 +231,55 @@ function ci_sison_glaz(x::PowerDivergenceTest, alpha::Float64; skew_correct::Boo
out[i,5] = min(x.thetahat[i] + cn + onen, 1)
end
if skew_correct
return Tuple{Float64,Float64}[(boundproportion(out[i,2]), boundproportion(out[i,3])) for i in 1:k]
return Tuple{Float64,Float64}[(boundproportion(out[i,2]),
boundproportion(out[i,3])) for i in 1:k]
else
return Tuple{Float64,Float64}[(boundproportion(out[i,4]), boundproportion(out[i,5])) for i in 1:k]
return Tuple{Float64,Float64}[(boundproportion(out[i,4]),
boundproportion(out[i,5])) for i in 1:k]
end
end

# power divergence statistic for testing goodness of fit
# Cressie and Read 1984; Read and Cressie 1988
# lambda = 1: Pearson's Chi-square statistic
# lambda -> 0: Converges to Likelihood Ratio test stat
# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale, Kullback 1978)
# lambda -> -1: Converges to minimum discrimination information statistic (Gokhale,
# Kullback 1978)
# lambda = -2: Neyman Modified chi-squared statistic (Neyman 1949)
# lambda = -.5: Freeman-Tukey statistic (Freeman, Tukey 1950)

# Under regularity conditions, their asymptotic distributions are all the same (Drost 1989)
# Under regularity conditions, their asymptotic distributions are all the same (Drost
# 1989)
# Chi-squared null approximation works best for lambda near 2/3
"""
PowerDivergenceTest(x[, y]; lambda = 1.0, theta0 = ones(length(x))/length(x))
PowerDivergenceTest(x[, y]; lambda = 1.0, theta0 = ones(length(x))/length(x), ddof =
0)

Perform a Power Divergence test.

If `y` is not given and `x` is a matrix with one row or column, or `x` is a vector, then
a goodness-of-fit test is performed (`x` is treated as a one-dimensional contingency
table). In this case, the hypothesis tested is whether the population probabilities equal
those in `theta0`, or are all equal if `theta0` is not given.
those in `theta0`, or are all equal if `theta0` is not given.


If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency
table is calculated using the `counts` function from the `StatsBase` package. Then the power
divergence test is conducted under the null hypothesis that the joint distribution of the
cell counts in a 2-dimensional contingency table is the product of the row and column
marginals.
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The
contingency table is calculated using the `counts` function from the `StatsBase` package.
Then the power divergence test is conducted under the null hypothesis that the joint
distribution of the cell counts in a 2-dimensional contingency table is the product of the
row and column marginals.

Note that the entries of `x` (and `y` if provided) must be non-negative integers.

Computed confidence intervals by default are Quesenberry-Hurst intervals
if the minimum of the expected cell counts exceeds 100, and Sison-Glaz intervals otherwise.
See the [`confint(::PowerDivergenceTest)`](@ref) documentation for a list of
supported methods to compute confidence intervals.
The `ddof` parameter is the "delta degrees of freedom" adjustment to the number of degrees
of freedom used for calculation of p-values. The number of degrees of freedom is decreased
by `ddof`.

Computed confidence intervals by default are Quesenberry-Hurst intervals if the minimum of
the expected cell counts exceeds 100, and Sison-Glaz intervals otherwise. See the
[`confint(::PowerDivergenceTest)`](@ref) documentation for a list of supported methods to
compute confidence intervals.

The power divergence test is given by
```math
Expand All @@ -284,31 +305,36 @@ Implements: [`pvalue`](@ref), [`confint(::PowerDivergenceTest)`](@ref)

* Agresti, Alan. Categorical Data Analysis, 3rd Edition. Wiley, 2013.
"""
function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat}
function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector{U} =
ones(length(x))/length(x), ddof::Integer=0) where {T<:Integer,U<:AbstractFloat}

nrows, ncols = size(x)
n = sum(x)

#validate date
(any(x .< 0) || any(!isfinite, x)) && throw(ArgumentError("all entries must be nonnegative and finite"))
(any(x .< 0) || any(!isfinite, x)) && throw(ArgumentError("all entries must be
nonnegative and finite"))
n == 0 && throw(ArgumentError("at least one entry must be positive"))
(!isfinite(nrows) || !isfinite(ncols) || !isfinite(nrows*ncols)) && throw(ArgumentError("invalid number of rows or columns"))
(!isfinite(nrows) || !isfinite(ncols) || !isfinite(nrows*ncols)) &&
throw(ArgumentError("invalid number of rows or columns"))

if nrows > 1 && ncols > 1
rowsums = sum(x, dims=2)
colsums = sum(x, dims=1)
df = (nrows - 1) * (ncols - 1)
df = (nrows - 1) * (ncols - 1) - ddof
thetahat = x ./ n
xhat = rowsums * colsums / n
theta0 = xhat / n
V = Float64[(colsums[j]/n) * (rowsums[i]/n) * (1 - rowsums[i]/n) * (n - colsums[j]) for i in 1:nrows, j in 1:ncols]
V = Float64[(colsums[j]/n) * (rowsums[i]/n) * (1 - rowsums[i]/n) * (n -
colsums[j]) for i in 1:nrows, j in 1:ncols]
elseif nrows == 1 || ncols == 1
df = length(x) - 1
df = length(x) - 1 - ddof
xhat = reshape(n * theta0, size(x))
thetahat = x / n
V = reshape(n .* theta0 .* (1 .- theta0), size(x))

abs(1 - sum(theta0)) <= sqrt(eps()) || throw(ArgumentError("Probabilities must sum to one"))
abs(1 - sum(theta0)) <= sqrt(eps()) ||
throw(ArgumentError("Probabilities must sum to one"))
else
throw(ArgumentError("Number of rows or columns must be at least 1"))
end
Expand All @@ -331,24 +357,29 @@ function PowerDivergenceTest(x::AbstractMatrix{T}; lambda::U=1.0, theta0::Vector
stat *= 2 / (lambda * (lambda + 1))
end

PowerDivergenceTest(lambda, vec(theta0), stat, df, x, n, vec(thetahat), xhat, (x - xhat) ./ sqrt.(xhat), (x - xhat) ./ sqrt.(V))
PowerDivergenceTest(lambda, vec(theta0), stat, df, x, n, vec(thetahat), xhat, (x -
xhat) ./ sqrt.(xhat), (x - xhat) ./ sqrt.(V))
end

#convenience functions

#PDT
function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}; lambda::U=1.0) where {T<:Integer,U<:AbstractFloat}
function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T},
levels::Levels{T}; lambda::U=1.0, ddof::Integer=0) where {T<:Integer,U<:AbstractFloat}
d = counts(x, y, levels)
PowerDivergenceTest(d, lambda=lambda)
PowerDivergenceTest(d, lambda=lambda, ddof=ddof)
end

function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T; lambda::U=1.0) where {T<:Integer,U<:AbstractFloat}
function PowerDivergenceTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T;
lambda::U=1.0, ddof::Integer=0) where {T<:Integer,U<:AbstractFloat}
d = counts(x, y, k)
PowerDivergenceTest(d, lambda=lambda)
PowerDivergenceTest(d, lambda=lambda, ddof=ddof)
end

PowerDivergenceTest(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} =
PowerDivergenceTest(reshape(x, length(x), 1), lambda=lambda, theta0=theta0)
PowerDivergenceTest(x::AbstractVector{T}; lambda::U=1.0, theta0::Vector{U} =
ones(length(x))/length(x), ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} =
PowerDivergenceTest(reshape(x, length(x), 1), lambda=lambda, theta0=theta0,
ddof=ddof)

#ChisqTest
"""
Expand All @@ -360,35 +391,40 @@ with ``λ = 1``).
If `y` is not given and `x` is a matrix with one row or column, or `x` is a vector, then
a goodness-of-fit test is performed (`x` is treated as a one-dimensional contingency
table). In this case, the hypothesis tested is whether the population probabilities equal
those in `theta0`, or are all equal if `theta0` is not given.
those in `theta0`, or are all equal if `theta0` is not given. The `ddof` parameter is the
"delta degrees of freedom" adjustment to the number of degrees of freedom used for
calculation of p-values. The number of degrees of freedom is decreased by `ddof`.

If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency
table is calculated using `counts` function from the `StatsBase` package. Then the power
divergence test is conducted under the null hypothesis that the joint distribution of the
cell counts in a 2-dimensional contingency table is the product of the row and column
marginals.
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The
contingency table is calculated using `counts` function from the `StatsBase` package. Then
the power divergence test is conducted under the null hypothesis that the joint
distribution of the cell counts in a 2-dimensional contingency table is the product of the
row and column marginals.

Note that the entries of `x` (and `y` if provided) must be non-negative integers.

Implements: [`pvalue`](@ref), [`confint`](@ref)
"""
function ChisqTest(x::AbstractMatrix{T}) where T<:Integer
PowerDivergenceTest(x, lambda=1.0)
function ChisqTest(x::AbstractMatrix{T}; ddof::Integer=0) where T<:Integer
PowerDivergenceTest(x, lambda=1.0, ddof=ddof)
end

function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}) where T<:Integer
function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T};
ddof::Integer=0) where T<:Integer
d = counts(x, y, levels)
PowerDivergenceTest(d, lambda=1.0)
PowerDivergenceTest(d, lambda=1.0, ddof=ddof)
end

function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where T<:Integer
function ChisqTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T;
ddof::Integer=0) where T<:Integer
d = counts(x, y, k)
PowerDivergenceTest(d, lambda=1.0)
PowerDivergenceTest(d, lambda=1.0, ddof=ddof)
end

ChisqTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} =
PowerDivergenceTest(reshape(x, length(x), 1), lambda=1.0, theta0=theta0)
ChisqTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x);
ddof::Integer=0) where {T<:Integer,U<:AbstractFloat} =
PowerDivergenceTest(reshape(x, length(x), 1), lambda=1.0, theta0=theta0, ddof=ddof)

#MultinomialLRTest
"""
Expand All @@ -403,11 +439,11 @@ table). In this case, the hypothesis tested is whether the population probabilit
those in `theta0`, or are all equal if `theta0` is not given.

If `x` is a matrix with at least two rows and columns, it is taken as a two-dimensional
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The contingency
table is calculated using `counts` function from the `StatsBase` package. Then the power
divergence test is conducted under the null hypothesis that the joint distribution of the
cell counts in a 2-dimensional contingency table is the product of the row and column
marginals.
contingency table. Otherwise, `x` and `y` must be vectors of the same length. The
contingency table is calculated using `counts` function from the `StatsBase` package. Then
the power divergence test is conducted under the null hypothesis that the joint
distribution of the cell counts in a 2-dimensional contingency table is the product of the
row and column marginals.

Note that the entries of `x` (and `y` if provided) must be non-negative integers.

Expand All @@ -417,17 +453,20 @@ function MultinomialLRTest(x::AbstractMatrix{T}) where T<:Integer
PowerDivergenceTest(x, lambda=0.0)
end

function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, levels::Levels{T}) where T<:Integer
function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T},
levels::Levels{T}) where T<:Integer
d = counts(x, y, levels)
PowerDivergenceTest(d, lambda=0.0)
end

function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where T<:Integer
function MultinomialLRTest(x::AbstractVector{T}, y::AbstractVector{T}, k::T) where
T<:Integer
d = counts(x, y, k)
PowerDivergenceTest(d, lambda=0.0)
end

MultinomialLRTest(x::AbstractVector{T}, theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} =
MultinomialLRTest(x::AbstractVector{T},
theta0::Vector{U} = ones(length(x))/length(x)) where {T<:Integer,U<:AbstractFloat} =
PowerDivergenceTest(reshape(x, length(x), 1), lambda=0.0, theta0=theta0)

function show_params(io::IO, x::PowerDivergenceTest, ident="")
Expand Down
13 changes: 13 additions & 0 deletions test/power_divergence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,17 @@ MultinomialLRTest(x,y,(1:3,1:3))
d = [113997 1031298
334453 37471]
PowerDivergenceTest(d)

# Test ddof for the ChisqTest
sprague252 marked this conversation as resolved.
Show resolved Hide resolved
x = [8,10,16,6]
probs = [0.15865525393145702, 0.341344746068543, 0.34134474606854304, 0.15865525393145702]
m = ChisqTest(x, probs, ddof=2)
@test pvalue(m) ≈ 0.17603510054227095

# Test ddof for the PowerDivergenceTest
x = [8,10,16,6]
probs = [0.15865525393145702, 0.341344746068543, 0.34134474606854304, 0.15865525393145702]
m = PowerDivergenceTest(x, lambda=1.0, theta0=probs, ddof=2)
@test pvalue(m) ≈ 0.17603510054227095

end