diff --git a/Project.toml b/Project.toml index 7ecf4b9a..3fc1e92b 100644 --- a/Project.toml +++ b/Project.toml @@ -9,8 +9,10 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" diff --git a/docs/src/index.md b/docs/src/index.md index 3de56ca6..3150cc7f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -21,6 +21,8 @@ Presently, summary statistics such as `mean`, `total`, `ratio`, and `quantile` c The package is built on top of [DataFrames.jl](https://dataframes.juliadata.org/stable/) and supports a variety of features for data manipulation. Plots are generated using [AlgebraOfGraphics](https://github.com/MakieOrg/AlgebraOfGraphics.jl). +Discourse [post](https://discourse.julialang.org/t/ann-announcing-survey-jl-for-analysis-of-complex-surveys/94667) announcing the package. + ## Plans We plan for efficient implementations of all the methods in R `survey`. Features for future releases will include: diff --git a/src/Survey.jl b/src/Survey.jl index 6960cfb5..729acee9 100644 --- a/src/Survey.jl +++ b/src/Survey.jl @@ -12,6 +12,8 @@ using AlgebraOfGraphics using CategoricalArrays using Random using Missings +using RData +using Downloads include("SurveyDesign.jl") include("bootstrap.jl") @@ -26,6 +28,8 @@ include("show.jl") include("ratio.jl") include("by.jl") include("jackknife.jl") +include("ht.jl") +include("analytical.jl") export load_data export AbstractSurveyDesign, SurveyDesign, ReplicateDesign @@ -37,5 +41,6 @@ export boxplot export bootweights export ratio export jackknifeweights, jackknife_variance +export HartleyRao end diff --git a/src/analytical.jl b/src/analytical.jl new file mode 100644 index 00000000..eec64202 --- /dev/null +++ b/src/analytical.jl @@ -0,0 +1,18 @@ +""" + Variance under Simple Random Sampling scheme + see Lohr pg78. +""" +function srs_variance(x::Symbol, design::AbstractSurveyDesign) + ŝ = var(design.data[!,x]) ./ nrow(design.data) + X̂ = wsum(design.data[!, x], weights(design.data[!, design.weights])) + DataFrame(ht_total = X̂) +end + +""" + Strata totals +""" +function strata_totals(x::Symbol, design::AbstractSurveyDesign) + gdf_strata = groupby(design.data, design.strata) + X̂ₛ = combine(gdf_strata, [x, design.weights] => ((a, b) -> wsum(a, b)) => :total) + return X̂ₛ +end \ No newline at end of file diff --git a/src/ht.jl b/src/ht.jl new file mode 100644 index 00000000..c5f5ce96 --- /dev/null +++ b/src/ht.jl @@ -0,0 +1,28 @@ +""" + Hartley Rao Approximation of the variance of the Horvitz-Thompson Estimator. + Avoids exhaustively calculating joint (inclusion) probabilities πᵢⱼ of the sampling scheme. + """ +function HartleyRao(x::Symbol, design::SurveyDesign, HT_total) + y = design.data[!,x] + πᵢ = design.data[!,design.allprobs] + hartley_rao_var = sum((1 .- ((design.data[!,design.sampsize] .- 1) ./ design.data[!,design.sampsize]) .* πᵢ) .* + ((y .* design.data[!,design.weights]) .- (HT_total ./ design.data[!,design.sampsize])) .^ 2) + return hartley_rao_var +end + +""" + HorvitzThompsonTotal(x, design) + +Horvitz-Thompson total +""" +function HorvitzThompsonTotal(x::Symbol, design::SurveyDesign) + X̂ₕₜ = wsum(design.data[!, x], weights(design.data[!, design.weights])) + var = HartleyRao(x, design, X̂ₕₜ) + DataFrame(total = X̂ₕₜ, var = var) +end + +function HorvitzThompsonTotal(x::Vector{Symbol}, design::SurveyDesign) + df = reduce(vcat, [HorvitzThompsonTotal(i, design) for i in x]) + insertcols!(df, 1, :names => String.(x)) + return df +end \ No newline at end of file diff --git a/src/total.jl b/src/total.jl index a451324c..67a8876d 100644 --- a/src/total.jl +++ b/src/total.jl @@ -17,8 +17,9 @@ julia> total(:api00, dclus1) ``` """ function total(x::Symbol, design::SurveyDesign) - X = wsum(design.data[!, x], weights(design.data[!, design.weights])) - DataFrame(total = X) + X̂ = wsum(design.data[!, x], weights(design.data[!, design.weights])) + variance = HartleyRao(x, design, X̂) + DataFrame(total = X̂, SE = sqrt(variance)) end """ diff --git a/test/Project.toml b/test/Project.toml index 0c99cc27..bc5620b6 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,3 +3,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" \ No newline at end of file diff --git a/test/analytical.jl b/test/analytical.jl new file mode 100644 index 00000000..d6f617dc --- /dev/null +++ b/test/analytical.jl @@ -0,0 +1,3 @@ +# @testset "analytical_ht_total" begin + +# end \ No newline at end of file diff --git a/test/ht.jl b/test/ht.jl new file mode 100644 index 00000000..4937ba1b --- /dev/null +++ b/test/ht.jl @@ -0,0 +1,12 @@ +@testset "hartley_rao" begin + ### I havent been able to reproduce below in R survey, which give somewhat different SE estimates + # base functionality SRS + tot = total(:api00, srs) + @test tot.SE[1] ≈ 57154.1 rtol = STAT_TOL + # Stratified + tot = total(:api00, dstrat) + @test tot.SE[1] ≈ 699405 rtol = STAT_TOL + # one stage cluster + tot = total(:api00, dclus1) + @test tot.SE[1] ≈ 4880240 rtol = STAT_TOL +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d709103c..19712253 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -41,6 +41,9 @@ apiclus2 = load_data("apiclus2") # Load API dataset dclus2 = SurveyDesign(apiclus2; clusters = :dnum, weights = :pw) # Create SurveyDesign dclus2_boot = dclus2 |> bootweights # Create replicate design +# download and test using popular multistage stratified surveys +## TODO + # NHANES nhanes = load_data("nhanes") nhanes.seq1 = collect(1.0:5.0:42955.0) @@ -63,4 +66,6 @@ include("hist.jl") include("boxplot.jl") include("ratio.jl") include("show.jl") -include("jackknife.jl") \ No newline at end of file +include("jackknife.jl") +include("analytical.jl") +include("ht.jl")