JuliaStats · gragusa · Jun 10, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 17, 2022
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/clusters_clt.tex b/clusters_clt.tex
@@ -0,0 +1,24 @@
+\setlength{\LTpost}{0mm}
+\begin{longtable}{l|rrrrrrrr}
+\toprule
+\multicolumn{1}{l}{} & \multicolumn{2}{c}{\emph{G} = 50} & \multicolumn{2}{c}{\emph{G} = 100} & \multicolumn{2}{c}{\emph{G} = 150} & \multicolumn{2}{c}{\emph{G} = 200} \\ 
+\cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9}
+\multicolumn{1}{l}{} & \(\frac{\sqrt{G}\bar{X}_{n}}{\bar{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\hat{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\bar{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\hat{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\bar{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\hat{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\bar{\sigma}_{n}}\) & \(\frac{\sqrt{G}\bar{X}_{n}}{\hat{\sigma}_{n}}\) \\ 
+\midrule\addlinespace[2.5pt]
+\multicolumn{9}{l}{\(\alpha=1.5, \beta=1.9\)} \\ 
+\midrule\addlinespace[2.5pt]
+10\% & $0.03$ & $0.06$ & $0.04$ & $0.03$ & $0.03$ & $0.04$ & $0.05$ & $0.06$ \\ 
+5\% & $0.02$ & $0.03$ & $0.02$ & $0.01$ & $0.00$ & $0.01$ & $0.04$ & $0.06$ \\ 
+1\% & $0.01$ & $0.00$ & $0.01$ & $0.00$ & $0.00$ & $0.01$ & $0.01$ & $0.02$ \\ 
+\midrule\addlinespace[2.5pt]
+\multicolumn{9}{l}{\(\alpha=1.5, \beta=2.1\)} \\ 
+\midrule\addlinespace[2.5pt]
+10\% & $0.05$ & $0.03$ & $0.03$ & $0.08$ & $0.03$ & $0.03$ & $0.02$ & $0.08$ \\ 
+5\% & $0.02$ & $0.02$ & $0.01$ & $0.05$ & $0.02$ & $0.01$ & $0.02$ & $0.05$ \\ 
+1\% & $0.01$ & $0.00$ & $0.00$ & $0.02$ & $0.00$ & $0.00$ & $0.00$ & $0.01$ \\ 
+\bottomrule
+\end{longtable}
+\begin{minipage}{\linewidth}
+The table shows the rejection rates for \(G=\{50,100,150,200\}\) and \(\alpha=1.5\) and \(\beta=1.9\) and \(\beta=2.1\). The Monte Carlo is based on 10,000 simulations. The simulation standard errors are: 0.009 for \(\alpha=10\%\), 0.007 for \(\alpha=5\%\), and 0.0031 for \(\alpha=1\%\).\\
+\end{minipage}
+
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,15 +1,20 @@
 [deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 
 [compat]
 DataFrames = "1"
 Documenter = "1"
-Optim = "1.6.2"
+Optim = "1.6.2"
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -2,7 +2,7 @@
 
 ```@meta
 DocTestSetup = quote
-    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets
+    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets, StableRNGs
 end
 ```
 

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -61,7 +61,7 @@ julia> dof(ols)
 3
 
 julia> dof_residual(ols)
-1.0
+1
 
 julia> round(aic(ols); digits=5)
 5.84252
@@ -214,8 +214,8 @@ sales ^ 2    -6.94594e-9   3.72614e-9   -1.86    0.0725  -1.45667e-8  6.7487e-10
 ```jldoctest
 julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
 3×2 DataFrame
- Row │ X      Y
-     │ Int64  Int64
+ Row │ X      Y     
+     │ Int64  Int64 
 ─────┼──────────────
    1 │     1      1
    2 │     2      0
@@ -319,8 +319,8 @@ julia> using GLM, RDatasets
 
 julia> form = dataset("datasets", "Formaldehyde")
 6×2 DataFrame
- Row │ Carb     OptDen
-     │ Float64  Float64
+ Row │ Carb     OptDen  
+     │ Float64  Float64 
 ─────┼──────────────────
    1 │     0.1    0.086
    2 │     0.3    0.269
@@ -473,8 +473,8 @@ julia> dobson = DataFrame(Counts    = [18.,17,15,20,10,21,25,13,13],
                           Outcome   = categorical([1,2,3,1,2,3,1,2,3]),
                           Treatment = categorical([1,1,1,2,2,2,3,3,3]))
 9×3 DataFrame
- Row │ Counts   Outcome  Treatment
-     │ Float64  Cat…     Cat…
+ Row │ Counts   Outcome  Treatment 
+     │ Float64  Cat…     Cat…      
 ─────┼─────────────────────────────
    1 │    18.0  1        1
    2 │    17.0  2        1
@@ -510,32 +510,11 @@ julia> round(deviance(gm1), digits=5)
 
 In this example, we choose the best model from a set of λs, based on minimum BIC.
 
-```jldoctest
+```jldoctest; filter = r"(\d*)\.(\d{7})\d+" => s"\1.\2***"
 julia> using GLM, RDatasets, StatsBase, DataFrames, Optim
 
-julia> trees = DataFrame(dataset("datasets", "trees"))
-31×3 DataFrame
- Row │ Girth    Height  Volume  
-     │ Float64  Int64   Float64 
-─────┼──────────────────────────
-   1 │     8.3      70     10.3
-   2 │     8.6      65     10.3
-   3 │     8.8      63     10.2
-   4 │    10.5      72     16.4
-   5 │    10.7      81     18.8
-   6 │    10.8      83     19.7
-   7 │    11.0      66     15.6
-   8 │    11.0      75     18.2
-  ⋮  │    ⋮       ⋮        ⋮
-  25 │    16.3      77     42.6
-  26 │    17.3      81     55.4
-  27 │    17.5      82     55.7
-  28 │    17.9      80     58.3
-  29 │    18.0      80     51.5
-  30 │    18.0      80     51.0
-  31 │    20.6      87     77.0
-                 16 rows omitted
-
+julia> trees = DataFrame(dataset("datasets", "trees"));
+
 julia> bic_glm(λ) = bic(glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(λ)));
 
 julia> optimal_bic = optimize(bic_glm, -1.0, 1.0);
@@ -554,9 +533,9 @@ Coefficients:
 ────────────────────────────────────────────────────────────────────────────
 (Intercept)  -1.07586    0.352543    -3.05    0.0023  -1.76684    -0.384892
 Height        0.0232172  0.00523331   4.44    <1e-05   0.0129601   0.0334743
-Girth         0.242837   0.00922555  26.32    <1e-99   0.224756    0.260919
+Girth         0.242837   0.00922556  26.32    <1e-99   0.224756    0.260919
 ────────────────────────────────────────────────────────────────────────────
 
 julia> round(optimal_bic.minimum, digits=5)
 156.37638
-```
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -123,6 +123,110 @@ x: 4         -0.032673    0.0797865  -0.41    0.6831  -0.191048    0.125702
 ───────────────────────────────────────────────────────────────────────────
 ```
 
+## Weighting 
+
+Both `lm` and `glm` allow weighted estimation. The four different 
+[types of weights](https://juliastats.org/StatsBase.jl/stable/weights/) defined in 
+[StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) can be used to fit a model:
+
+- `AnalyticWeights` describe a non-random relative importance (usually between 0 and 1) for
+  each observation. These weights may also be referred to as reliability weights, precision
+  weights or inverse variance weights. These are typically used when the observations being
+  weighted are aggregate values (e.g., averages) with differing variances.
+- `FrequencyWeights` describe the number of times (or frequency) each observation was seen.
+  These weights may also be referred to as case weights or repeat weights.
+- `ProbabilityWeights` represent the inverse of the sampling probability for each observation,
+  providing a correction mechanism for under- or over-sampling certain population groups.
+  These weights may also be referred to as sampling weights.
+- `UnitWeights` attribute a weight of 1 to each observation, which corresponds
+  to unweighted regression (the default).
+
+To indicate which kind of weights should be used, the vector of weights must be wrapped in
+one of the three weights types, and then passed to the `weights` keyword argument.
+Short-hand functions `aweights`, `fweights`, and `pweights` can be used to construct
+`AnalyticWeights`, `FrequencyWeights`, and `ProbabilityWeights`, respectively.
+
+We illustrate the API with randomly generated data.
+
+```jldoctest weights
+julia> using StableRNGs, DataFrames, GLM
+
+julia> data = DataFrame(y = rand(StableRNG(1), 100), x = randn(StableRNG(2), 100), weights = repeat([1, 2, 3, 4], 25));
+
+julia> m = lm(@formula(y ~ x), data)
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.517369    0.0280232  18.46    <1e-32   0.461758  0.57298
+x            -0.0500249   0.0307201  -1.63    0.1066  -0.110988  0.0109382
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_aweights = lm(@formula(y ~ x), data, wts=aweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0270707  19.09    <1e-34   0.463009  0.570451
+x            -0.0478667   0.0308395  -1.55    0.1239  -0.109067  0.0133333
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_fweights = lm(@formula(y ~ x), data, wts=fweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+─────────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%    Upper 95%
+─────────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0170172  30.37    <1e-84   0.483213    0.550246
+x            -0.0478667   0.0193863  -2.47    0.0142  -0.0860494  -0.00968394
+─────────────────────────────────────────────────────────────────────────────
+
+julia> m_pweights = lm(@formula(y ~ x), data, wts=pweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+───────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%   Upper 95%
+───────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0287193  17.99    <1e-32   0.459737  0.573722
+x            -0.0478667   0.0265532  -1.80    0.0745  -0.100561  0.00482739
+───────────────────────────────────────────────────────────────────────────
+
+```
+
+!!! warning
+
+  In the old API, weights were passed as `AbstractVectors` and were silently treated in
+  the internal computation of standard errors and related quantities as `FrequencyWeights`.
+  Passing weights as `AbstractVector` is still allowed for backward compatibility, but it
+  is deprecated. When weights are passed following the old API, they are now coerced to
+  `FrequencyWeights` and a deprecation warning is issued.
+
+The type of the weights will affect the variance of the estimated coefficients and the
+quantities involving this variance. The coefficient point estimates will be the same
+regardless of the type of weights.
+
+```jldoctest weights
+julia> loglikelihood(m_aweights)
+-16.296307561384253
+
+julia> loglikelihood(m_fweights)
+-25.51860961756451
+```
+
 ## Comparing models with F-test
 
 Comparisons between two or more linear models can be performed using the `ftest` function,
@@ -176,8 +280,8 @@ Many of the methods provided by this package have names similar to those in [R](
 - `vcov`: variance-covariance matrix of the coefficient estimates
 
 
-Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, but
-in practice one typically uses `LogLink`.
+Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, 
+but in practice one typically uses `LogLink`.
 
 ```jldoctest methods
 julia> using GLM, DataFrames, StatsBase
@@ -209,7 +313,9 @@ julia> round.(predict(mdl, test_data); digits=8)
  9.33333333
 ```
 
-The [`cooksdistance`](@ref) method computes [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation used to fit a linear model, giving an estimate of the influence of each data point.
+The [`cooksdistance`](@ref) method computes
+[Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation
+used to fit a linear model, giving an estimate of the influence of each data point.
 Note that it's currently only implemented for linear models without weights.
 
 ```jldoctest methods

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -12,17 +12,18 @@ module GLM
     import Statistics: cor
     using StatsAPI
     import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
-                      loglikelihood, nullloglikelihood, nobs, stderror, vcov,
-                      residuals, predict, predict!,
-                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
+                      loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
+                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
+                      PValue, weights, leverage
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
     import Tables
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
-           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
+           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept, dispersion, vif, gvif, termnames
+           cooksdistance, hasintercept, dispersion, vif, gvif, termnames, weights, AnalyticWeights,
+           ProbabilityWeights, FrequencyWeights, UnitWeights, uweights, fweights, pweights, aweights, leverage
 
     export
         # types
@@ -109,13 +110,15 @@ module GLM
           If `method=:cholesky` (the default), then the `Cholesky` decomposition method will be used.
           If `method=:qr`, then the `QR` decomposition method (which is more stable
           but slower) will be used.
-        - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
-          Such weights are equivalent to repeating each observation a number of times equal
-          to its weight. Do note that this interpretation gives equal point estimates but
-          different standard errors from analytical (a.k.a. inverse variance) weights and
-          from probability (a.k.a. sampling) weights which are the default in some other
-          software.
-          Can be length 0 to indicate no weighting (default).
+        - `wts::AbstractWeights`: Weights of observations.
+          The weights can be of type `AnalyticWeights`, `FrequencyWeights`, 
+          `ProbabilityWeights`, or `UnitWeights`. `AnalyticWeights` describe a non-random
+          relative importance (usually between 0 and 1) for each observation. These weights may 
+          also be referred to as reliability weights, precision weights or inverse variance weights. 
+          `FrequencyWeights` describe the number of times (or frequency) each observation was seen. 
+          `ProbabilityWeights` represent the inverse of the sampling probability for each observation,
+          providing a correction mechanism for under- or over-sampling certain population groups. `UnitWeights` 
+          (default) describe the case in which all weights are equal to 1 (so no weighting takes place).
         - `contrasts::AbstractDict{Symbol}=Dict{Symbol,Any}()`: a `Dict` mapping term names
           (as `Symbol`s) to term types (e.g. `ContinuousTerm`) or contrasts
           (e.g., `HelmertCoding()`, `SeqDiffCoding(; levels=["a", "b", "c"])`,

diff --git a/src/ftest.jl b/src/ftest.jl
@@ -57,7 +57,10 @@ F-statistic: 241.62 on 12 observations and 1 degrees of freedom, p-value: <1e-07
 """
 function ftest(mod::LinearModel)
     hasintercept(mod) || throw(ArgumentError("ftest only works for models with an intercept"))
-
+    wts = weights(mod)
+    if wts isa ProbabilityWeights
+        throw(ArgumentError("`ftest` for probability weighted models is not currently supported."))
+    end
     rss = deviance(mod)
     tss = nulldeviance(mod)
 
@@ -228,3 +231,7 @@ function show(io::IO, ftr::FTestResult{N}) where N
     end
     print(io, '─'^totwidth)
 end
+
+function ftest(r::LinearModel{T,<:ProbabilityWeights}) where {T}
+    throw(ArgumentError("`ftest` for probability weighted models is not currently supported."))
+end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{
Copy link Member nalimilan Dec 18, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Remove this file?
		}