JuliaStats · gragusa · Jun 10, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 17, 2022
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -3,6 +3,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
@@ -12,4 +13,4 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 [compat]
 DataFrames = "1"
 Documenter = "1"
-Optim = "1.6.2"
+Optim = "1.6.2"
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -2,7 +2,7 @@
 
 ```@meta
 DocTestSetup = quote
-    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets
+    using CategoricalArrays, DataFrames, Distributions, GLM, RDatasets, StableRNGs
 end
 ```
 

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -12,8 +12,8 @@ julia> using DataFrames, GLM, StatsBase
 
 julia> data = DataFrame(X=[1,2,3], Y=[2,4,7])
 3×2 DataFrame
- Row │ X      Y
-     │ Int64  Int64
+ Row │ X      Y     
+     │ Int64  Int64 
 ─────┼──────────────
    1 │     1      2
    2 │     2      4
@@ -61,7 +61,7 @@ julia> dof(ols)
 3
 
 julia> dof_residual(ols)
-1.0
+1
 
 julia> round(aic(ols); digits=5)
 5.84252
@@ -214,8 +214,8 @@ sales ^ 2    -6.94594e-9   3.72614e-9   -1.86    0.0725  -1.45667e-8  6.7487e-10
 ```jldoctest
 julia> data = DataFrame(X=[1,2,2], Y=[1,0,1])
 3×2 DataFrame
- Row │ X      Y
-     │ Int64  Int64
+ Row │ X      Y     
+     │ Int64  Int64 
 ─────┼──────────────
    1 │     1      1
    2 │     2      0
@@ -319,8 +319,8 @@ julia> using GLM, RDatasets
 
 julia> form = dataset("datasets", "Formaldehyde")
 6×2 DataFrame
- Row │ Carb     OptDen
-     │ Float64  Float64
+ Row │ Carb     OptDen  
+     │ Float64  Float64 
 ─────┼──────────────────
    1 │     0.1    0.086
    2 │     0.3    0.269
@@ -473,8 +473,8 @@ julia> dobson = DataFrame(Counts    = [18.,17,15,20,10,21,25,13,13],
                           Outcome   = categorical([1,2,3,1,2,3,1,2,3]),
                           Treatment = categorical([1,1,1,2,2,2,3,3,3]))
 9×3 DataFrame
- Row │ Counts   Outcome  Treatment
-     │ Float64  Cat…     Cat…
+ Row │ Counts   Outcome  Treatment 
+     │ Float64  Cat…     Cat…      
 ─────┼─────────────────────────────
    1 │    18.0  1        1
    2 │    17.0  2        1
@@ -510,32 +510,11 @@ julia> round(deviance(gm1), digits=5)
 
 In this example, we choose the best model from a set of λs, based on minimum BIC.
 
-```jldoctest
+```jldoctest; filter = r"(\d*)\.(\d{7})\d+" => s"\1.\2***"
 julia> using GLM, RDatasets, StatsBase, DataFrames, Optim
 
-julia> trees = DataFrame(dataset("datasets", "trees"))
-31×3 DataFrame
- Row │ Girth    Height  Volume  
-     │ Float64  Int64   Float64 
-─────┼──────────────────────────
-   1 │     8.3      70     10.3
-   2 │     8.6      65     10.3
-   3 │     8.8      63     10.2
-   4 │    10.5      72     16.4
-   5 │    10.7      81     18.8
-   6 │    10.8      83     19.7
-   7 │    11.0      66     15.6
-   8 │    11.0      75     18.2
-  ⋮  │    ⋮       ⋮        ⋮
-  25 │    16.3      77     42.6
-  26 │    17.3      81     55.4
-  27 │    17.5      82     55.7
-  28 │    17.9      80     58.3
-  29 │    18.0      80     51.5
-  30 │    18.0      80     51.0
-  31 │    20.6      87     77.0
-                 16 rows omitted
-
+julia> trees = DataFrame(dataset("datasets", "trees"));
+
 julia> bic_glm(λ) = bic(glm(@formula(Volume ~ Height + Girth), trees, Normal(), PowerLink(λ)));
 
 julia> optimal_bic = optimize(bic_glm, -1.0, 1.0);
@@ -554,9 +533,9 @@ Coefficients:
 ────────────────────────────────────────────────────────────────────────────
 (Intercept)  -1.07586    0.352543    -3.05    0.0023  -1.76684    -0.384892
 Height        0.0232172  0.00523331   4.44    <1e-05   0.0129601   0.0334743
-Girth         0.242837   0.00922555  26.32    <1e-99   0.224756    0.260919
+Girth         0.242837   0.00922556  26.32    <1e-99   0.224756    0.260919
 ────────────────────────────────────────────────────────────────────────────
 
 julia> round(optimal_bic.minimum, digits=5)
 156.37638
-```
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -123,6 +123,108 @@ x: 4         -0.032673    0.0797865  -0.41    0.6831  -0.191048    0.125702
 ───────────────────────────────────────────────────────────────────────────
 ```
 
+## Weighting 
+
+Both `lm` and `glm` allow weighted estimation. The three different 
+[types of weights](https://juliastats.org/StatsBase.jl/stable/weights/) defined in 
+[StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) can be used to fit a model:
+
+- `AnalyticWeights` describe a non-random relative importance (usually between 0 and 1) for
+  each observation. These weights may also be referred to as reliability weights, precision
+  weights or inverse variance weights. These are typically used when the observations being
+  weighted are aggregate values (e.g., averages) with differing variances.
+- `FrequencyWeights` describe the inverse of the sampling probability for each observation,
+  providing a correction mechanism for under- or over-sampling certain population groups.
+  These weights may also be referred to as sampling weights.
+- `ProbabilityWeights` describe how the sample can be scaled back to the population.
+  Usually are the reciprocals of sampling probabilities.
+
+To indicate which kind of weights should be used, the vector of weights must be wrapped in
+one of the three weights types, and then passed to the `weights` keyword argument.
+Short-hand functions `aweights`, `fweights`, and `pweights` can be used to construct
+`AnalyticWeights`, `FrequencyWeights`, and `ProbabilityWeights`, respectively.
+
+We illustrate the API with randomly generated data.
+
+```jldoctest weights
+julia> using StableRNGs, DataFrames, GLM
+
+julia> data = DataFrame(y = rand(StableRNG(1), 100), x = randn(StableRNG(2), 100), weights = repeat([1, 2, 3, 4], 25), );
+
+julia> m = lm(@formula(y ~ x), data)
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.517369    0.0280232  18.46    <1e-32   0.461758  0.57298
+x            -0.0500249   0.0307201  -1.63    0.1066  -0.110988  0.0109382
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_aweights = lm(@formula(y ~ x), data, wts=aweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+──────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+──────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0270707  19.09    <1e-34   0.463009  0.570451
+x            -0.0478667   0.0308395  -1.55    0.1239  -0.109067  0.0133333
+──────────────────────────────────────────────────────────────────────────
+
+julia> m_fweights = lm(@formula(y ~ x), data, wts=fweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+─────────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%    Upper 95%
+─────────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0170172  30.37    <1e-84   0.483213    0.550246
+x            -0.0478667   0.0193863  -2.47    0.0142  -0.0860494  -0.00968394
+─────────────────────────────────────────────────────────────────────────────
+
+julia> m_pweights = lm(@formula(y ~ x), data, wts=pweights(data.weights))
+LinearModel
+
+y ~ 1 + x
+
+Coefficients:
+───────────────────────────────────────────────────────────────────────────
+                  Coef.  Std. Error      t  Pr(>|t|)  Lower 95%   Upper 95%
+───────────────────────────────────────────────────────────────────────────
+(Intercept)   0.51673     0.0287193  17.99    <1e-32   0.459737  0.573722
+x            -0.0478667   0.0265532  -1.80    0.0745  -0.100561  0.00482739
+───────────────────────────────────────────────────────────────────────────
+
+```
+
+!!! warning
+
+  In the old API, weights were passed as `AbstractVectors` and were silently treated in
+  the internal computation of standard errors and related quantities as `FrequencyWeights`.
+  Passing weights as `AbstractVector` is still allowed for backward compatibility, but it
+  is deprecated. When weights are passed following the old API, they are now coerced to
+  `FrequencyWeights` and a deprecation warning is issued.
+
+The type of the weights will affect the variance of the estimated coefficients and the
+quantities involving this variance. The coefficient point estimates will be the same
+regardless of the type of weights.
+
+```jldoctest weights
+julia> loglikelihood(m_aweights)
+-16.296307561384253
+
+julia> loglikelihood(m_fweights)
+-25.51860961756451
+```
+
 ## Comparing models with F-test
 
 Comparisons between two or more linear models can be performed using the `ftest` function,
@@ -176,8 +278,8 @@ Many of the methods provided by this package have names similar to those in [R](
 - `vcov`: variance-covariance matrix of the coefficient estimates
 
 
-Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, but
-in practice one typically uses `LogLink`.
+Note that the canonical link for negative binomial regression is `NegativeBinomialLink`, 
+but in practice one typically uses `LogLink`.
 
 ```jldoctest methods
 julia> using GLM, DataFrames, StatsBase
@@ -209,12 +311,14 @@ julia> round.(predict(mdl, test_data); digits=8)
  9.33333333
 ```
 
-The [`cooksdistance`](@ref) method computes [Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation used to fit a linear model, giving an estimate of the influence of each data point.
+The [`cooksdistance`](@ref) method computes
+[Cook's distance](https://en.wikipedia.org/wiki/Cook%27s_distance) for each observation
+used to fit a linear model, giving an estimate of the influence of each data point.
 Note that it's currently only implemented for linear models without weights.
 
 ```jldoctest methods
 julia> round.(cooksdistance(mdl); digits=8)
-3-element Vector{Float64}:
+3×1 Matrix{Float64}:
  2.5
  0.25
  2.5

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -12,17 +12,18 @@ module GLM
     import Statistics: cor
     using StatsAPI
     import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
-                      loglikelihood, nullloglikelihood, nobs, stderror, vcov,
-                      residuals, predict, predict!,
-                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
+                      loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
+                      fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², 
+                      PValue, weights, leverage
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
     import Tables
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
-           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
+           loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, predict!,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept, dispersion, vif, gvif, termnames
+           cooksdistance, hasintercept, dispersion, vif, gvif, termnames, weights, AnalyticWeights,
+           ProbabilityWeights, FrequencyWeights, UnitWeights, uweights, fweights, pweights, aweights, leverage
 
     export
         # types

diff --git a/src/ftest.jl b/src/ftest.jl
@@ -57,7 +57,10 @@ F-statistic: 241.62 on 12 observations and 1 degrees of freedom, p-value: <1e-07
 """
 function ftest(mod::LinearModel)
     hasintercept(mod) || throw(ArgumentError("ftest only works for models with an intercept"))
-
+    wts = weights(mod)
+    if wts isa ProbabilityWeights
+        throw(ArgumentError("`ftest` for probability weighted models is not currently supported."))
+    end
     rss = deviance(mod)
     tss = nulldeviance(mod)
 
@@ -228,3 +231,7 @@ function show(io::IO, ftr::FTestResult{N}) where N
     end
     print(io, '─'^totwidth)
 end
+
+function ftest(r::LinearModel{T,<:ProbabilityWeights}) where {T}
+    throw(ArgumentError("`ftest` for probability weighted models is not currently supported."))
+end