From b13ecb9e75315301a23105faa4ba6846abeabcd6 Mon Sep 17 00:00:00 2001 From: Mauricio 'Pacha' Vargas Sepulveda Date: Mon, 22 Jul 2024 20:53:54 -0400 Subject: [PATCH] refactored apes() and bias_corr() --- NAMESPACE | 2 +- NEWS.md | 3 +- R/bias_corr.R | 62 +++++------ R/capybara-package.R | 2 +- R/cpp11.R | 4 + R/feglm.R | 22 ++-- R/feglm_control.R | 2 +- R/feglm_offset.R | 54 +--------- R/helpers.R | 2 +- README.Rmd | 4 +- README.md | 4 +- dev/benchmarks_tests_agtpa_capybara_only.R | 2 +- dev/test-offset.R | 18 ++++ docs/index.html | 16 +-- docs/news/index.html | 2 +- docs/pkgdown.yml | 2 +- docs/reference/apes.html | 9 +- docs/reference/bias_corr.html | 6 +- docs/reference/feglm.html | 4 +- docs/reference/feglm_control.html | 2 +- docs/reference/felm.html | 2 +- docs/reference/fenegbin.html | 2 +- docs/reference/fepoisson.html | 2 +- man/bias_corr.Rd | 4 +- man/feglm_control.Rd | 2 +- src/00_main.h | 17 +++ src/05_glm_fit.cpp | 21 ++-- src/06_glm_offset_fit.cpp | 101 ++++++++++++++++++ ...elation.cpp => 07_kendall_correlation.cpp} | 0 src/cpp11.cpp | 8 ++ tests/testthat/test-apes-bias.R | 24 +++-- tests/testthat/test-felm.R | 1 - tests/testthat/test-fepoisson.R | 5 +- 33 files changed, 254 insertions(+), 157 deletions(-) create mode 100644 dev/test-offset.R create mode 100644 src/06_glm_offset_fit.cpp rename src/{06_kendall_correlation.cpp => 07_kendall_correlation.cpp} (100%) diff --git a/NAMESPACE b/NAMESPACE index bba62c5..6dbe056 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,8 +73,8 @@ importFrom(stats,rgamma) importFrom(stats,rlogis) importFrom(stats,rnorm) importFrom(stats,rpois) -importFrom(stats,sd) importFrom(stats,terms) +importFrom(stats,var) importFrom(stats,vcov) importFrom(utils,combn) useDynLib(capybara, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index 75468e8..1c31911 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,8 +3,7 @@ * Moves all the heavy computation to C++ using Armadillo and it exports the results to R. Previously, there were multiple data copies between R and C++ that added overhead to the computations. -* For a future release, I may rewrite the offset and APES computation to C++, - but with those the overhead is minimal. +* The previous versions returned MX by default, now it has to be specified. # capybara 0.5.2 diff --git a/R/bias_corr.R b/R/bias_corr.R index 5c412a6..1fa900a 100644 --- a/R/bias_corr.R +++ b/R/bias_corr.R @@ -16,7 +16,7 @@ #' weakly exogenous regressors, e.g. lagged outcome variables, we suggest to #' choose a bandwidth between one and four. Note that the order of factors to #' be partialed out is important for bandwidths larger than zero. -#' @param panel.structure a string equal to \code{"classic"} or \code{"network"} +#' @param panel_structure a string equal to \code{"classic"} or \code{"network"} #' which determines the structure of the panel used. \code{"classic"} denotes #' panel structures where for example the same cross-sectional units are #' observed several times (this includes pseudo panels). \code{"network"} @@ -59,7 +59,7 @@ bias_corr <- function( object = NULL, L = 0L, - panel.structure = c("classic", "network")) { + panel_structure = c("classic", "network")) { # Check validity of 'object' if (is.null(object)) { stop("'object' has to be specified.", call. = FALSE) @@ -67,20 +67,20 @@ bias_corr <- function( stop("'bias_corr' called on a non-'feglm' object.", call. = FALSE) } - # Check validity of 'panel.structure' - panel.structure <- match.arg(panel.structure) + # Check validity of 'panel_structure' + panel_structure <- match.arg(panel_structure) # Extract model information - beta.uncorr <- object[["coefficients"]] + beta_uncorr <- object[["coefficients"]] control <- object[["control"]] data <- object[["data"]] eps <- .Machine[["double.eps"]] family <- object[["family"]] formula <- object[["formula"]] lvls_k <- object[["lvls_k"]] - nms.sp <- names(beta.uncorr) + nms.sp <- names(beta_uncorr) nt <- object[["nobs"]][["nobs"]] - k.vars <- names(lvls_k) + k_vars <- names(lvls_k) k <- length(lvls_k) # Check if binary choice model @@ -100,11 +100,11 @@ bias_corr <- function( } # Check if provided object matches requested panel structure - if (panel.structure == "classic") { + if (panel_structure == "classic") { if (!(k %in% c(1L, 2L))) { stop( paste( - "panel.structure == 'classic' expects a one- or two-way fixed", + "panel_structure == 'classic' expects a one- or two-way fixed", "effect model." ), call. = FALSE @@ -114,7 +114,7 @@ bias_corr <- function( if (!(k %in% c(2L, 3L))) { stop( paste( - "panel.structure == 'network' expects a two- or three-way fixed", + "panel_structure == 'network' expects a two- or three-way fixed", "effects model." ), call. = FALSE @@ -129,17 +129,17 @@ bias_corr <- function( wt <- object[["weights"]] # Generate auxiliary list of indexes for different sub panels - k.list <- get_index_list_(k.vars, data) + k_list <- get_index_list_(k_vars, data) # Compute derivatives and weights eta <- object[["eta"]] mu <- family[["linkinv"]](eta) - mu.eta <- family[["mu.eta"]](eta) + mu_eta <- family[["mu.eta"]](eta) v <- wt * (y - mu) - w <- wt * mu.eta + w <- wt * mu_eta z <- wt * partial_mu_eta_(eta, family, 2L) if (family[["link"]] != "logit") { - h <- mu.eta / family[["variance"]](mu) + h <- mu_eta / family[["variance"]](mu) v <- h * v w <- h * w z <- h * z @@ -150,54 +150,54 @@ bias_corr <- function( if (control[["keep_mx"]]) { MX <- object[["MX"]] } else { - MX <- center_variables_(X, w, k.list, control[["center_tol"]], 10000L) + MX <- center_variables_r_(X, w, k_list, control[["center_tol"]], 10000L) } # Compute bias terms for requested bias correction - if (panel.structure == "classic") { + if (panel_structure == "classic") { # Compute \hat{B} and \hat{D} - b <- as.vector(group_sums_(MX * z, w, k.list[[1L]])) / 2.0 / nt + b <- as.vector(group_sums_(MX * z, w, k_list[[1L]])) / 2.0 / nt if (k > 1L) { - b <- b + as.vector(group_sums_(MX * z, w, k.list[[2L]])) / 2.0 / nt + b <- b + as.vector(group_sums_(MX * z, w, k_list[[2L]])) / 2.0 / nt } # Compute spectral density part of \hat{B} if (L > 0L) { - b <- (b + group_sums_spectral_(MX * w, v, w, L, k.list[[1L]])) / nt + b <- (b + group_sums_spectral_(MX * w, v, w, L, k_list[[1L]])) / nt } } else { # Compute \hat{D}_{1}, \hat{D}_{2}, and \hat{B} - b <- group_sums_(MX * z, w, k.list[[1L]]) / (2.0 * nt) - b <- (b + group_sums_(MX * z, w, k.list[[2L]])) / (2.0 * nt) + b <- group_sums_(MX * z, w, k_list[[1L]]) / (2.0 * nt) + b <- (b + group_sums_(MX * z, w, k_list[[2L]])) / (2.0 * nt) if (k > 2L) { - b <- (b + group_sums_(MX * z, w, k.list[[3L]])) / (2.0 * nt) + b <- (b + group_sums_(MX * z, w, k_list[[3L]])) / (2.0 * nt) } # Compute spectral density part of \hat{B} if (k > 2L && L > 0L) { - b <- (b + group_sums_spectral_(MX * w, v, w, L, k.list[[3L]])) / nt + b <- (b + group_sums_spectral_(MX * w, v, w, L, k_list[[3L]])) / nt } } # Compute bias-corrected structural parameters - beta <- beta.uncorr - solve(object[["hessian"]] / nt, b) + beta <- beta_uncorr - solve(object[["hessian"]] / nt, b) names(beta) <- nms.sp # Update \eta and first- and second-order derivatives eta <- feglm_offset_(object, X %*% beta) mu <- family[["linkinv"]](eta) - mu.eta <- family[["mu.eta"]](eta) + mu_eta <- family[["mu.eta"]](eta) v <- wt * (y - mu) - w <- wt * mu.eta + w <- wt * mu_eta if (family[["link"]] != "logit") { - h <- mu.eta / family[["variance"]](mu) + h <- mu_eta / family[["variance"]](mu) v <- h * v w <- h * w rm(h) } # Update centered regressor matrix - MX <- center_variables_r_(X, w, k.list, control[["center_tol"]], 10000L) + MX <- center_variables_r_(X, w, k_list, control[["center_tol"]], 10000L) colnames(MX) <- nms.sp # Update hessian @@ -209,9 +209,9 @@ bias_corr <- function( object[["eta"]] <- eta if (control[["keep_mx"]]) object[["MX"]] <- MX object[["hessian"]] <- H - object[["coefficients.uncorr"]] <- beta.uncorr - object[["bias.term"]] <- b - object[["panel.structure"]] <- panel.structure + object[["coefficients_uncorr"]] <- beta_uncorr + object[["bias_term"]] <- b + object[["panel_structure"]] <- panel_structure object[["bandwidth"]] <- L # Add additional class to result list diff --git a/R/capybara-package.R b/R/capybara-package.R index 3bd6236..1981179 100644 --- a/R/capybara-package.R +++ b/R/capybara-package.R @@ -18,7 +18,7 @@ #' @importFrom MASS negative.binomial theta.ml #' @importFrom rlang sym := #' @importFrom stats as.formula binomial model.matrix na.omit gaussian poisson -#' pnorm printCoefmat rgamma rlogis rnorm rpois terms vcov predict sd +#' pnorm printCoefmat rgamma rlogis rnorm rpois terms vcov predict var #' complete.cases #' @importFrom utils combn #' @useDynLib capybara, .registration = TRUE diff --git a/R/cpp11.R b/R/cpp11.R index cfa7e4a..68b05aa 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -28,6 +28,10 @@ feglm_fit_ <- function(beta_r, eta_r, y_r, x_r, wt_r, theta, family, control, k_ .Call(`_capybara_feglm_fit_`, beta_r, eta_r, y_r, x_r, wt_r, theta, family, control, k_list) } +feglm_offset_fit_ <- function(eta_r, y_r, offset_r, wt_r, family, control, k_list) { + .Call(`_capybara_feglm_offset_fit_`, eta_r, y_r, offset_r, wt_r, family, control, k_list) +} + kendall_cor_ <- function(m) { .Call(`_capybara_kendall_cor_`, m) } diff --git a/R/feglm.R b/R/feglm.R index d030568..37dce94 100644 --- a/R/feglm.R +++ b/R/feglm.R @@ -157,19 +157,15 @@ feglm <- function( } dimnames(fit[["hessian"]]) <- list(nms_sp, nms_sp) - # Generate result list ---- - reslist <- c( - fit, list( - nobs = nobs, - lvls_k = lvls_k, - nms_fe = nms_fe, - formula = formula, - data = data, - family = family, - control = control - ) - ) + # Add to fit list ---- + fit[["nobs"]] <- nobs + fit[["lvls_k"]] <- lvls_k + fit[["nms_fe"]] <- nms_fe + fit[["formula"]] <- formula + fit[["data"]] <- data + fit[["family"]] <- family + fit[["control"]] <- control # Return result list ---- - structure(reslist, class = "feglm") + structure(fit, class = "feglm") } diff --git a/R/feglm_control.R b/R/feglm_control.R index d145efd..31288f9 100644 --- a/R/feglm_control.R +++ b/R/feglm_control.R @@ -39,7 +39,7 @@ feglm_control <- function( limit = 10L, trace = FALSE, drop_pc = TRUE, - keep_mx = TRUE) { + keep_mx = FALSE) { # Check validity of tolerance parameters if (dev_tol <= 0.0 || center_tol <= 0.0) { stop( diff --git a/R/feglm_offset.R b/R/feglm_offset.R index 6108b20..b46ca70 100644 --- a/R/feglm_offset.R +++ b/R/feglm_offset.R @@ -27,7 +27,7 @@ feglm_offset_ <- function(object, offset) { # Generate auxiliary list of indexes to project out the fixed effects k_list <- get_index_list_(k_vars, data) - # Compute starting guess for \eta + # Compute starting guess for eta if (family[["family"]] == "binomial") { eta <- rep(family[["linkfun"]](sum(wt * (y + 0.5) / 2.0) / sum(wt)), nt) } else if (family[["family"]] %in% c("Gamma", "inverse.gaussian")) { @@ -36,54 +36,8 @@ feglm_offset_ <- function(object, offset) { eta <- rep(family[["linkfun"]](sum(wt * (y + 0.1)) / sum(wt)), nt) } - # Compute initial quantities for the maximization routine - mu <- family[["linkinv"]](eta) - dev <- sum(family[["dev.resids"]](y, mu, wt)) - Myadj <- as.matrix(numeric(nt)) - - # Start maximization of the log-likelihood - for (iter in seq.int(iter_max)) { - # Store \eta, \beta, and deviance of the previous iteration - eta_old <- eta - dev_old <- dev - - # Compute weights and dependent variable - mu.eta <- family[["mu.eta"]](eta) - w <- (wt * mu.eta^2) / family[["variance"]](mu) - yadj <- (y - mu) / mu.eta + eta - offset - - # Centering dependent variable and compute \eta update - Myadj <- center_variables_r_(Myadj + yadj, w, k_list, center_tol, 10000L) - eta_upd <- yadj - drop(Myadj) + offset - eta - - # Step-halving with three checks - # 1. finite deviance - # 2. valid \eta and \mu - # 3. improvement as in glm2 - rho <- 1.0 - for (inner.iter in seq.int(50L)) { - eta <- eta_old + rho * eta_upd - mu <- family[["linkinv"]](eta) - dev <- sum(family[["dev.resids"]](y, mu, wt)) - dev.crit <- is.finite(dev) - val.crit <- family[["valideta"]](eta) && family[["validmu"]](mu) - imp.crit <- (dev - dev_old) / (0.1 + abs(dev)) <= -dev_tol - if (dev.crit && val.crit && imp.crit) break - rho <- rho / 2.0 - } - - # Check if step-halving failed - if (!dev.crit || !val.crit) { - stop("Inner loop failed; cannot correct step size.", call. = FALSE) - } - - # Check termination condition - if (abs(dev - dev_old) / (0.1 + abs(dev)) < dev_tol) break - - # Update starting guesses for acceleration - Myadj <- Myadj - yadj - } - # Return eta - eta + if (is.integer(y)) { y <- as.numeric(y) } + feglm_offset_fit_(eta, y, offset, wt, family[["family"]], control, + k_list) } diff --git a/R/helpers.R b/R/helpers.R index 6b2d696..57bde2d 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -379,7 +379,7 @@ get_score_matrix_ <- function(object) { attr(X, "dimnames") <- NULL # Center variables - MX <- center_variables_(X, NA_real_, w, k.list, control[["center_tol"]], 10000L, FALSE) + MX <- center_variables_r_(X, w, k.list, control[["center_tol"]], 10000L) colnames(MX) <- nms_sp } diff --git a/README.Rmd b/README.Rmd index fcf6918..15a7f69 100644 --- a/README.Rmd +++ b/README.Rmd @@ -125,7 +125,7 @@ Median time for the different models in the book |:------------|-------:|-----------------:|------------:|-----------------:|----------------------------:|--------------:| | Alpaca | 0.4s | 2.6s | 1.6s | 2.0s | 3.1s | 5.3s | | Base R | 120.0s | 2.0m | 1380.0s | 1440.0s | 1380.0s | 1500.0s | -| **Capybara**| 0.3s | 2.2s | 1.4s | 1.7s | 2.1s | 3.9s | +| **Capybara**| 0.3s | 2.0s | 1.2s | 1.4s | 1.7s | 3.4s | | Fixest | 0.1s | 0.5s | 0.1s | 0.2s | 0.3s | 0.5s | Memory allocation for the same models @@ -134,7 +134,7 @@ Memory allocation for the same models |:------------|-------:|---------------:|-----------:|-----------------:|--------------------------:|-------------:| |Alpaca | 307MB | 341MB | 306MB | 336MB | 395MB | 541MB | |Base R | 3000MB | 3000MB | 12000MB | 12000GB | 12000GB | 12000MB | -|**Capybara** | 29MB | 34MB | 21MB | 24MB | 30MB | 47MB | +|**Capybara** | 27MB | 32MB | 20MB | 23MB | 29MB | 43MB | |Fixest | 44MB | 36MB | 27MB | 32MB | 41MB | 63MB | # Debugging diff --git a/README.md b/README.md index 344079d..64db857 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Analysis](https://www.wto.org/english/res_e/publications_e/advancedguide2016_e.h | :----------- | -----: | --------------: | ----------: | ----------------: | -------------------------: | ------------: | | Alpaca | 0.4s | 2.6s | 1.6s | 2.0s | 3.1s | 5.3s | | Base R | 120.0s | 2.0m | 1380.0s | 1440.0s | 1380.0s | 1500.0s | -| **Capybara** | 0.3s | 2.2s | 1.4s | 1.7s | 2.1s | 3.9s | +| **Capybara** | 0.3s | 2.0s | 1.2s | 1.4s | 1.7s | 3.4s | | Fixest | 0.1s | 0.5s | 0.1s | 0.2s | 0.3s | 0.5s | Memory allocation for the same models @@ -130,7 +130,7 @@ Memory allocation for the same models | :----------- | -----: | --------------: | ----------: | ----------------: | -------------------------: | ------------: | | Alpaca | 307MB | 341MB | 306MB | 336MB | 395MB | 541MB | | Base R | 3000MB | 3000MB | 12000MB | 12000GB | 12000GB | 12000MB | -| **Capybara** | 29MB | 34MB | 21MB | 24MB | 30MB | 47MB | +| **Capybara** | 27MB | 32MB | 20MB | 23MB | 29MB | 43MB | | Fixest | 44MB | 36MB | 27MB | 32MB | 41MB | 63MB | # Debugging diff --git a/dev/benchmarks_tests_agtpa_capybara_only.R b/dev/benchmarks_tests_agtpa_capybara_only.R index 21c7d2e..236574f 100644 --- a/dev/benchmarks_tests_agtpa_capybara_only.R +++ b/dev/benchmarks_tests_agtpa_capybara_only.R @@ -1,7 +1,7 @@ # this is not just about speed/memory, but also about obtaining the same # slopes as in base R -library(alpaca) +library(capybara) library(dplyr) library(tidyr) library(janitor) diff --git a/dev/test-offset.R b/dev/test-offset.R new file mode 100644 index 0000000..e833619 --- /dev/null +++ b/dev/test-offset.R @@ -0,0 +1,18 @@ +test_that("offset works", { + m1 <- feglm(mpg ~ wt | cyl, data = mtcars, family = poisson()) + y <- predict(m1, type = "response") + o1 <- feglm_offset_(m1, y) + + # m2 <- alpaca::feglm(mpg ~ wt | cyl, data = mtcars, family = poisson()) + # o2 <- drop(alpaca:::feglmOffset(m2, y) + # datapasta::vector_paste(round(o2, 4)) + o2 <- c( + 3.018703, 3.011154, 3.056387, 3.001613, 2.979713, 2.995091, 2.976723, + 3.026537, 3.027809, 2.995612, 2.995612, 2.999650, 3.006936, 3.005836, + 2.977558, 2.974679, 2.975975, 3.094682, 3.062526, 3.053450, 3.029361, + 2.956144, 2.958109, 2.949010, 2.948902, 3.049442, 3.041447, 3.066858, + 2.964431, 2.992499, 2.955002, 3.018302 + ) + + expect_equal(round(o1, 4), round(o2, 4)) +}) diff --git a/docs/index.html b/docs/index.html index c951618..4b687da 100644 --- a/docs/index.html +++ b/docs/index.html @@ -179,11 +179,11 @@

Benchmarks Capybara 0.3s -2.2s +2.0s +1.2s 1.4s 1.7s -2.1s -3.9s +3.4s Fixest @@ -237,12 +237,12 @@

Benchmarks Capybara +27MB +32MB +20MB +23MB 29MB -34MB -21MB -24MB -30MB -47MB +43MB Fixest diff --git a/docs/news/index.html b/docs/news/index.html index 8d7cdfe..1330cb0 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -60,7 +60,7 @@

Changelog

  • Moves all the heavy computation to C++ using Armadillo and it exports the results to R. Previously, there were multiple data copies between R and C++ that added overhead to the computations.
  • -
  • For a future release, I may rewrite the offset and APES computation to C++, but with those the overhead is minimal.
  • +
  • The previous versions returned MX by default, now it has to be specified.
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index b16a896..1065a4d 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,5 +3,5 @@ pkgdown: 2.0.7 pkgdown_sha: ~ articles: intro: intro.html -last_built: 2024-07-22T03:41Z +last_built: 2024-07-23T00:52Z diff --git a/docs/reference/apes.html b/docs/reference/apes.html index 6032919..194abc6 100644 --- a/docs/reference/apes.html +++ b/docs/reference/apes.html @@ -186,7 +186,7 @@

Examples

mod_bc <- bias_corr(mod) summary(mod_bc) #> Formula: trade ~ lang | year -#> <environment: 0x5fa3c5f54f50> +#> <environment: 0x579102d4cff0> #> #> Family: Binomial #> @@ -204,9 +204,12 @@

Examples

# Compute bias-corrected average partial effects mod_ape_bc <- apes(mod_bc) -#> Error in if (panel_structure == "classic") { if (!(k %in% c(1L, 2L))) { stop(paste("panel_structure == 'classic' expects a one- or two-way fixed", "effects model."), call. = FALSE) }} else { if (!(k %in% c(2L, 3L))) { stop(paste("panel_structure == 'network' expects a two- or three-way fixed", "effects model."), call. = FALSE) }}: argument is of length zero summary(mod_ape_bc) -#> Error in eval(expr, envir, enclos): object 'mod_ape_bc' not found +#> Estimates: +#> Estimate Std. Error z value Pr(>|z|) +#> lang 0.05596 0.01513 3.699 0.000216 *** +#> --- +#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
diff --git a/docs/reference/bias_corr.html b/docs/reference/bias_corr.html index 2c72fce..4dc5d37 100644 --- a/docs/reference/bias_corr.html +++ b/docs/reference/bias_corr.html @@ -76,7 +76,7 @@

Asymptotic bias correction after fitting binary choice models with a
-
bias_corr(object = NULL, L = 0L, panel.structure = c("classic", "network"))
+
bias_corr(object = NULL, L = 0L, panel_structure = c("classic", "network"))
@@ -95,7 +95,7 @@

Arguments

be partialed out is important for bandwidths larger than zero.

-
panel.structure
+
panel_structure

a string equal to "classic" or "network" which determines the structure of the panel used. "classic" denotes panel structures where for example the same cross-sectional units are @@ -146,7 +146,7 @@

Examples

mod_bc <- bias_corr(mod) summary(mod_bc) #> Formula: trade ~ lang | year -#> <environment: 0x5fa3c58f51f0> +#> <environment: 0x579100916318> #> #> Family: Binomial #> diff --git a/docs/reference/feglm.html b/docs/reference/feglm.html index 908094d..09beb95 100644 --- a/docs/reference/feglm.html +++ b/docs/reference/feglm.html @@ -162,7 +162,7 @@

Examples

summary(mod) #> Formula: trade ~ log_dist + lang + cntg + clny | exp_year + imp_year -#> <environment: 0x5fa3c55b9a20> +#> <environment: 0x5791016e2240> #> #> Family: Poisson #> @@ -192,7 +192,7 @@

Examples

summary(mod, type = "clustered") #> Formula: trade ~ log_dist + lang + cntg + clny | exp_year + imp_year | #> pair -#> <environment: 0x5fa3c55b9a20> +#> <environment: 0x5791016e2240> #> #> Family: Poisson #> diff --git a/docs/reference/feglm_control.html b/docs/reference/feglm_control.html index ec24887..8f375a0 100644 --- a/docs/reference/feglm_control.html +++ b/docs/reference/feglm_control.html @@ -72,7 +72,7 @@

Set feglm Control Parameters

limit = 10L, trace = FALSE, drop_pc = TRUE, - keep_mx = TRUE + keep_mx = FALSE )
diff --git a/docs/reference/felm.html b/docs/reference/felm.html index ad1d1c1..82d4ea1 100644 --- a/docs/reference/felm.html +++ b/docs/reference/felm.html @@ -116,7 +116,7 @@

Examples

summary(mod) #> Formula: log(trade) ~ log_dist + lang + cntg + clny | exp_year + imp_year -#> <environment: 0x5fa3c75f1ab0> +#> <environment: 0x5791041ea358> #> #> Estimates: #> diff --git a/docs/reference/fenegbin.html b/docs/reference/fenegbin.html index a665741..be195d6 100644 --- a/docs/reference/fenegbin.html +++ b/docs/reference/fenegbin.html @@ -141,7 +141,7 @@

Examples

summary(mod) #> Formula: trade ~ log_dist + lang + cntg + clny | exp_year + imp_year -#> <environment: 0x5fa3c7540330> +#> <environment: 0x57910411c3a0> #> #> Family: Negative Binomial(1.1839) #> diff --git a/docs/reference/fepoisson.html b/docs/reference/fepoisson.html index 2c38a99..89fea9f 100644 --- a/docs/reference/fepoisson.html +++ b/docs/reference/fepoisson.html @@ -128,7 +128,7 @@

Examples

summary(mod) #> Formula: trade ~ log_dist + lang + cntg + clny | exp_year + imp_year -#> <environment: 0x5fa3c48bf728> +#> <environment: 0x579101725c50> #> #> Family: Poisson #> diff --git a/man/bias_corr.Rd b/man/bias_corr.Rd index 9026431..7825153 100644 --- a/man/bias_corr.Rd +++ b/man/bias_corr.Rd @@ -5,7 +5,7 @@ \title{Asymptotic bias correction after fitting binary choice models with a 1,2,3-way error component} \usage{ -bias_corr(object = NULL, L = 0L, panel.structure = c("classic", "network")) +bias_corr(object = NULL, L = 0L, panel_structure = c("classic", "network")) } \arguments{ \item{object}{an object of class \code{"feglm"}.} @@ -18,7 +18,7 @@ weakly exogenous regressors, e.g. lagged outcome variables, we suggest to choose a bandwidth between one and four. Note that the order of factors to be partialed out is important for bandwidths larger than zero.} -\item{panel.structure}{a string equal to \code{"classic"} or \code{"network"} +\item{panel_structure}{a string equal to \code{"classic"} or \code{"network"} which determines the structure of the panel used. \code{"classic"} denotes panel structures where for example the same cross-sectional units are observed several times (this includes pseudo panels). \code{"network"} diff --git a/man/feglm_control.Rd b/man/feglm_control.Rd index b5d7c89..9838e19 100644 --- a/man/feglm_control.Rd +++ b/man/feglm_control.Rd @@ -11,7 +11,7 @@ feglm_control( limit = 10L, trace = FALSE, drop_pc = TRUE, - keep_mx = TRUE + keep_mx = FALSE ) } \arguments{ diff --git a/src/00_main.h b/src/00_main.h index 926b2dd..d4153b2 100644 --- a/src/00_main.h +++ b/src/00_main.h @@ -27,3 +27,20 @@ Col solve_eta_(const Mat &MX, const Mat &MNU, Mat crossprod_(const Mat &X, const Col &w, const int &n, const int &p, const bool &weighted, const bool &root_weights); + +std::string tidy_family_(const std::string &family); + +Col link_inv_(const Col &eta, const std::string &fam); + +double dev_resids_(const Col &y, const Col &mu, + const double &theta, const Col &wt, + const std::string &fam); + +Col mu_eta_(Col &eta, const std::string &fam); + +Col variance_(const Col &mu, const double &theta, + const std::string &fam); + +bool valid_eta_(const Col &eta, const std::string &fam); + +bool valid_mu_(const Col &mu, const std::string &fam); diff --git a/src/05_glm_fit.cpp b/src/05_glm_fit.cpp index e608f21..9973818 100644 --- a/src/05_glm_fit.cpp +++ b/src/05_glm_fit.cpp @@ -371,22 +371,13 @@ Col variance_(const Col &mu, const double &theta, beta = beta_old + (rho * beta_upd); mu = link_inv_(eta, fam); dev = dev_resids_(y, mu, theta, wt, fam); - dev_ratio_inner = (dev - dev_old) / (0.1 + fabs(dev_old)); - - // std::cout << "iter: " << iter << std::endl; - // std::cout << "iter_inner: " << iter_inner << std::endl; - // std::cout << "beta old: " << beta_old.t() << std::endl; - // std::cout << "beta: " << beta.t() << std::endl; - // std::cout << "dev: " << dev << std::endl; - // std::cout << "dev_ratio_inner: " << dev_ratio_inner << std::endl; - // std::cout << "dev_tol: " << dev_tol << std::endl; + dev_ratio_inner = (dev - dev_old) / (0.1 + fabs(dev)); dev_crit = is_finite(dev); val_crit = (valid_eta_(eta, fam) && valid_mu_(mu, fam)); imp_crit = (dev_ratio_inner <= -dev_tol); if (dev_crit == true && val_crit == true && imp_crit == true) { - // std::cout << "ok" << std::endl; break; } @@ -433,10 +424,6 @@ Col variance_(const Col &mu, const double &theta, mu_eta = mu_eta_(eta, fam); w = (wt % square(mu_eta)) / variance_(mu, theta, fam); - // Center variables - - MX = center_variables_(as_Mat(x_r), w, k_list, center_tol, iter_center_max); - // Recompute Hessian H = crossprod_(MX, w, n, p, true, true); @@ -455,7 +442,11 @@ Col variance_(const Col &mu, const double &theta, out.push_back({"iter"_nm = iter + 1}); if (keep_mx == true) { - out.push_back({"MX"_nm = as_doubles_matrix(MX)}); + out.push_back({ + "MX"_nm = as_doubles_matrix( + center_variables_(as_Mat(x_r), w, k_list, center_tol, iter_center_max) + ) + }); } return out; diff --git a/src/06_glm_offset_fit.cpp b/src/06_glm_offset_fit.cpp new file mode 100644 index 0000000..51e7077 --- /dev/null +++ b/src/06_glm_offset_fit.cpp @@ -0,0 +1,101 @@ +#include "00_main.h" + +[[cpp11::register]] doubles feglm_offset_fit_( + const doubles &eta_r, const doubles &y_r, const doubles &offset_r, + const doubles &wt_r, const std::string &family, const list &control, + const list &k_list) { + // Type conversion + + Col eta = as_Col(eta_r); + Col y = as_Col(y_r); + Col offset = as_Col(offset_r); + Mat Myadj = Mat(y.n_elem, 1, fill::zeros); + Col wt = as_Col(wt_r); + + // Auxiliary variables (fixed) + + std::string fam = tidy_family_(family); + double center_tol = as_cpp(control["center_tol"]); + double dev_tol = as_cpp(control["dev_tol"]); + int iter, iter_max = as_cpp(control["iter_max"]); + int iter_center_max = 10000; + int iter_inner, iter_inner_max = 50; + + // Auxiliary variables (storage) + + Col mu = link_inv_(eta, fam); + double dev = dev_resids_(y, mu, 0.0, wt, fam); + + const int n = y.n_elem; + Col mu_eta(n), yadj(n); + Mat w(n, 1); + bool conv = false; + + bool dev_crit, val_crit, imp_crit; + double dev_old, dev_ratio, dev_ratio_inner, rho; + Col eta_upd(n), eta_old(n); + + // Maximize the log-likelihood + + for (iter = 0; iter < iter_max; ++iter) { + rho = 1.0; + eta_old = eta, dev_old = dev; + + // Compute weights and dependent variable + + mu_eta = mu_eta_(eta, fam); + w = (wt % square(mu_eta)) / variance_(mu, 0.0, fam); + yadj = (y - mu) / mu_eta + eta - offset; + + // Center variables + + Myadj = center_variables_(Myadj + yadj, w, k_list, center_tol, iter_center_max); + + // Compute update step and update eta + + // Step-halving with three checks: + // 1. finite deviance + // 2. valid eta and mu + // 3. improvement as in glm2 + + eta_upd = yadj - Myadj + offset - eta; + + for (iter_inner = 0; iter_inner < iter_inner_max; ++iter_inner) { + eta = eta_old + (rho * eta_upd); + mu = link_inv_(eta, fam); + dev = dev_resids_(y, mu, 0.0, wt, fam); + dev_ratio_inner = (dev - dev_old) / (0.1 + fabs(dev_old)); + + dev_crit = is_finite(dev); + val_crit = (valid_eta_(eta, fam) && valid_mu_(mu, fam)); + imp_crit = (dev_ratio_inner <= -dev_tol); + + if (dev_crit == true && val_crit == true && imp_crit == true) { + break; + } + + rho *= 0.5; + } + + // Check if step-halving failed (deviance and invalid eta or mu) + + if (dev_crit == false || val_crit == false) { + stop("Inner loop failed; cannot correct step size."); + } + + // Check convergence + + dev_ratio = fabs(dev - dev_old) / (0.1 + fabs(dev)); + + if (dev_ratio < dev_tol) { + conv = true; + break; + } + + // Update starting guesses for acceleration + + Myadj = Myadj - yadj; + } + + return as_doubles(eta); +} diff --git a/src/06_kendall_correlation.cpp b/src/07_kendall_correlation.cpp similarity index 100% rename from src/06_kendall_correlation.cpp rename to src/07_kendall_correlation.cpp diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 069099b..5d10912 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -54,6 +54,13 @@ extern "C" SEXP _capybara_feglm_fit_(SEXP beta_r, SEXP eta_r, SEXP y_r, SEXP x_r return cpp11::as_sexp(feglm_fit_(cpp11::as_cpp>(beta_r), cpp11::as_cpp>(eta_r), cpp11::as_cpp>(y_r), cpp11::as_cpp &>>(x_r), cpp11::as_cpp>(wt_r), cpp11::as_cpp>(theta), cpp11::as_cpp>(family), cpp11::as_cpp>(control), cpp11::as_cpp>(k_list))); END_CPP11 } +// 06_glm_offset_fit.cpp +doubles feglm_offset_fit_(const doubles & eta_r, const doubles & y_r, const doubles & offset_r, const doubles & wt_r, const std::string & family, const list & control, const list & k_list); +extern "C" SEXP _capybara_feglm_offset_fit_(SEXP eta_r, SEXP y_r, SEXP offset_r, SEXP wt_r, SEXP family, SEXP control, SEXP k_list) { + BEGIN_CPP11 + return cpp11::as_sexp(feglm_offset_fit_(cpp11::as_cpp>(eta_r), cpp11::as_cpp>(y_r), cpp11::as_cpp>(offset_r), cpp11::as_cpp>(wt_r), cpp11::as_cpp>(family), cpp11::as_cpp>(control), cpp11::as_cpp>(k_list))); + END_CPP11 +} // 06_kendall_correlation.cpp double kendall_cor_(const doubles_matrix<> & m); extern "C" SEXP _capybara_kendall_cor_(SEXP m) { @@ -73,6 +80,7 @@ extern "C" { static const R_CallMethodDef CallEntries[] = { {"_capybara_center_variables_r_", (DL_FUNC) &_capybara_center_variables_r_, 5}, {"_capybara_feglm_fit_", (DL_FUNC) &_capybara_feglm_fit_, 9}, + {"_capybara_feglm_offset_fit_", (DL_FUNC) &_capybara_feglm_offset_fit_, 7}, {"_capybara_get_alpha_", (DL_FUNC) &_capybara_get_alpha_, 3}, {"_capybara_group_sums_", (DL_FUNC) &_capybara_group_sums_, 3}, {"_capybara_group_sums_cov_", (DL_FUNC) &_capybara_group_sums_cov_, 3}, diff --git a/tests/testthat/test-apes-bias.R b/tests/testthat/test-apes-bias.R index e99002b..4ab15f7 100644 --- a/tests/testthat/test-apes-bias.R +++ b/tests/testthat/test-apes-bias.R @@ -2,15 +2,23 @@ test_that("apes/bias works", { trade_short <- trade_panel[trade_panel$year %in% 2002L:2006L, ] trade_short$trade <- ifelse(trade_short$trade > 100, 1L, 0L) - mod <- feglm(trade ~ lang | year, trade_short, family = binomial()) + mod1 <- feglm(trade ~ lang | year, trade_short, family = binomial()) + apes1 <- apes(mod1) + bias1 <- bias_corr(mod1) - # names(mod) - # length(mod) + # mod2 <- alpaca::feglm(trade ~ lang | year, trade_short, family = binomial()) + # apes2 <- alpaca::getAPEs(mod2) + # bias2 <- alpaca::biasCorr(mod2) + apes2 <- c("lang" = 0.05594) + bias2 <- c("lang" = 0.23390) - expect_output(print(mod)) + expect_output(print(mod1)) - expect_gt(length(coef(apes(mod))), 0) - expect_gt(length(coef(summary(apes(mod)))), 0) - expect_gt(length(coef(bias_corr(mod))), 0) - expect_gt(length(coef(summary(bias_corr(mod)))), 0) + expect_equal(length(coef(apes1)), 1) + expect_equal(round(coef(apes1), 5), apes2) + expect_equal(length(coef(summary(apes(mod1)))), 4) + + expect_equal(length(coef(bias1)), 1) + expect_equal(round(coef(bias1), 1), round(bias2, 1)) + expect_equal(length(coef(summary(bias1))), 4) }) diff --git a/tests/testthat/test-felm.R b/tests/testthat/test-felm.R index a28a4eb..8aa0991 100644 --- a/tests/testthat/test-felm.R +++ b/tests/testthat/test-felm.R @@ -1,5 +1,4 @@ test_that("felm works", { - load_all() m1 <- felm(mpg ~ wt | cyl, mtcars) m2 <- lm(mpg ~ wt + as.factor(cyl), mtcars) diff --git a/tests/testthat/test-fepoisson.R b/tests/testthat/test-fepoisson.R index 838c0b8..582dd66 100644 --- a/tests/testthat/test-fepoisson.R +++ b/tests/testthat/test-fepoisson.R @@ -29,18 +29,17 @@ test_that("fepoisson is similar to fixest", { fes <- fixed_effects(mod) n <- unname(mod[["nobs"]]["nobs"]) - p <- dim(mod[["MX"]])[2] expect_equal(length(fes), 2) expect_equal(length(fitted(mod)), n) expect_equal(length(predict(mod)), n) - expect_equal(length(coef(mod)), p) + expect_equal(length(coef(mod)), 4) expect_equal(length(fes), 2) expect_equal(round(fes[["exp_year"]][1:3], 3), c(10.195, 11.081, 11.260)) expect_equal(round(fes[["imp_year"]][1:3], 3), c(0.226, -0.254, 1.115)) smod <- summary(mod) - expect_equal(length(coef(smod)[, 1]), p) + expect_equal(length(coef(smod)[, 1]), 4) expect_output(summary_formula_(smod)) expect_output(summary_family_(smod)) expect_output(summary_estimates_(smod, 3))