Merge pull request #264 from jr-leary7/dev

Dev
jr-leary7 · Oct 29, 2024 · 4648e53 · 4648e53
2 parents 6f385a9 + 14e3650
commit 4648e53
Show file tree

Hide file tree

Showing 20 changed files with 212 additions and 68 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model Gene Expression Dynamics with Spline-Based NB GLMs, GEEs, & GLMMs
-Version: 0.8.6
+Version: 0.8.7
 Authors@R: c(person(given = c("Jack", "R."), family = "Leary", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), 
              person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
 Description: Our scLANE model uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 

diff --git a/NAMESPACE b/NAMESPACE
@@ -136,6 +136,7 @@ importFrom(stats,fitted.values)
 importFrom(stats,hclust)
 importFrom(stats,kmeans)
 importFrom(stats,logLik)
+importFrom(stats,model.matrix)
 importFrom(stats,offset)
 importFrom(stats,p.adjust)
 importFrom(stats,p.adjust.methods)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# Changes in v0.8.7 
+
++ Switched GEE fitting back to use `scale.fix = FALSE` and a fixed value for the Negative-binomial overdispersion parameter as it improves model fits. 
++ Added option to use a Lagrange Multiplier (Score) test for GEE mode instead of the default Wald test. The relevant argument is `gee.test` in `testDynamic()`.
++ Updated documentation and some tests. 
++ Added column called `Null_Fit_Notes` to output from `getResultsDE()` to describe when and how null models fail. This doesn't happen frequently, but it's good info to have when it does. 
+
 # Changes in v0.8.6
 
 + Changed GEE fitting to use `scale.fix = TRUE` throughout the package, as it appears to be faster and more statistically efficient based on simulated data benchmarking. 

diff --git a/R/backward_sel_WIC.R b/R/backward_sel_WIC.R
@@ -7,7 +7,7 @@
 #' @importFrom gamlss gamlss
 #' @importFrom geeM geem
 #' @importFrom MASS negative.binomial
-#' @importFrom stats vcov
+#' @importFrom stats vcov coef
 #' @param Y The response variable. Defaults to NULL.
 #' @param B_new The model matrix. Defaults to NULL.
 #' @param is.gee Is the model a GEE? Defaults to FALSE.
@@ -36,8 +36,8 @@ backward_sel_WIC <- function(Y = NULL,
     fit <- geeM::geem(Y ~ B_new - 1,
                       id = id.vec,
                       corstr = cor.structure,
-                      family = MASS::negative.binomial(theta.hat, link = "log"),
-                      scale.fix = TRUE,
+                      family = MASS::negative.binomial(50, link = "log"),
+                      scale.fix = FALSE,
                       sandwich = sandwich.var)
     wald_stat <- unname(summary(fit)$wald.test[-1])^2
   } else {
@@ -47,7 +47,7 @@ backward_sel_WIC <- function(Y = NULL,
     vcov_mat <- try({ stats::vcov(fit, type = "all") }, silent = TRUE)
     if (inherits(vcov_mat, "try-error")) {
       covmat_unscaled <- chol2inv(fit$mu.qr$qr[1:(fit$mu.df - fit$mu.nl.df), 1:(fit$mu.df - fit$mu.nl.df), drop = FALSE])
-      wald_stat <- unname(coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1]
+      wald_stat <- unname(stats::coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1]
     } else {
       wald_stat <- unname((vcov_mat$coef / vcov_mat$se)[-c(1, length(vcov_mat$coef))]^2)
     }

diff --git a/R/getResultsDE.R b/R/getResultsDE.R
@@ -27,7 +27,7 @@ getResultsDE <- function(test.dyn.res = NULL,
                          fdr.cutoff = 0.01,
                          n.cores = 2L) {
   # check inputs
-  if (is.null(test.dyn.res)) { stop("Please provide a result list.") }
+  if (is.null(test.dyn.res)) { stop("Please provide a result list from testDynamic().") }
   if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") }
   # set up parallel processing
   if (n.cores > 1L) {
@@ -40,7 +40,7 @@ getResultsDE <- function(test.dyn.res = NULL,
                                      function(x) {
                                        purrr::map_dfr(x,
                                                       function(y) {
-                                                        as.data.frame(rbind(y[seq(13)])) %>%
+                                                        as.data.frame(rbind(y[seq(14)])) %>%
                                                           dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
                                                       })
                                      }) %>%

diff --git a/R/marge2.R b/R/marge2.R
@@ -24,7 +24,7 @@
 #' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
 #' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
 #' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.
-#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.
+#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 25.
 #' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".
 #' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.
 #' @param minspan (Optional) A set minimum span value. Defaults to NULL.
@@ -63,7 +63,7 @@ marge2 <- function(X_pred = NULL,
                    cor.structure = "ar1",
                    sandwich.var = FALSE,
                    approx.knot = TRUE,
-                   n.knot.max = 50,
+                   n.knot.max = 25,
                    glm.backend = "MASS",
                    tols_score = 1e-5,
                    minspan = NULL,
@@ -84,12 +84,6 @@ marge2 <- function(X_pred = NULL,
   }
   q <- ncol(X_pred)  # Number of predictor variables
   B <- as.matrix(rep(1, NN))  # Start with the intercept model.
-  # estimate "known" theta for GEE models
-  if (is.gee || !is.glmm) {
-    theta_hat <- MASS::theta.mm(y = Y,
-                                mu = mean(Y),
-                                dfr = NN - 1)
-  }
 
   pen <- 2  # penalty for GCV criterion -- could also switch to log(N) later
   colnames(B) <- "Intercept"
@@ -110,6 +104,7 @@ marge2 <- function(X_pred = NULL,
   breakFlag <- FALSE
   ok <- TRUE
   int.count <- 0
+  theta_hat <- 1
 
   while (ok) {  # this is such egregiously bad code lol
     if (breakFlag) {
@@ -167,6 +162,7 @@ marge2 <- function(X_pred = NULL,
           X_red <- seq(min(X_red),
                        max(X_red),
                        length.out = n.knot.max)
+          X_red <- round(X_red, 4)
         }
       } else {
         # original candidate knot selection from 2017 Stoklosa & Warton paper
@@ -638,13 +634,13 @@ marge2 <- function(X_pred = NULL,
       } else {
         var_name_list1 <- vector("list")
         if (trunc.type == 2) {
-          B_temp <- cbind(B, b1_new, b2_new) # Additive model with both truncated basis functions.
+          B_temp <- cbind(B, b1_new, b2_new)  # Additive model with both truncated basis functions.
           B_new <- cbind(b1_new, b2_new)
           B_names <- c(B_name1, B_name2)
           var_name_list1 <- c(var_name_list1, list(var_name))
           var_name_list1 <- c(var_name_list1, list(var_name))  # Repeat it because there are two new truncated basis function in the set.
         } else {
-          B_temp <- cbind(B, b1_new) # Additive model with one truncated basis function (i.e., the positive part).
+          B_temp <- cbind(B, b1_new)  # Additive model with one truncated basis function (i.e., the positive part).
           B_new <- b1_new
           B_names <- B_name1
           var_name_list1 <- c(var_name_list1, list(var_name))
@@ -777,7 +773,7 @@ marge2 <- function(X_pred = NULL,
   colnames(wic_mat_2)[(ncol_B  + 1)] <- "Forward pass model"
 
   wic_mat_2[1, (ncol_B + 1)] <- full.wic
-  wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new)
+  wic1_2 <- backward_sel_WIC(Y, B_new = B_new)
   wic_mat_2[2, 2:(length(wic1_2) + 1)] <- wic1_2
   WIC_2 <- sum(apply(wic_mat_2[seq(2), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new)
   WIC_vec_2 <- c(WIC_vec_2, WIC_2)
@@ -787,8 +783,7 @@ marge2 <- function(X_pred = NULL,
   B_new_2 <- as.matrix(B_new[, -(variable.lowest_2 + 1)])
   cnames_2 <- c(cnames_2, list(colnames(B_new_2)))
   for (i in 2:(ncol_B - 1)) {
-    wic1_2 <- backward_sel_WIC(Y = Y,
-                               B_new = B_new_2)
+    wic1_2 <- backward_sel_WIC(Y, B_new = B_new_2)
     if (i != (ncol_B - 1)) {
       wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2
       WIC_2 <- sum(apply(wic_mat_2[seq(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
@@ -837,24 +832,24 @@ marge2 <- function(X_pred = NULL,
       final_mod <- geeM::geem(model_formula,
                               data = model_df,
                               id = id.vec,
-                              family = MASS::negative.binomial(theta_hat, link = log),
+                              family = MASS::negative.binomial(50, link = log),
                               corstr = cor.structure,
-                              scale.fix = TRUE,
+                              scale.fix = FALSE,
                               sandwich = sandwich.var)
     } else {
       if (glm.backend == "MASS") {
         final_mod <- MASS::glm.nb(model_formula,
                                   data = model_df,
                                   method = "glm.fit2",
                                   link = log,
-                                  init.theta = theta_hat,
+                                  init.theta = 1,
                                   y = FALSE,
                                   model = FALSE)
         final_mod <- stripGLM(glm.obj = final_mod)
       } else if (glm.backend == "speedglm") {
         final_mod <- speedglm::speedglm(model_formula,
                                         data = model_df,
-                                        family = MASS::negative.binomial(theta_hat, link = "log"),
+                                        family = MASS::negative.binomial(50, link = log),
                                         trace = FALSE,
                                         model = FALSE,
                                         y = FALSE,

diff --git a/R/plotModelCoefs.R b/R/plotModelCoefs.R
@@ -24,7 +24,7 @@
 #' cell_offset <- createCellOffset(sim_counts)
 #' scLANE_de_res <- getResultsDE(scLANE_models)
 #' plotModelCoefs(scLANE_models,
-#'                gene = scLANE_de_res$Gene[1],
+#'                gene = "ACLY",
 #'                pt = sim_pseudotime,
 #'                expr.mat = sim_counts,
 #'                size.factor.offset = cell_offset)

diff --git a/R/pullMARGESummary.R b/R/pullMARGESummary.R
@@ -20,7 +20,7 @@ pullMARGESummary <- function(marge.model = NULL,
                              sandwich.var = FALSE,
                              is.glmm = FALSE) {
   # check inputs
-  if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary") }
+  if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary().") }
   # handle the degenerate case
   if (inherits(marge.model, "try-error")) {
     res <- list(marge_pred_df = NULL,

diff --git a/R/pullNullSummary.R b/R/pullNullSummary.R
@@ -20,7 +20,7 @@ pullNullSummary <- function(null.model = NULL,
                             sandwich.var = FALSE,
                             is.glmm = FALSE) {
   # check inputs
-  if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary") }
+  if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary().") }
   # handle the degenerate case
   if (inherits(null.model, "try-error")) {
     res <- list(null_sumy_df = NULL,
@@ -73,7 +73,8 @@ pullNullSummary <- function(null.model = NULL,
     res <- list(null_sumy_df = null_sumy_df,
                 null_pred_df = null_pred_df,
                 null_ll = ifelse(is.gee, NA_real_, as.numeric(stats::logLik(null.model))),
-                null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)))
+                null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)), 
+                null_fit_notes = NA_character_)
   }
   return(res)
 }
diff --git a/R/scoreTestGEE.R b/R/scoreTestGEE.R
@@ -0,0 +1,70 @@
+#' Use a Lagrange multiplier (score) test to compare nested GEE models.
+#'
+#' @name scoreTestGEE
+#' @author Jack R. Leary
+#' @description Performs a basic Lagrange multiplier test to determine whether an alternate model is significantly better than a nested null model. This is the GEE equivalent (kind of) of \code{\link{modelLRT}}. Be careful with small sample sizes.
+#' @importFrom stats model.matrix predict pchisq
+#' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.
+#' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.
+#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL. 
+#' @param null.df The dataframe used to fit the null model. Defaults to NULL. 
+#' @param sandwich.var Boolean specifying whether the sandwich variance-covariance matrix should be used. Defaults to FALSE.
+#' @return A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test.
+#' @details
+#' \itemize{
+#' \item Calculating the test statistic involves taking the inverse of the variance-covariance matrix of the coefficients. Ideally this would be done using the "true" inverse with something like \code{\link{solve}}, \code{\link{qr.solve}}, or \code{\link{chol2inv}}, but in practice this can cause issues when the variance-covariance matrix is near-singular. With this in mind, we use the Moore-Penrose pseudoinverse as implemented in \code{\link[MASS]{ginv}} instead, which leads to more stable results.
+#' \item The \emph{p}-value is calculated using an asymptotic \eqn{\Chi^2} distribution, with the degrees of freedom equal to the number of non-intercept coefficients in the alternative model.
+#' }
+#' @seealso \code{\link[geeM]{geem}}
+#' @seealso \code{\link{waldTestGEE}}
+#' @seealso \code{\link{modelLRT}}
+
+scoreTestGEE <- function(mod.1 = NULL, 
+                         mod.0 = NULL, 
+                         alt.df = NULL, 
+                         null.df = NULL, 
+                         sandwich.var = FALSE) {
+  # check inputs 
+  if (is.null(mod.1) || is.null(mod.0) || is.null(alt.df) || is.null(null.df)) { stop("Please provide all inputs to scoreTestGEE().") }
+  if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
+    res <- list(Score_Stat = NA_real_,
+                DF = NA_real_,
+                P_Val = NA_real_,
+                Notes = "No test performed due to model failure.")
+    return(res)
+  }
+  mod.1 <- mod.1$final_mod
+  if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to scoreTestGee().") }
+  if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") }
+  if (length(coef(mod.1)) <= length(coef(mod.0))) {
+    # can't calculate Score statistic if both models are intercept-only
+    res <- list(Score_Stat = 0,
+                DF = 0,
+                P_Val = 1,
+                Notes = NA_character_)
+  } else {
+    X_null <- stats::model.matrix(mod.0, data = null.df)
+    X_alt <- stats::model.matrix(mod.1, data = alt.df)
+    residuals_null <- null.df$Y - exp(stats::predict(mod.0))
+    if (sandwich.var) {
+      V_null <- as.matrix(mod.0$var)
+    } else {
+      V_null <- as.matrix(mod.0$naiv.var)
+    }
+    X_alt_t <- t(X_alt)
+    U <- X_alt_t %*% residuals_null
+    V_U <- (X_alt_t * as.numeric(V_null)) %*% X_alt
+    V_U_inv <- try({ eigenMapMatrixInvert(V_U) }, silent = TRUE)
+    if (inherits(V_U_inv, "try-error")) {
+      V_U_inv <- eigenMapPseudoInverse(V_U)
+    }
+    test_stat <- as.numeric(t(U) %*% V_U_inv %*% U)
+    df1 <- ncol(X_alt) - ncol(X_null)
+    p_value <- 1 - stats::pchisq(test_stat, df = df1)
+    res <- list(Score_Stat = test_stat,
+                DF = df1,
+                P_Val = p_value,
+                Notes = NA_character_)
+  }
+  return(res)
+}
diff --git a/R/stat_out_score_gee_null.R b/R/stat_out_score_gee_null.R
@@ -35,10 +35,9 @@ stat_out_score_gee_null <- function(Y = NULL,
                      id = id.vec,
                      data = NULL,
                      corstr = cor.structure,
-                     family = MASS::negative.binomial(theta.hat),
-                     scale.fix = TRUE,
-                     sandwich = FALSE,
-                     maxit = 10)
+                     family = MASS::negative.binomial(50, link = "log"),
+                     scale.fix = FALSE,
+                     sandwich = FALSE)
   alpha_est <- ests$alpha
   sigma_est <- ests$phi
   mu_est <- as.matrix(stats::fitted.values(ests))

diff --git a/R/summary.R b/R/summary.R
@@ -31,7 +31,7 @@ summary.scLANE <- function(test.dyn.res = NULL,
   summary_stats$mean_adj_p_value <- mean(adj_p_values, na.rm = TRUE)
   summary_stats$test_type <- ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "LRT", 
                                     "Likelihood Ratio Test", 
-                                    "Wald Test")
+                                    ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "Wald", "Wald Test", "Score Test"))
   class(summary_stats) <- "summary.scLANE"
   return(summary_stats)
 }