Merge pull request #248 from jr-leary7/dev

Dev
jr-leary7 · Oct 12, 2024 · 5efbe2c · 5efbe2c
2 parents 4956c19 + d51b5d9
commit 5efbe2c
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 25 deletions.
diff --git a/R/marge2.R b/R/marge2.R
@@ -25,6 +25,7 @@
 #' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
 #' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.
 #' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses random sampling (don't worry, a random seed is set internally) to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.
+#' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS". 
 #' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.
 #' @param minspan (Optional) A set minimum span value. Defaults to NULL.
 #' @param return.basis (Optional) Whether the basis model matrix should be returned as part of the \code{marge} model object. Defaults to FALSE.
@@ -63,6 +64,7 @@ marge2 <- function(X_pred = NULL,
                    sandwich.var = FALSE,
                    approx.knot = TRUE,
                    n.knot.max = 50,
+                   glm.backend = "MASS", 
                    tols_score = 1e-5,
                    minspan = NULL,
                    return.basis = FALSE,
@@ -73,7 +75,7 @@ marge2 <- function(X_pred = NULL,
   if (is.gee & is.null(id.vec)) { stop("id.vec in marge2() must be non-null if is.gee = TRUE.") }
   if (is.gee & (!cor.structure %in% c("independence", "exchangeable", "ar1"))) { stop("cor.structure in marge2() must be a known type if is.gee = TRUE.") }
   if (is.gee & is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running marge2() with is.gee = TRUE.") }
-
+  if (!glm.backend %in% c("MASS", "speedglm")) { stop("Please choose a valid GLM backend for model fitting.") }
   # Algorithm 2 (forward pass) as in Friedman (1991). Uses score statistics instead of RSS, etc.
   NN <- length(Y)  # Total sample size
   if (is.gee) {
@@ -837,18 +839,26 @@ marge2 <- function(X_pred = NULL,
                               scale.fix = FALSE,
                               sandwich = sandwich.var)
     } else {
-      final_mod <- MASS::glm.nb(model_formula,
-                                data = model_df,
-                                method = "glm.fit2",
-                                link = log,
-                                init.theta = theta_hat,
-                                y = FALSE,
-                                model = FALSE)
+      if (glm.backend == "MASS") {
+        final_mod <- MASS::glm.nb(model_formula,
+                                  data = model_df,
+                                  method = "glm.fit2",
+                                  link = log,
+                                  init.theta = theta_hat,
+                                  y = FALSE,
+                                  model = FALSE)
+        final_mod <- stripGLM(glm.obj = final_mod)
+      } else if (glm.backend == "speedglm") {
+        final_mod <- speedglm::speedglm(model_formula, 
+                                        data = model_df, 
+                                        family = MASS::negative.binomial(theta_hat, link = "log"), 
+                                        trace = FALSE, 
+                                        model = TRUE, 
+                                        y = FALSE, 
+                                        fitted = TRUE)
+      }
     }
     # format results
-    if (!is.gee) {
-      final_mod <- stripGLM(glm.obj = final_mod)
-    }
     res <- list(final_mod = final_mod,
                 basis_mtx = NULL,
                 WIC_mtx = NULL,

diff --git a/R/stripGLM.R b/R/stripGLM.R
@@ -9,8 +9,8 @@
 
 stripGLM <- function(glm.obj = NULL) {
   # check inputs
-  if (inherits(glm.obj, "try-error")) { return(glm.obj) }
   if (is.null(glm.obj)) { stop("You forgot to supply inputs to stripGLM().") }
+  if (inherits(glm.obj, "try-error")) { return(glm.obj) }
   if (!inherits(glm.obj, "glm")) { stop("Input to stripGLM() must be of class glm.") }
 
   # strip out unnecessary glm pieces

diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -14,24 +14,23 @@
 #' @importFrom withr with_output_sink
 #' @importFrom MASS glm.nb negative.binomial theta.mm
 #' @importFrom dplyr rename mutate relocate
-#' @importFrom broom.mixed tidy
 #' @importFrom purrr imap reduce
 #' @importFrom stats predict logLik deviance offset
 #' @importFrom geeM geem
 #' @importFrom glmmTMB glmmTMB nbinom2
 #' @param expr.mat Either a \code{SingleCellExperiment}, \code{Seurat}, or \code{CellDataSet} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
 #' @param pt Either the output from \code{\link[slingshot]{SlingshotDataSet}} object from which pseudotime can be generated, or a data.frame containing the pseudotime or latent time estimates for each cell (can be multiple columns / lineages). Defaults to NULL.
 #' @param genes A character vector of genes to model. If not provided, defaults to all genes in \code{expr.mat}. Defaults to NULL.
-#' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
 #' @param size.factor.offset (Optional) An offset to be included in the final model fit. Can be generated easily with \code{\link{createCellOffset}}. Defaults to NULL.
 #' @param is.gee Should a GEE framework be used instead of the default GLM? Defaults to FALSE.
 #' @param cor.structure If the GEE framework is used, specifies the desired working correlation structure. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1".
 #' @param gee.bias.correction.method Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance.
-#' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param is.glmm Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.
-#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
-#' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
+#' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to TRUE.
+#' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
+#' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
+#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
 #' @param verbose (Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
@@ -74,7 +73,7 @@ testDynamic <- function(expr.mat = NULL,
                         random.seed = 312) {
   # check inputs
   if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") }
-
+  
   # get raw counts from SingleCellExperiment or Seurat object & transpose to cell x gene dense matrix
   if (is.null(genes)) {
     genes <- rownames(expr.mat)
@@ -191,7 +190,7 @@ testDynamic <- function(expr.mat = NULL,
                  cor.structure = cor.structure,
                  sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE),
                  M = n.potential.basis.fns,
-                 approx.knot = approx.knot,
+                 approx.knot = approx.knot, 
                  return.basis = TRUE)
         }, silent = TRUE)
       } else if (is.glmm) {
@@ -253,20 +252,19 @@ testDynamic <- function(expr.mat = NULL,
                            se = TRUE)
         }, silent = TRUE)
       } else {
+        theta_hat <- MASS::theta.mm(y = null_mod_df$Y_null,
+                                    mu = mean(null_mod_df$Y_null),
+                                    dfr = length(null_mod_df$Y_null) - 1)
         null_mod <- try({
           MASS::glm.nb(null_mod_formula,
                        data = null_mod_df,
                        method = "glm.fit2",
                        y = FALSE,
                        model = FALSE,
-                       init.theta = 1,
+                       init.theta = theta_hat,
                        link = log)
         }, silent = TRUE)
-      }
-
-      # slim down GLM object if not a GEE / GLMM model (which are much smaller for some reason)
-      if (!(is.gee || is.glmm)) {
-        null_mod <- stripGLM(glm.obj = null_mod)
+        null_mod <- stripGLM(null_mod)
       }
 
       # record model fit status

diff --git a/man/marge2.Rd b/man/marge2.Rd