From 4dc10cd2a696b07e6d3fc2fc730de686e89456ba Mon Sep 17 00:00:00 2001
From: Jack Leary <j.leary@ufl.edu>
Date: Sat, 12 Oct 2024 13:07:52 -0400
Subject: [PATCH 1/3] added experimental speed flag to marge2()

---
 R/marge2.R         | 32 +++++++++++++++++++++-----------
 R/stripGLM.R       |  2 +-
 R/testDynamic.R    | 25 +++++++++++++------------
 man/marge2.Rd      |  3 +++
 man/testDynamic.Rd |  2 ++
 5 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/R/marge2.R b/R/marge2.R
index 1c238b4..9e308d7 100644
--- a/R/marge2.R
+++ b/R/marge2.R
@@ -25,6 +25,7 @@
 #' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
 #' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.
 #' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses random sampling (don't worry, a random seed is set internally) to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.
+#' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS". 
 #' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.
 #' @param minspan (Optional) A set minimum span value. Defaults to NULL.
 #' @param return.basis (Optional) Whether the basis model matrix should be returned as part of the \code{marge} model object. Defaults to FALSE.
@@ -63,6 +64,7 @@ marge2 <- function(X_pred = NULL,
                    sandwich.var = FALSE,
                    approx.knot = TRUE,
                    n.knot.max = 50,
+                   glm.backend = "MASS", 
                    tols_score = 1e-5,
                    minspan = NULL,
                    return.basis = FALSE,
@@ -73,7 +75,7 @@ marge2 <- function(X_pred = NULL,
   if (is.gee & is.null(id.vec)) { stop("id.vec in marge2() must be non-null if is.gee = TRUE.") }
   if (is.gee & (!cor.structure %in% c("independence", "exchangeable", "ar1"))) { stop("cor.structure in marge2() must be a known type if is.gee = TRUE.") }
   if (is.gee & is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running marge2() with is.gee = TRUE.") }
-
+  if (!glm.backend %in% c("MASS", "speedglm")) { stop("Please choose a valid GLM backend for model fitting.") }
   # Algorithm 2 (forward pass) as in Friedman (1991). Uses score statistics instead of RSS, etc.
   NN <- length(Y)  # Total sample size
   if (is.gee) {
@@ -837,18 +839,26 @@ marge2 <- function(X_pred = NULL,
                               scale.fix = FALSE,
                               sandwich = sandwich.var)
     } else {
-      final_mod <- MASS::glm.nb(model_formula,
-                                data = model_df,
-                                method = "glm.fit2",
-                                link = log,
-                                init.theta = theta_hat,
-                                y = FALSE,
-                                model = FALSE)
+      if (glm.backend == "MASS") {
+        final_mod <- MASS::glm.nb(model_formula,
+                                  data = model_df,
+                                  method = "glm.fit2",
+                                  link = log,
+                                  init.theta = theta_hat,
+                                  y = FALSE,
+                                  model = FALSE)
+        final_mod <- stripGLM(glm.obj = final_mod)
+      } else if (glm.backend == "speedglm") {
+        final_mod <- speedglm::speedglm(model_formula, 
+                                        data = model_df, 
+                                        family = MASS::negative.binomial(theta_hat, link = "log"), 
+                                        trace = FALSE, 
+                                        model = TRUE, 
+                                        y = FALSE, 
+                                        fitted = TRUE)
+      }
     }
     # format results
-    if (!is.gee) {
-      final_mod <- stripGLM(glm.obj = final_mod)
-    }
     res <- list(final_mod = final_mod,
                 basis_mtx = NULL,
                 WIC_mtx = NULL,
diff --git a/R/stripGLM.R b/R/stripGLM.R
index c149616..c811d4d 100644
--- a/R/stripGLM.R
+++ b/R/stripGLM.R
@@ -9,8 +9,8 @@
 
 stripGLM <- function(glm.obj = NULL) {
   # check inputs
-  if (inherits(glm.obj, "try-error")) { return(glm.obj) }
   if (is.null(glm.obj)) { stop("You forgot to supply inputs to stripGLM().") }
+  if (inherits(glm.obj, "try-error")) { return(glm.obj) }
   if (!inherits(glm.obj, "glm")) { stop("Input to stripGLM() must be of class glm.") }
 
   # strip out unnecessary glm pieces
diff --git a/R/testDynamic.R b/R/testDynamic.R
index a7e583b..38dd468 100644
--- a/R/testDynamic.R
+++ b/R/testDynamic.R
@@ -14,7 +14,6 @@
 #' @importFrom withr with_output_sink
 #' @importFrom MASS glm.nb negative.binomial theta.mm
 #' @importFrom dplyr rename mutate relocate
-#' @importFrom broom.mixed tidy
 #' @importFrom purrr imap reduce
 #' @importFrom stats predict logLik deviance offset
 #' @importFrom geeM geem
@@ -22,16 +21,17 @@
 #' @param expr.mat Either a \code{SingleCellExperiment}, \code{Seurat}, or \code{CellDataSet} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
 #' @param pt Either the output from \code{\link[slingshot]{SlingshotDataSet}} object from which pseudotime can be generated, or a data.frame containing the pseudotime or latent time estimates for each cell (can be multiple columns / lineages). Defaults to NULL.
 #' @param genes A character vector of genes to model. If not provided, defaults to all genes in \code{expr.mat}. Defaults to NULL.
-#' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
 #' @param size.factor.offset (Optional) An offset to be included in the final model fit. Can be generated easily with \code{\link{createCellOffset}}. Defaults to NULL.
 #' @param is.gee Should a GEE framework be used instead of the default GLM? Defaults to FALSE.
 #' @param cor.structure If the GEE framework is used, specifies the desired working correlation structure. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1".
 #' @param gee.bias.correction.method Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance.
-#' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param is.glmm Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.
-#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
-#' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
+#' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to TRUE.
+#' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
+#' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
+#' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
+#' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS". 
 #' @param verbose (Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
@@ -74,7 +74,8 @@ testDynamic <- function(expr.mat = NULL,
                         random.seed = 312) {
   # check inputs
   if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") }
-
+  if (!glm.backend %in% c("MASS", "speedglm")) { stop("Please choose a valid GLM backend for model fitting.") }
+  
   # get raw counts from SingleCellExperiment or Seurat object & transpose to cell x gene dense matrix
   if (is.null(genes)) {
     genes <- rownames(expr.mat)
@@ -192,6 +193,7 @@ testDynamic <- function(expr.mat = NULL,
                  sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE),
                  M = n.potential.basis.fns,
                  approx.knot = approx.knot,
+                 glm.backend = glm.backend, 
                  return.basis = TRUE)
         }, silent = TRUE)
       } else if (is.glmm) {
@@ -253,20 +255,19 @@ testDynamic <- function(expr.mat = NULL,
                            se = TRUE)
         }, silent = TRUE)
       } else {
+        theta_hat <- MASS::theta.mm(y = null_mod_df$Y_null,
+                                    mu = mean(null_mod_df$Y_null),
+                                    dfr = length(null_mod_df$subject) - 1)
         null_mod <- try({
           MASS::glm.nb(null_mod_formula,
                        data = null_mod_df,
                        method = "glm.fit2",
                        y = FALSE,
                        model = FALSE,
-                       init.theta = 1,
+                       init.theta = theta_hat,
                        link = log)
         }, silent = TRUE)
-      }
-
-      # slim down GLM object if not a GEE / GLMM model (which are much smaller for some reason)
-      if (!(is.gee || is.glmm)) {
-        null_mod <- stripGLM(glm.obj = null_mod)
+        null_mod <- stripGLM(null_mod)
       }
 
       # record model fit status
diff --git a/man/marge2.Rd b/man/marge2.Rd
index 1f10a20..962edc1 100644
--- a/man/marge2.Rd
+++ b/man/marge2.Rd
@@ -16,6 +16,7 @@ marge2(
   sandwich.var = FALSE,
   approx.knot = TRUE,
   n.knot.max = 50,
+  glm.backend = "MASS",
   tols_score = 1e-05,
   minspan = NULL,
   return.basis = FALSE,
@@ -46,6 +47,8 @@ marge2(
 
 \item{n.knot.max}{(Optional) The maximum number of candidate knots to consider. Uses random sampling (don't worry, a random seed is set internally) to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.}
 
+\item{glm.backend}{(Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".}
+
 \item{tols_score}{(Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.}
 
 \item{minspan}{(Optional) A set minimum span value. Defaults to NULL.}
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd
index efc641a..4d3d3b6 100644
--- a/man/testDynamic.Rd
+++ b/man/testDynamic.Rd
@@ -52,6 +52,8 @@ testDynamic(
 \item{verbose}{(Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.}
 
 \item{random.seed}{(Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.}
+
+\item{glm.backend}{(Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".}
 }
 \value{
 A list of lists, where each element is a gene and each gene contains sublists for each lineage. Each gene-lineage sublist contains a gene name, lineage number, default \code{marge} vs. null model test results, model statistics, and fitted values. Use \code{\link{getResultsDE}} to tidy the results.

From 671eeb3b0d9cd8aaf7d79f0c348b97af7112602d Mon Sep 17 00:00:00 2001
From: Jack Leary <j.leary@ufl.edu>
Date: Sat, 12 Oct 2024 13:23:13 -0400
Subject: [PATCH 2/3] fixed failing tests caused by previous commit

---
 R/testDynamic.R    | 5 +----
 man/testDynamic.Rd | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/R/testDynamic.R b/R/testDynamic.R
index 38dd468..cef6914 100644
--- a/R/testDynamic.R
+++ b/R/testDynamic.R
@@ -31,7 +31,6 @@
 #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
 #' @param n.potential.basis.fns (Optional) The maximum number of possible basis functions. See the parameter \code{M} in \code{\link{marge2}}. Defaults to 5.
 #' @param n.cores (Optional) If running in parallel, how many cores should be used? Defaults to 4L.
-#' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS". 
 #' @param verbose (Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.
 #' @param random.seed (Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.
 #' @details
@@ -74,7 +73,6 @@ testDynamic <- function(expr.mat = NULL,
                         random.seed = 312) {
   # check inputs
   if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") }
-  if (!glm.backend %in% c("MASS", "speedglm")) { stop("Please choose a valid GLM backend for model fitting.") }
   
   # get raw counts from SingleCellExperiment or Seurat object & transpose to cell x gene dense matrix
   if (is.null(genes)) {
@@ -192,8 +190,7 @@ testDynamic <- function(expr.mat = NULL,
                  cor.structure = cor.structure,
                  sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE),
                  M = n.potential.basis.fns,
-                 approx.knot = approx.knot,
-                 glm.backend = glm.backend, 
+                 approx.knot = approx.knot, 
                  return.basis = TRUE)
         }, silent = TRUE)
       } else if (is.glmm) {
diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd
index 4d3d3b6..efc641a 100644
--- a/man/testDynamic.Rd
+++ b/man/testDynamic.Rd
@@ -52,8 +52,6 @@ testDynamic(
 \item{verbose}{(Optional) A boolean indicating whether a progress bar should be printed to the console. Defaults to TRUE.}
 
 \item{random.seed}{(Optional) The random seed used to initialize RNG streams in parallel. Defaults to 312.}
-
-\item{glm.backend}{(Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".}
 }
 \value{
 A list of lists, where each element is a gene and each gene contains sublists for each lineage. Each gene-lineage sublist contains a gene name, lineage number, default \code{marge} vs. null model test results, model statistics, and fitted values. Use \code{\link{getResultsDE}} to tidy the results.

From d51b5d98cd5730ef84c289089628b7946331ea41 Mon Sep 17 00:00:00 2001
From: Jack Leary <j.leary@ufl.edu>
Date: Sat, 12 Oct 2024 14:01:41 -0400
Subject: [PATCH 3/3] fixed failing tests introduced by prior commits -- closes
 #247

---
 R/testDynamic.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/testDynamic.R b/R/testDynamic.R
index cef6914..db4d098 100644
--- a/R/testDynamic.R
+++ b/R/testDynamic.R
@@ -254,7 +254,7 @@ testDynamic <- function(expr.mat = NULL,
       } else {
         theta_hat <- MASS::theta.mm(y = null_mod_df$Y_null,
                                     mu = mean(null_mod_df$Y_null),
-                                    dfr = length(null_mod_df$subject) - 1)
+                                    dfr = length(null_mod_df$Y_null) - 1)
         null_mod <- try({
           MASS::glm.nb(null_mod_formula,
                        data = null_mod_df,