minor bugs removed & tests updated

jr-leary7 · Sep 1, 2023 · 6f68e8e · 6f68e8e
1 parent 810d46a
commit 6f68e8e
Show file tree

Hide file tree

Showing 14 changed files with 94 additions and 79 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: scLANE
 Type: Package
 Title: Model gene expression dynamics with spline-based NB GLMs, GEEs, & GLMMs
-Version: 0.7.1
+Version: 0.7.2
 Authors@R: c(person(given = "Jack", family = "Leary", email = "[email protected]", role = c("aut", "cre")), 
              person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd")))
 Description: This package uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# scLANE 0.7.2 
+
+* Added a function named `sortGenesHeatmap()` that aids in the creation of expression cascade heatmaps by sorting genes according to where in pseudotime their peak expression is. 
+* Changed the parameter `approx.knot` in the `testDynamic()` function to use (stochasticity-controlled) subsampling instead of `seq()` to reduce candidate knot space. 
+
 # scLANE 0.7.1
 
 * Changed input format of all functions to allow counts matrices formatted as `SingleCellExperiment` or `Seurat` objects, sparse matrices, or dense matrices.

diff --git a/R/marge2.R b/R/marge2.R
@@ -16,20 +16,20 @@
 #' @param X_pred A matrix of the predictor variables. Defaults to NULL.
 #' @param Y The response variable. Defaults to NULL.
 #' @param Y.offset (Optional) An vector of per-cell size factors to be included in the final model fit as an offset. Defaults to NULL.
-#' @param M A set threshold for the number of basis functions to be used. Defaults to 5.
+#' @param M A set threshold for the maximum number of basis functions to be chosen. Defaults to 5.
 #' @param is.gee Should the \code{geeM} package be used to fit a negative binomial GEE? Defaults to FALSE.
 #' @param id.vec If \code{is.gee = TRUE}, must be a vector of ID values for the observations. Data must be sorted such that the subjects are in order! Defaults to NULL.
-#' @param cor.structure If \code{is.gee = TRUE}, must be a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
-#' @param approx.knot (Optional) Should the set of candidate knots be reduce in order to speed up computation? This has little effect on the final fit, but can improve computation time significantly. Defaults to TRUE.
-#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to set this number of unique values from the reduced set of all candidate knots. Defaults to 20.
+#' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
+#' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.
+#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses random sampling (don't worry, a random seed is set internally) to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.
 #' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.
 #' @param minspan (Optional) A set minimum span value. Defaults to NULL.
 #' @param return.basis (Optional) Whether the basis model matrix should be returned as part of the \code{marge} model object. Defaults to FALSE.
 #' @param return.WIC (Optional) Whether the WIC matrix should be returned as part of the \code{marge} model object. Defaults to FALSE.
 #' @param return.GCV (Optional) Whether the final GCV value should be returned as part of the \code{marge} model object. Defaults to FALSE.
 #' @details
 #' \itemize{
-#' \item If models are being fit using an offset (as is recommended), it is assumed that the offset represents a library size factor (or similar quantity) generated using e.g., \code{\link{createCellOffset}} or \code{\link[scuttle]{computeLibraryFactors}}. Since this quantity represents a scaling factor divide by sequencing depth, the offset is formulated as \code{offset(log(1 / cell_offset))}. The inversion is necessary because the rate term, i.e. the sequencing depth, is the denominator of the estimated size factors.
+#' \item If models are being fit using an offset (as is recommended), it is assumed that the offset represents a library size factor (or similar quantity) generated using e.g., \code{\link{createCellOffset}} or \code{\link[scuttle]{computeLibraryFactors}}. Since this quantity represents a scaling factor divided by sequencing depth, the offset is formulated as \code{offset(log(1 / cell_offset))}. The inversion is necessary because the rate term, i.e. the sequencing depth, is the denominator of the estimated size factors.
 #' }
 #' @return An object of class \code{marge} containing the fitted model & other optional quantities of interest (basis function matrix, GCV, etc.).
 #' @references Friedman, J. (1991). Multivariate adaptive regression splines. \emph{The Annals of Statistics}, \strong{19}, 1--67.
@@ -69,7 +69,7 @@ marge2 <- function(X_pred = NULL,
                    id.vec = NULL,
                    cor.structure = "ar1",
                    approx.knot = TRUE,
-                   n.knot.max = 20,
+                   n.knot.max = 50,
                    tols_score = 1e-5,
                    minspan = NULL,
                    return.basis = FALSE,
@@ -78,7 +78,7 @@ marge2 <- function(X_pred = NULL,
   # check inputs
   if (is.null(X_pred) || is.null(Y)) { stop("Some required inputs to marge2() are missing.") }
   if (is.gee & is.null(id.vec)) { stop("id.vec in marge2() must be non-null if is.gee = TRUE.") }
-  if (is.gee & (!cor.structure %in% c("independence", "exchangeable", "ar1", "unstructured"))) { stop("cor.structure in marge2() must be a known type if is.gee = TRUE.") }
+  if (is.gee & (!cor.structure %in% c("independence", "exchangeable", "ar1"))) { stop("cor.structure in marge2() must be a known type if is.gee = TRUE.") }
   if (is.gee & is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running marge2() with is.gee = TRUE.") }
 
   # Algorithm 2 (forward pass) as in Friedman (1991). Uses score statistics instead of RSS, etc.
@@ -166,7 +166,8 @@ marge2 <- function(X_pred = NULL,
         X_red2 <- max_span(X_red = X, q = q)
         X_red <- intersect(X_red1, X_red2)
         if (length(X_red) > n.knot.max) {
-          X_red <- seq(min(X_red), max(X_red), length.out = n.knot.max)
+          set.seed(as.integer(length(X_red)))
+          X_red <- sample(X_red, size = n.knot.max)
         }
       } else {
         # original candidate knot selection from 2017 Stoklosa & Warton paper
@@ -186,7 +187,8 @@ marge2 <- function(X_pred = NULL,
       in.set <- ifelse(ncol(B) > 1, sum(!var_name_vec %in% var_name), 0)
 
       for (t in seq(length(X_red))) {
-        b1_new <- matrix(tp1(x = X,  t = X_red[t]), ncol = 1)  # Pairs of truncated functions.
+        # pairs of truncated functions
+        b1_new <- matrix(tp1(x = X,  t = X_red[t]), ncol = 1)
         b2_new <- matrix(tp2(x = X, t = X_red[t]), ncol = 1)
 
         score_knot_both_int <- NULL

diff --git a/R/max_span.R b/R/max_span.R
@@ -1,9 +1,9 @@
 #' Truncates the predictor variable value to exclude extreme values in knots selection.
 #'
 #' @name max_span
-#' @param X_red : a vector of values for the predictor variable.
-#' @param q : the number of predictors used.
-#' @param alpha : see Friedman (1991) equation (45). The default is 0.05.
+#' @param X_red A vector of values for the predictor variable.
+#' @param q The number of predictors used.
+#' @param alpha See Friedman (1991) equation (45). Defaults to 0.05.
 #' @details Note that this equation comes from Friedman (1991) equation (45).
 #' @return \code{max_span} returns a vector of truncated predictor variable values.
 #' @author Jakub Stoklosa and David I. Warton.
@@ -18,10 +18,8 @@ max_span <- function(X_red = NULL,
   x <- sort(unique(X_red))
   maxspan <- round((3 - log2(alpha / q)))
   x_new <- x[-c(1:maxspan, floor(N - maxspan + 1):N)]
-  if (length(x_new) > 0) {
-    res <- x_new
-  } else {
-    res <- x
+  if (length(x_new) == 0) {
+    x_new <- x
   }
-  return(res)
+  return(x_new)
 }
diff --git a/R/min_span.R b/R/min_span.R
@@ -1,10 +1,10 @@
 #' A truncation function applied on the predictor variable for knot selection.
 #'
 #' @name min_span
-#' @param X_red : a vector of reduced predictor variable values. Defaults to NULL.
-#' @param q : the number of predictor variables used. Defaults to NULL.
-#' @param minspan : the set minimum span value. Defaults to \code{round((-log2(-(1 / (q * N)) * log(1 - alpha)) / 2.5))}.
-#' @param alpha : see Friedman (1991) equation (43). The default is 0.05.
+#' @param X_red A vector of reduced predictor variable values. Defaults to NULL.
+#' @param q The number of predictor variables used. Defaults to NULL.
+#' @param minspan The set minimum span value. Defaults to \code{round((-log2(-(1 / (q * N)) * log(1 - alpha)) / 2.5))}.
+#' @param alpha See Friedman (1991) equation (43). Defaults to 0.05.
 #' @details  This function selects a minimum span between the knots to mitigate runs of correlated noise in the input data and hence avoiding estimation issues, this equation comes from Friedman (1991) equation 43.
 #' @return \code{min_span} returns a vector of truncated predictor variable values.
 #' @author Jakub Stoklosa and David I. Warton.
@@ -20,7 +20,7 @@ min_span <- function(X_red = NULL,
   N <- length(X_red)
   x <- sort(X_red)
   if (is.null(minspan)) {
-    minspan <- round((-log2(-(1 / (q * N)) * log(1 - alpha)) / 2.5))
+    minspan <- round(-log2(-(1 / (q * N)) * log(1 - alpha)) / 2.5)
   }
   # run function
   x_new <- min(x, na.rm = TRUE)

diff --git a/R/plotModels.R b/R/plotModels.R
@@ -276,7 +276,8 @@ plotModels <- function(test.dyn.res = NULL,
                              size = 0.5)
     if (requireNamespace("ggh4x", quietly = TRUE)) {
       p <- p + ggh4x::facet_nested_wrap(~paste0("Lineage ", LINEAGE) + MODEL + ID,
-                                        nrow = length(levels(counts_df$MODEL)))
+                                        nrow = length(levels(counts_df$MODEL)),
+                                        strip = ggh4x::strip_nested(background_x = list(ggplot2::element_rect(linewidth = gg.theme$line$linewidth))))
     } else {
       p <- p + ggplot2::facet_wrap(~paste0("Lineage ", LINEAGE) + MODEL + ID)
     }
@@ -298,15 +299,18 @@ plotModels <- function(test.dyn.res = NULL,
                        fill = "Subject",
                        title = gene) +
          gg.theme +
-         ggplot2::theme(plot.title = ggplot2::element_text(face = "italic")) +
+         ggplot2::theme(plot.title = ggplot2::element_text(face = "italic"),
+                        strip.clip = "off",
+                        strip.background = ggplot2::element_rect(linewidth = gg.theme$line$linewidth)) +
          ggplot2::guides(color = ggplot2::guide_legend(override.aes = list(size = 4, alpha = 1)))
   } else {
     p <- ggplot2::ggplot(counts_df, ggplot2::aes(x = PT, y = COUNT, color = LINEAGE)) +
          ggplot2::geom_point(alpha = 0.5,
                              size = 0.5,
                              show.legend = ifelse(ncol(pt) > 1, TRUE, FALSE))
     if (requireNamespace("ggh4x", quietly = TRUE)) {
-      p <- p + ggh4x::facet_nested_wrap(~paste0("Lineage ", LINEAGE) + MODEL, )
+      p <- p + ggh4x::facet_nested_wrap(~paste0("Lineage ", LINEAGE) + MODEL,
+                                        strip = ggh4x::strip_nested(background_x = list(ggplot2::element_rect(linewidth = gg.theme$line$linewidth))))
     } else {
       p <- p + ggplot2::facet_wrap(~paste0("Lineage ", LINEAGE) + MODEL)
     }
@@ -326,7 +330,9 @@ plotModels <- function(test.dyn.res = NULL,
                        fill = "Lineage",
                        title = gene) +
          gg.theme +
-         ggplot2::theme(plot.title = ggplot2::element_text(face = "italic")) +
+         ggplot2::theme(plot.title = ggplot2::element_text(face = "italic"),
+                        strip.clip = "off",
+                        strip.background = ggplot2::element_rect(linewidth = gg.theme$line$linewidth)) +
          ggplot2::guides(color = ggplot2::guide_legend(override.aes = list(size = 4, alpha = 1)))
   }
   return(p)

diff --git a/R/sortGenesHeatmap.R b/R/sortGenesHeatmap.R
@@ -1,4 +1,4 @@
-#' Generate a table of fitted values and celltype metadata for genes of interest.
+#' Sort genes by where their peak expression occurs across pseudotime.
 #'
 #' @name sortGenesHeatmap
 #' @author Jack Leary
@@ -9,6 +9,7 @@
 #' @param heatmap.mat A matrix of raw or smoothed expression values with genes as columns and cells as rows. Defaults to NULL.
 #' @param pt.vec A numeric vector of pseudotime values for each cell i.e., for each row in the heatmap matrix. Defaults to NULL.
 #' @return A character vector of genes sorted by their peak expression values over pseudotime.
+#' @seealso \code{\link{smoothedCountsMatrix}}
 #' @export
 #' @examples
 #' \dontrun{

diff --git a/R/testDynamic.R b/R/testDynamic.R
@@ -30,8 +30,6 @@
 #' @param approx.knot (Optional) Should the knot space be reduced in order to improve computation time? Defaults to TRUE.
 #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to FALSE.
 #' @param track.time (Optional) A boolean indicating whether the amount of time the function takes to run should be tracked and printed to the console. Useful for debugging. Defaults to FALSE.
-#' @param log.file (Optional) A string indicating a \code{.txt} file to which iteration tracking should be written. Can be useful for debugging. Defaults to NULL.
-#' @param log.iter (Optional) If logging is enabled, how often should iterations be printed to the logfile. Defaults to 100.
 #' @details
 #' \itemize{
 #' \item If \code{expr.mat} is a \code{Seurat} object, counts will be extracted from the output of \code{\link[SeuratObject]{DefaultAssay}}. If using this functionality, check to ensure the specified assay is correct before running the function. If the input is a \code{SingleCellExperiment} object, the raw counts will be extracted with \code{\link[BiocGenerics]{counts}}.
@@ -72,9 +70,7 @@
 #'             parallel.exec = TRUE,
 #'             n.cores = 8,
 #'             is.glmm = TRUE,
-#'             id.vec = seu_obj$subject_id,
-#'             log.file = "scLANE_log.txt",
-#'             log.iter = 10)
+#'             id.vec = seu_obj$subject_id)
 #' }
 
 testDynamic <- function(expr.mat = NULL,
@@ -90,9 +86,7 @@ testDynamic <- function(expr.mat = NULL,
                         parallel.exec = TRUE,
                         n.cores = 2,
                         approx.knot = TRUE,
-                        track.time = FALSE,
-                        log.file = NULL,
-                        log.iter = 100) {
+                        track.time = FALSE) {
   # check inputs
   if (is.null(expr.mat) || is.null(pt)) { stop("You forgot some inputs to testDynamic().") }
   # get raw counts from SingleCellExperiment or Seurat object & transpose to cell x gene dense matrix
@@ -134,10 +128,11 @@ testDynamic <- function(expr.mat = NULL,
   if (is.null(genes)) {
     genes <- colnames(expr.mat)
   }
-  # set up parallel execution & time tracking
+  # set up time tracking
   if (track.time) {
     start_time <- Sys.time()
   }
+  # set up parallel processing
   if (parallel.exec) {
     cl <- parallel::makeCluster(n.cores)
     doParallel::registerDoParallel(cl)
@@ -150,16 +145,9 @@ testDynamic <- function(expr.mat = NULL,
   expr.mat <- bigstatsr::as_FBM(expr.mat,
                                 type = "integer",
                                 is_read_only = TRUE)
-  # set up logging to .txt file if desired
-  print_nums <- seq(0, length(genes), log.iter)[-1]
-  if (!is.null(log.file)) {
-    if (!substr(log.file, nchar(log.file) - 3, nchar(log.file)) == ".txt") {
-      log.file <- paste0(log.file, ".txt")
-    }
-  }
   # build list of objects to prevent from being sent to parallel workers
   necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "print_nums",
-                      "n_lineages", "id.vec", "cor.structure", "is.gee", "log.file", "log.iter", "glmm.adaptive", "size.factor.offset")
+                      "n_lineages", "id.vec", "cor.structure", "is.gee", "glmm.adaptive", "size.factor.offset")
   if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) {
     no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)],
                    ls()[-which(ls() %in% necessary_vars)])
@@ -183,13 +171,6 @@ testDynamic <- function(expr.mat = NULL,
                                  .packages = package_list,
                                  .noexport = no_export,
                                  .verbose = FALSE) %dopar% {
-    if (!is.null(log.file) && i %in% print_nums) {
-      withr::with_output_sink(log.file,
-                              code = {
-                                cat(paste(Sys.time(), "- Starting iteration:", i, "\n"))
-                              },
-                              append = TRUE)
-    }
     lineage_list <- vector("list", n_lineages)
     for (j in seq(n_lineages)) {
       lineage_cells <- which(!is.na(pt[, j]))
@@ -565,7 +546,7 @@ testDynamic <- function(expr.mat = NULL,
                                                        is.glmm = is.glmm) %>%
                           dplyr::mutate(Gene = genes[i], Lineage = LETTERS[j]) %>%
                           dplyr::relocate(Gene, Lineage)
-        # compute LRT stat using asymptotic Chi-squared approximation
+        # compute test stat using asymptotic Chi-squared approximation
         if (is.gee) {
           test_res <- scLANE::waldTestGEE(mod.1 = marge_mod$final_mod, mod.0 = null_mod)
         } else {
@@ -602,7 +583,7 @@ testDynamic <- function(expr.mat = NULL,
                                   Null_Preds = null_pred_df,
                                   MARGE_Slope_Data = marge_slope_df)
       } else {
-        stop(paste0("Conditions for marge or null model fits not met for gene ",
+        stop(paste0("Conditions for scLANE or null model fits not met for gene ",
                     genes[i],
                     " on lineage ",
                     j))