Merge pull request #156 from jr-leary7/dev

Dev
jr-leary7 · Nov 10, 2023 · 9967dbb · 9967dbb
2 parents 131bc05 + 321b5a3
commit 9967dbb
Show file tree

Hide file tree

Showing 15 changed files with 399 additions and 30 deletions.
diff --git a/.github/workflows/render-README.yaml b/.github/workflows/render-README.yaml
@@ -22,9 +22,9 @@ jobs:
       - uses: r-lib/actions/setup-pandoc@v2
 
       - name: install CRAN packages
-        run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes", "devtools", "BiocManager"))'
+        run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "dplyr", "purrr", "remotes", "devtools", "BiocManager", "Seurat"), dependencies = TRUE)'
       - name: install BioConductor packages
-        run: Rscript -e 'BiocManager::install(c("SingleCellExperiment", "scater", "scran"))'
+        run: Rscript -e 'BiocManager::install(c("SingleCellExperiment", "scater", "scran", "scuttle", "bluster"))'
       - name: install GitHub packages
         run: Rscript -e 'remotes::install_github("jr-leary7/scLANE")'
       - name: render README

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -44,17 +44,22 @@ URL: https://github.com/jr-leary7/scLANE
 BugReports: https://github.com/jr-leary7/scLANE/issues
 Suggests: 
     covr, 
+    grid, 
     coop,
     uwot, 
     ggh4x, 
     knitr,
+    UCell, 
     irlba, 
     rlang, 
     igraph, 
+    gtable, 
+    ggpubr, 
     Seurat, 
     bluster, 
     cluster, 
     rmarkdown,
+    gridExtra, 
     BiocStyle, 
     slingshot, 
     gprofiler2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(embedGenes)
 export(enrichDynamicGenes)
 export(extractBreakpoints)
 export(fitGLMM)
+export(geneProgramScoring)
 export(getFittedValues)
 export(getKnotDist)
 export(getResultsDE)
@@ -14,10 +15,12 @@ export(modelLRT)
 export(nbGAM)
 export(npConvolve)
 export(plotClusteredGenes)
+export(plotModelCoefs)
 export(plotModels)
 export(smoothedCountsMatrix)
 export(sortGenesHeatmap)
 export(stripGLM)
+export(summarizeModel)
 export(testDynamic)
 export(testSlope)
 export(theme_scLANE)
@@ -29,6 +32,7 @@ importFrom(MASS,ginv)
 importFrom(MASS,glm.nb)
 importFrom(MASS,negative.binomial)
 importFrom(MASS,theta.mm)
+importFrom(Matrix,Matrix)
 importFrom(Matrix,colSums)
 importFrom(Matrix,t)
 importFrom(Rcpp,sourceCpp)
@@ -46,6 +50,8 @@ importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,if_else)
 importFrom(dplyr,inner_join)
+importFrom(dplyr,lag)
+importFrom(dplyr,lead)
 importFrom(dplyr,mutate)
 importFrom(dplyr,ntile)
 importFrom(dplyr,pull)
@@ -77,10 +83,12 @@ importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_line)
 importFrom(ggplot2,geom_point)
 importFrom(ggplot2,geom_ribbon)
+importFrom(ggplot2,geom_vline)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,guide_legend)
 importFrom(ggplot2,guides)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_x_continuous)
 importFrom(ggplot2,scale_y_continuous)
 importFrom(ggplot2,theme)
 importFrom(ggplot2,theme_classic)

diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,13 @@
 # `scLANE` v0.7.7
 
-* Added DOI badge to README
-* Further compression of included datasets.
+* Added DOI badge to README. 
+* Better compression of included datasets.
+* Added `geneProgramScoring()` for module scoring of dynamic gene clusters.
+* Added `plotModelCoefs()` to annotate gene dynamics plots with a table of model coefficients. 
 
 # `scLANE` v0.7.6
 
-* Added Zenodo tracking.
+* Added [Zenodo tracking](https://doi.org/10.5281/zenodo.10030621).
 * Added simulated dataset to `data/`.
 
 # `scLANE` v0.7.5
@@ -21,7 +23,7 @@
 # `scLANE` v0.7.3
 
 * Added a function named `embedGenes()` that takes a smoothed counts matrix as input & returns PCA & UMAP embeddings along with a graph-based clustering. 
-* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine dissimilarity. 
+* Updated the `clusterGenes()` function to be much more efficient as well as changing the distance metric used to be cosine distance. 
 * Added `theme_scLANE()` for output plots. 
 * Enhanced documentation. 
 * Increased test coverage.

diff --git a/R/geneProgramScoring.R b/R/geneProgramScoring.R
@@ -0,0 +1,79 @@
+#' Add per-cell module scores for gene programs.
+#'
+#' @name geneProgramScoring
+#' @author Jack Leary
+#' @import magrittr
+#' @importFrom Matrix Matrix
+#' @importFrom ggplot2 ggplot aes geom_point geom_vline geom_ribbon geom_line scale_x_continuous labs
+#' @importFrom scales label_number
+#' @description This function uses \code{\link[UCell]{ScoreSignatures_UCell}} to create a per-cell module score for each of the provided gene clusters. If the
+#' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
+#' @param genes A character vector of gene IDs. Defaults to NULL.
+#' @param gene.clusters A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL.
+#' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.
+#' @param n.cores (Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2.
+#' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix.
+#' @export
+#' @examples
+#' \dontrun{
+#' geneProgramScoring(seu_obj,
+#'                    genes = gene_embed$gene,
+#'                    gene.clusters = gene_embed$leiden,
+#'                    program.labels = c("cell cycle", "organogenesis"))
+#' }
+
+geneProgramScoring <- function(expr.mat = NULL,
+                               genes = NULL,
+                               gene.clusters = NULL,
+                               program.labels = NULL,
+                               n.cores = 2) {
+  # check inputs
+  if (is.null(expr.mat) || is.null(genes) || is.null(gene.clusters)) { stop("Arguments to geneProgramScoring() are missing.") }
+  if (!is.factor(gene.clusters)) {
+    gene.clusters <- as.factor(gene.clusters)
+  }
+  # set program labels
+  cluster.labels <- unique(gene.clusters)
+  if (is.null(program.labels)) {
+    program.labels <- paste0("cluster_", cluster.labels)
+  } else {
+    program.labels <- gsub(" ", "_", program.labels)
+  }
+  # set up query matrix
+  if (inherits(expr.mat, "SingleCellExperiment")) {
+    counts_matrix <- BiocGenerics::counts(expr.mat)
+  } else if (inherits(expr.mat, "Seurat")) {
+    counts_matrix <- Seurat::GetAssayData(expr.mat,
+                                          slot = "counts",
+                                          assay = Seurat::DefaultAssay(expr.mat))
+  } else if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array")) {
+    counts_matrix <- Matrix::Matrix(expr.mat, sparse = TRUE)
+  }
+  # set up feature list
+  program_list <- split(genes, gene.clusters)
+  names(program_list) <- program.labels
+  # run UCell
+  program_scores <- UCell::ScoreSignatures_UCell(counts_matrix,
+                                                 features = program_list,
+                                                 ncores = n.cores)
+  # reformat program scores depending on input format
+  if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array") || inherits(expr.mat, "dgCMatrix")) {
+    colnames(program_scores) <- program.labels
+  } else {
+    for (g in seq(ncol(program_scores))) {
+      if (inherits(expr.mat, "SingleCellExperiment")) {
+        SummarizedExperiment::colData(expr.mat)[, program.labels[g]] <- program_scores[, g]
+      } else if (inherits(expr.mat, "Seurat")) {
+        expr.mat <- Seurat::AddMetaData(expr.mat,
+                                        metadata = program_scores[, g],
+                                        program.labels[g])
+      }
+    }
+  }
+  # return results
+  if (inherits(expr.mat, "matrix") || inherits(expr.mat, "array") || inherits(expr.mat, "dgCMatrix")) {
+    return(program_scores)
+  } else {
+    return(expr.mat)
+  }
+}
diff --git a/R/plotModelCoefs.R b/R/plotModelCoefs.R
@@ -0,0 +1,137 @@
+#' Plot gene dynamics with estimated coefficients.
+#'
+#' @name plotModelCoefs
+#' @author Jack Leary
+#' @import magrittr
+#' @importFrom dplyr select mutate lag lead
+#' @importFrom tidyr pivot_longer
+#' @importFrom ggplot2 ggplot aes geom_point geom_vline geom_ribbon geom_line scale_x_continuous labs
+#' @importFrom scales label_number
+#' @description Generate a plot of gene dynamics over a single pseudotime lineage, along with a table of coefficients across pseudotime intervals.
+#' @param test.dyn.res The output from \code{\link{testDynamic}}. Defaults to NULL.
+#' @param gene A character specifying which gene's dynamics should be plotted. Defaults to NULL.
+#' @param pt A data.frame of pseudotime values for each cell. Defaults to NULL.
+#' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of integer-valued counts with genes as rows & cells as columns. Defaults to NULL.
+#' @param size.factor.offset (Optional) An offset to be used to rescale the fitted values. Can be generated easily with \code{\link{createCellOffset}}. No need to provide if the GEE backend was used. Defaults to NULL.
+#' @param log1p.norm (Optional) Should log1p-normalized versions of expression & model predictions be returned as well? Defaults to TRUE.
+#' @param lineage A character vector specifying which lineage should be plotted. Should be letters, i.e. lineage "A" or "B". Defaults to "A".
+#' @return A \code{ggplot2} object displaying a gene dynamics plot & a table of coefficients across pseudotime intervals.
+#' @export
+#' @examples
+#' \dontrun{
+#' plotModelCoefs(gene_stats,
+#'                gene = "BRCA2",
+#'                pt = pt_df,
+#'                expr.mat = seu_obj,
+#'                size.factor.offset = cell_offset)
+#' }
+
+plotModelCoefs <- function(test.dyn.res = NULL,
+                           gene = NULL,
+                           pt = NULL,
+                           expr.mat = NULL,
+                           size.factor.offset = NULL,
+                           lineage = "A",
+                           log1p.norm = TRUE) {
+  # check inputs
+  if (is.null(test.dyn.res) || is.null(gene) || is.null(pt) || is.null(expr.mat)) { stop("Arguments to plotModelCoefs() are missing.") }
+  # pull fitted values
+  all_lineages <- gsub("Lineage_", "", names(test.dyn.res[[1]]))
+  if (length(all_lineages) == 1) {
+    gfv_filter <- NULL
+  } else {
+    gfv_filter <- all_lineages[all_lineages != lineage]
+  }
+  fitted_vals <- getFittedValues(test.dyn.res,
+                                 genes = gene,
+                                 pt = pt,
+                                 expr.mat = expr.mat,
+                                 size.factor.offset = size.factor.offset,
+                                 log1p.norm = log1p.norm,
+                                 filter.lineage = gfv_filter)
+  if (log1p.norm) {
+    fitted_vals <- dplyr::select(fitted_vals,
+                                 cell,
+                                 lineage,
+                                 pt,
+                                 gene,
+                                 rna = rna_log1p,
+                                 scLANE_pred = scLANE_pred_log1p,
+                                 scLANE_ci_ll = scLANE_ci_ll_log1p,
+                                 scLANE_ci_ul = scLANE_ci_ul_log1p)
+  } else {
+    fitted_vals <- dplyr::select(fitted_vals,
+                                 cell,
+                                 lineage,
+                                 pt,
+                                 gene,
+                                 rna,
+                                 scLANE_pred,
+                                 scLANE_ci_ll,
+                                 scLANE_ci_ul)
+
+  }
+  # generate dynamics plot
+  dyn_plot <- ggplot2::ggplot(fitted_vals, ggplot2::aes(x = pt, y = rna)) +
+              ggplot2::geom_point(size = 1.5,
+                                  alpha = 0.6,
+                                  stroke = 0,
+                                  color = "grey30") +
+              ggplot2::geom_vline(data = data.frame(gene = gene, knot = unique(test.dyn.res[[gene]][[paste0("Lineage_", lineage)]]$MARGE_Slope_Data$Breakpoint)),
+                                  mapping = ggplot2::aes(xintercept = knot),
+                                  linetype = "dashed",
+                                  color = "black",
+                                  linewidth = 0.75) +
+              ggplot2::geom_ribbon(ggplot2::aes(ymin = scLANE_ci_ll, ymax = scLANE_ci_ul),
+                                   linewidth = 0,
+                                   fill = "darkgreen",
+                                   alpha = 0.35) +
+              ggplot2::geom_line(ggplot2::aes(y = scLANE_pred),
+                                 color = "darkgreen",
+                                 linewidth = 0.75) +
+              ggplot2::scale_x_continuous(labels = scales::label_number(accuracy = 0.01)) +
+              ggplot2::labs(x = "Pseudotime",
+                            y = ifelse(log1p.norm, "Normalized Expression", "Expression")) +
+              theme_scLANE()
+  # generate coefficient summary
+  min_pt <- min(pt[, which(LETTERS == lineage)], na.rm = TRUE)
+  max_pt <- max(pt[, which(LETTERS == lineage)], na.rm = TRUE)
+  coef_sumy <- dplyr::select(test.dyn.res[[gene]][[paste0("Lineage_", lineage)]]$Gene_Dynamics,
+                             -dplyr::starts_with("Trend")) %>%
+               tidyr::pivot_longer(dplyr::starts_with("Slope"),
+                                   names_to = "Segment",
+                                   values_to = "Coef") %>%
+               dplyr::mutate(Breakpoint_Lag = dplyr::lag(Breakpoint),
+                             Breakpoint_Lead = dplyr::lead(Breakpoint),
+                             Interval = NA_character_,
+                             .before = 4) %>%
+               dplyr::mutate(Breakpoint_Lag = dplyr::if_else(is.na(Breakpoint_Lag), min_pt, Breakpoint_Lag),
+                             Breakpoint_Lead = dplyr::if_else(is.na(Breakpoint_Lead), max_pt, Breakpoint_Lead),
+                             Interval = paste0("(", round(Breakpoint_Lag, 3), ", ", round(Breakpoint_Lead, 3), ")")) %>%
+               dplyr::select(Interval, Coef) %>%
+               dplyr::mutate(Coef = round(Coef, 3))
+  # convert coefficient summary to grob
+  coef_sumy_grob <- gridExtra::tableGrob(coef_sumy,
+                                         cols = c("Interval", "Coefficient"),
+                                         theme = gridExtra::ttheme_minimal(base_size = 11,
+                                                                           core = list(fg_params = list(hjust = 0, x = 0.05)),
+                                                                           colhead = list(fg_params = list(hjust = 0, x = 0.05))),
+                                         rows = NULL) %>%
+                    gtable::gtable_add_grob(grobs = grid::rectGrob(gp = grid::gpar(fill = NA, lwd = 3)),
+                                            t = 1,
+                                            b = nrow(.),
+                                            l = 1,
+                                            r = ncol(.)) %>%
+                    gtable::gtable_add_grob(grobs = grid::rectGrob(gp = grid::gpar(fill = NA, lwd = 3)),
+                                            t = 1,
+                                            l = 1,
+                                            r = ncol(.))
+
+  # combine objects
+  dyn_plot_anno <- ggpubr::ggarrange(dyn_plot,
+                                     coef_sumy_grob,
+                                     ncol = 2,
+                                     nrow = 1,
+                                     widths = c(2, 1))
+  return(dyn_plot_anno)
+}
diff --git a/R/summarizeModel.R b/R/summarizeModel.R
@@ -10,6 +10,7 @@
 #' @param pt The predictor matrix of pseudotime. Defaults to NULL.
 #' @return A data.frame of the model coefficients, cutpoint intervals, and formatted equations.
 #' @seealso \code{\link{marge2}}
+#' @export
 #' @examples
 #' \dontrun{
 #' summarizeModel(marge.model = marge_mod, pt = pt_df)

diff --git a/README.Rmd b/README.Rmd
@@ -89,8 +89,9 @@ plotPCA(sim_data, colour_by = "subject") +
 
 Since we have multi-subject data, we can use any of the three model modes to run our DE testing. We'll start with the simplest model, the GLM, then work our way through the other options in order of increasing complexity. We first prepare our inputs - a dataframe containing our cell ordering, a set of genes to build models for, and a vector of per-cell size factors to be used as offsets during estimation. In reality, it's usually unnecessary to fit a model for every single gene in a dataset, as trajectories are usually estimated using a subset of the entire set of genes (usually a few thousand most highly variable genes). For the purpose of demonstration, we'll select 50 genes each from the dynamic and non-dynamic populations. 
 
-> [!NOTE]
-> In this case we're working with a single pseudotime lineage, though in real datasets several lineages often exist; in order to fit models for a subset of lineages simply remove the corresponding columns from the cell ordering dataframe passed as input to `testDynamic()`. 
+{% note %}
+**Note** In this case we're working with a single pseudotime lineage, though in real datasets several lineages often exist; in order to fit models for a subset of lineages simply remove the corresponding columns from the cell ordering dataframe passed as input to `testDynamic()`. 
+{% endnote %}
 
 ```{r prep-data}
 set.seed(312)
@@ -165,8 +166,9 @@ scLANE_models_glmm <- testDynamic(sim_data,
                                   n.cores = 4)
 ```
 
-> [!NOTE]
-> The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. 
+{% note %}
+**Note** The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. 
+{% endnote %}
 
 Like the GLM mode, the GLMM mode uses a likelihood ratio test to compare the null & alternate models. We fit the two nested models using maximum likelihood estimation instead of [REML](https://en.wikipedia.org/wiki/Restricted_maximum_likelihood) in order to perform this test; the null model in this case is a negative binomial GLMM with a random intercept for each subject.