diff --git a/NAMESPACE b/NAMESPACE index b08da02..ffb1af7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(extractBreakpoints) export(fitGLMM) export(geneProgramDrivers) export(geneProgramScoring) +export(geneProgramSignificance) export(getFittedValues) export(getKnotDist) export(getResultsDE) @@ -71,6 +72,7 @@ importFrom(furrr,future_map) importFrom(future,multisession) importFrom(future,plan) importFrom(future,sequential) +importFrom(gamlss,LR.test) importFrom(gamlss,gamlss) importFrom(gamlss,gamlss.control) importFrom(gamlss,pb) diff --git a/NEWS.md b/NEWS.md index 95d2a12..ea7a3d7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ + Added `geneProgramDrivers()` function to compute & test correlations of expression with gene module scores. + Updated documentation & unit tests. ++ Added `geneProgramSignificance` function to estimate associations between gene program module scores and pseudotime. # Changes in version 0.7.8 diff --git a/R/geneProgramDrivers.R b/R/geneProgramDrivers.R index 81d3472..f396f83 100644 --- a/R/geneProgramDrivers.R +++ b/R/geneProgramDrivers.R @@ -11,7 +11,7 @@ #' @importFrom doSNOW registerDoSNOW #' @importFrom stats cor.test p.adjust #' @importFrom dplyr arrange desc mutate filter -#' @description This function computes the correlation +#' @description This function computes the correlation between smoothed gene expression and gene program scores in order to identify genes are significantly associated with program scores i.e., the "drivers" of the gene program. #' @param expr.mat Either a \code{SingleCellExperiment} or \code{Seurat} object from which counts can be extracted, or a matrix of normalized counts with genes as rows & cells as columns. Defaults to NULL. #' @param genes A character vector of genes to test. Defaults to NULL. #' @param gene.program A vector of program scores as returned by \code{\link{geneProgramScoring}}. Defaults to NULL. @@ -39,7 +39,8 @@ #' program_drivers <- geneProgramDrivers(sim_counts, #' genes = gene_embed$gene, #' gene.program = sim_counts$cluster_0, -#' fdr.cutoff = 0.05) +#' fdr.cutoff = 0.05, +#' n.cores = 1L) geneProgramDrivers <- function(expr.mat = NULL, genes = NULL, diff --git a/R/geneProgramScoring.R b/R/geneProgramScoring.R index 631d2fc..fc8e2cc 100644 --- a/R/geneProgramScoring.R +++ b/R/geneProgramScoring.R @@ -8,7 +8,8 @@ #' @param genes A character vector of gene IDs. Defaults to NULL. #' @param gene.clusters A factor containing the cluster assignment of each gene in \code{genes}. Defaults to NULL. #' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL. -#' @param minmax.norm (Optional) Should each program's score be min-max normalized to be on [0, 1]? Defaults to FALSE. +#' @param minmax.norm (Optional) Should each program's score be min-max normalized to be on (0, 1)? Defaults to TRUE. +#' @param minmax.epsilon (Optional) The tolerance used to ensure that program scores equal to 0 or 1 do not occur. Defaults to 0.01. #' @param n.cores (Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2. #' @return Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix. #' @seealso \code{\link[UCell]{ScoreSignatures_UCell}} @@ -31,7 +32,8 @@ geneProgramScoring <- function(expr.mat = NULL, genes = NULL, gene.clusters = NULL, program.labels = NULL, - minmax.norm = FALSE, + minmax.norm = TRUE, + minmax.epsilon = 1e-2, n.cores = 2L) { # check inputs if (is.null(expr.mat) || is.null(genes) || is.null(gene.clusters)) { stop("Arguments to geneProgramScoring() are missing.") } @@ -65,11 +67,11 @@ geneProgramScoring <- function(expr.mat = NULL, program_scores <- UCell::ScoreSignatures_UCell(counts_matrix, features = program_list, ncores = n.cores) - # min-max normalize if desired + # min-max normalize with tol to (0, 1) if desired if (minmax.norm) { program_scores <- purrr::map(seq(ncol(program_scores)), \(i) { scores <- program_scores[, i] - normed_scores <- (scores - min(scores)) / (max(scores) - min(scores)) + normed_scores <- minmax.epsilon + (((scores - min(scores)) * (1 - 2 * minmax.epsilon)) / (max(scores) - min(scores))) return(normed_scores) }) program_scores <- purrr::reduce(program_scores, cbind) diff --git a/R/geneProgramSignificance.R b/R/geneProgramSignificance.R new file mode 100644 index 0000000..65604c4 --- /dev/null +++ b/R/geneProgramSignificance.R @@ -0,0 +1,71 @@ +#' Test significance of gene program enrichment across a trajectory. +#' +#' @name geneProgramSignificance +#' @author Jack Leary +#' @import magrittr +#' @importFrom purrr map reduce +#' @importFrom gamlss gamlss LR.test +#' @importFrom dplyr arrange desc mutate +#' @importFrom stats p.adjust +#' @description This function fits a Beta GAM with pseudotime as a covariate and gene program score as the response. The fitted model is then compared to a null, intercept-only model in order to determine the significance of the relationship between gene program and pseudotime. +#' @param gene.programs A list of vectors of program scores as returned by \code{\link{geneProgramScoring}}. Defaults to NULL. +#' @param pt A vector of pseudotime values for each cell. May contain NAs, which are handled internally. Defaults to NULL. +#' @param program.labels (Optional) A character vector specifying a label for each gene cluster. Defaults to NULL. +#' @param p.adj.method (Optional) The method used to adjust \emph{p}-values for multiple hypothesis testing. Defaults to "holm". +#' @return A table of statistical output showing the significance of the association between pseudotime and program scores. +#' @seealso \code{\link{geneProgramScoring}} +#' @export +#' @examples +#' data(sim_counts) +#' data(scLANE_models) +#' data(sim_pseudotime) +#' smoothed_dynamics <- smoothedCountsMatrix(scLANE_models, +#' pt = sim_pseudotime, +#' n.cores = 1L) +#' gene_embed <- embedGenes(smoothed_dynamics$Lineage_A, n.cores = 1L) +#' sim_counts <- geneProgramScoring(sim_counts, +#' genes = gene_embed$gene, +#' gene.clusters = gene_embed$leiden, +#' n.cores = 1L) +#' program_enrichment_stats <- geneProgramSignificance(list(sim_counts$cluster_0), +#' pt = sim_pseudotime$PT, +#' program.labels = c("Program Name")) + +geneProgramSignificance <- function(gene.programs = NULL, + pt = NULL, + program.labels = NULL, + p.adj.method = "holm") { + # check inputs + if (is.null(gene.programs) || is.null(pt)) { stop("Inputs to geneProgramSignificance() are missing.") } + if (is.null(program.labels)) { + program.labels <- paste0("cluster_", seq(length(gene.programs)) - 1) + } + if (length(gene.programs) != length(program.labels)) { stop("program.labels must have the same length as gene.programs.") } + # iterate over gene programs and fit Beta GAMs + program_models <- purrr::map(seq(gene.programs), \(g) { + pseudotime_vec <- pt[!is.na(pt)] + program_score_vec <- gene.programs[[g]][!is.na(pt)] + alt_model <- gamlss::gamlss(program_score_vec ~ pseudotime_vec, + family = "BE", + data = NULL, + control = gamlss::gamlss.control(trace = FALSE)) + null_model <- gamlss::gamlss(program_score_vec ~ 1, + family = "BE", + data = NULL, + control = gamlss::gamlss.control(trace = FALSE)) + lr_test_res <- gamlss::LR.test(null = null_model, + alternative = alt_model, + print = FALSE) + res <- data.frame(Program = program.labels[g], + LRT_Stat = lr_test_res$chi, + DF = lr_test_res$df, + P_Value = lr_test_res$p.val) + return(res) + }) + # prepare test results + program_sig_df <- purrr::reduce(program_models, rbind) %>% + dplyr::arrange(P_Value, dplyr::desc(LRT_Stat)) %>% + dplyr::mutate(P_Value = dplyr::if_else(P_Value == 0, 2e-16, P_Value), + P_Value_Adj = stats::p.adjust(P_Value, method = p.adj.method)) + return(program_sig_df) +} diff --git a/man/geneProgramDrivers.Rd b/man/geneProgramDrivers.Rd index 8494d06..0806e5a 100644 --- a/man/geneProgramDrivers.Rd +++ b/man/geneProgramDrivers.Rd @@ -36,7 +36,7 @@ geneProgramDrivers( Either a \code{Seurat} or \code{SingleCellExperiment} object if \code{expr.mat} is in either form, or a data.frame containing per-cell program scores if \code{expr.mat} is a matrix. } \description{ -This function computes the correlation +This function computes the correlation between smoothed gene expression and gene program scores in order to identify genes are significantly associated with program scores i.e., the "drivers" of the gene program. } \examples{ data(sim_counts) @@ -53,7 +53,8 @@ sim_counts <- geneProgramScoring(sim_counts, program_drivers <- geneProgramDrivers(sim_counts, genes = gene_embed$gene, gene.program = sim_counts$cluster_0, - fdr.cutoff = 0.05) + fdr.cutoff = 0.05, + n.cores = 1L) } \seealso{ \code{\link{geneProgramScoring}} diff --git a/man/geneProgramScoring.Rd b/man/geneProgramScoring.Rd index adc7f66..9ffbbee 100644 --- a/man/geneProgramScoring.Rd +++ b/man/geneProgramScoring.Rd @@ -9,7 +9,8 @@ geneProgramScoring( genes = NULL, gene.clusters = NULL, program.labels = NULL, - minmax.norm = FALSE, + minmax.norm = TRUE, + minmax.epsilon = 0.01, n.cores = 2L ) } @@ -22,7 +23,9 @@ geneProgramScoring( \item{program.labels}{(Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.} -\item{minmax.norm}{(Optional) Should each program's score be min-max normalized to be on [0, 1]? Defaults to FALSE.} +\item{minmax.norm}{(Optional) Should each program's score be min-max normalized to be on (0, 1)? Defaults to TRUE.} + +\item{minmax.epsilon}{(Optional) The tolerance used to ensure that program scores equal to 0 or 1 do not occur. Defaults to 0.01.} \item{n.cores}{(Optional) The number of cores used under the hood in \code{\link[UCell]{ScoreSignatures_UCell}}. Defaults to 2.} } diff --git a/man/geneProgramSignificance.Rd b/man/geneProgramSignificance.Rd new file mode 100644 index 0000000..bb9cb44 --- /dev/null +++ b/man/geneProgramSignificance.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/geneProgramSignificance.R +\name{geneProgramSignificance} +\alias{geneProgramSignificance} +\title{Test significance of gene program enrichment across a trajectory.} +\usage{ +geneProgramSignificance( + gene.programs = NULL, + pt = NULL, + program.labels = NULL, + p.adj.method = "holm" +) +} +\arguments{ +\item{gene.programs}{A list of vectors of program scores as returned by \code{\link{geneProgramScoring}}. Defaults to NULL.} + +\item{pt}{A vector of pseudotime values for each cell. May contain NAs, which are handled internally. Defaults to NULL.} + +\item{program.labels}{(Optional) A character vector specifying a label for each gene cluster. Defaults to NULL.} + +\item{p.adj.method}{(Optional) The method used to adjust \emph{p}-values for multiple hypothesis testing. Defaults to "holm".} +} +\value{ +A table of statistical output showing the significance of the association between pseudotime and program scores. +} +\description{ +This function fits a Beta GAM with pseudotime as a covariate and gene program score as the response. The fitted model is then compared to a null, intercept-only model in order to determine the significance of the relationship between gene program and pseudotime. +} +\examples{ +data(sim_counts) +data(scLANE_models) +data(sim_pseudotime) +smoothed_dynamics <- smoothedCountsMatrix(scLANE_models, + pt = sim_pseudotime, + n.cores = 1L) +gene_embed <- embedGenes(smoothed_dynamics$Lineage_A, n.cores = 1L) +sim_counts <- geneProgramScoring(sim_counts, + genes = gene_embed$gene, + gene.clusters = gene_embed$leiden, + n.cores = 1L) +program_enrichment_stats <- geneProgramSignificance(list(sim_counts$cluster_0), + pt = sim_pseudotime$PT, + program.labels = c("Program Name")) +} +\seealso{ +\code{\link{geneProgramScoring}} +} +\author{ +Jack Leary +} diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 44f8aaf..20968f0 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -227,6 +227,10 @@ withr::with_output_sink(tempfile(), { sim_data_seu <- geneProgramScoring(sim_data_seu, genes = gene_embedding$gene, gene.clusters = gene_embedding$leiden) + # gene program significance + program_significance <- geneProgramSignificance(list(sim_data$cluster_0), + pt = pt_test$PT, + program.labels = c("Cluster0")) # gene program drivers program_drivers <- geneProgramDrivers(sim_data, genes = gene_embedding$gene, @@ -407,6 +411,11 @@ test_that("geneProgramScoring() output", { expect_equal(colnames(sim_data_seu@meta.data)[11], "cluster_1") }) +test_that("geneProgramSignificance() output", { + expect_s3_class(program_significance, "data.frame") + expect_equal(ncol(program_significance), 5) +}) + test_that("geneProgramDrivers() output", { expect_s3_class(program_drivers, "data.frame") expect_s3_class(program_drivers_seu, "data.frame")