From fe965d4a3fa26cc9c4b042455a2dc9686ccc2545 Mon Sep 17 00:00:00 2001 From: Jack Leary Date: Thu, 30 Nov 2023 13:03:19 -0500 Subject: [PATCH] updated README & minor docs changes --- .DS_Store | Bin 10244 -> 16388 bytes R/plotModelCoefs.R | 2 +- R/testDynamic.R | 4 ++-- README.Rmd | 38 ++++++++++---------------------------- inst/.DS_Store | Bin 6148 -> 6148 bytes inst/rmarkdown/.DS_Store | Bin 6148 -> 6148 bytes man/testDynamic.Rd | 2 +- 7 files changed, 14 insertions(+), 32 deletions(-) diff --git a/.DS_Store b/.DS_Store index 412ed02b166f258bab4560771829b84b56019b65..64e4768f65221a3de2f1ecd35892ca1107c9644b 100644 GIT binary patch literal 16388 zcmeHOYitzP6~1S8fmx>DF~tzCgLkP#3S=Q(8)Bn8SU*THnA-8$*g%54kHJitomqCq zHk6V&Bx$PV@tt$#+&goJ5CYv(xdb7dgb)D_noA$SV!VlA-YQ~&ohY3i zRVj!ifX4(LA(SU1OLS5o3XwB2Sa!3g0Ctcb*mtsw43QY$f1{95^$b#8&JGG zz(?T2hJYP71TNNp*dXx7>m%SJPy+$_e2C#e^Q4{6vfO%f;E7J3DVXB?VWA^!TQ%@XzFAlmDTeF z28QsaHvoCPx7U6FAbj(kqV)O4-{`8vnY%&G!co5ELD~V2{_*Q{m~+6q64qVYaj8lEAyon=pVuqkdviEJpFnY5-UYT;0edleLyh5iD4_-? zuaHOnQ1*(N+!f`_&F!J68uhxA7ADj9%5jot=qDyq;9~IdeXu z&%Xbd{rIYaiua)Fb2=4dKfbE!b1A2;S~-i+v*sA4htlSd;1zgjl~Ia6b4pb_?#87& z6gx*dE+x~4s-JuWd<1+1d<1+1d<6b)5ZH!24Uw%leO8FcPgCGkwVS8$6Ot3Nlb?Yv zSToA9EI*I*qVtms`16-2Kp~PNX`Ha9`F2Uu`PX?G-|qcKp4xrrvv#|-EZ?pdoo|;k z(W=SvQoH<2)T0lce0+c2`^Be96SEq;mFd`AydM1wVn20~<$DGVm(Sn`Yzp6(&}8xH zv4Vw7nObSRn##8a4Ob~HA+!OFzG_(L<^xTSDq9IWV7x-+;`LCDXZtQi z1xX&COxi-LX1?Bn+QV@uWA)_;G*z3z!@u*D7tYxa*)p~AAgQK0AYSt=YX7#J3)CvMW2#G^i!azbRE_4>>iiwU`k>MYav-Y!o+ zT8cLuz0yZ7x}#^M?}S4OZ3OcbZGa3plpgHcg5c3oW4ecZO9*Z(Z6Is^9>_GNW#uqaRCp2E?ru1@7Oq||5AYIX{l zP}u$Iot^?;RFQnxflB4bk-<@l6ho|IYy!75$X91$PL;XwB3Ia*szdP!jX zA8U?84pB-^QO0u)1ys zg4;clYhy zv$x}>^_mTZx?a?7k8>bA_<@+Fr|UPO;-x>anBj-cJk zCn^re9J1tw(lTn{g2laFY>7%s&@oDIq7*`>vuQiUE^I{$ON7!&wGe~bpcU?fdoAlG$b1T>G8o3cq3I%E!n4 z_kZ`^YY$mV2{dGW&g!H%Vq)kiS~2pwqO-HP`Mkru{_ef+9zo2tJKhgyen9gB+P@vp zs=de$Zq{ti4{jI8%^loarppJqjw_R;*ELmKmE~Yc-!?f;n;df7OCFK4XZ zvm8-(!#TRI!yYIP_SfD$|ADyPe#>jg@iU8Rssr2&f8_qq!}^!9i9{#e|G%sD|9_0j zKl=#y2rNnjAk>@cr62oBt8w=z>0bL79us)bMEPdv1tz?vWSU!B@iBM&ER6R_n?B2O zEWz|Y{{g^X-2NAfTUSKpLsIR?&WEPjT#gXM7Qn97{PJd628A4sadu|6gTb B{yzW! delta 219 zcmZo^U~CDHU|?WibSh0TWMEJLGC6=4L<{gtEEJolCpp=`hD8*}XAlBnMxZ!DPP$=m za(-^X=0J(%?3>v+SU4DkCi5uNvS%{nF%&bDOnxZRFnNJe;N(nY{>h4}*CwZ{@iQ|r z1VP0;KEyYBZzppJ^G`k}@@{gB82{z~30W3FW{|Bw$8ZA) tSCE4?3o?CYp3HA!IoX~^LVyY4Rt=CEpt}qvr`Sx~EN2wS2sCyTBLFn_Gfw~j diff --git a/R/plotModelCoefs.R b/R/plotModelCoefs.R index aeb8e36..57a311e 100644 --- a/R/plotModelCoefs.R +++ b/R/plotModelCoefs.R @@ -120,7 +120,7 @@ plotModelCoefs <- function(test.dyn.res = NULL, } # convert coefficient summary to grob coef_sumy_grob <- gridExtra::tableGrob(coef_sumy, - cols = c("Interval", "Coefficient"), + cols = c("Interval", "Slope"), theme = gridExtra::ttheme_minimal(base_size = 11, core = list(fg_params = list(hjust = 0, x = 0.05)), colhead = list(fg_params = list(hjust = 0, x = 0.05))), diff --git a/R/testDynamic.R b/R/testDynamic.R index 67c1d3d..3792612 100644 --- a/R/testDynamic.R +++ b/R/testDynamic.R @@ -2,7 +2,7 @@ #' #' @name testDynamic #' @author Jack Leary -#' @description This function tests whether a NB \code{marge} model is better than a null (intercept-only) NB GLM using the Likelihood Ratio Test. In effect, the test tells us whether a gene's expression changes (in any way) over pseudotime. +#' @description This function tests whether a NB \code{marge} model is better than a null (intercept-only) model using the Likelihood Ratio Test. In effect, the test tells us whether a gene's expression changes (in any way) over pseudotime. #' @import glm2 #' @import magrittr #' @importFrom Matrix t @@ -381,7 +381,7 @@ testDynamic <- function(expr.mat = NULL, total_time <- end_time - start_time total_time_units <- attributes(total_time)$units total_time_numeric <- as.numeric(total_time) - time_message <- paste0("\nscLANE testing completed for ", + time_message <- paste0("scLANE testing completed for ", length(genes), " genes across ", n_lineages, diff --git a/README.Rmd b/README.Rmd index b5d2ce7..7ffe868 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,7 +17,9 @@ knitr::opts_chunk$set(warning = FALSE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%", - fig.retina = TRUE) + fig.retina = TRUE, + fig.width = 5, + fig.height = 3) ``` # `scLANE` @@ -43,18 +45,16 @@ remotes::install_github("jr-leary7/scLANE") ## Model structure -The `scLANE` package enables users to accurately determine differential expression of genes over pseudotime or latent time, and to characterize gene's dynamics using interpretable model coefficients. `scLANE` builds upon the [`marge` modeling framework](https://github.com/JakubStats/marge), allowing users to characterize their trajectory's effects on mRNA expression using negative binomial [GLMs](https://en.wikipedia.org/wiki/Generalized_linear_model), [GEEs](https://en.wikipedia.org/wiki/Generalized_estimating_equation), or [GLMMs](https://en.wikipedia.org/wiki/Generalized_linear_mixed_model), depending on the experimental design & biological questions of interest. This modeling framework is an extension of the well-known [Multivariate Adapative Regression Splines (MARS)](https://en.wikipedia.org/wiki/Multivariate_adaptive_regression_spline) method, which uses truncated power basis splines to build nonlinear models using a [generalized cross validation (GCV) criterion](https://doi.org/10.48550/arXiv.1706.02495). +The `scLANE` package enables users to accurately determine differential expression of genes over pseudotime or latent time, and to characterize gene's dynamics using interpretable model coefficients. `scLANE` builds upon the `marge` modeling framework([GitHub](https://github.com/JakubStats/marge), [paper](https://doi.org/10.1080/10618600.2017.1360780)), allowing users to characterize their trajectory's effects on gene expression using negative binomial GLMs, GEEs, or GLMMs depending on the experimental design & biological questions of interest. This modeling framework is an extension of the [Multivariate Adapative Regression Splines (MARS)](https://en.wikipedia.org/wiki/Multivariate_adaptive_regression_spline) method, which builds nonlinear models out of piecewise linear components. `scLANE` is agnostic with respect to the ordering estimation method used, and can be implemented downstream of any pseudotime or RNA velocity method. A quickstart guide on how to use `scLANE` with simulated data continues below, and a more detailed vignette showcasing its performance on real data can be found [here](https://jr-leary7.github.io/quarto-site/tutorials/scLANE_Trajectory_DE.html). # Usage -Our method relies on a relatively simple test in order to define whether a given gene is differentially expressed (or "dynamic") over the provided trajectory. While the exact structure of the test differs by model backend, the concept is the same: the spline-based NB GLM / GEE / GLMM is treated as the alternate model, and a null model is fit using the corresponding model backend. If the GLM backend is used, then the null model is simply an intercept-only NB GLM; the GEE backend fits an intercept-only model with the same working correlation structure as the alternate model, and if the GLMM backend is used then the null model is an intercept-only model with random intercepts for each subject. The alternate hypothesis is thus that at least one of the estimated coefficients is significantly different from zero. We predict a given gene to be dynamic if the adjusted *p*-value of the test is less than an *a priori* threshold; the default threshold is $\alpha = 0.01$, and the default adjustment method is [the Holm correction](https://en.wikipedia.org/wiki/Holm–Bonferroni_method). +Our method relies on a relatively simple test in order to define whether a given gene is differentially expressed (or "dynamic") over the provided trajectory. While the exact structure of the test differs by model mode, the concept is the same: the spline-based NB GLM / GEE / GLMM is treated as the alternate model, and a null model is fit using the corresponding model mode. If the GLM mode is used, then the null model is simply an intercept-only NB GLM; the GEE mode fits an intercept-only model with the same working correlation structure as the alternate model, and if the GLMM mode is used then the null model is an intercept-only model with random intercepts for each subject. The alternate hypothesis is that at least one of the estimated coefficients is significantly different from zero. We predict a given gene to be dynamic if the adjusted *p*-value of the test is less than the default $\alpha = 0.01$ threshold, and classify it as static otherwise. ## Libraries -First we'll need to load a couple packages. - ```{r libraries, results='hide', message=FALSE} library(dplyr) library(scLANE) @@ -119,7 +119,7 @@ scLANE_models_glm <- testDynamic(sim_data, verbose = FALSE) ``` -After the function finishes running, we use `getResultsDE()` to generate a sorted table of DE test results, with one row for each gene & lineage. The GLM mode uses a simple likelihood ratio test to compare the null & alternate models, with the test statistic assumed to be [asymptotically Chi-squared distributed](https://en.wikipedia.org/wiki/Likelihood-ratio_test). +After the function finishes running, we use `getResultsDE()` to generate a sorted table of DE test results, with one row for each gene & lineage. The GLM mode uses a simple likelihood ratio test to compare the null & alternate models, with the test statistic assumed to be asymptotically Chi-squared distributed. ```{r glm-results} scLANE_res_glm <- getResultsDE(scLANE_models_glm) @@ -132,7 +132,7 @@ select(scLANE_res_glm, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic_ ### GEE mode -The function call is essentially the same when using the GLM mode, with the exception of needing to provide a sorted vector of subject IDs & a desired correlation structure, the default being [the AR1 structure](https://en.wikipedia.org/wiki/Autoregressive_model). We also need to flip the `is.gee` flag in order to indicate that we'd like to fit estimating equations models (instead of mixed models). Since fitting GEEs is more computationally complex than fitting GLMs, DE testing with the GEE mode takes a bit longer. Using more cores and / or running the tests on an HPC cluster speeds things up considerably. +The function call is essentially the same when using the GLM mode, with the exception of needing to provide a sorted vector of subject IDs & a desired correlation structure. We also need to flip the `is.gee` flag in order to indicate that we'd like to fit estimating equations models (instead of mixed models). Since fitting GEEs is more computationally complex than fitting GLMs, DE testing with the GEE mode takes a bit longer. Using more cores and / or running the tests on an HPC cluster speeds things up considerably. ```{r gee-models} scLANE_models_gee <- testDynamic(sim_data, @@ -146,7 +146,7 @@ scLANE_models_gee <- testDynamic(sim_data, verbose = FALSE) ``` -We again generate the table of DE test results. The variance of the estimated coefficients is determined using [the sandwich estimator](https://online.stat.psu.edu/stat504/lesson/12/12.3), and a Wald test is used to compare the null & alternate models. +We again generate the table of DE test results. The variance of the estimated coefficients is determined using the sandwich estimator, and a Wald test is used to compare the null & alternate models. ```{r gee-results} scLANE_res_gee <- getResultsDE(scLANE_models_gee) @@ -176,7 +176,7 @@ scLANE_models_glmm <- testDynamic(sim_data, **Note:** The GLMM mode is still under development, as we are working on further reducing runtime and increasing the odds of the underlying optimization process converging successfully. As such, updates will be frequent and functionality / results may shift slightly. -Like the GLM mode, the GLMM mode uses a likelihood ratio test to compare the null & alternate models. We fit the two nested models using maximum likelihood estimation instead of [REML](https://en.wikipedia.org/wiki/Restricted_maximum_likelihood) in order to perform this test; the null model in this case is a negative binomial GLMM with a random intercept for each subject. +Like the GLM mode, the GLMM mode uses a likelihood ratio test to compare the null & alternate models. ```{r glmm-results} scLANE_res_glmm <- getResultsDE(scLANE_models_glmm) @@ -191,7 +191,7 @@ select(scLANE_res_glmm, Gene, Lineage, Test_Stat, P_Val, P_Val_Adj, Gene_Dynamic ### Model comparison -We can use the `plotModels()` to visually compare different types of models. It takes as input the results from `testDynamic()`, as well as a few specifications for which models & lineages should be plotted. While more complex visualizations can be created from our model output, this function gives us a good first glance at which models fit the underlying trend the best. Here we show the output generated using the GLM mode, split by model type. The intercept-only model shows the null hypothesis against which the scLANE model is compared using the likelihood ratio test and the GLM displays the inadequacy of monotonic modeling architectures for nonlinear dynamics. A GAM shows essentially the same trend as the `scLANE` model, though the fitted trend from `scLANE` is more interpretable & has a narrower confidence interval. +We can use the `plotModels()` to visually compare different types of models. It takes as input the results from `testDynamic()`, as well as a few specifications for which models & lineages should be plotted. While more complex visualizations can be created from our model output, this function gives us a good first glance at which models fit the underlying trend the best. Here we show the output generated using the GLM mode, split by model type. The intercept-only model shows the null hypothesis against which the scLANE model is compared using the likelihood ratio test and the GLM displays the inadequacy of monotonic modeling architectures for nonlinear dynamics. A GAM shows essentially the same trend as the `scLANE` model, though the fitted trend from `scLANE` is more interpretable. ```{r plot-models-glm} plotModels(scLANE_models_glm, @@ -205,22 +205,6 @@ plotModels(scLANE_models_glm, plot.scLANE = TRUE) ``` -Model comparison using the GEE mode is similar, with the only change being that we now provide a vector of subject IDs. - -```{r plot-models-gee} -plotModels(scLANE_models_gee, - gene = scLANE_res_gee$Gene[1], - is.gee = TRUE, - id.vec = sim_data$subject, - pt = order_df, - expr.mat = sim_data, - size.factor.offset = cell_offset, - plot.null = TRUE, - plot.glm = TRUE, - plot.gam = TRUE, - plot.scLANE = TRUE) -``` - When plotting the models generated using the GLMM mode, we split by lineage & color the points by subject ID instead of by lineage. The gene in question highlights the utility of the scLANE model, since the gene dynamics differ significantly by subject. ```{r plot-models-glmm, fig.width=9, fig.height=9} @@ -231,8 +215,6 @@ plotModels(scLANE_models_glmm, size.factor.offset = cell_offset, id.vec = sim_data$subject, is.glmm = TRUE, - plot.null = FALSE, - plot.glm = TRUE, plot.gam = TRUE, plot.scLANE = TRUE) ``` diff --git a/inst/.DS_Store b/inst/.DS_Store index f7fcef9fd29c7d49b680444a9d497b6bd130df7a..366ba3e45417a09d735a9326db360dbc37f0ec03 100644 GIT binary patch delta 345 zcmZoMXfc=|#>B!ku~2NHo+2a9#(>?7ixpUy7V&;WjXY(YYM3q9d*Q|GKEN$1sOn_x#3nfFl>I# z#LtN8AnnKqkd+K245B)qu~2NHo+2a5#(>?7j4YEAS;ZzRvD}_Km-XjlJ@%U$8!VVNvvcrs d099@lK_85meLM~JLp1^^S}6E^?= diff --git a/inst/rmarkdown/.DS_Store b/inst/rmarkdown/.DS_Store index 271a8005d45ebbf8b9dea66b53656fca55c7497b..72fada99b0e556b7cd70b084e3dc25c8c3c59d68 100644 GIT binary patch delta 168 zcmZoMXfc@J&&akhU^gQp+hiW5kM%j}hQZ1CxdjYhV7vxMaWa%Jq%!0(6fop4B!bz+ xx%ndMMoWRsZ1eKWkCkeW^TCE2@IQ`Gc~YmX6N|J4*;J!DAND{ delta 37 tcmZoMXfc@J&&aefU^nAr0}+