Skip to content

Commit

Permalink
Merge pull request #264 from jr-leary7/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jr-leary7 authored Oct 29, 2024
2 parents 6f385a9 + 14e3650 commit 4648e53
Show file tree
Hide file tree
Showing 20 changed files with 212 additions and 68 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scLANE
Type: Package
Title: Model Gene Expression Dynamics with Spline-Based NB GLMs, GEEs, & GLMMs
Version: 0.8.6
Version: 0.8.7
Authors@R: c(person(given = c("Jack", "R."), family = "Leary", email = "[email protected]", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")),
person(given = "Rhonda", family = "Bacher", email = "[email protected]", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X")))
Description: Our scLANE model uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time.
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ importFrom(stats,fitted.values)
importFrom(stats,hclust)
importFrom(stats,kmeans)
importFrom(stats,logLik)
importFrom(stats,model.matrix)
importFrom(stats,offset)
importFrom(stats,p.adjust)
importFrom(stats,p.adjust.methods)
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Changes in v0.8.7

+ Switched GEE fitting back to use `scale.fix = FALSE` and a fixed value for the Negative-binomial overdispersion parameter as it improves model fits.
+ Added option to use a Lagrange Multiplier (Score) test for GEE mode instead of the default Wald test. The relevant argument is `gee.test` in `testDynamic()`.
+ Updated documentation and some tests.
+ Added column called `Null_Fit_Notes` to output from `getResultsDE()` to describe when and how null models fail. This doesn't happen frequently, but it's good info to have when it does.

# Changes in v0.8.6

+ Changed GEE fitting to use `scale.fix = TRUE` throughout the package, as it appears to be faster and more statistically efficient based on simulated data benchmarking.
Expand Down
8 changes: 4 additions & 4 deletions R/backward_sel_WIC.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @importFrom gamlss gamlss
#' @importFrom geeM geem
#' @importFrom MASS negative.binomial
#' @importFrom stats vcov
#' @importFrom stats vcov coef
#' @param Y The response variable. Defaults to NULL.
#' @param B_new The model matrix. Defaults to NULL.
#' @param is.gee Is the model a GEE? Defaults to FALSE.
Expand Down Expand Up @@ -36,8 +36,8 @@ backward_sel_WIC <- function(Y = NULL,
fit <- geeM::geem(Y ~ B_new - 1,
id = id.vec,
corstr = cor.structure,
family = MASS::negative.binomial(theta.hat, link = "log"),
scale.fix = TRUE,
family = MASS::negative.binomial(50, link = "log"),
scale.fix = FALSE,
sandwich = sandwich.var)
wald_stat <- unname(summary(fit)$wald.test[-1])^2
} else {
Expand All @@ -47,7 +47,7 @@ backward_sel_WIC <- function(Y = NULL,
vcov_mat <- try({ stats::vcov(fit, type = "all") }, silent = TRUE)
if (inherits(vcov_mat, "try-error")) {
covmat_unscaled <- chol2inv(fit$mu.qr$qr[1:(fit$mu.df - fit$mu.nl.df), 1:(fit$mu.df - fit$mu.nl.df), drop = FALSE])
wald_stat <- unname(coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1]
wald_stat <- unname(stats::coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1]
} else {
wald_stat <- unname((vcov_mat$coef / vcov_mat$se)[-c(1, length(vcov_mat$coef))]^2)
}
Expand Down
4 changes: 2 additions & 2 deletions R/getResultsDE.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ getResultsDE <- function(test.dyn.res = NULL,
fdr.cutoff = 0.01,
n.cores = 2L) {
# check inputs
if (is.null(test.dyn.res)) { stop("Please provide a result list.") }
if (is.null(test.dyn.res)) { stop("Please provide a result list from testDynamic().") }
if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") }
# set up parallel processing
if (n.cores > 1L) {
Expand All @@ -40,7 +40,7 @@ getResultsDE <- function(test.dyn.res = NULL,
function(x) {
purrr::map_dfr(x,
function(y) {
as.data.frame(rbind(y[seq(13)])) %>%
as.data.frame(rbind(y[seq(14)])) %>%
dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z))))
})
}) %>%
Expand Down
29 changes: 12 additions & 17 deletions R/marge2.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1".
#' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE.
#' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.
#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.
#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 25.
#' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".
#' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001.
#' @param minspan (Optional) A set minimum span value. Defaults to NULL.
Expand Down Expand Up @@ -63,7 +63,7 @@ marge2 <- function(X_pred = NULL,
cor.structure = "ar1",
sandwich.var = FALSE,
approx.knot = TRUE,
n.knot.max = 50,
n.knot.max = 25,
glm.backend = "MASS",
tols_score = 1e-5,
minspan = NULL,
Expand All @@ -84,12 +84,6 @@ marge2 <- function(X_pred = NULL,
}
q <- ncol(X_pred) # Number of predictor variables
B <- as.matrix(rep(1, NN)) # Start with the intercept model.
# estimate "known" theta for GEE models
if (is.gee || !is.glmm) {
theta_hat <- MASS::theta.mm(y = Y,
mu = mean(Y),
dfr = NN - 1)
}

pen <- 2 # penalty for GCV criterion -- could also switch to log(N) later
colnames(B) <- "Intercept"
Expand All @@ -110,6 +104,7 @@ marge2 <- function(X_pred = NULL,
breakFlag <- FALSE
ok <- TRUE
int.count <- 0
theta_hat <- 1

while (ok) { # this is such egregiously bad code lol
if (breakFlag) {
Expand Down Expand Up @@ -167,6 +162,7 @@ marge2 <- function(X_pred = NULL,
X_red <- seq(min(X_red),
max(X_red),
length.out = n.knot.max)
X_red <- round(X_red, 4)
}
} else {
# original candidate knot selection from 2017 Stoklosa & Warton paper
Expand Down Expand Up @@ -638,13 +634,13 @@ marge2 <- function(X_pred = NULL,
} else {
var_name_list1 <- vector("list")
if (trunc.type == 2) {
B_temp <- cbind(B, b1_new, b2_new) # Additive model with both truncated basis functions.
B_temp <- cbind(B, b1_new, b2_new) # Additive model with both truncated basis functions.
B_new <- cbind(b1_new, b2_new)
B_names <- c(B_name1, B_name2)
var_name_list1 <- c(var_name_list1, list(var_name))
var_name_list1 <- c(var_name_list1, list(var_name)) # Repeat it because there are two new truncated basis function in the set.
} else {
B_temp <- cbind(B, b1_new) # Additive model with one truncated basis function (i.e., the positive part).
B_temp <- cbind(B, b1_new) # Additive model with one truncated basis function (i.e., the positive part).
B_new <- b1_new
B_names <- B_name1
var_name_list1 <- c(var_name_list1, list(var_name))
Expand Down Expand Up @@ -777,7 +773,7 @@ marge2 <- function(X_pred = NULL,
colnames(wic_mat_2)[(ncol_B + 1)] <- "Forward pass model"

wic_mat_2[1, (ncol_B + 1)] <- full.wic
wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new)
wic1_2 <- backward_sel_WIC(Y, B_new = B_new)
wic_mat_2[2, 2:(length(wic1_2) + 1)] <- wic1_2
WIC_2 <- sum(apply(wic_mat_2[seq(2), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new)
WIC_vec_2 <- c(WIC_vec_2, WIC_2)
Expand All @@ -787,8 +783,7 @@ marge2 <- function(X_pred = NULL,
B_new_2 <- as.matrix(B_new[, -(variable.lowest_2 + 1)])
cnames_2 <- c(cnames_2, list(colnames(B_new_2)))
for (i in 2:(ncol_B - 1)) {
wic1_2 <- backward_sel_WIC(Y = Y,
B_new = B_new_2)
wic1_2 <- backward_sel_WIC(Y, B_new = B_new_2)
if (i != (ncol_B - 1)) {
wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2
WIC_2 <- sum(apply(wic_mat_2[seq(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2)
Expand Down Expand Up @@ -837,24 +832,24 @@ marge2 <- function(X_pred = NULL,
final_mod <- geeM::geem(model_formula,
data = model_df,
id = id.vec,
family = MASS::negative.binomial(theta_hat, link = log),
family = MASS::negative.binomial(50, link = log),
corstr = cor.structure,
scale.fix = TRUE,
scale.fix = FALSE,
sandwich = sandwich.var)
} else {
if (glm.backend == "MASS") {
final_mod <- MASS::glm.nb(model_formula,
data = model_df,
method = "glm.fit2",
link = log,
init.theta = theta_hat,
init.theta = 1,
y = FALSE,
model = FALSE)
final_mod <- stripGLM(glm.obj = final_mod)
} else if (glm.backend == "speedglm") {
final_mod <- speedglm::speedglm(model_formula,
data = model_df,
family = MASS::negative.binomial(theta_hat, link = "log"),
family = MASS::negative.binomial(50, link = log),
trace = FALSE,
model = FALSE,
y = FALSE,
Expand Down
2 changes: 1 addition & 1 deletion R/plotModelCoefs.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' cell_offset <- createCellOffset(sim_counts)
#' scLANE_de_res <- getResultsDE(scLANE_models)
#' plotModelCoefs(scLANE_models,
#' gene = scLANE_de_res$Gene[1],
#' gene = "ACLY",
#' pt = sim_pseudotime,
#' expr.mat = sim_counts,
#' size.factor.offset = cell_offset)
Expand Down
2 changes: 1 addition & 1 deletion R/pullMARGESummary.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pullMARGESummary <- function(marge.model = NULL,
sandwich.var = FALSE,
is.glmm = FALSE) {
# check inputs
if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary") }
if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary().") }
# handle the degenerate case
if (inherits(marge.model, "try-error")) {
res <- list(marge_pred_df = NULL,
Expand Down
5 changes: 3 additions & 2 deletions R/pullNullSummary.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ pullNullSummary <- function(null.model = NULL,
sandwich.var = FALSE,
is.glmm = FALSE) {
# check inputs
if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary") }
if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary().") }
# handle the degenerate case
if (inherits(null.model, "try-error")) {
res <- list(null_sumy_df = NULL,
Expand Down Expand Up @@ -73,7 +73,8 @@ pullNullSummary <- function(null.model = NULL,
res <- list(null_sumy_df = null_sumy_df,
null_pred_df = null_pred_df,
null_ll = ifelse(is.gee, NA_real_, as.numeric(stats::logLik(null.model))),
null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)))
null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)),
null_fit_notes = NA_character_)
}
return(res)
}
70 changes: 70 additions & 0 deletions R/scoreTestGEE.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#' Use a Lagrange multiplier (score) test to compare nested GEE models.
#'
#' @name scoreTestGEE
#' @author Jack R. Leary
#' @description Performs a basic Lagrange multiplier test to determine whether an alternate model is significantly better than a nested null model. This is the GEE equivalent (kind of) of \code{\link{modelLRT}}. Be careful with small sample sizes.
#' @importFrom stats model.matrix predict pchisq
#' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.
#' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.
#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL.
#' @param null.df The dataframe used to fit the null model. Defaults to NULL.
#' @param sandwich.var Boolean specifying whether the sandwich variance-covariance matrix should be used. Defaults to FALSE.
#' @return A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test.
#' @details
#' \itemize{
#' \item Calculating the test statistic involves taking the inverse of the variance-covariance matrix of the coefficients. Ideally this would be done using the "true" inverse with something like \code{\link{solve}}, \code{\link{qr.solve}}, or \code{\link{chol2inv}}, but in practice this can cause issues when the variance-covariance matrix is near-singular. With this in mind, we use the Moore-Penrose pseudoinverse as implemented in \code{\link[MASS]{ginv}} instead, which leads to more stable results.
#' \item The \emph{p}-value is calculated using an asymptotic \eqn{\Chi^2} distribution, with the degrees of freedom equal to the number of non-intercept coefficients in the alternative model.
#' }
#' @seealso \code{\link[geeM]{geem}}
#' @seealso \code{\link{waldTestGEE}}
#' @seealso \code{\link{modelLRT}}

scoreTestGEE <- function(mod.1 = NULL,
mod.0 = NULL,
alt.df = NULL,
null.df = NULL,
sandwich.var = FALSE) {
# check inputs
if (is.null(mod.1) || is.null(mod.0) || is.null(alt.df) || is.null(null.df)) { stop("Please provide all inputs to scoreTestGEE().") }
if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) {
res <- list(Score_Stat = NA_real_,
DF = NA_real_,
P_Val = NA_real_,
Notes = "No test performed due to model failure.")
return(res)
}
mod.1 <- mod.1$final_mod
if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to scoreTestGee().") }
if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") }
if (length(coef(mod.1)) <= length(coef(mod.0))) {
# can't calculate Score statistic if both models are intercept-only
res <- list(Score_Stat = 0,
DF = 0,
P_Val = 1,
Notes = NA_character_)
} else {
X_null <- stats::model.matrix(mod.0, data = null.df)
X_alt <- stats::model.matrix(mod.1, data = alt.df)
residuals_null <- null.df$Y - exp(stats::predict(mod.0))
if (sandwich.var) {
V_null <- as.matrix(mod.0$var)
} else {
V_null <- as.matrix(mod.0$naiv.var)
}
X_alt_t <- t(X_alt)
U <- X_alt_t %*% residuals_null
V_U <- (X_alt_t * as.numeric(V_null)) %*% X_alt
V_U_inv <- try({ eigenMapMatrixInvert(V_U) }, silent = TRUE)
if (inherits(V_U_inv, "try-error")) {
V_U_inv <- eigenMapPseudoInverse(V_U)
}
test_stat <- as.numeric(t(U) %*% V_U_inv %*% U)
df1 <- ncol(X_alt) - ncol(X_null)
p_value <- 1 - stats::pchisq(test_stat, df = df1)
res <- list(Score_Stat = test_stat,
DF = df1,
P_Val = p_value,
Notes = NA_character_)
}
return(res)
}
7 changes: 3 additions & 4 deletions R/stat_out_score_gee_null.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@ stat_out_score_gee_null <- function(Y = NULL,
id = id.vec,
data = NULL,
corstr = cor.structure,
family = MASS::negative.binomial(theta.hat),
scale.fix = TRUE,
sandwich = FALSE,
maxit = 10)
family = MASS::negative.binomial(50, link = "log"),
scale.fix = FALSE,
sandwich = FALSE)
alpha_est <- ests$alpha
sigma_est <- ests$phi
mu_est <- as.matrix(stats::fitted.values(ests))
Expand Down
2 changes: 1 addition & 1 deletion R/summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ summary.scLANE <- function(test.dyn.res = NULL,
summary_stats$mean_adj_p_value <- mean(adj_p_values, na.rm = TRUE)
summary_stats$test_type <- ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "LRT",
"Likelihood Ratio Test",
"Wald Test")
ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "Wald", "Wald Test", "Score Test"))
class(summary_stats) <- "summary.scLANE"
return(summary_stats)
}
Expand Down
Loading

0 comments on commit 4648e53

Please sign in to comment.