diff --git a/R/testNhoods.R b/R/testNhoods.R index e7bd332..267b0ca 100644 --- a/R/testNhoods.R +++ b/R/testNhoods.R @@ -58,6 +58,9 @@ #' @param fail.on.error A logical scalar the determines the behaviour of the error reporting. Used for debugging only. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default #' this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}. +#' @param force A logical scalar that overrides the default behaviour to nicely error when N < 50 and using a mixed +#' effect model. This is because model parameter estimation may be unstable with these sample sizes, and hence the +#' fixed effect GLM is recommended instead. If used with the LMM, a warning will be produced. #' #' @details #' This function wraps up several steps of differential abundance testing using @@ -84,6 +87,14 @@ #' consistency. While it is strictly feasible to compute multiple contrasts at once, the #' recommendation, for ease of interpretability, is to compute one at a time. #' +#' If using the GLMM option, i.e. including a random effect variable in the \code{design} +#' formula, then \code{testNhoods} will check for the sample size of the analysis. If this is +#' less than 60 it will stop and produce an error. It is \emph{strongly} recommended that +#' the GLMM is not used with relatively small sample sizes, i.e. N<60, and even up to N~100 +#' may have unstable parameter estimates across nhoods. This behaviour can be overriden by +#' setting \code{force=TRUE}, but also be aware that parameter estimates may not be +#' accurate. A warning will be produced to alert you to this fact. +#' #' @return A \code{data.frame} of model results, which contain: #' \describe{ #' \item{\code{logFC}:}{Numeric, the log fold change between conditions, or for @@ -152,7 +163,8 @@ testNhoods <- function(x, design, design.df, kinship=NULL, min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", REML=TRUE, norm.method=c("TMM", "RLE", "logMS"), cell.sizes=NULL, max.iters = 50, max.tol = 1e-5, glmm.solver=NULL, - subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam()){ + subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam(), + force=FALSE){ is.lmm <- FALSE geno.only <- FALSE if(is(design, "formula")){ @@ -223,6 +235,16 @@ testNhoods <- function(x, design, design.df, kinship=NULL, stop("design must be either a formula or model matrix") } + check.n <- nrow(x.model) < 60 + + if(is.lmm & check.n & isFALSE(force)){ + stop("You are attempting to use the GLMM with N=", nrow(x.model), ". It is ", + "strongly discouraged. To override this behaviour set force=TRUE") + } else if(is.lmm & check.n & force){ + warning("Running GLMM with small sample size, N=", nrow(x.model), ". Model ", + "estimates may not be reliable") + } + if(!is(x, "Milo")){ stop("Unrecognised input type - must be of class Milo") } else if(.check_empty(x, "nhoodCounts")){ diff --git a/tests/testthat/test_testNhoods.R b/tests/testthat/test_testNhoods.R index 78bc28e..4e4a1d2 100644 --- a/tests/testthat/test_testNhoods.R +++ b/tests/testthat/test_testNhoods.R @@ -250,16 +250,22 @@ test_that("Singular Hessians are detectable and fail appropriately", { # collinear fixed and random effects expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition + (1|Condition), - design.df=sim1.meta, glmm.solver="Fisher", fail.on.error=TRUE)), - "matrix is singular") + design.df=sim1.meta, glmm.solver="Fisher", force=TRUE, fail.on.error=TRUE)), + "Lowest traceback returned") }) test_that("Invalid formulae give expected errors", { expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition + (50|Condition), - design.df=sim1.meta, glmm.solver="Fisher")), + design.df=sim1.meta, force=TRUE, glmm.solver="Fisher")), "is an invalid formula for random effects") }) +test_that("Small sample sizes produce expected warnings and errors", { + expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition + (1|Condition), + design.df=sim1.meta, force=FALSE, glmm.solver="Fisher")), + "You are attempting to use the GLMM with N") +}) + test_that("NA or Inf cell sizes causes the expected errors", { cell.sizes.na <- colSums(nhoodCounts(sim1.mylo)) cell.sizes.na[1] <- NA