diff --git a/.github/workflows/RCMD_check.yml b/.github/workflows/RCMD_check.yml index 5eb4b75..5a31f1e 100644 --- a/.github/workflows/RCMD_check.yml +++ b/.github/workflows/RCMD_check.yml @@ -6,14 +6,12 @@ on: - main - master - devel - - alice_fast - - alice + - genetic_case pull_request: branches: - main - master - devel - - alice name: R-CMD-check @@ -42,11 +40,11 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.config.r }} - - uses: r-lib/actions/setup-pandoc@v1 + - uses: r-lib/actions/setup-pandoc@v2 - name: Query dependencies run: | @@ -74,8 +72,8 @@ jobs: - name: Install dependencies run: | remotes::install_deps(dependencies = TRUE) - remotes::install_cran("rcmdcheck") - BiocManager::install("MouseGastrulationData") + remotes::install_cran(c("rcmdcheck", "XML", "RCurl")) + BiocManager::install(c("MouseGastrulationData", "MouseThymusAgeing", "scRNAseq")) shell: Rscript {0} - name: Check diff --git a/.gitignore b/.gitignore index c3a1b52..7698d6a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ doc Meta docs .Rproj.user -*yml \ No newline at end of file +*yml +*.o +*~ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index ebb85d8..23ef2d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,31 +1,36 @@ Package: miloR Type: Package Title: Differential neighbourhood abundance testing on a graph -Version: 1.9.99 +Version: 1.99.9 Authors@R: - c(person("Mike", "Morgan", role=c("aut", "cre"), email="michael.morgan@abdn.ac.uk"), - comment=c(ORCID="0000-0003-0757-0711"), + c(person("Mike", "Morgan", role=c("aut", "cre"), email="michael.morgan@abdn.ac.uk", + comment=c(ORCID="0000-0003-0757-0711")), person("Emma", "Dann", role=c("aut", "ctb"), email="ed6@sanger.ac.uk")) Description: Milo performs single-cell differential abundance testing. Cell states are modelled - as representative neighbourhoods on a nearest neighbour graph. Hypothesis testing is performed using a - negative bionomial generalized linear model. + as representative neighbourhoods on a nearest neighbour graph. Hypothesis testing is performed using either + a negative bionomial generalized linear model or negative binomial generalized linear mixed model. License: GPL-3 + file LICENSE Encoding: UTF-8 URL: https://marionilab.github.io/miloR BugReports: https://github.com/MarioniLab/miloR/issues biocViews: SingleCell, MultipleComparison, FunctionalGenomics, Software +LinkingTo: Rcpp, + RcppArmadillo, + RcppEigen Depends: R (>= 4.0.0), edgeR Imports: BiocNeighbors, BiocGenerics, SingleCellExperiment, Matrix (>= 1.3-0), + MatrixGenerics, S4Vectors, stats, stringr, methods, igraph, irlba, + utils, cowplot, BiocParallel, BiocSingular, @@ -42,10 +47,11 @@ Imports: BiocNeighbors, ggrepel, ggbeeswarm, RColorBrewer, - grDevices + grDevices, + Rcpp, + numDeriv Suggests: testthat, - MASS, mvtnorm, scater, scran, @@ -59,7 +65,9 @@ Suggests: MouseThymusAgeing, magick, RCurl, + MASS, curl, + scRNAseq, graphics RoxygenNote: 7.2.3 NeedsCompilation: no @@ -71,15 +79,19 @@ Collate: 'buildGraph.R' 'calcNhoodExpression.R' 'calcNhoodDistance.R' + 'checkSeparation.R' 'countCells.R' 'findNhoodMarkers.R' 'graphSpatialFDR.R' + 'glmm.R' 'makeNhoods.R' 'milo.R' 'miloR-package.R' 'methods.R' 'plotNhoods.R' 'sim_discrete.R' + 'sim_family.R' + 'sim_nbglmm.R' 'sim_trajectory.R' 'testNhoods.R' 'testDiffExp.R' @@ -88,4 +100,6 @@ Collate: 'annotateNhoods.R' 'groupNhoods.R' 'findNhoodGroupMarkers.R' + 'RcppExports.R' + 'miloR.R' VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 241ef46..21ba4be 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,20 +10,30 @@ export("nhoodIndex<-") export("nhoodReducedDim<-") export("nhoods<-") export(.calc_distance) +export(.parse_formula) +export(.rEParse) export(Milo) +export(Satterthwaite_df) export(annotateNhoods) export(buildFromAdjacency) export(buildGraph) export(buildNhoodGraph) export(calcNhoodDistance) export(calcNhoodExpression) +export(checkSeparation) +export(computePvalue) export(countCells) export(findNhoodGroupMarkers) export(findNhoodMarkers) +export(fitGLMM) +export(glmmControl.defaults) export(graph) export(graphSpatialFDR) export(groupNhoods) +export(initialiseG) +export(initializeFullZ) export(makeNhoods) +export(matrix.trace) export(nhoodAdjacency) export(nhoodCounts) export(nhoodDistances) @@ -80,18 +90,26 @@ importFrom(BiocGenerics,which) importFrom(BiocNeighbors,KmknnParam) importFrom(BiocNeighbors,findKNN) importFrom(BiocParallel,SerialParam) +importFrom(BiocParallel,bplapply) +importFrom(BiocParallel,bpok) +importFrom(BiocParallel,bpoptions) +importFrom(BiocParallel,bptry) importFrom(BiocSingular,bsparam) importFrom(Matrix,Matrix) importFrom(Matrix,colSums) importFrom(Matrix,crossprod) +importFrom(Matrix,diag) importFrom(Matrix,rowMeans) importFrom(Matrix,rowSums) +importFrom(Matrix,solve) importFrom(Matrix,sparseMatrix) importFrom(Matrix,t) importFrom(Matrix,tcrossprod) importFrom(Matrix,tril) importFrom(Matrix,which) +importFrom(MatrixGenerics,colSums2) importFrom(RColorBrewer,brewer.pal) +importFrom(Rcpp,evalCpp) importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,SimpleList) importFrom(S4Vectors,coolcat) @@ -158,6 +176,7 @@ importFrom(matrixStats,colMedians) importFrom(methods,as) importFrom(methods,callNextMethod) importFrom(methods,slot) +importFrom(numDeriv,jacobian) importFrom(stats,as.formula) importFrom(stats,dist) importFrom(stats,hclust) @@ -167,8 +186,15 @@ importFrom(stats,na.exclude) importFrom(stats,na.fail) importFrom(stats,na.omit) importFrom(stats,na.pass) +importFrom(stats,pt) +importFrom(stats,runif) importFrom(stats,setNames) +importFrom(stats,var) importFrom(stringr,str_replace) importFrom(tibble,has_rownames) importFrom(tibble,rownames_to_column) importFrom(tidyr,pivot_longer) +importFrom(utils,tail) +importMethodsFrom(Matrix,"%*%") +importMethodsFrom(Matrix,t) +useDynLib(miloR) diff --git a/NEWS.md b/NEWS.md index 508112e..6d8426c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ -# 1.5.0 (TBD) +# 1.7.1 (2023-02-15) ++ Patch to fix `NA` plotting in `plotDABeeswarm` + +# 1.60 (2022-11-02) ++ Vignette describing the use of contrasts in `testNhoods` + +# 1.5.0 (2022-04-27) + Introduce plotting function to visualise neighbourhood count distributions for nhoods interest: `plotNhoodCounts`. Implemented by Nick Hirschmüller # 1.3.1 (2022-01-07) diff --git a/R/AllClasses.R b/R/AllClasses.R index 4b02c3f..b2186ef 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -1,4 +1,8 @@ -#' The Milo container class +#' @title +#' Milo class definition +#' +#' @description +#' The class definition container to hold the data structures required for the Milo workflow. #' #' @slot graph An igraph object that represents the kNN graph #' @slot nhoods A CxN binary sparse matrix mapping cells to the neighbourhoods they belong to @@ -14,14 +18,17 @@ #' @slot nhoodGraph an igraph object that represents the graph of neighbourhoods #' @slot .k A hidden slot that stores the value of k used for graph building #' +#' @returns A Milo class object - see object builder help pages for details +#' #' @importClassesFrom Matrix dgCMatrix dsCMatrix dgTMatrix dgeMatrix ddiMatrix sparseMatrix setClassUnion("matrixORMatrix", c("matrix", "dgCMatrix", "dsCMatrix", "ddiMatrix", - "dgTMatrix", "dgeMatrix")) # is there a record for how long a virtual class can be?! + "dgTMatrix", "dgeMatrix")) setClassUnion("characterORNULL", c("character", "NULL")) setClassUnion("listORNULL", c("list", "NULL")) setClassUnion("numericORNULL", c("numeric", "NULL")) #' @aliases Milo #' @rdname Milo +#' #' @export #' @importFrom SingleCellExperiment SingleCellExperiment #' @importFrom S4Vectors SimpleList diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..8b0d7c5 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,159 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' GLMM parameter estimation using pseudo-likelihood with a custom covariance matrix +#' +#' Iteratively estimate GLMM fixed and random effect parameters, and variance +#' component parameters using Fisher scoring based on the Pseudo-likelihood +#' approximation to a Normal loglihood. This function incorporates a user-defined +#' covariance matrix, e.g. a kinship matrix for genetic analyses. +#' +#' @param Z mat - sparse matrix that maps random effect variable levels to +#' observations +#' @param X mat - sparse matrix that maps fixed effect variables to +#' observations +#' @param K mat - sparse matrix that defines the known covariance patterns between +#' individual observations. For example, a kinship matrix will then adjust for the +#' known/estimated genetic relationships between observations. +#' @param muvec vec vector of estimated phenotype means +#' @param offsets vec vector of model offsets +#' @param curr_theta vec vector of initial parameter estimates +#' @param curr_beta vec vector of initial beta estimates +#' @param curr_u vec of initial u estimates +#' @param curr_sigma vec of initial sigma estimates +#' @param curr_G mat c X c matrix of variance components +#' @param y vec of observed counts +#' @param u_indices List a List, each element contains the indices of Z relevant +#' to each RE and all its levels +#' @param theta_conv double Convergence tolerance for paramter estimates +#' @param rlevels List containing mapping of RE variables to individual +#' levels +#' @param curr_disp double Dispersion parameter estimate +#' @param REML bool - use REML for variance component estimation +#' @param maxit int maximum number of iterations if theta_conv is FALSE +#' @param solver string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring +#' @param vardist string which variance form to use NB = negative binomial, P=Poisson [not yet implemented]/ +#' +#' @details Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +#' switches between the joint fixed and random effect parameter inference, and the variance component +#' estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +#' given the parameter estimates. The fixed and random effect parameters are estimated using +#' Hendersons mixed model equations, and the variance component parameters are then estimated with +#' the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +#' the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +#' trigger the switch to the HE-NNLS solver until the model converges. +#' +#' @return A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +#' types are described here): +#' \describe{ +#' \item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +#' \item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +#' linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +#' \item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable. For this model the +#' last variance component corresponds to the input \emph{K} matrix.} +#' \item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +#' \item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +#' \item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +#' \item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +#' \item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +#' \item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +#' \item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +#' \item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +#' \item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +#' component.} +#' \item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +#' \item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +#' \item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +#' \item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +#' This is required to compute the degrees of freedom for the fixed effect parameter inference.} +#' \item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +#' iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +#' The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +#' } +#' +#' @author Mike Morgan +#' +#' @examples +#' NULL +#' +#' @name fitGeneticPLGlmm +#' +fitGeneticPLGlmm <- function(Z, X, K, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist) { + .Call('_miloR_fitGeneticPLGlmm', PACKAGE = 'miloR', Z, X, K, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist) +} + +#' GLMM parameter estimation using pseudo-likelihood +#' +#' Iteratively estimate GLMM fixed and random effect parameters, and variance +#' component parameters using Fisher scoring based on the Pseudo-likelihood +#' approximation to a Normal loglihood. +#' +#' @param Z mat - sparse matrix that maps random effect variable levels to +#' observations +#' @param X mat - sparse matrix that maps fixed effect variables to +#' observations +#' @param muvec vec vector of estimated phenotype means +#' @param offsets vec vector of model offsets +#' @param curr_theta vec vector of initial parameter estimates +#' @param curr_beta vec vector of initial beta estimates +#' @param curr_u vec of initial u estimates +#' @param curr_sigma vec of initial sigma estimates +#' @param curr_G mat c X c matrix of variance components +#' @param y vec of observed counts +#' @param u_indices List a List, each element contains the indices of Z relevant +#' to each RE and all its levels +#' @param theta_conv double Convergence tolerance for paramter estimates +#' @param rlevels List containing mapping of RE variables to individual +#' levels +#' @param curr_disp double Dispersion parameter estimate +#' @param REML bool - use REML for variance component estimation +#' @param maxit int maximum number of iterations if theta_conv is FALSE +#' @param solver string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring +#' @param vardist string which variance form to use NB = negative binomial, P=Poisson [not yet implemented.] +#' +#' @details Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +#' switches between the joint fixed and random effect parameter inference, and the variance component +#' estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +#' given the parameter estimates. The fixed and random effect parameters are estimated using +#' Hendersons mixed model equations, and the variance component parameters are then estimated with +#' the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +#' the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +#' trigger the switch to the HE-NNLS solver until the model converges. +#' +#' @return A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +#' types are described here): +#' \describe{ +#' \item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +#' \item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +#' linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +#' \item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable.} +#' \item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +#' \item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +#' \item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +#' \item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +#' \item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +#' \item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +#' \item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +#' \item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +#' \item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +#' component.} +#' \item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +#' \item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +#' \item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +#' \item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +#' This is required to compute the degrees of freedom for the fixed effect parameter inference.} +#' \item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +#' iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +#' The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +#' } +#' +#' @author Mike Morgan +#' +#' @examples +#' NULL +#' +#' @name fitPLGlmm +fitPLGlmm <- function(Z, X, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist) { + .Call('_miloR_fitPLGlmm', PACKAGE = 'miloR', Z, X, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist) +} + diff --git a/R/annotateNhoods.R b/R/annotateNhoods.R index 5d4456a..e22cd54 100644 --- a/R/annotateNhoods.R +++ b/R/annotateNhoods.R @@ -13,6 +13,11 @@ #' \code{testNhoods}. #' @param coldata_col A character scalar determining which column of \code{colData(x)} stores #' the annotation to be added to the neighbourhoods +#' @param subset.nhoods A character, numeric or logical vector that will subset the annotation to the specific nhoods. If +#' a character vector these should correspond to row names of \code{nhoodCounts}. If a logical vector then +#' these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed +#' to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))} +#' an error will be produced. This is necessary if \code{testNhoods} was run using \code{subset.nhoods=...}. #' #' @details #' For each neighbourhood, this calculates the most frequent value of \code{colData(x)[coldata_col]} @@ -34,19 +39,23 @@ #' #' @export #' @rdname annotateNhoods -annotateNhoods <- function(x, da.res, coldata_col){ - if(!is(x, "Milo")){ - stop("Unrecognised input type - must be of class Milo") - } - - if(!coldata_col %in% names(colData(x))){ - stop(coldata_col, " is not a column in colData(x)") - } +annotateNhoods <- function(x, da.res, coldata_col, subset.nhoods=NULL){ + if(!is(x, "Milo")){ + stop("Unrecognised input type - must be of class Milo") + } - if(ncol(nhoods(x)) != nrow(da.res)){ - stop("the number of rows in da.res does not match the number of neighbourhoods in nhoods(x). Are you sure da.res is the output of testNhoods(x)?") - } + if(!coldata_col %in% names(colData(x))){ + stop(coldata_col, " is not a column in colData(x)") + } + if(is.null(subset.nhoods)){ + if(ncol(nhoods(x)) != nrow(da.res)){ + stop("the number of rows in da.res does not match the number of neighbourhoods in nhoods(x). Are you sure da.res is the output of testNhoods(x) or did you use subset.nhoods?") + } + keep.nh <- rep(TRUE, ncol(nhoods(x))) + } else{ + keep.nh <- subset.nhoods + } anno_vec <- colData(x)[[coldata_col]] if (!is.factor(anno_vec)) { @@ -56,14 +65,14 @@ annotateNhoods <- function(x, da.res, coldata_col){ ## Count occurrence of labels in each nhood n.levels <- length(levels(anno_vec)) - nhood_counts <- vapply(seq_len(ncol(nhoods(x))), FUN=function(n) table(anno_vec[which(nhoods(x)[,n]==1)]), + nhood_counts <- vapply(seq_len(ncol(nhoods(x)[, keep.nh, drop=FALSE])), FUN=function(n) table(anno_vec[which(nhoods(x)[ ,n]==1)]), FUN.VALUE=numeric(n.levels)) nhood_counts <- t(nhood_counts) - rownames(nhood_counts) <- seq_len(ncol(nhoods(x))) + rownames(nhood_counts) <- seq_len(ncol(nhoods(x)[, keep.nh, drop=FALSE])) ## Fetch the most frequent label - max_val <- apply(nhood_counts, 1, function(x) colnames(nhood_counts)[which.max(x)]) - max_frac <- apply(nhood_counts, 1, function(x) max(x)/sum(x)) + max_val <- apply(nhood_counts, 1, function(X) colnames(nhood_counts)[which.max(X)]) + max_frac <- apply(nhood_counts, 1, function(X) max(X)/sum(X)) ## Add to da.res table da.res[coldata_col] <- max_val diff --git a/R/buildGraph.R b/R/buildGraph.R index 584b08b..40ee418 100644 --- a/R/buildGraph.R +++ b/R/buildGraph.R @@ -63,13 +63,13 @@ NULL buildGraph <- function(x, k=10, d=50, transposed=FALSE, get.distance=FALSE, reduced.dim="PCA", BNPARAM=KmknnParam(), BSPARAM=bsparam(), BPPARAM=SerialParam()){ - + # check class of x to determine which function to call # in all cases it must return a Milo object with the graph slot populated # what is a better design principle here? make a Milo object here and just # have one function, or have a separate function for input data type? I # think the former probably. - + if(is(x, "Milo")){ # check for reducedDims if(length(reducedDimNames(x)) == 0){ @@ -114,10 +114,10 @@ buildGraph <- function(x, k=10, d=50, transposed=FALSE, get.distance=FALSE, attr(reducedDim(x, "PCA"), "rotation") <- x_pca$rotation reduced.dim <- "PCA" } - + x <- Milo(x) } - + .buildGraph(x, k=k, d=d, get.distance=get.distance, reduced.dim=reduced.dim, BNPARAM=BNPARAM, BSPARAM=BSPARAM, BPPARAM=BPPARAM) } @@ -131,17 +131,17 @@ buildGraph <- function(x, k=10, d=50, transposed=FALSE, get.distance=FALSE, reduced.dim="PCA", BNPARAM=KmknnParam(), BSPARAM=bsparam(), BPPARAM=SerialParam()){ - + nn.out <- .setup_knn_data(x=reducedDim(x, reduced.dim), d=d, k=k, BNPARAM=BNPARAM, BSPARAM=BSPARAM, BPPARAM=BPPARAM) - + # separate graph and distances? At some point need to expand the distances # to the larger neighbourhood message("Constructing kNN graph with k:", k) zee.graph <- .neighborsToKNNGraph(nn.out$index, directed=FALSE) graph(x) <- zee.graph - + # adding distances if(isTRUE(get.distance)){ message("Retrieving distances from ", k, " nearest neighbours") @@ -149,7 +149,7 @@ buildGraph <- function(x, k=10, d=50, transposed=FALSE, get.distance=FALSE, # starting with a sparse matrix requires a coercion at each iteration # which uses up lots of memory and unncessary CPU time old.dist <- matrix(0L, ncol=ncol(x), nrow=ncol(x)) - + n.idx <- ncol(x) for(i in seq_len(n.idx)){ i.knn <- nn.out$index[i, ] @@ -168,10 +168,9 @@ buildGraph <- function(x, k=10, d=50, transposed=FALSE, get.distance=FALSE, #' @importFrom BiocNeighbors findKNN .setup_knn_data <- function(x, k, d=50, get.distance=FALSE, BNPARAM, BSPARAM, BPPARAM) { - + # Finding the KNNs - keep the distances # input should be cells X dimensions findKNN(x[, seq_len(d)], k=k, BNPARAM=BNPARAM, BPPARAM=BPPARAM, get.distance=get.distance) } - diff --git a/R/checkSeparation.R b/R/checkSeparation.R new file mode 100644 index 0000000..23a1cc4 --- /dev/null +++ b/R/checkSeparation.R @@ -0,0 +1,114 @@ +#' Check for separation of count distributions by variables +#' +#' Check the count distributions for each nhood according to a test +#' variable of interest. This is important for checking if there is separation +#' in the GLMM to inform either nhood subsetting or re-computation of the +#' NN-graph and refined nhoods. +#' @param x \code{\linkS4class{Milo}} object with a non-empty +#' \code{nhoodCounts} slot. +#' @param design.df A \code{data.frame} containing meta-data in which \code{condition} +#' is a column variable. The rownames must be the same as, or a subset of, the +#' colnames of \code{nhoodCounts(x)}. +#' @param condition A character scalar of the test variable contained in \code{design.df}. +#' This should be a factor variable if it is numeric or character it will be cast to a +#' factor variable. +#' @param min.val A numeric scalar that sets the minimum number of counts across condition level +#' samples, below which separation is defined. +#' @param factor.check A logical scalar that sets the factor variable level checking. See \emph{details} +#' for more information. +#' +#' @details +#' This function checks across nhoods for separation based on the separate levels +#' of an input factor variable. It checks if \emph{condition} is a factor variable, +#' and if not it will cast it to a factor. Note that the function first checks for the +#' number of unique values - if this exceeds > 50% of the number of elements an +#' error is generated. Users can override this behaviour with \code{factor.check=FALSE}. +#' +#' @return +#' A logical vector of the same length as \code{ncol(nhoodCounts(x))} where \emph{TRUE} +#' values represent nhoods where separation is detected. The output of this function +#' can be used to subset nhood-based analyses +#' e.g. \code{testNhoods(..., subset.nhoods=checkSepartion(x, ...))}. +#' +#' @author Mike Morgan +#' +#' @examples +#' library(SingleCellExperiment) +#' ux.1 <- matrix(rpois(12000, 5), ncol=400) +#' ux.2 <- matrix(rpois(12000, 4), ncol=400) +#' ux <- rbind(ux.1, ux.2) +#' vx <- log2(ux + 1) +#' pca <- prcomp(t(vx)) +#' +#' sce <- SingleCellExperiment(assays=list(counts=ux, logcounts=vx), +#' reducedDims=SimpleList(PCA=pca$x)) +#' +#' milo <- Milo(sce) +#' milo <- buildGraph(milo, k=20, d=10, transposed=TRUE) +#' milo <- makeNhoods(milo, k=20, d=10, prop=0.3) +#' milo <- calcNhoodDistance(milo, d=10) +#' +#' cond <- rep("A", ncol(milo)) +#' cond.a <- sample(1:ncol(milo), size=floor(ncol(milo)*0.25)) +#' cond.b <- setdiff(1:ncol(milo), cond.a) +#' cond[cond.b] <- "B" +#' meta.df <- data.frame(Condition=cond, Replicate=c(rep("R1", 132), rep("R2", 132), rep("R3", 136))) +#' meta.df$SampID <- paste(meta.df$Condition, meta.df$Replicate, sep="_") +#' milo <- countCells(milo, meta.data=meta.df, samples="SampID") +#' +#' test.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)), "Replicate"=rep(c("R1", "R2", "R3"), 2)) +#' test.meta$Sample <- paste(test.meta$Condition, test.meta$Replicate, sep="_") +#' rownames(test.meta) <- test.meta$Sample +#' +#' check.sep <- checkSeparation(milo, design.df=test.meta, condition='Condition') +#' sum(check.sep) +#' +#' @name checkSeparation +NULL + +#' @export +checkSeparation <- function(x, design.df, condition, min.val=1, factor.check=TRUE){ + if(!any(colnames(design.df) %in% condition)){ + stop(condition, " is not a variable in design.df") + } + + if(.check_empty(x, "nhoodCounts")){ + stop("nhoodCounts not found - please run countCells() first") + } + + if(all(rownames(design.df) %in% as.character(seq_len(nrow(design.df))))){ + stop("Please add rownames to design.df that are the same as the colnames of nhoodCounts(x)") + } else{ + if(sum(rownames(design.df) %in% colnames(nhoodCounts(x))) < 1){ + stop("rownames of design.df are not a subset of nhoodCounts colnames") + } + } + + if(isTRUE(factor.check)){ + if(!is.factor(design.df[, condition])){ + n.unique <- length(unique(as.vector(design.df[, condition]))) + if(n.unique >= floor(nrow(design.df)/2.0)){ + stop("Too many levels in ", condition, ". This is not a suitable variable") + } else{ + cond.vec <- as.factor(design.df[, condition]) + } + } else{ + cond.vec <- design.df[, condition] + } + } else{ + cond.vec <- as.function(design.df[, condition]) + } + + names(cond.vec) <- rownames(design.df) + + any_separate <- apply(nhoodCounts(x)[, names(cond.vec)], + FUN=function(NR, conditions, minval=1) { + cond.levels <- levels(conditions) + nr.tab <- unlist(by(NR, INDICES=conditions, FUN=sum, simplify=FALSE)) + any(nr.tab < minval) + }, MARGIN=1, conditions=cond.vec, minval=min.val) + return(any_separate) +} + + + diff --git a/R/findNhoodGroupMarkers.R b/R/findNhoodGroupMarkers.R index 49d158d..4d207d3 100644 --- a/R/findNhoodGroupMarkers.R +++ b/R/findNhoodGroupMarkers.R @@ -52,6 +52,7 @@ #' #' #' @examples +#' NULL #' #' @name findNhoodGroupMarkers #' @export diff --git a/R/glmm.R b/R/glmm.R new file mode 100644 index 0000000..19e0539 --- /dev/null +++ b/R/glmm.R @@ -0,0 +1,630 @@ +#' Perform differential abundance testing using a NB-generalised linear mixed model +#' +#' This function will perform DA testing per-nhood using a negative binomial generalised linear mixed model +#' @param X A matrix containing the fixed effects of the model. +#' @param Z A matrix containing the random effects of the model. +#' @param y A matrix containing the observed phenotype over each neighborhood. +#' @param offsets A vector containing the (log) offsets to apply normalisation for different numbers of cells across samples. +#' @param init.theta A column vector (m X 1 matrix) of initial estimates of fixed and random effect coefficients +#' @param Kin A n x n covariance matrix to explicitly model variation between observations +#' @param REML A logical value denoting whether REML (Restricted Maximum Likelihood) should be run. Default is TRUE. +#' @param random.levels A list describing the random effects of the model, and for each, the different unique levels. +#' @param glmm.control A list containing parameter values specifying the theta tolerance of the model, the maximum number of iterations to be run, +#' initial parameter values for the fixed (init.beta) and random effects (init.u), and glmm solver (see details). +#' @param dispersion A scalar value for the initial dispersion of the negative binomial. +#' @param geno.only A logical value that flags the model to use either just the \code{matrix} `Kin` or the supplied random effects. +#' @param solver a character value that determines which optimisation algorithm is used for the variance components. Must be either +#' HE (Haseman-Elston regression) or Fisher (Fisher scoring). +#' +#' @details +#' This function runs a negative binomial generalised linear mixed effects model. If mixed effects are detected in testNhoods, +#' this function is run to solve the model. The solver defaults to the \emph{Fisher} optimiser, and in the case of negative variance estimates +#' it will switch to the non-negative least squares (NNLS) Haseman-Elston solver. This behaviour can be pre-set by passing +#' \code{glmm.control$solver="HE"} for Haseman-Elston regression, which is the recommended solver when a covariance matrix is provided, +#' or \code{glmm.control$solver="HE-NNLS"} which is the constrained HE optimisation algorithm. +#' +#' @return A list containing the GLMM output, including inference results. The list elements are as follows: +#' \describe{ +#' \item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +#' \item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +#' linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +#' \item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable.} +#' \item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +#' \item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +#' \item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +#' \item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +#' \item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +#' \item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +#' \item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +#' \item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +#' \item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +#' component.} +#' \item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +#' \item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +#' \item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +#' \item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +#' This is required to compute the degrees of freedom for the fixed effect parameter inference.} +#' \item{\code{DF:}}{\code{numeric} vector of the number of inferred degrees of freedom. For details see \link{Satterthwaite_df}.} +#' \item{\code{PVALS:}}{\code{numeric} vector of the compute p-values from a t-distribution with the inferred number of degrees of +#' freedom.} +#' \item{\code{ERROR:}}{\code{list} containing Rcpp error messages - used for internal checking.} +#' } +#' @author Mike Morgan +#' +#' @examples +#' data(sim_nbglmm) +#' random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), +#' "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +#' X <- as.matrix(data.frame("Intercept"=rep(1, nrow(sim_nbglmm)), "FE2"=as.numeric(sim_nbglmm$FE2))) +#' Z <- as.matrix(data.frame("RE1"=paste("RE1", as.numeric(sim_nbglmm$RE1), sep="_"), +#' "RE2"=paste("RE2", as.numeric(sim_nbglmm$RE2), sep="_"))) +#' y <- sim_nbglmm$Mean.Count +#' dispersion <- 0.5 +#' +#' glmm.control <- glmmControl.defaults() +#' glmm.control$theta.tol <- 1e-6 +#' glmm.control$max.iter <- 15 +#' model.list <- fitGLMM(X=X, Z=Z, y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, +#' REML = TRUE, glmm.control=glmm.control, dispersion=dispersion, solver="Fisher") +#' model.list +#' +#' @name fitGLMM +#' +#' @importMethodsFrom Matrix %*% +#' @importFrom Matrix Matrix solve crossprod +#' @importFrom stats runif var +#' @export +fitGLMM <- function(X, Z, y, offsets, init.theta=NULL, Kin=NULL, + random.levels=NULL, REML=FALSE, + glmm.control=list(theta.tol=1e-6, max.iter=100, + init.sigma=NULL, init.beta=NULL, + init.u=NULL, solver=NULL), + dispersion = 1, geno.only=FALSE, + solver=NULL){ + + if(!is.null(solver)){ + glmm.control$solver <- solver + } + + if(!glmm.control$solver %in% c("HE", "Fisher", "HE-NNLS")){ + stop(glmm.control$solver, " not recognised - must be HE, HE-NNLS or Fisher") + } + + # model components + # X - fixed effects model matrix + # Z - random effects model matrix + # y - observed phenotype + + # check all dimensions conform + if(nrow(X) != nrow(Z) | nrow(X) != length(y)){ + stop("Dimensions of y, X and Z are discordant. y: ", length(y), "x1, X:", + nrow(X), "x", ncol(X), ", Z:", nrow(Z), "x", ncol(Z)) + } + + theta.conv <- glmm.control[["theta.tol"]] # convergence for the parameters + max.hit <- glmm.control[["max.iter"]] + + # OLS for the betas is usually a good starting point for NR + if(is.null(glmm.control[["init.beta"]])){ + curr_beta <- solve((t(X) %*% X)) %*% t(X) %*% log(y + 1) + } else{ + curr_beta = matrix(glmm.control[["init.beta"]], ncol=1) + } + rownames(curr_beta) <- colnames(X) + + + if(isFALSE(geno.only) & !is.null(Kin)){ + # Kin must be nXn + if(nrow(Kin) != ncol(Kin)){ + stop("Input covariance matrix is not square: ", nrow(Kin), "x", ncol(Kin)) + } + + if(nrow(Kin) != nrow(Z)){ + stop("Input covariance matrix and Z design matrix are discordant: ", + nrow(Z), "x", ncol(Z), ", ", nrow(Kin), "x", ncol(Kin)) + } + + # create full Z with expanded random effect levels + full.Z <- initializeFullZ(Z=Z, cluster_levels=random.levels) + + # random value initiation from runif + if(is.null(glmm.control[["init.u"]])){ + curr_u <- matrix(runif(ncol(full.Z), 0, 1), ncol=1) + } else{ + curr_u <- matrix(glmm.control[["init.u"]], ncol=1) + } + rownames(curr_u) <- colnames(full.Z) + + # compute sample variances of the us + if(is.null(glmm.control[["init.sigma"]])){ + curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), ncol(Z)), ncol=1, sparse=TRUE) # split evenly + # curr_sigma <- Matrix(runif(ncol(Z), 0, 1), ncol=1, sparse = TRUE) + } else{ + curr_sigma <- Matrix(glmm.control[["init.sigma"]], ncol=1, sparse=TRUE) + } + rownames(curr_sigma) <- colnames(Z) + + ## add the genetic components + ## augment Z with I + geno.I <- diag(nrow(full.Z)) + colnames(geno.I) <- paste0("CovarMat", seq_len(ncol(geno.I))) + full.Z <- do.call(cbind, list(full.Z, geno.I)) + + # add a genetic variance component + sigma_g <- Matrix(runif(1, 0, 1), ncol=1, nrow=1, sparse=TRUE) + rownames(sigma_g) <- "CovarMat" + curr_sigma <- do.call(rbind, list(curr_sigma, sigma_g)) + + # add genetic BLUPs + g_u <- matrix(runif(nrow(full.Z), 0, 1), ncol=1) + rownames(g_u) <- colnames(geno.I) + curr_u <- do.call(rbind, list(curr_u, g_u)) + + random.levels <- c(random.levels, list("CovarMat"=colnames(geno.I))) + + #compute variance-covariance matrix G + curr_G <- initialiseG(cluster_levels=random.levels, sigmas=curr_sigma, Kin=Kin) + } else if(isTRUE(geno.only) & !is.null(Kin)){ + # Kin must be nXn + if(nrow(Kin) != ncol(Kin)){ + stop("Input covariance matrix is not square: ", nrow(Kin), "x", ncol(Kin)) + } + + # if we only have a GRM then Z _is_ full.Z? + # full.Z <- initializeFullZ(Z, cluster_levels=random.levels) + full.Z <- Z + # should this be the matrix R? + colnames(full.Z) <- paste0(names(random.levels), seq_len(ncol(full.Z))) + + # random value initiation from runif + if(is.null(glmm.control[["init.u"]])){ + curr_u <- matrix(runif(ncol(full.Z), 0, 1), ncol=1) + } else{ + curr_u <- matrix(glmm.control[["init.u"]], ncol=1) + } + rownames(curr_u) <- colnames(full.Z) + + # compute sample variances of the us + if(is.null(glmm.control[["init.sigma"]])){ + curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), length(random.levels)), ncol=1, sparse=TRUE) # split evenly + # curr_sigma <- Matrix(runif(length(random.levels), 0, 1), ncol=1, sparse = TRUE) + } else{ + curr_sigma <- Matrix(glmm.control[["init.sigma"]], ncol=1, sparse=TRUE) + } + + rownames(curr_sigma) <- names(random.levels) + + #compute variance-covariance matrix G + curr_G <- initialiseG(cluster_levels=random.levels, sigmas=curr_sigma, Kin=Kin) + } else if(is.null(Kin)){ + # create full Z with expanded random effect levels + full.Z <- initializeFullZ(Z=Z, cluster_levels=random.levels) + + # random value initiation from runif + if(is.null(glmm.control[["init.u"]])){ + curr_u <- matrix(runif(ncol(full.Z), 0, 1), ncol=1) + } else{ + curr_u <- matrix(glmm.control[["init.u"]], ncol=1) + } + rownames(curr_u) <- colnames(full.Z) + + # compute sample variances of the us + if(is.null(glmm.control[["init.sigma"]])){ + # random sigma or split variance across them? + curr_sigma = Matrix(rep(var(y)/(ncol(Z) + 2), ncol(Z)), ncol=1, sparse=TRUE) # split evenly + # curr_sigma <- Matrix(runif(ncol(Z), 0, 1), ncol=1, sparse = TRUE) + } else{ + curr_sigma <- Matrix(glmm.control[["init.sigma"]], ncol=1, sparse=TRUE) + } + rownames(curr_sigma) <- colnames(Z) + + #compute variance-covariance matrix G + curr_G <- initialiseG(cluster_levels=random.levels, sigmas=curr_sigma) + } + + # create a single variable for the thetas + curr_theta <- do.call(rbind, list(curr_beta, curr_u)) + if(nrow(curr_theta) != sum(c(ncol(X), ncol(full.Z)))){ + stop("Number of parameters does not match columns of X and Z") + } + + #compute mu.vec using inverse link function + mu.vec <- exp(offsets + (X %*% curr_beta) + (full.Z %*% curr_u)) + if(any(is.infinite(mu.vec))){ + stop("Infinite values in initial estimates - reconsider model") + } + + if(isTRUE(any(is.na(mu.vec[, 1])))){ + if(isTRUE(any(is.na(offsets)))){ + stop("NA values in offsets - remove these samples before re-running model") + } else{ + stop("NAs values in initial estimates - remove these samples before re-running model") + } + } + + # be careful here as the colnames of full.Z might match multiple RE levels <- big source of bugs!!! + u_indices <- sapply(seq_along(names(random.levels)), + FUN=function(RX) { + which(colnames(full.Z) %in% random.levels[[RX]]) + }, simplify=FALSE) + + if(sum(unlist(lapply(u_indices, length))) != ncol(full.Z)){ + stop("Non-unique column names in Z - please ensure these are unique") + } + + # flatten column matrices to vectors + mu.vec <- mu.vec[, 1] + curr_beta <- curr_beta[, 1] + + curr_sigma <- curr_sigma[, 1] + curr_u <- curr_u[, 1] + curr_theta <- curr_theta[, 1] + + if(is.null(Kin)){ + final.list <- tryCatch(fitPLGlmm(Z=full.Z, X=X, muvec=mu.vec, offsets=offsets, curr_beta=curr_beta, + curr_theta=curr_theta, curr_u=curr_u, curr_sigma=curr_sigma, + curr_G=as.matrix(curr_G), y=y, u_indices=u_indices, theta_conv=theta.conv, rlevels=random.levels, + curr_disp=dispersion, REML=REML, maxit=max.hit, solver=glmm.control$solver, vardist="NB"), + error=function(err){ + return(list("FE"=NA, "RE"=NA, "Sigma"=NA, + "converged"=FALSE, "Iters"=NA, "Dispersion"=NA, + "Hessian"=NA, "SE"=NA, "t"=NA, "PSVAR"=NA, + "COEFF"=NA, "P"=NA, "Vpartial"=NA, "Ginv"=NA, + "Vsinv"=NA, "Winv"=NA, "VCOV"=NA, "LOGLIHOOD"=NA, + "DF"=NA, "PVALS"=NA, + "ERROR"=err)) + }) + } else{ + final.list <- tryCatch(fitGeneticPLGlmm(Z=full.Z, X=X, K=as.matrix(Kin), offsets=offsets, + muvec=mu.vec, curr_beta=curr_beta, + curr_theta=curr_theta, curr_u=curr_u, curr_sigma=curr_sigma, + curr_G=curr_G, y=y, u_indices=u_indices, theta_conv=theta.conv, rlevels=random.levels, + curr_disp=dispersion, REML=REML, maxit=max.hit, solver=glmm.control$solver, vardist="NB"), + error=function(err){ + return(list("FE"=NA, "RE"=NA, "Sigma"=NA, + "converged"=FALSE, "Iters"=NA, "Dispersion"=NA, + "Hessian"=NA, "SE"=NA, "t"=NA, "PSVAR"=NA, + "COEFF"=NA, "P"=NA, "Vpartial"=NA, "Ginv"=NA, + "Vsinv"=NA, "Winv"=NA, "VCOV"=NA, "LOGLIHOOD"=NA, + "DF"=NA, "PVALS"=NA, + "ERROR"=err)) + }) + } + + if(!all(is.na(unlist(final.list[c(1:3)])))){ + # compute Z scores, DF and P-values + mint <- length(curr_beta) + cint <- length(curr_u) + + dfs <- Satterthwaite_df(final.list[["COEFF"]], mint, cint, final.list[["SE"]], final.list[["Sigma"]], final.list[["FE"]], + final.list[["Vpartial"]], final.list[["VCOV"]], final.list[["Ginv"]], random.levels) + pvals <- computePvalue(final.list[["t"]], dfs) + + if(any(is.infinite(pvals))){ + stop("Setting infinite p-values to NA") + pvals[is.infinite(pvals)] <- NA + } + + final.list[["DF"]] <- dfs + final.list[["PVALS"]] <- pvals + } + + + # final checks + inf.params <- is.infinite(c(final.list[["Sigma"]], final.list[["FE"]], final.list[["RE"]])) + if(sum(inf.params) > 0){ + stop("Infinite parameter estimates - reconsider model or increase sample size") + } + + return(final.list) +} + + +#' Construct the initial G matrix +#' +#' This function maps the variance estimates onto the full \code{c x q} levels for each random effect. This +#' ensures that the matrices commute in the NB-GLMM solver. This function is included for reference, and +#' should not be used directly +#' @param cluster_levels A \code{list} containing the random effect levels for each variable +#' @param sigmas A \code{matrix} of c X 1, i.e. a column vector, containing the variance component estimates +#' @param Kin A \code{matrix} containing a user-supplied covariance matrix +#' +#' @details Broadcast the variance component estimates to the full \code{c\*q x c\*q} matrix. +#' +#' @return \code{matrix} of the full broadcast variance component estimates. +#' @author Mike Morgan & Alice Kluzer +#' +#' @examples +#' data(sim_nbglmm) +#' random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), +#' "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +#' rand.sigma <- matrix(runif(2), ncol=1) +#' rownames(rand.sigma) <- names(random.levels) +#' big.G <- initialiseG(random.levels, rand.sigma) +#' dim(big.G) +#' +#' @importFrom Matrix sparseMatrix diag +#' @export +initialiseG <- function(cluster_levels, sigmas, Kin=NULL){ + # construct the correct size of G given the random effects and variance components + # names of cluster_levels and columns of Z must match + # the independent sigmas go on the diagonal and the off-diagonal are the crossed/interactions + # sigmas must be named + sum.levels <- sum(unlist(lapply(cluster_levels, length))) + G <- sparseMatrix(i=sum.levels, j=sum.levels, repr="C", x=0L) + dimnames(G) <- list(unlist(cluster_levels), unlist(cluster_levels)) + i <- j <- 1 + + for(x in seq_len(nrow(sigmas))){ + x.q <- length(cluster_levels[[rownames(sigmas)[x]]]) + if(!is.null(Kin)){ + diag(G[c(i:(i+x.q-1)), c(i:(i+x.q-1)), drop=FALSE]) <- sigmas[x, ] # is this sufficient to transform the sigma to the model scale? + } else{ + if(rownames(sigmas[x, , drop=FALSE]) %in% c("Genetic")){ + diag(G[c(i:(i+x.q-1)), c(i:(i+x.q-1)), drop=FALSE]) <- sigmas[x, ] * Kin + }else{ + diag(G[c(i:(i+x.q-1)), c(i:(i+x.q-1)), drop=FALSE]) <- sigmas[x, ] # is this sufficient to transform the sigma to the model scale? + } + } + + i <- j <- i+x.q + } + return(as.matrix(G)) +} + + +#' Construct the full Z matrix +#' +#' Using a simplified version of the \code{n x c} Z matrix, with one column per variable, construct the fully broadcast +#' \code{n x (c*q)} binary matrix that maps each individual onto the random effect variable levels. It is not intended +#' for this function to be called by the user directly, but it can be useful to debug mappings between random effect +#' levels and input variables. +#' @param Z A \code{n x c} matrix containing the numeric or character levels +#' @param cluster_levels A \code{list} that maps the column names of Z onto the individual levels +#' @param stand.cols A logical scalar that determines if Z* should be computed which is the row-centered and +#' scaled version of the full Z matrix +#' +#' @details +#' To make sure that matrices commute it is necessary to construct the full \code{n x c*q} matrix. This is a binary +#' matrix where each level of each random effect occupies a column, and the samples/observations are mapped onto +#' the correct levels based on the input Z. +#' +#' @return \code{matrix} Fully broadcast Z matrix with one column per random effect level for all random effect variables +#' in the model. +#' @author Mike Morgan & Alice Kluzer +#' +#' @examples +#' data(sim_nbglmm) +#' random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), +#' "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +#' Z <- as.matrix(data.frame("RE1"=paste("RE1", as.numeric(sim_nbglmm$RE1), sep="_"), +#' "RE2"=paste("RE2", as.numeric(sim_nbglmm$RE2), sep="_"))) +#' fullZ <- initializeFullZ(Z, random.levels) +#' dim(Z) +#' dim(fullZ) +#' +#' @importMethodsFrom Matrix %*% +#' @importFrom Matrix Matrix diag +#' @export +initializeFullZ <- function(Z, cluster_levels, stand.cols=FALSE){ + + # construct the full Z with all random effect levels + n.cols <- ncol(Z) + z.names <- colnames(Z) + if(is.null(z.names)){ + stop("Columns of Z must have valid names") + } + + # check that all of the levels are present in random.levels AND the + # entries of Z + all.present <- unlist(sapply(seq_along(cluster_levels), FUN=function(PX){ + if(is.numeric(unique(Z[, PX])) & ncol(Z) != nrow(Z)){ + all(cluster_levels[[PX]] %in% paste0(names(cluster_levels)[PX], unique(Z[, PX]))) + } else if(ncol(Z) == nrow(Z)){ + all(cluster_levels[[PX]] %in% colnames(Z)) + } else { + all(cluster_levels[[PX]] %in% unique(Z[, PX])) + } + }, simplify=FALSE)) + + + if(!all(all.present)){ + stop("Columns of Z are discordant with input random effect levels") + } + + col.classes <- apply(Z, 2, class) + i.z.list <- list() + for(i in seq_len(n.cols)){ + i.class <- col.classes[i] + if(i.class %in% c("factor")){ # treat as factors + i.levels <- levels(Z[, i, drop=FALSE]) + i.levels <- as.factor(paste(sort(as.integer(i.levels)))) + i.z <- sapply(i.levels, FUN=function(X) (Z[, i] == X) + 0, simplify=TRUE) + } else if(i.class %in% c("character")){ + i.levels <- as.factor(unique(Z[, i, drop=FALSE])) # ordering is arbitrary + i.z <- sapply(i.levels, FUN=function(X) (Z[, i] == X) + 0, simplify=TRUE) + } else if(i.class %in% c("numeric")){ # split into unique levels if integer levels + i.mod <- all(Z[, i, drop=FALSE] %% 1 == 0) + if(isTRUE(i.mod)){ + i.levels <- unique(Z[, i]) + i.levels <- as.factor(paste(sort(as.integer(i.levels)))) + i.z <- sapply(i.levels, FUN=function(X) (Z[, i] == X) + 0, simplify=TRUE) + } else{ + i.z <- Z[, i, drop=FALSE] # if float then treat as continuous + } + } else if(i.class %in% c("integer")){ + i.levels <- unique(Z[, i]) + i.levels <- as.factor(paste(sort(as.integer(i.levels)))) + i.z <- sapply(i.levels, FUN=function(X) (Z[, i] == X) + 0, simplify=TRUE) + } + + colnames(i.z) <- cluster_levels[[colnames(Z)[i]]] + + # to standardise or not? + if(isTRUE(stand.cols)){ + q <- ncol(i.z) + i.ident <- diag(1L, nrow=nrow(i.z), ncol=nrow(i.z)) + i.star <- i.z - ((i.ident %*% i.z)/q) + i.z <- i.star + } + + i.z.list[[colnames(Z)[i]]] <- i.z + } + full.Z <- do.call(cbind, i.z.list) + return(full.Z) +} + + +#' Compute the trace of a matrix +#' +#' Exactly what it says on the tin - compute the sum of the matrix diagonal +#' @param x A \code{matrix} +#' +#' @details It computes the matrix trace of a square matrix. +#' +#' @return \code{numeric} scalar of the matrix trace. +#' @author Mike Morgan +#' +#' @examples +#' matrix.trace(matrix(runif(9), ncol=3, nrow=3)) +#' +#' @importFrom Matrix diag +#' @export +matrix.trace <- function(x){ + # check is square matrix first + x.dims <- dim(x) + if(x.dims[1] != x.dims[2]){ + stop("matrix is not square") + } else{ + return(sum(diag(x))) + } +} + +#' glmm control default values +#' +#' This will give the default values for the GLMM solver +#' @param ... see \code{fitGLMM} for details +# +#' @details The default values for the parameter estimation convergence is 1e-6, and the +#' maximum number of iterations is 100. In practise if the solver converges it generally does +#' so fairly quickly on moderately well conditioned problems. The default solver is Fisher +#' scoring, but this will switch (with a warning produced) to the NNLS Haseman-Elston solver +#' if negative variance estimates are found. +#' +#' @return \code{list} containing the default values GLMM solver. This can be saved in the +#' user environment and then passed to \link{testNhoods} directly to modify the convergence +#' criteria of the solver that is used. +#' \describe{ +#' \item{\code{theta.tol:}}{\code{numeric} scalar that sets the convergence threshold for the +#' parameter inference - this is applied globally to fixed and random effect parameters, and +#' to the variance estimates.} +#' \item{\code{max.iter:}}{\code{numeric} scalar that sets the maximum number of iterations that +#' the NB-GLMM will run for.} +#' \item{\code{solver:}}{\code{character} scalar that sets the solver to use. Valid values are +#' \emph{Fisher}, \emph{HE} or \emph{HE-NNLS}. See \link{fitGLMM} for details.} +#' } +#' @author Mike Morgan +#' @examples +#' mmcontrol <- glmmControl.defaults() +#' mmcontrol +#' mmcontrol$solver <- "HE-NNLS" +#' mmcontrol +#' +#' @export +glmmControl.defaults <- function(...){ + # return the default glmm control values + return(list(theta.tol=1e-6, max.iter=100, solver='Fisher')) +} + + +#' Compute the p-value for the fixed effect parameters +#' +#' Based on the asymptotic t-distribution, comptue the 2-tailed p-value that estimate != 0. This +#' function is not intended to be used directly, but is included for reference or if an alternative +#' estimate of the degrees of freedom is available. +#' @param Zscore A numeric vector containing the Z scores for each fixed effect parameter +#' @param df A numeric vector containing the estimated degrees of freedom for each fixed effect +#' parameter +#' +#' @details Based on sampling from a 2-tailed t-distribution with \code{df} degrees of freedom, +#' compute the probability that the calculated \code{Zscore} is greater than or equal to what would be +#' expected from random chance. +#' @return Numeric vector of p-values, 1 per fixed effect parameter +#' @author Mike Morgan & Alice Kluzer +#' @examples +#' NULL +#' +#' @importFrom stats pt +#' @export +computePvalue <- function(Zscore, df) { + pval <- 2*(pt(abs(Zscore), df, lower.tail=FALSE)) + return(pval) +} + + +#' @importMethodsFrom Matrix %*% t +#' @importFrom Matrix solve diag +###---- first calculate g = derivative of C with respect to sigma ---- +function_jac <- function(x, coeff.mat, mint, cint, G_inv, random.levels) { + UpperLeft <- coeff.mat[c(1:mint), c(1:mint)] + UpperRight <- coeff.mat[c(1:mint), c((mint+1):(mint+cint))] + LowerLeft <- coeff.mat[c((mint+1):(mint+cint)), c(1:mint)] + LowerRight <- coeff.mat[c((mint+1):(mint+cint)), c((mint+1):(mint+cint))] - G_inv + + n <- length(random.levels) + diag(LowerRight) <- diag(LowerRight) + rep(1/x, times=lengths(random.levels)) #when extending to random slopes, this needs to be changed to a matrix and added to LowerRight directly + C <- solve(UpperLeft - UpperRight %*% solve(LowerRight) %*% LowerLeft) +} + + +#' Compute degrees of freedom using Satterthwaite method +#' +#' This function is not intended to be called by the user, and is included for reference +#' @param coeff.mat A \code{matrix} class object containing the coefficient matrix from the mixed model equations +#' @param mint A numeric scalar of the number of fixed effect variables in the model +#' @param cint A numeric scalar of the number of random effect variables in the model +#' @param SE A \code{1 x mint} \code{matrix}, i.e. column vector, containing the standard errors of the fixed effect +#' parameter estimates +#' @param curr_sigma A \code{1 x cint matrix}, i.e. column vector, of the variance component parameter estimates +#' @param curr_beta A \code{1 x mint matrix}, i.e. column vector, of the fixed effect parameter estimates +#' @param V_partial A \code{list} of the partial derivatives for each fixed and random effect variable in the model +#' @param V_a A \code{c+m x c+m} variance-covariance matrix of the fixed and random effect variable parameter estimates +#' @param G_inv A \code{nxc X nxc} inverse matrix containing the variance component estimates +#' @param random.levels A \code{list} containing the mapping between the random effect variables and each respective set +#' of levels for said variable. +#' +#' @details The Satterthwaite degrees of freedom are computed, which estimates the numbers of degrees of freedom in the +#' NB-GLMM based on ratio of the squared standard errors and the product of the Jacobians of the variance-covariance matrix +#' from the fixed effect variable parameter estimation with full variance-covariance matrix. For more details see +#' Satterthwaite FE, Biometrics Bulletin (1946) Vol 2 No 6, pp110-114. +#' +#' @return \code{matrix} containing the inferred number of degrees of freedom for the specific model. +#' @author Mike Morgan & Alice Kluzer +#' @examples +#' NULL +#' +#' @importMethodsFrom Matrix %*% t +#' @importFrom Matrix solve diag +#' @importFrom numDeriv jacobian +#' @export +Satterthwaite_df <- function(coeff.mat, mint, cint, SE, curr_sigma, curr_beta, V_partial, V_a, G_inv, random.levels) { + + if(any(class(curr_sigma) %in% c("Matrix", "matrix", "dgeMatrix", "dgCMatrix"))){ + curr_sigma <- as.vector(curr_sigma) + } else{ + stop("Class of curr_sigma not recognised") + } + + jac <- jacobian(func=function_jac, x=curr_sigma, coeff.mat=coeff.mat, mint=mint, cint=cint, G_inv=G_inv, random.levels=random.levels) + jac_list <- lapply(1:ncol(jac), function(i) + array(jac[, i], dim=rep(length(curr_beta), 2))) #when extending to random slopes, this would have to be reformatted into list, where each element belongs to one random effect + + # V_a is provided externally + df <- rep(NA, length(curr_beta)) + for (i in 1:length(curr_beta)) { + jac_var_beta <- matrix(unlist(lapply(lapply(jac_list, diag), `[[`, i)), ncol=1) # could this be done with AD? + denom <- t(jac_var_beta) %*% (V_a) %*% jac_var_beta #g' Va g + df[i] <- (2*(SE[i]^2))/denom + } + return(as.matrix(df)) +} diff --git a/R/graphSpatialFDR.R b/R/graphSpatialFDR.R index 5a1204a..e128016 100644 --- a/R/graphSpatialFDR.R +++ b/R/graphSpatialFDR.R @@ -29,7 +29,7 @@ #' defined. k-distance uses the distance to the kth nearest neighbour #' of the index vertex, neighbour-distance uses the average within-neighbourhood #' Euclidean distance in reduced dimensional space, max uses the largest within-neighbourhood distance -#' from the index vertex, and graph-overlap uses the total number of cells overlapping between +#' from the index vertex, and graph-overlap uses the total number of cells overlapping between #' neighborhoods (distance-independent measure). The frequency-weighted version of the #' BH method is then applied to the p-values, as in \code{cydar}. #' @@ -56,7 +56,6 @@ graphSpatialFDR <- function(x.nhoods, graph, pvalues, k=NULL, weighting='k-dista haspval <- !is.na(pvalues) if (!all(haspval)) { - coords <- coords[haspval, , drop=FALSE] pvalues <- pvalues[haspval] } @@ -140,7 +139,7 @@ graphSpatialFDR <- function(x.nhoods, graph, pvalues, k=NULL, weighting='k-dista } else{ stop("k-distance weighting requires either a distance matrix or reduced dimensions.") } - + } else if(weighting == "graph-overlap"){ # no distance matrix is required here # compute overlap between neighborhoods @@ -151,7 +150,7 @@ graphSpatialFDR <- function(x.nhoods, graph, pvalues, k=NULL, weighting='k-dista } else{ stop("No neighborhoods found - please run makeNhoods first") } - + } else{ stop("Weighting option not recognised - must be either k-distance, neighbour-distance, max or graph-overlap") } diff --git a/R/makeNhoods.R b/R/makeNhoods.R index e5b8c78..5f351d2 100644 --- a/R/makeNhoods.R +++ b/R/makeNhoods.R @@ -57,10 +57,10 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" if(is(x, "Milo")){ message("Checking valid object") # check that a graph has been built - if(!.valid_graph(graph(x))){ + if(!.valid_graph(miloR::graph(x))){ stop("Not a valid Milo object - graph is missing. Please run buildGraph() first.") } - graph <- graph(x) + x.graph <- miloR::graph(x) if(isTRUE(refined) & refinement_scheme == "reduced_dim"){ X_reduced_dims <- reducedDim(x, reduced_dims) @@ -76,13 +76,14 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" stop("Rownames of reduced dimensions do not match cell IDs") } } + } else if(is(x, "igraph")){ if(isTRUE(refined) & refinement_scheme == "reduced_dim" & !is.matrix(reduced_dims)) { stop("No reduced dimensions matrix provided - required for refined sampling with refinement_scheme = reduced_dim.") } - graph <- x + x.graph <- x if(isTRUE(refined) & refinement_scheme == "reduced_dim"){ X_reduced_dims <- reduced_dims @@ -100,7 +101,7 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" stop("Data format: ", class(x), " not recognised. Should be Milo or igraph.") } - random_vertices <- .sample_vertices(graph, prop, return.vertices = TRUE) + random_vertices <- .sample_vertices(x.graph, prop, return.vertices = TRUE) if (isFALSE(refined)) { sampled_vertices <- random_vertices @@ -108,7 +109,7 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" if(refinement_scheme == "reduced_dim"){ sampled_vertices <- .refined_sampling(random_vertices, X_reduced_dims, k) } else if (refinement_scheme == "graph") { - sampled_vertices <- .graph_refined_sampling(random_vertices, graph) + sampled_vertices <- .graph_refined_sampling(random_vertices, x.graph) } else { stop("When refined == TRUE, refinement_scheme must be one of \"reduced_dim\" or \"graph\".") } @@ -125,7 +126,7 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" } # Is there an alternative to using a for loop to populate the sparseMatrix here? # if vertex names are set (as can happen with graphs from 3rd party tools), then set rownames of nh_mat - v.class <- V(graph)$name + v.class <- V(x.graph)$name if(is(x, "Milo")){ rownames(nh_mat) <- colnames(x) @@ -133,12 +134,12 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" if(is.null(v.class) & refinement_scheme == "reduced_dim"){ rownames(nh_mat) <- rownames(X_reduced_dims) } else if(!is.null(v.class)){ - rownames(nh_mat) <- V(graph)$name + rownames(nh_mat) <- V(x.graph)$name } } for (X in seq_len(length(sampled_vertices))){ - nh_mat[unlist(neighborhood(graph, order = 1, nodes = sampled_vertices[X])), X] <- 1 #changed to include index cells + nh_mat[unlist(neighborhood(x.graph, order = 1, nodes = sampled_vertices[X])), X] <- 1 #changed to include index cells } # need to add the index cells. @@ -152,6 +153,7 @@ makeNhoods <- function(x, prop=0.1, k=21, d=30, refined=TRUE, reduced_dims="PCA" } } + #' @importFrom BiocNeighbors findKNN #' @importFrom matrixStats colMedians .refined_sampling <- function(random_vertices, X_reduced_dims, k){ diff --git a/R/methods.R b/R/methods.R index a753748..8497448 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,6 +1,5 @@ ######## getter/setter Methods ######## -#' @title -#' Get and set methods for Milo objects +#' @title Get and set methods for Milo objects #' #' @description #' Get and set methods for Milo object slots. Generally speaking these methods @@ -64,6 +63,8 @@ #' \item{\code{show(x)}:}{Prints information to the console regarding the \code{\linkS4class{Milo}} object.} #' } #' +#' @return See individual methods for return values +#' #' @author Mike Morgan #' #' @name Milo-methods diff --git a/R/milo.R b/R/milo.R index feb9b72..738c532 100644 --- a/R/milo.R +++ b/R/milo.R @@ -1,5 +1,7 @@ +#' @title #' The Milo constructor #' +#' @description #' The Milo class extends the SingleCellExperiment class and is designed to #' work with neighbourhoods of cells. Therefore, it inherits from the #' \linkS4class{SingleCellExperiment} class and follows the same usage @@ -33,7 +35,7 @@ #' expression matrix. In this case it simply constructs a SingleCellExperiment #' and fills the relevant slots, such as reducedDims. #' -#' @return a Milo object +#' @returns a Milo object #' #' @author Mike Morgan #' @@ -118,15 +120,14 @@ Milo <- function(..., graph=list(), nhoodDistances=Matrix(0L, sparse=TRUE), nhoodIndex=list(), nhoodExpression=Matrix(0L, sparse=TRUE), .k=NULL) - - reducedDims(out) <- reducedDims(sce) + + reducedDims(out) <- SimpleList() altExps(out) <- list() - + if (objectVersion(out) >= "1.11.3"){ colPairs(out) <- SimpleList() rowPairs(out) <- SimpleList() } - out } diff --git a/R/miloR-package.R b/R/miloR-package.R index c7b4fff..2b2a608 100644 --- a/R/miloR-package.R +++ b/R/miloR-package.R @@ -8,4 +8,6 @@ #' @author Mike Morgan & Emma Dann #' #' @name miloR-package +#' +#' @returns The miloR package NULL diff --git a/R/miloR.R b/R/miloR.R new file mode 100644 index 0000000..dfc826b --- /dev/null +++ b/R/miloR.R @@ -0,0 +1,11 @@ +#' miloR +#' +#' Milo performs single-cell differential abundance testing. Cell states are modelled +#' as representative neighbourhoods on a nearest neighbour graph. Hypothesis testing is performed using a +#' negative bionomial generalized linear model. +#' +#' @docType package +#' @importFrom Rcpp evalCpp +#' @useDynLib miloR +#' @name miloR +NULL diff --git a/R/plotNhoods.R b/R/plotNhoods.R index 25faead..998211a 100644 --- a/R/plotNhoods.R +++ b/R/plotNhoods.R @@ -146,6 +146,7 @@ plotNhoodGraph <- function(x, layout="UMAP", colour_by=NA, subset.nhoods=NULL, s } else{ col_vals <- colData(x)[as.numeric(vertex_attr(nh_graph)$name), colour_by] } + if(!is.factor(col_vals)){ if(!is.numeric(col_vals)) { col_vals <- as.character(col_vals) diff --git a/R/sim_family.R b/R/sim_family.R new file mode 100644 index 0000000..aa2e51c --- /dev/null +++ b/R/sim_family.R @@ -0,0 +1,35 @@ +#' sim_family +#' +#' Simulated counts data from a series of simulated family trees +#' +#' Data are simulated counts from 30 families and includes X and Z design matrices, +#' as well as a single large kinship matrix. Kinships between family members are +#' dictated by the simulated family, i.e. sibs=0.5, parent-sib=0.5, sib-grandparent=0.25, etc. +#' These kinships, along with 2 other random effects, are used to induce a defined covariance +#' between simulated obserations as such: +#' +#' Z:= random effect design matrix, n X q +#' G:= matrix of variance components, including kinship matrix +#' +#' LL^T = Chol(ZGZ^T) := the Cholesky decomposition of the random effect contribution +#' to the sample covariance +#' Ysim:= simulated means based on exp(offset + Xbeta + Zb) +#' Y = LYsim := simulated means with defined covariance +#' +#' +#' @docType data +#' @usage data(sim_family) +#' +#' @format A list containing a \code{data.frame} in the "DF" slot containing the +#' mean counts and meta-data, and a \code{matrix} containing the kinship matrix +#' across all families in the "IBD" slot. +#' +#' @keywords datasets +#' +#' +#' @examples +#' NULL +#' +#' @name sim_family +#' @aliases sim_family +NULL diff --git a/R/sim_nbglmm.R b/R/sim_nbglmm.R new file mode 100644 index 0000000..cf6dee6 --- /dev/null +++ b/R/sim_nbglmm.R @@ -0,0 +1,37 @@ +#' sim_nbglmm +#' +#' Simulated counts data from a NB-GLMM for a single trait +#' +#' Data are simulated counts from 50 samples in a single data frame, from which the +#' X and Z design matrices, can be constructed (see examples). There are 2 random effects and 2 fixed +#' effect variables used to simulate the count trait. +#' +#' +#' @docType data +#' @usage data(sim_nbglmm) +#' +#' @format A \code{data.frame} \emph{sim_nbglmm} containing the following columns: +#' \describe{ +#' \item{\code{Mean:}}{\code{numeric} containing the base mean computed as the linear combination of the +#' simulated fixed and random effect weights multiplied by their respective weight matrices.} +#' \item{\code{Mean.Count:}}{\code{numeric} containing the integer count values randomly sampled from a negative +#' binomail distribution with mean = \emph{Mean} and dispersion = \emph{r}} +#' \item{\code{r:}}{\code{numeric} containing the dispersion value used to simulate the integer counts in +#' \emph{Mean.Count}.} +#' \item{\code{Intercept:}}{\code{numeric} of all 1s which can be used to set the intercept term in the X design +#' matrix.} +#' \item{\code{FE1:}}{\code{numeric} a binary fixed effect variable taking on values [0, 1]} +#' \item{\code{FE2:}}{\code{numeric} a continuous fixed effect variables} +#' \item{\code{RE1:}}{\code{numeric} a random effect variable with 10 levels} +#' \item{\code{RE2:}}{\code{numeric} a random effect variable with 7 levels} +#' } +#' +#' @keywords datasets +#' +#' @examples +#' data(sim_nbglmm) +#' head(sim_nbglmm) +#' +#' @name sim_nbglmm +#' @aliases sim_nbglmm +NULL diff --git a/R/testNhoods.R b/R/testNhoods.R index f332ffb..012f8d5 100644 --- a/R/testNhoods.R +++ b/R/testNhoods.R @@ -11,10 +11,15 @@ #' argument #' @param design.df A \code{data.frame} containing meta-data to which \code{design} #' refers to +#' @param kinship (optional) An n X n \code{matrix} containing pair-wise relationships between +#' observations, such as expected relationships or computed from SNPs/SNVs/other genetic variants. +#' Row names and column names should correspond to the column names of \code{nhoods(x)} and rownames +#' of \code{design.df}. #' @param min.mean A scalar used to threshold neighbourhoods on the minimum #' average cell counts across samples. #' @param model.contrasts A string vector that defines the contrasts used to perform -#' DA testing. +#' DA testing. For a specific comparison we recommend a single contrast be passed to +#' \code{testNhoods}. More details can be found in the vignette \code{milo_contrasts}. #' @param fdr.weighting The spatial FDR weighting scheme to use. Choice from max, #' neighbour-distance, graph-overlap or k-distance (default). If \code{none} is passed no #' spatial FDR correction is performed and returns a vector of NAs. @@ -28,9 +33,31 @@ #' method by Anders & Huber, 2010, to compute normalisation factors relative to a reference computed from #' the geometric mean across samples. The latter methods provides a degree of robustness against false positives #' when there are very large compositional differences between samples. +#' @param cell.sizes A named numeric vector of cell numbers per experimental samples. Names should correspond +#' to the columns of \code{nhoodCounts}. This can be used to define the model normalisation factors based on +#' a set of numbers instead of the \code{colSums(nhoodCounts(x))}. The example use-case is when performing an +#' analysis of a subset of nhoods while retaining the need to normalisation based on the numbers of cells +#' collected for each experimental sample to avoid compositional biases. Infinite or NA values will give an error. #' @param reduced.dim A character scalar referring to the reduced dimensional slot used to compute distances for #' the spatial FDR. This should be the same as used for graph building. -#' +#' @param REML A logical scalar that controls the variance component behaviour to use either restricted maximum +#' likelihood (REML) or maximum likelihood (ML). The former is recommened to account for the bias in the ML +#' variance estimates. +#' @param glmm.solver A character scalar that determines which GLMM solver is applied. Must be one of: Fisher, HE +#' or HE-NNLS. HE or HE-NNLS are recommended when supplying a user-defined covariance matrix. +#' @param max.iters A scalar that determines the maximum number of iterations to run the GLMM solver if it does +#' not reach the convergence tolerance threshold. +#' @param max.tol A scalar that deterimines the GLMM solver convergence tolerance. It is recommended to keep +#' this number small to provide some confidence that the parameter estimates are at least in a feasible region +#' and close to a \emph{local} optimum +#' @param subset.nhoods A character, numeric or logical vector that will subset the analysis to the specific nhoods. If +#' a character vector these should correspond to row names of \code{nhoodCounts}. If a logical vector then +#' these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed +#' to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))} +#' an error will be produced. +#' @param fail.on.error A logical scalar the determines the behaviour of the error reporting. Used for debugging only. +#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default +#' this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}. #' #' @details #' This function wraps up several steps of differential abundance testing using @@ -40,6 +67,15 @@ #' Quasi-Likelihood F-test in \code{glmQLFTest} for DA testing. FDR correction #' is performed separately as the default multiple-testing correction is #' inappropriate for neighbourhoods with overlapping cells. +#' The GLMM testing cannot be performed using \code{edgeR}, however, a separate +#' function \code{fitGLMM} can be used to fit a mixed effect model to each +#' nhood (see \code{fitGLMM} docs for details). +#' +#' Parallelisation is currently only enabled for the NB-LMM and uses the BiocParallel paradigm. In +#' general the GLM implementation in \code{glmQLFit} is sufficiently fast that it does not require +#' parallelisation. Parallelisation requires the user to pass a \linkS4class{BiocParallelParam} object +#' with the parallelisation arguments contained therein. This relies on the user to specify how +#' parallelisation - for details see the \code{BiocParallel} package. #' #' @return A \code{data.frame} of model results, which contain: #' \describe{ @@ -97,34 +133,87 @@ NULL #' @export -#' @importFrom stats model.matrix #' @importFrom Matrix colSums rowMeans -#' @importFrom stats dist median +#' @importFrom MatrixGenerics colSums2 +#' @importFrom utils tail +#' @importFrom stats dist median model.matrix #' @importFrom limma makeContrasts +#' @importFrom BiocParallel bplapply SerialParam bptry bpok bpoptions #' @importFrom edgeR DGEList estimateDisp glmQLFit glmQLFTest topTags calcNormFactors -testNhoods <- function(x, design, design.df, +testNhoods <- function(x, design, design.df, kinship=NULL, fdr.weighting=c("k-distance", "neighbour-distance", "max", "graph-overlap", "none"), - min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", - norm.method=c("TMM", "RLE", "logMS")){ + min.mean=0, model.contrasts=NULL, robust=TRUE, reduced.dim="PCA", REML=TRUE, + norm.method=c("TMM", "RLE", "logMS"), cell.sizes=NULL, + max.iters = 50, max.tol = 1e-5, glmm.solver=NULL, + subset.nhoods=NULL, fail.on.error=FALSE, BPPARAM=SerialParam()){ + is.lmm <- FALSE + geno.only <- FALSE if(is(design, "formula")){ - model <- model.matrix(design, data=design.df) - rownames(model) <- rownames(design.df) + # parse to find random and fixed effects - need to double check the formula is valid + parse <- unlist(strsplit(gsub(design, pattern="~", replacement=""), split= "+", fixed=TRUE)) + if(length(parse) < 1){ + stop("Forumla ", design, " not a proper formula - variables should be separated by '+'") + } + + find_re <- any(grepl(parse, pattern="1\\W?\\|")) + if(!find_re & any(grepl(parse, pattern="[0-9]?\\W?\\|\\W?"))){ + stop(design, " is an invalid formula for random effects. Use (1 | VARIABLE) format.") + } + + if(find_re | !is.null(kinship)){ + message("Random effects found") + + is.lmm <- TRUE + if(find_re | is.null(kinship)){ + # make model matrices for fixed and random effects + z.model <- .parse_formula(design, design.df, vtype="re") + rownames(z.model) <- rownames(design.df) + } else if(find_re & !is.null(kinship)){ + if(!all(rownames(kinship) == rownames(design.df))){ + stop("Genotype rownames do not match design.df rownames") + } + + z.model <- .parse_formula(design, design.df, vtype="re") + rownames(z.model) <- rownames(design.df) + } else if(!find_re & !is.null(kinship)){ + z.model <- diag(nrow(kinship)) + colnames(z.model) <- paste0("Genetic", seq_len(nrow(kinship))) + rownames(z.model) <- rownames(design.df) + geno.only <- TRUE + } + + # this will always implicitly include an intercept term - perhaps + # this shouldn't be the case? + x.model <- .parse_formula(design, design.df, vtype="fe") + rownames(x.model) <- rownames(design.df) + max.iters <- max.iters + + if(all(rownames(x.model) != rownames(z.model))){ + stop("Discordant sample names for mixed model design matrices") + } + + } else{ + x.model <- model.matrix(design, data=design.df) + rownames(x.model) <- rownames(design.df) + } } else if(is(design, "matrix")){ - model <- design - if(nrow(model) != nrow(design.df)){ + x.model <- design + if(nrow(x.model) != nrow(design.df)){ stop("Design matrix and model matrix are not the same dimensionality") } - if(any(rownames(model) != rownames(design.df))){ + if(any(rownames(x.model) != rownames(design.df))){ warning("Design matrix and model matrix dimnames are not the same") # check if rownames are a subset of the design.df - check.names <- any(rownames(model) %in% rownames(design.df)) + check.names <- any(rownames(x.model) %in% rownames(design.df)) if(isTRUE(check.names)){ - rownames(model) <- rownames(design.df) + rownames(x.model) <- rownames(design.df) } else{ stop("Design matrix and model matrix rownames are not a subset") } } + } else{ + stop("design must be either a formula or model matrix") } if(!is(x, "Milo")){ @@ -142,14 +231,26 @@ testNhoods <- function(x, design, design.df, } subset.counts <- FALSE - if(ncol(nhoodCounts(x)) != nrow(model)){ + if(ncol(nhoodCounts(x)) != nrow(x.model)){ # need to allow for design.df with a subset of samples only - if(all(rownames(model) %in% colnames(nhoodCounts(x)))){ + if(all(rownames(x.model) %in% colnames(nhoodCounts(x)))){ message("Design matrix is a strict subset of the nhood counts") subset.counts <- TRUE } else{ - stop("Design matrix (", nrow(model), ") and nhood counts (", - ncol(nhoodCounts(x)), ") are not the same dimension") + stop("Design matrix (", nrow(x.model), ") and nhood counts (", ncol(nhoodCounts(x)), ") are not the same dimension") + } + } + + if(is.lmm){ + if(ncol(nhoodCounts(x)) != nrow(z.model)){ + # need to allow for design.df with a subset of samples only + if(all(rownames(z.model) %in% colnames(nhoodCounts(x)))){ + message("Random effects design matrix is a strict subset of the nhood counts") + subset.counts <- TRUE + } else{ + stop("Random effects design matrix (", nrow(z.model), ") and nhood counts (", + ncol(nhoodCounts(x)), ") are not the same dimension") + } } } @@ -158,66 +259,288 @@ testNhoods <- function(x, design, design.df, # what is the cost of cast a matrix that is already dense vs. testing it's class if(min.mean > 0){ if(isTRUE(subset.counts)){ - keep.nh <- rowMeans(nhoodCounts(x)[, rownames(model)]) >= min.mean + if(!is.null(subset.nhoods)){ + mean.keep <- rowMeans(nhoodCounts(x)[, rownames(x.model)]) >= min.mean + if(is.character(subset.nhoods)){ + if(all(subset.nhoods) %in% rownames(nhoodCounts(x))){ + keep.nh <- rownames(nhoodCounts(x) %in% subset.nhoods) + } else{ + stop("Nhood subsetting is illegal - use same names as in rownames of nhoodCounts") + } + } else if(is.logical(subset.nhoods)){ + if(length(subset.nhoods) != nrow(nhoodCounts(x))){ + stop("Logical subset vector must be same length as nrow nhoodCounts") + } else{ + keep.nh <- subset.nhoods + } + } else if(is.numeric(subset.nhoods)){ + if(max(subset.nhoods) > nrow(nhoodCounts(x))){ + stop("Maximum index is out of bounds: ", max(subset.nhoods)) + } else{ + keep.nh <- seq_len(nrow(nhoodCounts(x))) %in% subset.nhoods + } + } else{ + stop("Subsetting vector type not recognised: ", type(subset.nhoods)) + } + + keep.nh <- mean.keep & keep.nh + + } else{ + keep.nh <- rowMeans(nhoodCounts(x)[, rownames(x.model)]) >= min.mean + } + } else if(!is.null(subset.nhoods)){ + mean.keep <- rowMeans(nhoodCounts(x)[, rownames(x.model)]) >= min.mean + if(is.character(subset.nhoods)){ + if(all(subset.nhoods) %in% rownames(nhoodCounts(x))){ + keep.nh <- rownames(nhoodCounts(x) %in% subset.nhoods) + } else{ + stop("Nhood subsetting is illegal - use same names as in rownames of nhoodCounts") + } + } else if(is.logical(subset.nhoods)){ + if(length(subset.nhoods) != nrow(nhoodCounts(x))){ + stop("Logical subset vector must be same length as nrow nhoodCounts") + } else{ + keep.nh <- subset.nhoods + } + } else if(is.numeric(subset.nhoods)){ + if(max(subset.nhoods) > nrow(nhoodCounts(x))){ + stop("Maximum index is out of bounds: ", max(subset.nhoods)) + } else{ + keep.nh <- seq_len(nrow(nhoodCounts(x))) %in% subset.nhoods + } + } else{ + stop("Subsetting vector type not recognised: ", type(subset.nhoods)) + } + + keep.nh <- mean.keep & keep.nh } else{ keep.nh <- rowMeans(nhoodCounts(x)) >= min.mean } + } else if(!is.null(subset.nhoods)){ + if(is.character(subset.nhoods)){ + if(all(subset.nhoods) %in% rownames(nhoodCounts(x))){ + keep.nh <- rownames(nhoodCounts(x) %in% subset.nhoods) + } else{ + stop("Nhood subsetting is illegal - use same names as in rownames of nhoodCounts") + } + } else if(is.logical(subset.nhoods)){ + if(length(subset.nhoods) != nrow(nhoodCounts(x))){ + stop("Logical subset vector must be same length as nrow nhoodCounts") + } else{ + keep.nh <- subset.nhoods + } + } else if(is.numeric(subset.nhoods)){ + if(max(subset.nhoods) > nrow(nhoodCounts(x))){ + stop("Maximum index is out of bounds: ", max(subset.nhoods)) + } else{ + keep.nh <- seq_len(nrow(nhoodCounts(x))) %in% subset.nhoods + } + } else{ + stop("Subsetting vector type not recognised: ", type(subset.nhoods)) + } + } else{ if(isTRUE(subset.counts)){ - keep.nh <- rep(TRUE, nrow(nhoodCounts(x)[, rownames(model)])) + keep.nh <- rep(TRUE, nrow(nhoodCounts(x)[, rownames(x.model)])) }else{ keep.nh <- rep(TRUE, nrow(nhoodCounts(x))) } } if(isTRUE(subset.counts)){ - keep.samps <- intersect(rownames(model), colnames(nhoodCounts(x)[keep.nh, ])) + keep.samps <- intersect(rownames(x.model), colnames(nhoodCounts(x)[keep.nh, ])) } else{ keep.samps <- colnames(nhoodCounts(x)[keep.nh, ]) } - if(any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) != rownames(model)) & !any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) %in% rownames(model))){ + if(any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) != rownames(x.model)) & !any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) %in% rownames(x.model))){ stop("Sample names in design matrix and nhood counts are not matched. Set appropriate rownames in design matrix.") - } else if(any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) != rownames(model)) & any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) %in% rownames(model))){ + } else if(any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) != rownames(x.model)) & any(colnames(nhoodCounts(x)[keep.nh, keep.samps]) %in% rownames(x.model))){ warning("Sample names in design matrix and nhood counts are not matched. Reordering") - model <- model[colnames(nhoodCounts(x)[keep.nh, keep.samps]), ] + x.model <- x.model[colnames(nhoodCounts(x)[keep.nh, keep.samps]), ] + if(is.lmm){ + z.model <- z.model[colnames(nhoodCounts(x)[keep.nh, keep.samps]), , drop = FALSE] + } + } + + if(isTRUE(is.null(cell.sizes))){ + cell.sizes <- colSums(nhoodCounts(x)[keep.nh, keep.samps]) + } else{ + check_inf_na <- any(is.na(cell.sizes)) | any(is.infinite(cell.sizes)) + if(isTRUE(check_inf_na)){ + stop("NA or Infinite values found in cell.sizes - remove samples these and re-try") + } } if(length(norm.method) > 1){ message("Using TMM normalisation") dge <- DGEList(counts=nhoodCounts(x)[keep.nh, keep.samps], - lib.size=colSums(nhoodCounts(x)[keep.nh, keep.samps])) + lib.size=cell.sizes) dge <- calcNormFactors(dge, method="TMM") } else if(norm.method %in% c("TMM")){ message("Using TMM normalisation") dge <- DGEList(counts=nhoodCounts(x)[keep.nh, keep.samps], - lib.size=colSums(nhoodCounts(x)[keep.nh, keep.samps])) + lib.size=cell.sizes) dge <- calcNormFactors(dge, method="TMM") } else if(norm.method %in% c("RLE")){ message("Using RLE normalisation") dge <- DGEList(counts=nhoodCounts(x)[keep.nh, keep.samps], - lib.size=colSums(nhoodCounts(x)[keep.nh, keep.samps])) + lib.size=cell.sizes) dge <- calcNormFactors(dge, method="RLE") }else if(norm.method %in% c("logMS")){ message("Using logMS normalisation") dge <- DGEList(counts=nhoodCounts(x)[keep.nh, keep.samps], - lib.size=colSums(nhoodCounts(x)[keep.nh, keep.samps])) + lib.size=cell.sizes) } - dge <- estimateDisp(dge, model) - fit <- glmQLFit(dge, model, robust=robust) - if(!is.null(model.contrasts)){ - mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=model) - res <- as.data.frame(topTags(glmQLFTest(fit, contrast=mod.constrast), - sort.by='none', n=Inf)) - } else{ - n.coef <- ncol(model) - res <- as.data.frame(topTags(glmQLFTest(fit, coef=n.coef), sort.by='none', n=Inf)) + dge <- estimateDisp(dge, x.model) + + if (is.lmm) { + message("Running GLMM model - this may take a few minutes") + + if(isFALSE(geno.only)){ + rand.levels <- lapply(seq_along(colnames(z.model)), FUN=function(X) { + zx <- unique(z.model[, X]) + if(is.numeric(zx)){ + paste0(colnames(z.model)[X], zx) + } else{ + zx + } + }) + names(rand.levels) <- colnames(z.model) + } else{ + rand.levels <- list("Genetic"=colnames(z.model)) + } + + # extract tagwise dispersion for glmm + # re-scale these to allow for non-zero variances + dispersion <- dge$tagwise.dispersion + + # I think these need to be logged + offsets <- log(dge$samples$norm.factors) + glmm.cont <- list(theta.tol=max.tol, max.iter=max.iters, solver=glmm.solver) + + #wrapper function is the same for all analyses + glmmWrapper <- function(Y, disper, Xmodel, Zmodel, off.sets, randlevels, + reml, glmm.contr, genonly=FALSE, kin.ship=NULL, + BPPARAM=BPPARAM, error.fail=FALSE){ + #bp.list <- NULL + # this needs to be able to run with BiocParallel + bp.list <- bptry({bplapply(seq_len(nrow(Y)), BPOPTIONS=bpoptions(stop.on.error = error.fail), + FUN=function(i, Xmodel, Zmodel, Y, off.sets, + randlevels, disper, genonly, + kin.ship, glmm.contr, reml){ + fitGLMM(X=Xmodel, Z=Zmodel, y=Y[i, ], offsets=off.sets, + random.levels=randlevels, REML = reml, + dispersion=disper[i], geno.only=genonly, + Kin=kinship, glmm.control=glmm.contr) + }, BPPARAM=BPPARAM, + Xmodel=Xmodel, Zmodel=Zmodel, Y=Y, off.sets=off.sets, + randlevels=randlevels, disper=disper, genonly=genonly, + kin.ship=kin.ship, glmm.cont=glmm.cont, reml=reml) + }) # need to handle this output which is a bplist_error object + + # parse the bplist_error object + if(all(bpok(bp.list))){ + model.list <- as(bp.list, "list") + } else{ + model.list <- list() + # set the failed results to all NA + for(x in seq_along(bp.list)){ + if(!bpok(bp.list)[x]){ + bperr <- attr(bp.list[[x]], "traceback") + if(isTRUE(error.fail)){ + stop(bperr) + } + + model.list[[x]] <- list("FE"=NA, "RE"=NA, "Sigma"=NA, + "converged"=FALSE, "Iters"=NA, "Dispersion"=NA, + "Hessian"=NA, "SE"=NA, "t"=NA, "PSVAR"=NA, + "COEFF"=NA, "P"=NA, "Vpartial"=NA, "Ginv"=NA, + "Vsinv"=NA, "Winv"=NA, "VCOV"=NA, "LOGLIHOOD"=NA, + "DF"=NA, "PVALS"=NA, + "ERROR"=bperr) + }else{ + model.list[[x]] <- bp.list[[x]] + } + } + } + return(model.list) + } + + + if(!is.null(kinship)){ + if(isTRUE(geno.only)){ + message("Running genetic model with ", nrow(kinship), " individuals") + } else{ + message("Running genetic model with ", nrow(z.model), " observations") + } + + if(geno.only){ + fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model, + off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont, + genonly = geno.only, kin.ship=kinship, + BPPARAM=BPPARAM, error.fail=fail.on.error) + } else{ + fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model, + off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont, + genonly = geno.only, kin.ship=kinship, + BPPARAM=BPPARAM, error.fail=fail.on.error) + } + + } else{ + fit <- glmmWrapper(Y=dge$counts, disper = 1/dispersion, Xmodel=x.model, Zmodel=z.model, + off.sets=offsets, randlevels=rand.levels, reml=REML, glmm.contr = glmm.cont, + genonly = geno.only, kin.ship=kinship, + BPPARAM=BPPARAM, error.fail=fail.on.error) + } + + # give warning about how many neighborhoods didn't converge and error if > 50% nhoods failed + n.nhoods <- length(fit) + half.n <- floor(n.nhoods * 0.5) + if (sum(!(unlist(lapply(fit, `[[`, "converged"))), na.rm = TRUE)/length(unlist(lapply(fit, `[[`, "converged"))) > 0){ + if(sum(is.na(unlist(lapply(fit, `[[`, "FE")))) >= half.n){ + err.list <- paste(unique(unlist(lapply(fit, `[[`, "ERROR"))), collapse="\n") + stop("Lowest traceback returned: ", err.list) # all unique error messages + } else{ + warning(paste(sum(!unlist(lapply(fit, `[[`, "converged")), na.rm = TRUE), "out of", length(unlist(lapply(fit, `[[`, "converged"))), + "neighborhoods did not converge; increase number of iterations?")) + } + + } + + # res has to reflect output from glmQLFit - express variance as a proportion as well. + # this only reports the final fixed effect parameter + ret.beta <- ncol(x.model) + + res <- cbind.data.frame("logFC" = unlist(lapply(lapply(fit, `[[`, "FE"), function(x) x[ret.beta])), + "logCPM"=log2((rowMeans(nhoodCounts(x)[keep.nh, ]/colSums2(nhoodCounts(x))))*1e6), + "SE"= unlist(lapply(lapply(fit, `[[`, "SE"), function(x) x[ret.beta])), + "tvalue" = unlist(lapply(lapply(fit, `[[`, "t"), function(x) x[ret.beta])), + "PValue" = unlist(lapply(lapply(fit, `[[`, "PVALS"), function(x) x[ret.beta])), + matrix(unlist(lapply(fit, `[[`, "Sigma")), ncol=length(rand.levels), byrow=TRUE), + "Converged"=unlist(lapply(fit, `[[`, "converged")), "Dispersion" = unlist(lapply(fit, `[[`, "Dispersion")), + "Logliklihood"=unlist(lapply(fit, `[[`, "LOGLIHOOD"))) + + rownames(res) <- 1:length(fit) + colnames(res)[6:(6+length(rand.levels)-1)] <- paste(names(rand.levels), "variance") + } else { + + fit <- glmQLFit(dge, x.model, robust=robust) + if(!is.null(model.contrasts)){ + mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=x.model) + res <- as.data.frame(topTags(glmQLFTest(fit, contrast=mod.constrast), + sort.by='none', n=Inf)) + } else{ + n.coef <- ncol(x.model) + res <- as.data.frame(topTags(glmQLFTest(fit, coef=n.coef), sort.by='none', n=Inf)) + } } res$Nhood <- as.numeric(rownames(res)) - message("Performing spatial FDR correction with", fdr.weighting[1], " weighting") + message("Performing spatial FDR correction with ", fdr.weighting[1], " weighting") + # res1 <- na.omit(res) mod.spatialfdr <- graphSpatialFDR(x.nhoods=nhoods(x), graph=graph(x), weighting=fdr.weighting, @@ -226,7 +549,6 @@ testNhoods <- function(x, design, design.df, indices=nhoodIndex(x), distances=nhoodDistances(x), reduced.dimensions=reducedDim(x, reduced.dim)) - res$SpatialFDR[order(res$Nhood)] <- mod.spatialfdr res } diff --git a/R/utils.R b/R/utils.R index 8a338f9..ca803e7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -63,6 +63,52 @@ x } + +# parse design formula +#' @export +.parse_formula <- function(in.form, design.df, vtype=c("re", "fe")){ + ## parse the formula and return the X and Z matrices + # need to decide on how to handle intercept terms - i.e. FE or RE + sp.form <- unlist(strsplit(as.character(in.form), + split="+", fixed=TRUE)) + + if(vtype %in% c("re")){ + v.terms <- unlist(lapply(sp.form, FUN=function(sp) { + return(ifelse(grepl(trimws(sp), pattern="\\|"), .rEParse(trimws(sp)), NA)) + })) + v.terms <- v.terms[!is.na(v.terms)] + d.mat <- as.matrix(design.df[, trimws(v.terms)]) + if (is.character(d.mat)) { + d.mat <- matrix(unlist(lapply(data.frame(d.mat)[, , drop = FALSE], + function(x) as.integer(factor(x)))), ncol = length(v.terms)) + } + colnames(d.mat) <- trimws(v.terms) + } else if(vtype %in% c("fe")){ + v.terms <- trimws(unlist(sp.form[!grepl(trimws(sp.form), pattern="~|\\|")])) + if(length(v.terms) > 1){ + v.terms <- paste(v.terms, collapse=" + ") + } + + d.mat <- model.matrix(as.formula(paste("~ 1 +", v.terms)), data = design.df) + d.mat <- d.mat[ ,!grepl("1*\\|", colnames(d.mat))] + } else{ + stop("vtype ", vtype, " not recognised") + } + + return(d.mat) +} + + +#' @export +.rEParse <- function(re.form) { + + .x <- gsub(unlist(strsplit(re.form, split="|", fixed=TRUE)), + pattern="\\)", replacement="") + + return(.x[length(.x)]) +} + + ###################################### ## neighbourhood grouping functions ###################################### diff --git a/README.md b/README.md index 9f7734a..fad304e 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ devtools::install_github("MarioniLab/miloR", ref="devel") 2. [Milo example on mouse gastrulation dataset](https://rawcdn.githack.com/MarioniLab/miloR/7c7f906b94a73e62e36e095ddb3e3567b414144e/vignettes/milo_gastrulation.html#5_Finding_markers_of_DA_populations): this includes a demo for downstream analysis functions. 3. [Integrating miloR in scanpy/anndata workflow](https://github.com/MarioniLab/milo_analysis_2020/blob/main/notebooks/milo_in_python.ipynb) (see also [`milopy`](https://github.com/emdann/milopy) for a full workflow in python) 4. [Specifying contrasts of interest for differential abundance testing with Milo](https://bioconductor.org/packages/release/bioc/vignettes/miloR/inst/doc/milo_contrasts.html) +5. [Using a mixed effect model for dependendent samples](https://raw.githack.com/MarioniLab/miloR/devel/vignettes/milo_glmm.html) ### Example work flow An example of the `Milo` work flow to get started: @@ -61,7 +62,6 @@ milo.design <- milo.design[milo.design$Freq > 0, ] rownames(milo.design) <- milo.design$Sample milo.design <- milo.design[colnames(nhoodCounts(milo.obj)),] - milo.res <- testNhoods(milo.obj, design=~Condition, design.df=milo.design) head(milo.res) ``` diff --git a/data/sim_family.RData b/data/sim_family.RData new file mode 100644 index 0000000..8c7dfb8 Binary files /dev/null and b/data/sim_family.RData differ diff --git a/data/sim_nbglmm.RData b/data/sim_nbglmm.RData new file mode 100644 index 0000000..7348fd1 Binary files /dev/null and b/data/sim_nbglmm.RData differ diff --git a/man/Satterthwaite_df.Rd b/man/Satterthwaite_df.Rd new file mode 100644 index 0000000..a11a98d --- /dev/null +++ b/man/Satterthwaite_df.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{Satterthwaite_df} +\alias{Satterthwaite_df} +\title{Compute degrees of freedom using Satterthwaite method} +\usage{ +Satterthwaite_df( + coeff.mat, + mint, + cint, + SE, + curr_sigma, + curr_beta, + V_partial, + V_a, + G_inv, + random.levels +) +} +\arguments{ +\item{coeff.mat}{A \code{matrix} class object containing the coefficient matrix from the mixed model equations} + +\item{mint}{A numeric scalar of the number of fixed effect variables in the model} + +\item{cint}{A numeric scalar of the number of random effect variables in the model} + +\item{SE}{A \code{1 x mint} \code{matrix}, i.e. column vector, containing the standard errors of the fixed effect +parameter estimates} + +\item{curr_sigma}{A \code{1 x cint matrix}, i.e. column vector, of the variance component parameter estimates} + +\item{curr_beta}{A \code{1 x mint matrix}, i.e. column vector, of the fixed effect parameter estimates} + +\item{V_partial}{A \code{list} of the partial derivatives for each fixed and random effect variable in the model} + +\item{V_a}{A \code{c+m x c+m} variance-covariance matrix of the fixed and random effect variable parameter estimates} + +\item{G_inv}{A \code{nxc X nxc} inverse matrix containing the variance component estimates} + +\item{random.levels}{A \code{list} containing the mapping between the random effect variables and each respective set +of levels for said variable.} +} +\value{ +\code{matrix} containing the inferred number of degrees of freedom for the specific model. +} +\description{ +This function is not intended to be called by the user, and is included for reference +} +\details{ +The Satterthwaite degrees of freedom are computed, which estimates the numbers of degrees of freedom in the +NB-GLMM based on ratio of the squared standard errors and the product of the Jacobians of the variance-covariance matrix +from the fixed effect variable parameter estimation with full variance-covariance matrix. For more details see +Satterthwaite FE, Biometrics Bulletin (1946) Vol 2 No 6, pp110-114. +} +\examples{ +NULL + +} +\author{ +Mike Morgan & Alice Kluzer +} diff --git a/man/annotateNhoods.Rd b/man/annotateNhoods.Rd index cc19a12..3fe2574 100644 --- a/man/annotateNhoods.Rd +++ b/man/annotateNhoods.Rd @@ -4,7 +4,7 @@ \alias{annotateNhoods} \title{Add annotations from colData to DA testing results} \usage{ -annotateNhoods(x, da.res, coldata_col) +annotateNhoods(x, da.res, coldata_col, subset.nhoods = NULL) } \arguments{ \item{x}{A \code{\linkS4class{Milo}} object containing single-cell gene expression @@ -15,6 +15,12 @@ and neighbourhoods.} \item{coldata_col}{A character scalar determining which column of \code{colData(x)} stores the annotation to be added to the neighbourhoods} + +\item{subset.nhoods}{A character, numeric or logical vector that will subset the annotation to the specific nhoods. If +a character vector these should correspond to row names of \code{nhoodCounts}. If a logical vector then +these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed +to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))} +an error will be produced. This is necessary if \code{testNhoods} was run using \code{subset.nhoods=...}.} } \value{ A \code{data.frame} of model results (as \code{da.res} input) with two new columns: (1) \code{coldata_col} storing diff --git a/man/checkSeparation.Rd b/man/checkSeparation.Rd new file mode 100644 index 0000000..5b2cdb0 --- /dev/null +++ b/man/checkSeparation.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/checkSeparation.R +\name{checkSeparation} +\alias{checkSeparation} +\title{Check for separation of count distributions by variables} +\arguments{ +\item{x}{\code{\linkS4class{Milo}} object with a non-empty +\code{nhoodCounts} slot.} + +\item{design.df}{A \code{data.frame} containing meta-data in which \code{condition} +is a column variable. The rownames must be the same as, or a subset of, the +colnames of \code{nhoodCounts(x)}.} + +\item{condition}{A character scalar of the test variable contained in \code{design.df}. +This should be a factor variable if it is numeric or character it will be cast to a +factor variable.} + +\item{min.val}{A numeric scalar that sets the minimum number of counts across condition level +samples, below which separation is defined.} + +\item{factor.check}{A logical scalar that sets the factor variable level checking. See \emph{details} +for more information.} +} +\value{ +A logical vector of the same length as \code{ncol(nhoodCounts(x))} where \emph{TRUE} +values represent nhoods where separation is detected. The output of this function +can be used to subset nhood-based analyses +e.g. \code{testNhoods(..., subset.nhoods=checkSepartion(x, ...))}. +} +\description{ +Check the count distributions for each nhood according to a test +variable of interest. This is important for checking if there is separation +in the GLMM to inform either nhood subsetting or re-computation of the +NN-graph and refined nhoods. +} +\details{ +This function checks across nhoods for separation based on the separate levels +of an input factor variable. It checks if \emph{condition} is a factor variable, +and if not it will cast it to a factor. Note that the function first checks for the +number of unique values - if this exceeds > 50% of the number of elements an +error is generated. Users can override this behaviour with \code{factor.check=FALSE}. +} +\examples{ +library(SingleCellExperiment) +ux.1 <- matrix(rpois(12000, 5), ncol=400) +ux.2 <- matrix(rpois(12000, 4), ncol=400) +ux <- rbind(ux.1, ux.2) +vx <- log2(ux + 1) +pca <- prcomp(t(vx)) + +sce <- SingleCellExperiment(assays=list(counts=ux, logcounts=vx), + reducedDims=SimpleList(PCA=pca$x)) + +milo <- Milo(sce) +milo <- buildGraph(milo, k=20, d=10, transposed=TRUE) +milo <- makeNhoods(milo, k=20, d=10, prop=0.3) +milo <- calcNhoodDistance(milo, d=10) + +cond <- rep("A", ncol(milo)) +cond.a <- sample(1:ncol(milo), size=floor(ncol(milo)*0.25)) +cond.b <- setdiff(1:ncol(milo), cond.a) +cond[cond.b] <- "B" +meta.df <- data.frame(Condition=cond, Replicate=c(rep("R1", 132), rep("R2", 132), rep("R3", 136))) +meta.df$SampID <- paste(meta.df$Condition, meta.df$Replicate, sep="_") +milo <- countCells(milo, meta.data=meta.df, samples="SampID") + +test.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)), "Replicate"=rep(c("R1", "R2", "R3"), 2)) +test.meta$Sample <- paste(test.meta$Condition, test.meta$Replicate, sep="_") +rownames(test.meta) <- test.meta$Sample + +check.sep <- checkSeparation(milo, design.df=test.meta, condition='Condition') +sum(check.sep) + +} +\author{ +Mike Morgan +} diff --git a/man/computePvalue.Rd b/man/computePvalue.Rd new file mode 100644 index 0000000..b9c99cf --- /dev/null +++ b/man/computePvalue.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{computePvalue} +\alias{computePvalue} +\title{Compute the p-value for the fixed effect parameters} +\usage{ +computePvalue(Zscore, df) +} +\arguments{ +\item{Zscore}{A numeric vector containing the Z scores for each fixed effect parameter} + +\item{df}{A numeric vector containing the estimated degrees of freedom for each fixed effect +parameter} +} +\value{ +Numeric vector of p-values, 1 per fixed effect parameter +} +\description{ +Based on the asymptotic t-distribution, comptue the 2-tailed p-value that estimate != 0. This +function is not intended to be used directly, but is included for reference or if an alternative +estimate of the degrees of freedom is available. +} +\details{ +Based on sampling from a 2-tailed t-distribution with \code{df} degrees of freedom, +compute the probability that the calculated \code{Zscore} is greater than or equal to what would be +expected from random chance. +} +\examples{ +NULL + +} +\author{ +Mike Morgan & Alice Kluzer +} diff --git a/man/findNhoodGroupMarkers.Rd b/man/findNhoodGroupMarkers.Rd index 7e8831b..f64efcd 100644 --- a/man/findNhoodGroupMarkers.Rd +++ b/man/findNhoodGroupMarkers.Rd @@ -76,3 +76,7 @@ negative binomial GLM (for details see \code{\link[edgeR]{edgeR-package}}). When the latter it is recommended to set \code{gene.offset=TRUE} as this behaviour adjusts the model offsets by the number of detected genes in each cell. } +\examples{ +NULL + +} diff --git a/man/fitGLMM.Rd b/man/fitGLMM.Rd new file mode 100644 index 0000000..4bb92e8 --- /dev/null +++ b/man/fitGLMM.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{fitGLMM} +\alias{fitGLMM} +\title{Perform differential abundance testing using a NB-generalised linear mixed model} +\usage{ +fitGLMM( + X, + Z, + y, + offsets, + init.theta = NULL, + Kin = NULL, + random.levels = NULL, + REML = FALSE, + glmm.control = list(theta.tol = 1e-06, max.iter = 100, init.sigma = NULL, init.beta = + NULL, init.u = NULL, solver = NULL), + dispersion = 1, + geno.only = FALSE, + solver = NULL +) +} +\arguments{ +\item{X}{A matrix containing the fixed effects of the model.} + +\item{Z}{A matrix containing the random effects of the model.} + +\item{y}{A matrix containing the observed phenotype over each neighborhood.} + +\item{offsets}{A vector containing the (log) offsets to apply normalisation for different numbers of cells across samples.} + +\item{init.theta}{A column vector (m X 1 matrix) of initial estimates of fixed and random effect coefficients} + +\item{Kin}{A n x n covariance matrix to explicitly model variation between observations} + +\item{random.levels}{A list describing the random effects of the model, and for each, the different unique levels.} + +\item{REML}{A logical value denoting whether REML (Restricted Maximum Likelihood) should be run. Default is TRUE.} + +\item{glmm.control}{A list containing parameter values specifying the theta tolerance of the model, the maximum number of iterations to be run, +initial parameter values for the fixed (init.beta) and random effects (init.u), and glmm solver (see details).} + +\item{dispersion}{A scalar value for the initial dispersion of the negative binomial.} + +\item{geno.only}{A logical value that flags the model to use either just the \code{matrix} `Kin` or the supplied random effects.} + +\item{solver}{a character value that determines which optimisation algorithm is used for the variance components. Must be either +HE (Haseman-Elston regression) or Fisher (Fisher scoring).} +} +\value{ +A list containing the GLMM output, including inference results. The list elements are as follows: +\describe{ +\item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +\item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +\item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable.} +\item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +\item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +\item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +\item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +\item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +\item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +\item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +\item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +\item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +component.} +\item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +\item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +\item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +\item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +This is required to compute the degrees of freedom for the fixed effect parameter inference.} +\item{\code{DF:}}{\code{numeric} vector of the number of inferred degrees of freedom. For details see \link{Satterthwaite_df}.} +\item{\code{PVALS:}}{\code{numeric} vector of the compute p-values from a t-distribution with the inferred number of degrees of +freedom.} +\item{\code{ERROR:}}{\code{list} containing Rcpp error messages - used for internal checking.} +} +} +\description{ +This function will perform DA testing per-nhood using a negative binomial generalised linear mixed model +} +\details{ +This function runs a negative binomial generalised linear mixed effects model. If mixed effects are detected in testNhoods, +this function is run to solve the model. The solver defaults to the \emph{Fisher} optimiser, and in the case of negative variance estimates +it will switch to the non-negative least squares (NNLS) Haseman-Elston solver. This behaviour can be pre-set by passing +\code{glmm.control$solver="HE"} for Haseman-Elston regression, which is the recommended solver when a covariance matrix is provided, +or \code{glmm.control$solver="HE-NNLS"} which is the constrained HE optimisation algorithm. +} +\examples{ +data(sim_nbglmm) +random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), + "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +X <- as.matrix(data.frame("Intercept"=rep(1, nrow(sim_nbglmm)), "FE2"=as.numeric(sim_nbglmm$FE2))) +Z <- as.matrix(data.frame("RE1"=paste("RE1", as.numeric(sim_nbglmm$RE1), sep="_"), + "RE2"=paste("RE2", as.numeric(sim_nbglmm$RE2), sep="_"))) +y <- sim_nbglmm$Mean.Count +dispersion <- 0.5 + +glmm.control <- glmmControl.defaults() +glmm.control$theta.tol <- 1e-6 +glmm.control$max.iter <- 15 +model.list <- fitGLMM(X=X, Z=Z, y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, + REML = TRUE, glmm.control=glmm.control, dispersion=dispersion, solver="Fisher") +model.list + +} +\author{ +Mike Morgan +} diff --git a/man/fitGeneticPLGlmm.Rd b/man/fitGeneticPLGlmm.Rd new file mode 100644 index 0000000..8b7336e --- /dev/null +++ b/man/fitGeneticPLGlmm.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{fitGeneticPLGlmm} +\alias{fitGeneticPLGlmm} +\title{GLMM parameter estimation using pseudo-likelihood with a custom covariance matrix} +\usage{ +fitGeneticPLGlmm( + Z, + X, + K, + muvec, + offsets, + curr_beta, + curr_theta, + curr_u, + curr_sigma, + curr_G, + y, + u_indices, + theta_conv, + rlevels, + curr_disp, + REML, + maxit, + solver, + vardist +) +} +\arguments{ +\item{Z}{mat - sparse matrix that maps random effect variable levels to +observations} + +\item{X}{mat - sparse matrix that maps fixed effect variables to +observations} + +\item{K}{mat - sparse matrix that defines the known covariance patterns between +individual observations. For example, a kinship matrix will then adjust for the +known/estimated genetic relationships between observations.} + +\item{muvec}{vec vector of estimated phenotype means} + +\item{offsets}{vec vector of model offsets} + +\item{curr_beta}{vec vector of initial beta estimates} + +\item{curr_theta}{vec vector of initial parameter estimates} + +\item{curr_u}{vec of initial u estimates} + +\item{curr_sigma}{vec of initial sigma estimates} + +\item{curr_G}{mat c X c matrix of variance components} + +\item{y}{vec of observed counts} + +\item{u_indices}{List a List, each element contains the indices of Z relevant +to each RE and all its levels} + +\item{theta_conv}{double Convergence tolerance for paramter estimates} + +\item{rlevels}{List containing mapping of RE variables to individual +levels} + +\item{curr_disp}{double Dispersion parameter estimate} + +\item{REML}{bool - use REML for variance component estimation} + +\item{maxit}{int maximum number of iterations if theta_conv is FALSE} + +\item{solver}{string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring} + +\item{vardist}{string which variance form to use NB = negative binomial, P=Poisson [not yet implemented]/} +} +\value{ +A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +types are described here): +\describe{ +\item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +\item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +\item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable. For this model the +last variance component corresponds to the input \emph{K} matrix.} +\item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +\item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +\item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +\item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +\item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +\item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +\item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +\item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +\item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +component.} +\item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +\item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +\item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +\item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +This is required to compute the degrees of freedom for the fixed effect parameter inference.} +\item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +} +} +\description{ +Iteratively estimate GLMM fixed and random effect parameters, and variance +component parameters using Fisher scoring based on the Pseudo-likelihood +approximation to a Normal loglihood. This function incorporates a user-defined +covariance matrix, e.g. a kinship matrix for genetic analyses. +} +\details{ +Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +switches between the joint fixed and random effect parameter inference, and the variance component +estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +given the parameter estimates. The fixed and random effect parameters are estimated using +Hendersons mixed model equations, and the variance component parameters are then estimated with +the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +trigger the switch to the HE-NNLS solver until the model converges. +} +\examples{ +NULL + +} +\author{ +Mike Morgan +} diff --git a/man/fitPLGlmm.Rd b/man/fitPLGlmm.Rd new file mode 100644 index 0000000..16ec164 --- /dev/null +++ b/man/fitPLGlmm.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{fitPLGlmm} +\alias{fitPLGlmm} +\title{GLMM parameter estimation using pseudo-likelihood} +\usage{ +fitPLGlmm( + Z, + X, + muvec, + offsets, + curr_beta, + curr_theta, + curr_u, + curr_sigma, + curr_G, + y, + u_indices, + theta_conv, + rlevels, + curr_disp, + REML, + maxit, + solver, + vardist +) +} +\arguments{ +\item{Z}{mat - sparse matrix that maps random effect variable levels to +observations} + +\item{X}{mat - sparse matrix that maps fixed effect variables to +observations} + +\item{muvec}{vec vector of estimated phenotype means} + +\item{offsets}{vec vector of model offsets} + +\item{curr_beta}{vec vector of initial beta estimates} + +\item{curr_theta}{vec vector of initial parameter estimates} + +\item{curr_u}{vec of initial u estimates} + +\item{curr_sigma}{vec of initial sigma estimates} + +\item{curr_G}{mat c X c matrix of variance components} + +\item{y}{vec of observed counts} + +\item{u_indices}{List a List, each element contains the indices of Z relevant +to each RE and all its levels} + +\item{theta_conv}{double Convergence tolerance for paramter estimates} + +\item{rlevels}{List containing mapping of RE variables to individual +levels} + +\item{curr_disp}{double Dispersion parameter estimate} + +\item{REML}{bool - use REML for variance component estimation} + +\item{maxit}{int maximum number of iterations if theta_conv is FALSE} + +\item{solver}{string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring} + +\item{vardist}{string which variance form to use NB = negative binomial, P=Poisson [not yet implemented.]} +} +\value{ +A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +types are described here): +\describe{ +\item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +\item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +\item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable.} +\item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +\item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +\item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +\item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +\item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +\item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +\item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +\item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +\item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +component.} +\item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +\item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +\item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +\item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +This is required to compute the degrees of freedom for the fixed effect parameter inference.} +\item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +} +} +\description{ +Iteratively estimate GLMM fixed and random effect parameters, and variance +component parameters using Fisher scoring based on the Pseudo-likelihood +approximation to a Normal loglihood. +} +\details{ +Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +switches between the joint fixed and random effect parameter inference, and the variance component +estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +given the parameter estimates. The fixed and random effect parameters are estimated using +Hendersons mixed model equations, and the variance component parameters are then estimated with +the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +trigger the switch to the HE-NNLS solver until the model converges. +} +\examples{ +NULL + +} +\author{ +Mike Morgan +} diff --git a/man/glmmControl.defaults.Rd b/man/glmmControl.defaults.Rd new file mode 100644 index 0000000..c8a7012 --- /dev/null +++ b/man/glmmControl.defaults.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{glmmControl.defaults} +\alias{glmmControl.defaults} +\title{glmm control default values} +\usage{ +glmmControl.defaults(...) +} +\arguments{ +\item{...}{see \code{fitGLMM} for details} +} +\value{ +\code{list} containing the default values GLMM solver. This can be saved in the +user environment and then passed to \link{testNhoods} directly to modify the convergence +criteria of the solver that is used. +\describe{ +\item{\code{theta.tol:}}{\code{numeric} scalar that sets the convergence threshold for the +parameter inference - this is applied globally to fixed and random effect parameters, and +to the variance estimates.} +\item{\code{max.iter:}}{\code{numeric} scalar that sets the maximum number of iterations that +the NB-GLMM will run for.} +\item{\code{solver:}}{\code{character} scalar that sets the solver to use. Valid values are +\emph{Fisher}, \emph{HE} or \emph{HE-NNLS}. See \link{fitGLMM} for details.} +} +} +\description{ +This will give the default values for the GLMM solver +} +\details{ +The default values for the parameter estimation convergence is 1e-6, and the +maximum number of iterations is 100. In practise if the solver converges it generally does +so fairly quickly on moderately well conditioned problems. The default solver is Fisher +scoring, but this will switch (with a warning produced) to the NNLS Haseman-Elston solver +if negative variance estimates are found. +} +\examples{ +mmcontrol <- glmmControl.defaults() +mmcontrol +mmcontrol$solver <- "HE-NNLS" +mmcontrol + +} +\author{ +Mike Morgan +} diff --git a/man/graphSpatialFDR.Rd b/man/graphSpatialFDR.Rd index 1489fdb..431bbcd 100644 --- a/man/graphSpatialFDR.Rd +++ b/man/graphSpatialFDR.Rd @@ -46,7 +46,7 @@ Each neighbourhood is weighted according to the weighting scheme defined. k-distance uses the distance to the kth nearest neighbour of the index vertex, neighbour-distance uses the average within-neighbourhood Euclidean distance in reduced dimensional space, max uses the largest within-neighbourhood distance -from the index vertex, and graph-overlap uses the total number of cells overlapping between +from the index vertex, and graph-overlap uses the total number of cells overlapping between neighborhoods (distance-independent measure). The frequency-weighted version of the BH method is then applied to the p-values, as in \code{cydar}. } diff --git a/man/initialiseG.Rd b/man/initialiseG.Rd new file mode 100644 index 0000000..a1e9439 --- /dev/null +++ b/man/initialiseG.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{initialiseG} +\alias{initialiseG} +\title{Construct the initial G matrix} +\usage{ +initialiseG(cluster_levels, sigmas, Kin = NULL) +} +\arguments{ +\item{cluster_levels}{A \code{list} containing the random effect levels for each variable} + +\item{sigmas}{A \code{matrix} of c X 1, i.e. a column vector, containing the variance component estimates} + +\item{Kin}{A \code{matrix} containing a user-supplied covariance matrix} +} +\value{ +\code{matrix} of the full broadcast variance component estimates. +} +\description{ +This function maps the variance estimates onto the full \code{c x q} levels for each random effect. This +ensures that the matrices commute in the NB-GLMM solver. This function is included for reference, and +should not be used directly +} +\details{ +Broadcast the variance component estimates to the full \code{c\*q x c\*q} matrix. +} +\examples{ +data(sim_nbglmm) +random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), + "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +rand.sigma <- matrix(runif(2), ncol=1) +rownames(rand.sigma) <- names(random.levels) +big.G <- initialiseG(random.levels, rand.sigma) +dim(big.G) + +} +\author{ +Mike Morgan & Alice Kluzer +} diff --git a/man/initializeFullZ.Rd b/man/initializeFullZ.Rd new file mode 100644 index 0000000..31e208e --- /dev/null +++ b/man/initializeFullZ.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{initializeFullZ} +\alias{initializeFullZ} +\title{Construct the full Z matrix} +\usage{ +initializeFullZ(Z, cluster_levels, stand.cols = FALSE) +} +\arguments{ +\item{Z}{A \code{n x c} matrix containing the numeric or character levels} + +\item{cluster_levels}{A \code{list} that maps the column names of Z onto the individual levels} + +\item{stand.cols}{A logical scalar that determines if Z* should be computed which is the row-centered and +scaled version of the full Z matrix} +} +\value{ +\code{matrix} Fully broadcast Z matrix with one column per random effect level for all random effect variables +in the model. +} +\description{ +Using a simplified version of the \code{n x c} Z matrix, with one column per variable, construct the fully broadcast +\code{n x (c*q)} binary matrix that maps each individual onto the random effect variable levels. It is not intended +for this function to be called by the user directly, but it can be useful to debug mappings between random effect +levels and input variables. +} +\details{ +To make sure that matrices commute it is necessary to construct the full \code{n x c*q} matrix. This is a binary +matrix where each level of each random effect occupies a column, and the samples/observations are mapped onto +the correct levels based on the input Z. +} +\examples{ +data(sim_nbglmm) +random.levels <- list("RE1"=paste("RE1", levels(as.factor(sim_nbglmm$RE1)), sep="_"), + "RE2"=paste("RE2", levels(as.factor(sim_nbglmm$RE2)), sep="_")) +Z <- as.matrix(data.frame("RE1"=paste("RE1", as.numeric(sim_nbglmm$RE1), sep="_"), + "RE2"=paste("RE2", as.numeric(sim_nbglmm$RE2), sep="_"))) +fullZ <- initializeFullZ(Z, random.levels) +dim(Z) +dim(fullZ) + +} +\author{ +Mike Morgan & Alice Kluzer +} diff --git a/man/matrix.trace.Rd b/man/matrix.trace.Rd new file mode 100644 index 0000000..ec41a00 --- /dev/null +++ b/man/matrix.trace.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/glmm.R +\name{matrix.trace} +\alias{matrix.trace} +\title{Compute the trace of a matrix} +\usage{ +matrix.trace(x) +} +\arguments{ +\item{x}{A \code{matrix}} +} +\value{ +\code{numeric} scalar of the matrix trace. +} +\description{ +Exactly what it says on the tin - compute the sum of the matrix diagonal +} +\details{ +It computes the matrix trace of a square matrix. +} +\examples{ +matrix.trace(matrix(runif(9), ncol=3, nrow=3)) + +} +\author{ +Mike Morgan +} diff --git a/man/matrixORMatrix-class.Rd b/man/matrixORMatrix-class.Rd deleted file mode 100644 index 05bc2e9..0000000 --- a/man/matrixORMatrix-class.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/AllClasses.R -\docType{class} -\name{matrixORMatrix-class} -\alias{matrixORMatrix-class} -\title{The Milo container class} -\description{ -The Milo container class -} -\section{Slots}{ - -\describe{ -\item{\code{graph}}{An igraph object that represents the kNN graph} - -\item{\code{nhoods}}{A CxN binary sparse matrix mapping cells to the neighbourhoods they belong to} - -\item{\code{nhoodDistances}}{An list of PxN sparse matrices of Euclidean distances -between vertices in each neighbourhood, one matrix per neighbourhood} - -\item{\code{nhoodCounts}}{An NxM sparse matrix of cells counts in each neighourhood -across M samples} - -\item{\code{nhoodIndex}}{A list of the index vertices for each neighbourhood} - -\item{\code{nhoodExpression}}{An GxN matrix of genes X neighbourhoods containing -average gene expression levels across cells in each neighbourhood} - -\item{\code{nhoodReducedDim}}{a list of reduced dimensional representations of -neighbourhoods, including projections into lower dimension space} - -\item{\code{nhoodGraph}}{an igraph object that represents the graph of neighbourhoods} - -\item{\code{.k}}{A hidden slot that stores the value of k used for graph building} -}} - diff --git a/man/methods.Rd b/man/methods.Rd index f630c12..285beb8 100644 --- a/man/methods.Rd +++ b/man/methods.Rd @@ -42,6 +42,9 @@ \alias{show} \alias{show,Milo-method} \title{Get and set methods for Milo objects} +\value{ +See individual methods for return values +} \description{ Get and set methods for Milo object slots. Generally speaking these methods are used internally, but they allow the user to assign their own externally computed diff --git a/man/miloR-package.Rd b/man/miloR-package.Rd index a4b8c4f..970cb41 100644 --- a/man/miloR-package.Rd +++ b/man/miloR-package.Rd @@ -3,6 +3,9 @@ \name{miloR-package} \alias{miloR-package} \title{The miloR package} +\value{ +The miloR package +} \description{ The \pkg{miloR} package provides modular functions to perform differential abundance testing on replicated single-cell experiments. For details please diff --git a/man/miloR.Rd b/man/miloR.Rd new file mode 100644 index 0000000..caafc01 --- /dev/null +++ b/man/miloR.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/miloR.R +\docType{package} +\name{miloR} +\alias{miloR} +\title{miloR} +\description{ +Milo performs single-cell differential abundance testing. Cell states are modelled +as representative neighbourhoods on a nearest neighbour graph. Hypothesis testing is performed using a +negative bionomial generalized linear model. +} diff --git a/man/sim_family.Rd b/man/sim_family.Rd new file mode 100644 index 0000000..9a17d7a --- /dev/null +++ b/man/sim_family.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sim_family.R +\docType{data} +\name{sim_family} +\alias{sim_family} +\title{sim_family} +\format{ +A list containing a \code{data.frame} in the "DF" slot containing the +mean counts and meta-data, and a \code{matrix} containing the kinship matrix +across all families in the "IBD" slot. +} +\usage{ +data(sim_family) +} +\description{ +Simulated counts data from a series of simulated family trees +} +\details{ +Data are simulated counts from 30 families and includes X and Z design matrices, +as well as a single large kinship matrix. Kinships between family members are +dictated by the simulated family, i.e. sibs=0.5, parent-sib=0.5, sib-grandparent=0.25, etc. +These kinships, along with 2 other random effects, are used to induce a defined covariance +between simulated obserations as such: + +Z:= random effect design matrix, n X q +G:= matrix of variance components, including kinship matrix + +LL^T = Chol(ZGZ^T) := the Cholesky decomposition of the random effect contribution +to the sample covariance +Ysim:= simulated means based on exp(offset + Xbeta + Zb) +Y = LYsim := simulated means with defined covariance +} +\examples{ +NULL + +} +\keyword{datasets} diff --git a/man/sim_nbglmm.Rd b/man/sim_nbglmm.Rd new file mode 100644 index 0000000..44b49b6 --- /dev/null +++ b/man/sim_nbglmm.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sim_nbglmm.R +\docType{data} +\name{sim_nbglmm} +\alias{sim_nbglmm} +\title{sim_nbglmm} +\format{ +A \code{data.frame} \emph{sim_nbglmm} containing the following columns: +\describe{ +\item{\code{Mean:}}{\code{numeric} containing the base mean computed as the linear combination of the +simulated fixed and random effect weights multiplied by their respective weight matrices.} +\item{\code{Mean.Count:}}{\code{numeric} containing the integer count values randomly sampled from a negative +binomail distribution with mean = \emph{Mean} and dispersion = \emph{r}} +\item{\code{r:}}{\code{numeric} containing the dispersion value used to simulate the integer counts in +\emph{Mean.Count}.} +\item{\code{Intercept:}}{\code{numeric} of all 1s which can be used to set the intercept term in the X design +matrix.} +\item{\code{FE1:}}{\code{numeric} a binary fixed effect variable taking on values [0, 1]} +\item{\code{FE2:}}{\code{numeric} a continuous fixed effect variables} +\item{\code{RE1:}}{\code{numeric} a random effect variable with 10 levels} +\item{\code{RE2:}}{\code{numeric} a random effect variable with 7 levels} +} +} +\usage{ +data(sim_nbglmm) +} +\description{ +Simulated counts data from a NB-GLMM for a single trait +} +\details{ +Data are simulated counts from 50 samples in a single data frame, from which the +X and Z design matrices, can be constructed (see examples). There are 2 random effects and 2 fixed +effect variables used to simulate the count trait. +} +\examples{ +data(sim_nbglmm) +head(sim_nbglmm) + +} +\keyword{datasets} diff --git a/man/testNhoods.Rd b/man/testNhoods.Rd index 5220d22..522113b 100644 --- a/man/testNhoods.Rd +++ b/man/testNhoods.Rd @@ -16,11 +16,17 @@ argument} \item{design.df}{A \code{data.frame} containing meta-data to which \code{design} refers to} +\item{kinship}{(optional) An n X n \code{matrix} containing pair-wise relationships between +observations, such as expected relationships or computed from SNPs/SNVs/other genetic variants. +Row names and column names should correspond to the column names of \code{nhoods(x)} and rownames +of \code{design.df}.} + \item{min.mean}{A scalar used to threshold neighbourhoods on the minimum average cell counts across samples.} \item{model.contrasts}{A string vector that defines the contrasts used to perform -DA testing.} +DA testing. For a specific comparison we recommend a single contrast be passed to +\code{testNhoods}. More details can be found in the vignette \code{milo_contrasts}.} \item{fdr.weighting}{The spatial FDR weighting scheme to use. Choice from max, neighbour-distance, graph-overlap or k-distance (default). If \code{none} is passed no @@ -38,8 +44,39 @@ method by Anders & Huber, 2010, to compute normalisation factors relative to a r the geometric mean across samples. The latter methods provides a degree of robustness against false positives when there are very large compositional differences between samples.} +\item{cell.sizes}{A named numeric vector of cell numbers per experimental samples. Names should correspond +to the columns of \code{nhoodCounts}. This can be used to define the model normalisation factors based on +a set of numbers instead of the \code{colSums(nhoodCounts(x))}. The example use-case is when performing an +analysis of a subset of nhoods while retaining the need to normalisation based on the numbers of cells +collected for each experimental sample to avoid compositional biases. Infinite or NA values will give an error.} + \item{reduced.dim}{A character scalar referring to the reduced dimensional slot used to compute distances for the spatial FDR. This should be the same as used for graph building.} + +\item{REML}{A logical scalar that controls the variance component behaviour to use either restricted maximum +likelihood (REML) or maximum likelihood (ML). The former is recommened to account for the bias in the ML +variance estimates.} + +\item{glmm.solver}{A character scalar that determines which GLMM solver is applied. Must be one of: Fisher, HE +or HE-NNLS. HE or HE-NNLS are recommended when supplying a user-defined covariance matrix.} + +\item{max.iters}{A scalar that determines the maximum number of iterations to run the GLMM solver if it does +not reach the convergence tolerance threshold.} + +\item{max.tol}{A scalar that deterimines the GLMM solver convergence tolerance. It is recommended to keep +this number small to provide some confidence that the parameter estimates are at least in a feasible region +and close to a \emph{local} optimum} + +\item{subset.nhoods}{A character, numeric or logical vector that will subset the analysis to the specific nhoods. If +a character vector these should correspond to row names of \code{nhoodCounts}. If a logical vector then +these should have the same \code{length} as \code{nrow} of \code{nhoodCounts}. If numeric, then these are assumed +to correspond to indices of \code{nhoodCounts} - if the maximal index is greater than \code{nrow(nhoodCounts(x))} +an error will be produced.} + +\item{fail.on.error}{A logical scalar the determines the behaviour of the error reporting. Used for debugging only.} + +\item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying the arguments for parallelisation. By default +this will evaluate using \code{SerialParam()}. See \code{details}on how to use parallelisation in \code{testNhoods}.} } \value{ A \code{data.frame} of model results, which contain: @@ -73,6 +110,15 @@ function sets the \code{lib.sizes} to the colSums(x), and uses the Quasi-Likelihood F-test in \code{glmQLFTest} for DA testing. FDR correction is performed separately as the default multiple-testing correction is inappropriate for neighbourhoods with overlapping cells. +The GLMM testing cannot be performed using \code{edgeR}, however, a separate +function \code{fitGLMM} can be used to fit a mixed effect model to each +nhood (see \code{fitGLMM} docs for details). + +Parallelisation is currently only enabled for the NB-LMM and uses the BiocParallel paradigm. In +general the GLM implementation in \code{glmQLFit} is sufficiently fast that it does not require +parallelisation. Parallelisation requires the user to pass a \linkS4class{BiocParallelParam} object +with the parallelisation arguments contained therein. This relies on the user to specify how +parallelisation - for details see the \code{BiocParallel} package. } \examples{ library(SingleCellExperiment) diff --git a/miloR.R b/miloR.R new file mode 100644 index 0000000..dfc826b --- /dev/null +++ b/miloR.R @@ -0,0 +1,11 @@ +#' miloR +#' +#' Milo performs single-cell differential abundance testing. Cell states are modelled +#' as representative neighbourhoods on a nearest neighbour graph. Hypothesis testing is performed using a +#' negative bionomial generalized linear model. +#' +#' @docType package +#' @importFrom Rcpp evalCpp +#' @useDynLib miloR +#' @name miloR +NULL diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..17b95df --- /dev/null +++ b/src/Makevars @@ -0,0 +1,5 @@ +#PKG_CXXFLAGS = $(CXX11STD) -Wall -pedantic +CXX11STD = CXX11 +PKG_CXXFLAGS = $(CFLAGS) $(CXX11STD) +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +ALL_CXXFLAGS = $(R_XTRA_CXXFLAGS) $(PKG_CXXFLAGS) $(CXXPICFLAGS) $(SHLIB_CXXFLAGS) $(CXXFLAGS) diff --git a/src/Makevars.win b/src/Makevars.win new file mode 100644 index 0000000..9c6c4da --- /dev/null +++ b/src/Makevars.win @@ -0,0 +1,2 @@ +PKG_CXXFLAGS = -std=c++11 +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..62cca5b --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,82 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include +#include +#include + +using namespace Rcpp; + +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + +// fitGeneticPLGlmm +List fitGeneticPLGlmm(const arma::mat& Z, const arma::mat& X, const arma::mat& K, arma::vec muvec, arma::vec offsets, arma::vec curr_beta, arma::vec curr_theta, arma::vec curr_u, arma::vec curr_sigma, arma::mat curr_G, const arma::vec& y, List u_indices, double theta_conv, const List& rlevels, double curr_disp, const bool& REML, const int& maxit, std::string solver, std::string vardist); +RcppExport SEXP _miloR_fitGeneticPLGlmm(SEXP ZSEXP, SEXP XSEXP, SEXP KSEXP, SEXP muvecSEXP, SEXP offsetsSEXP, SEXP curr_betaSEXP, SEXP curr_thetaSEXP, SEXP curr_uSEXP, SEXP curr_sigmaSEXP, SEXP curr_GSEXP, SEXP ySEXP, SEXP u_indicesSEXP, SEXP theta_convSEXP, SEXP rlevelsSEXP, SEXP curr_dispSEXP, SEXP REMLSEXP, SEXP maxitSEXP, SEXP solverSEXP, SEXP vardistSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type Z(ZSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type K(KSEXP); + Rcpp::traits::input_parameter< arma::vec >::type muvec(muvecSEXP); + Rcpp::traits::input_parameter< arma::vec >::type offsets(offsetsSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_beta(curr_betaSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_theta(curr_thetaSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_u(curr_uSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_sigma(curr_sigmaSEXP); + Rcpp::traits::input_parameter< arma::mat >::type curr_G(curr_GSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type y(ySEXP); + Rcpp::traits::input_parameter< List >::type u_indices(u_indicesSEXP); + Rcpp::traits::input_parameter< double >::type theta_conv(theta_convSEXP); + Rcpp::traits::input_parameter< const List& >::type rlevels(rlevelsSEXP); + Rcpp::traits::input_parameter< double >::type curr_disp(curr_dispSEXP); + Rcpp::traits::input_parameter< const bool& >::type REML(REMLSEXP); + Rcpp::traits::input_parameter< const int& >::type maxit(maxitSEXP); + Rcpp::traits::input_parameter< std::string >::type solver(solverSEXP); + Rcpp::traits::input_parameter< std::string >::type vardist(vardistSEXP); + rcpp_result_gen = Rcpp::wrap(fitGeneticPLGlmm(Z, X, K, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist)); + return rcpp_result_gen; +END_RCPP +} +// fitPLGlmm +List fitPLGlmm(const arma::mat& Z, const arma::mat& X, arma::vec muvec, arma::vec offsets, arma::vec curr_beta, arma::vec curr_theta, arma::vec curr_u, arma::vec curr_sigma, arma::mat curr_G, const arma::vec& y, List u_indices, double theta_conv, const List& rlevels, double curr_disp, const bool& REML, const int& maxit, std::string solver, std::string vardist); +RcppExport SEXP _miloR_fitPLGlmm(SEXP ZSEXP, SEXP XSEXP, SEXP muvecSEXP, SEXP offsetsSEXP, SEXP curr_betaSEXP, SEXP curr_thetaSEXP, SEXP curr_uSEXP, SEXP curr_sigmaSEXP, SEXP curr_GSEXP, SEXP ySEXP, SEXP u_indicesSEXP, SEXP theta_convSEXP, SEXP rlevelsSEXP, SEXP curr_dispSEXP, SEXP REMLSEXP, SEXP maxitSEXP, SEXP solverSEXP, SEXP vardistSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type Z(ZSEXP); + Rcpp::traits::input_parameter< const arma::mat& >::type X(XSEXP); + Rcpp::traits::input_parameter< arma::vec >::type muvec(muvecSEXP); + Rcpp::traits::input_parameter< arma::vec >::type offsets(offsetsSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_beta(curr_betaSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_theta(curr_thetaSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_u(curr_uSEXP); + Rcpp::traits::input_parameter< arma::vec >::type curr_sigma(curr_sigmaSEXP); + Rcpp::traits::input_parameter< arma::mat >::type curr_G(curr_GSEXP); + Rcpp::traits::input_parameter< const arma::vec& >::type y(ySEXP); + Rcpp::traits::input_parameter< List >::type u_indices(u_indicesSEXP); + Rcpp::traits::input_parameter< double >::type theta_conv(theta_convSEXP); + Rcpp::traits::input_parameter< const List& >::type rlevels(rlevelsSEXP); + Rcpp::traits::input_parameter< double >::type curr_disp(curr_dispSEXP); + Rcpp::traits::input_parameter< const bool& >::type REML(REMLSEXP); + Rcpp::traits::input_parameter< const int& >::type maxit(maxitSEXP); + Rcpp::traits::input_parameter< std::string >::type solver(solverSEXP); + Rcpp::traits::input_parameter< std::string >::type vardist(vardistSEXP); + rcpp_result_gen = Rcpp::wrap(fitPLGlmm(Z, X, muvec, offsets, curr_beta, curr_theta, curr_u, curr_sigma, curr_G, y, u_indices, theta_conv, rlevels, curr_disp, REML, maxit, solver, vardist)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_miloR_fitGeneticPLGlmm", (DL_FUNC) &_miloR_fitGeneticPLGlmm, 19}, + {"_miloR_fitPLGlmm", (DL_FUNC) &_miloR_fitPLGlmm, 18}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_miloR(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/computeMatrices.cpp b/src/computeMatrices.cpp new file mode 100644 index 0000000..ce27ea3 --- /dev/null +++ b/src/computeMatrices.cpp @@ -0,0 +1,447 @@ +#include "computeMatrices.h" +#include +#include +#include "utils.h" +// [[Rcpp::depends(RcppArmadillo)]] +using namespace Rcpp; + +arma::vec computeYStar(arma::mat X, arma::vec curr_beta, arma::mat Z, arma::mat Dinv, arma::vec curr_u, arma::vec y, + arma::vec offsets){ + // compute pseudovariable + int n = X.n_rows; + arma::vec ystar(n); + ystar = (offsets + (X * curr_beta) + (Z * curr_u)) + Dinv * (y - exp(offsets + (X * curr_beta) + (Z * curr_u))); + return ystar; +} + + +arma::mat computeVmu(arma::vec mu, double r, std::string vardist){ + int n = mu.size(); + arma::mat Vmu(n, n); + + if(vardist == "NB"){ + Vmu = computeVmuNB(mu, r); + } else if(vardist == "P"){ + Vmu = computeVmuPoisson(mu); + } + + return Vmu; +} + + +arma::mat computeVmuNB(arma::vec mu, double r){ + int n = mu.size(); + arma::mat Vmu(n, n); + + Vmu.diag() = (pow(mu, 2)/r) + mu; + + return Vmu; +} + +arma::mat computeVmuPoisson(arma::vec mu){ + int n = mu.size(); + arma::mat Vmu(n, n); + + Vmu.diag() = mu; + + return Vmu; +} + +arma::mat computeW(double disp, arma::mat Dinv, std::string vardist){ + int n = Dinv.n_cols; + arma::mat W(n, n); + + if(vardist == "NB"){ + W = computeWNB(disp, Dinv); + } else if(vardist == "P"){ + W = computeWPoisson(Dinv); + } + + return W; +} + + +arma::mat computeWNB(double disp, arma::mat Dinv){ + int n = Dinv.n_cols; + arma::mat W(n, n); + + // this should be trivial as it is a diagonal matrix + // D^-1 * V_mu * D^-1 simplifies to a diagonal matrix + // of disp + 1/mu_i which is (phi * I) + Dinv <- we don't need any multiplication!! + arma::fmat idisp = arma::eye(n, n); + idisp = (1/disp) * idisp; + W = idisp + Dinv; + return W; +} + + +arma::mat computeWPoisson(arma::mat Dinv){ + int n = Dinv.n_cols; + arma::mat W(n, n); + + // this should be trivial as it is a diagonal matrix + // in the Poisson case this simplifies to 1/mu + W = Dinv; + return W; +} + + +arma::mat computeVStar(arma::mat Z, arma::mat G, arma::mat W){ + int n = Z.n_rows; + arma::mat vstar(n, n); + // Z.brief_print("Z\n"); + vstar = (Z * G * Z.t()) + W; + + return vstar; +} + + +arma::mat computePREML (arma::mat Vsinv, arma::mat X){ + int n = Vsinv.n_cols; + + arma::mat P(n, n); + arma::mat spvsinv(Vsinv); + arma::mat sinternal(X.t() * Vsinv * X); + arma::mat _toinvert(sinternal); + arma::mat _sintP(inv(_toinvert)); + + P = Vsinv - (Vsinv * X * _sintP * X.t() * Vsinv); // dense matrix version is faster than sparse here ¯\_(ツ)_/¯ + + return P; +} + + +arma::mat initialiseG (List u_indices, arma::vec sigmas){ + // construct the correct size of G given the random effects and variance components + // the independent sigmas go on the diagonal and the off-diagonal are the crossed/interactions + // this doesn't actually handle the off-diagonal interactions yet + int c = u_indices.size(); + int stot = 0; + + // sum total number of levels + for(int i=0; i < c; i++){ + StringVector _ir = u_indices(i); + stot += _ir.size(); + } + + arma::mat G(stot, stot); + G = G.zeros(); + + // this only fills the diagonal elements of G + unsigned long i = 0; + unsigned long j = 0; + + for(unsigned long k=0; k < stot; k++){ + i = k; + j = k; + for(int x = 0; x < c; x++){ + arma::uvec _r = u_indices(x); + unsigned long q = _r.size(); + double _s = sigmas(x); + + for(int l=0; l < q; l++){ + unsigned long _lu = _r(l); + + if(k == _lu - 1){ + G(i, j) = _s; + } + } + } + } + + return G; +} + + +arma::mat subMatG (double sigma, arma::mat broadcast){ + // construct the submatrix for the input variance component + int nrow = broadcast.n_rows; + int ncol = broadcast.n_cols; + + arma::mat subG(nrow, ncol); + subG = sigma * broadcast; + + return subG; +} + + + +arma::mat initialiseG_G (List u_indices, arma::vec sigmas, arma::mat Kin){ + // construct the correct size of G given the random effects and variance components + // the independent sigmas go on the diagonal and the off-diagonal are the crossed/interactions + // this doesn't actually handle the off-diagonal interactions yet + // for the arbitrary covariance case multiply by Kin + // is the "genetic" sigma always last? + int c = u_indices.size(); + Rcpp::List Glist(1); // to store the first G + + for(int x = 0; x < c; x++){ + // create the broadcast matrix + arma::uvec _r = u_indices(x); // the vector of indices of Z that map to the RE + unsigned long q = _r.size(); // the number of levels for the RE + double _s = sigmas(x); // the sigma of the RE + + arma::mat sG(q, q); + + if(x == c - 1){ + unsigned long n = Kin.n_cols; + if(q != n){ + stop("RE indices and dimensions of covariance do not match"); + } else{ + sG = subMatG(_s, Kin); + // sG.brief_print("sG\n"); + } + } else{ + // create the rxr identity matrix + arma::mat rEye(q, q, arma::fill::eye); + sG = subMatG(_s, rEye); + } + + // grow G at each iteration here + if(x == 0){ + Glist(0) = sG; + } else{ + unsigned long sg_cols = sG.n_cols; + unsigned long sg_rows = sG.n_rows; + + arma::mat G = Glist(0); + + unsigned long g_cols = G.n_cols; + unsigned long g_rows = G.n_rows; + + arma::mat gright(g_rows, sg_cols); + arma::mat gleft(sg_rows, g_cols); + + arma::mat top(g_rows, g_cols + sg_cols); + arma::mat bottom(sg_rows, sg_cols + g_cols); + + top = arma::join_rows(G, gright); + bottom = arma::join_rows(gleft, sG); + + arma::mat _G(sg_rows + g_rows, sg_cols + g_cols); + _G = arma::join_cols(top, bottom); + Glist(0) = _G; + } + } + + arma::mat G = Glist(0); + return G; +} + + +arma::mat invGmat_G (List u_indices, arma::vec sigmas, arma::mat Kin){ + // first construct the correct sized G, i.e. c x c, then broadcast this to all RE levels + // make little G inverse + int c = u_indices.size(); + int stot = 0; + + // sum total number of levels + for(int i=0; i < c; i++){ + StringVector _ir = u_indices(i); + stot += _ir.size(); + } + + // arma::uvec _Gindex(c); // G is always square + Rcpp::List Glist(1); // to store the first G + + for(int x = 0; x < c; x++){ + // create the broadcast matrix + arma::uvec _r = u_indices(x); // the vector of indices of Z that map to the RE + unsigned long q = _r.size(); // the number of levels for the RE + double _s = sigmas(x); // the sigma of the RE + double _sinv = 1/_s; + + arma::mat sG(q, q); + + if(x == c - 1){ + unsigned long n = Kin.n_cols; + if(q != n){ + stop("RE indices and dimensions of covariance do not match"); + } else{ + sG = subMatG(_sinv, Kin); // this needs to be Kin^-1 + } + } else{ + // create the rxr identity matrix + arma::mat rEye(q, q, arma::fill::eye); + sG = subMatG(_sinv, rEye); + } + + // grow G at each iteration here + if(x == 0){ + Glist(0) = sG; + } else{ + unsigned long sg_cols = sG.n_cols; + unsigned long sg_rows = sG.n_rows; + + arma::mat G = Glist(0); + + unsigned long g_cols = G.n_cols; + unsigned long g_rows = G.n_rows; + + arma::mat gright(g_rows, sg_cols); + arma::mat gleft(sg_rows, g_cols); + + arma::mat top(g_rows, g_cols + sg_cols); + arma::mat bottom(sg_rows, sg_cols + g_cols); + + top = arma::join_rows(G, gright); + bottom = arma::join_rows(gleft, sG); + + arma::mat _G(sg_rows + g_rows, sg_cols + g_cols); + _G = arma::join_cols(top, bottom); + Glist(0) = _G; + } + } + + arma::mat G = Glist(0); + return G; +} + + + +arma::mat invGmat (List u_indices, arma::vec sigmas){ + // first construct the correct sized G, i.e. c x c, then brodcast this to all RE levels + // make little G inverse + int c = u_indices.size(); + int stot = 0; + arma::vec lsigma(c); + + for(int k = 0; k < c; k++){ + lsigma(k) = 1/sigmas(k); + } + + // sum total number of levels + for(int i=0; i < c; i++){ + StringVector _ir = u_indices(i); + stot += _ir.size(); + } + + arma::mat G(stot, stot); + G = G.zeros(); + + // this only fills the diagonal elements of G + unsigned long i = 0; + unsigned long j = 0; + + for(unsigned long k=0; k < stot; k++){ + i = k; + j = k; + for(int x = 0; x < c; x++){ + arma::uvec _r = u_indices(x); + unsigned long q = _r.size(); + double _s = lsigma(x); + + for(int l=0; l < q; l++){ + unsigned long _lu = _r(l); + + if(k == _lu - 1){ + G(i, j) = _s; + } + } + } + } + + return G; +} + + +// arma::mat makePCGFill(const List& u_indices, const arma::mat& Kinv){ +// // this makes a matrix of the same dimension as Ginv but without +// // the variance components +// +// // first construct the correct sized G, i.e. c x c, then brodcast this to all RE levels +// // make little G inverse +// int c = u_indices.size(); +// int stot = 0; +// +// // sum total number of levels +// for(int i=0; i < c; i++){ +// StringVector _ir = u_indices(i); +// stot += _ir.size(); +// } +// +// // arma::uvec _Gindex(c); // G is always square +// Rcpp::List Glist(1); // to store the first G +// +// for(int x = 0; x < c; x++){ +// // create the broadcast matrix +// arma::uvec _r = u_indices(x); // the vector of indices of Z that map to the RE +// unsigned long q = _r.size(); // the number of levels for the RE +// +// arma::mat sG(q, q); +// +// if(x == c - 1){ +// unsigned long n = Kinv.n_cols; +// if(q != n){ +// stop("RE indices and dimensions of covariance do not match"); +// } else{ +// sG = Kinv; // sub in 1.0 for 1/sigma +// } +// } else{ +// // create the rxr identity matrix +// arma::mat sG(q, q, arma::fill::eye); +// } +// +// // grow G at each iteration here +// if(x == 0){ +// unsigned long ig_cols = sG.n_cols; +// unsigned long ig_rows = sG.n_rows; +// Glist(0) = sG; +// } else{ +// unsigned long sg_cols = sG.n_cols; +// unsigned long sg_rows = sG.n_rows; +// +// arma::mat G = Glist(0); +// +// unsigned long g_cols = G.n_cols; +// unsigned long g_rows = G.n_rows; +// +// arma::mat gright(g_rows, sg_cols); +// arma::mat gleft(sg_rows, g_cols); +// +// arma::mat top(g_rows, g_cols + sg_cols); +// arma::mat bottom(sg_rows, sg_cols + g_cols); +// +// top = arma::join_rows(G, gright); +// bottom = arma::join_rows(gleft, sG); +// +// arma::mat _G(sg_rows + g_rows, sg_cols + g_cols); +// _G = arma::join_cols(top, bottom); +// Glist(0) = _G; +// } +// } +// +// arma::mat G = Glist(0); +// return G; +// } + + +arma::mat broadcastInverseMatrix(arma::mat matrix, const unsigned int& n){ + // take the individual nxn matrices where n=N/2 + arma::mat A(n, n); + unsigned int m = 2*n; + A = matrix.submat(0, 0, n-1, n-1); + + // check for singular sub-matrix + double _rcond = arma::rcond(A); + bool is_singular; + is_singular = _rcond < 1e-9; + + if(is_singular){ + Rcpp::stop("Kinship sub-matrix is singular"); + } + + arma::mat Ainv(n, n); + Ainv = arma::inv(A); + + arma::mat top(n, m, arma::fill::zeros); + top = arma::join_rows(Ainv, Ainv); + arma::mat bot(n, m, arma::fill::zeros); + bot = arma::join_rows(Ainv, Ainv); + + arma::mat kinverse(m, m, arma::fill::zeros); + kinverse = arma::join_cols(top, bot); + + return kinverse; +} + diff --git a/src/computeMatrices.h b/src/computeMatrices.h new file mode 100644 index 0000000..5e704f7 --- /dev/null +++ b/src/computeMatrices.h @@ -0,0 +1,25 @@ +#ifndef COMPUTEMATRICES_H +#define COMPUTEMATRICES_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +arma::vec computeYStar (arma::mat X, arma::vec curr_beta, arma::mat Z, arma::mat Dinv, + arma::vec curr_u, arma::vec y, arma::vec offsets); +arma::mat computeVmu (arma::vec mu, double r, std::string vardist); +arma::mat computeVmuPoisson(arma::vec mu); +arma::mat computeVmuNB(arma::vec mu, double r); +arma::mat computeW (double disp, arma::mat Dinv, std::string vardist); +arma::mat computeWNB(double disp, arma::mat Dinv); +arma::mat computeWPoisson(arma::mat Dinv); +arma::mat computeVStar (arma::mat Z, arma::mat G, arma::mat W); +arma::mat computePREML (arma::mat Vsinv, arma::mat X); +arma::mat initialiseG (Rcpp::List rlevels, arma::vec sigmas); +arma::mat initialiseG_G (Rcpp::List u_indices, arma::vec sigmas, arma::mat Kin); +arma::mat invGmat (Rcpp::List rlevels, arma::vec sigmas); +arma::mat invGmat_G (Rcpp::List u_indices, arma::vec sigmas, arma::mat Kin); +arma::mat subMatG (arma::vec u_index, double sigma, arma::mat broadcast); +// arma::mat makePCGFill(const Rcpp::List& u_indices, const arma::mat& Kinv); +arma::mat broadcastInverseMatrix(arma::mat matrix, const unsigned int& n); + +#endif diff --git a/src/fitGeneticPLGlmm.cpp b/src/fitGeneticPLGlmm.cpp new file mode 100644 index 0000000..15bcff3 --- /dev/null +++ b/src/fitGeneticPLGlmm.cpp @@ -0,0 +1,381 @@ +#include +#include +#include +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppEigen)]] +#include "paramEst.h" +#include "computeMatrices.h" +#include "invertPseudoVar.h" +#include "pseudovarPartial.h" +#include "multiP.h" +#include "inference.h" +#include "utils.h" +using namespace Rcpp; + + +//' GLMM parameter estimation using pseudo-likelihood with a custom covariance matrix +//' +//' Iteratively estimate GLMM fixed and random effect parameters, and variance +//' component parameters using Fisher scoring based on the Pseudo-likelihood +//' approximation to a Normal loglihood. This function incorporates a user-defined +//' covariance matrix, e.g. a kinship matrix for genetic analyses. +//' +//' @param Z mat - sparse matrix that maps random effect variable levels to +//' observations +//' @param X mat - sparse matrix that maps fixed effect variables to +//' observations +//' @param K mat - sparse matrix that defines the known covariance patterns between +//' individual observations. For example, a kinship matrix will then adjust for the +//' known/estimated genetic relationships between observations. +//' @param muvec vec vector of estimated phenotype means +//' @param offsets vec vector of model offsets +//' @param curr_theta vec vector of initial parameter estimates +//' @param curr_beta vec vector of initial beta estimates +//' @param curr_u vec of initial u estimates +//' @param curr_sigma vec of initial sigma estimates +//' @param curr_G mat c X c matrix of variance components +//' @param y vec of observed counts +//' @param u_indices List a List, each element contains the indices of Z relevant +//' to each RE and all its levels +//' @param theta_conv double Convergence tolerance for paramter estimates +//' @param rlevels List containing mapping of RE variables to individual +//' levels +//' @param curr_disp double Dispersion parameter estimate +//' @param REML bool - use REML for variance component estimation +//' @param maxit int maximum number of iterations if theta_conv is FALSE +//' @param solver string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring +//' @param vardist string which variance form to use NB = negative binomial, P=Poisson [not yet implemented]/ +//' +//' @details Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +//' switches between the joint fixed and random effect parameter inference, and the variance component +//' estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +//' given the parameter estimates. The fixed and random effect parameters are estimated using +//' Hendersons mixed model equations, and the variance component parameters are then estimated with +//' the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +//' the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +//' trigger the switch to the HE-NNLS solver until the model converges. +//' +//' @return A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +//' types are described here): +//' \describe{ +//' \item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +//' \item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +//' linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +//' \item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable. For this model the +//' last variance component corresponds to the input \emph{K} matrix.} +//' \item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +//' \item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +//' \item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +//' \item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +//' \item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +//' \item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +//' \item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +//' \item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +//' \item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +//' component.} +//' \item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +//' \item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +//' \item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +//' \item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +//' This is required to compute the degrees of freedom for the fixed effect parameter inference.} +//' \item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +//' iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +//' The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +//' } +//' +//' @author Mike Morgan +//' +//' @examples +//' NULL +//' +//' @name fitGeneticPLGlmm +//' +// [[Rcpp::export]] +List fitGeneticPLGlmm(const arma::mat& Z, const arma::mat& X, const arma::mat& K, + arma::vec muvec, arma::vec offsets, arma::vec curr_beta, + arma::vec curr_theta, arma::vec curr_u, arma::vec curr_sigma, + arma::mat curr_G, const arma::vec& y, List u_indices, + double theta_conv, + const List& rlevels, double curr_disp, const bool& REML, const int& maxit, + std::string solver, + std::string vardist){ + + // no guarantee that Pi exists before C++ 20(?!?!?!) + constexpr double pi = 3.14159265358979323846; + + // declare all variables + List outlist(14); + int iters=0; + int stot = Z.n_cols; + const int& c = curr_sigma.size(); + const int& m = X.n_cols; + const int& n = X.n_rows; + bool meet_cond = false; + double constval = 0.0; // value at which to constrain values + double _intercept = constval; // intercept for HE regression + double delta_up = 2.0 * curr_disp; + double delta_lo = 0.0; + double update_disp = 0.0; + double disp_diff = 0.0; + + // setup matrices + arma::mat D(n, n); + D.zeros(); + arma::mat Dinv(n, n); + Dinv.zeros(); + + arma::vec y_star(n); + + arma::mat Vmu(n, n); + arma::mat W(n, n); + arma::mat Winv(n, n); + + arma::mat V_star(n, n); + arma::mat V_star_inv(n, n); + arma::mat P(n, n); + + arma::mat coeff_mat(m+c, m+c); + List V_partial(c); + V_partial = pseudovarPartial_G(Z, K, u_indices); + // compute outside the loop + List VP_partial(c); + + arma::vec score_sigma(c); + arma::mat information_sigma(c, c); + arma::vec sigma_update(c); + arma::vec sigma_diff(sigma_update.size()); + sigma_diff.zeros(); + + arma::mat G_inv(stot, stot); + arma::mat Zstar(n, stot); + arma::mat Gfill(stot, stot); + + arma::vec theta_update(m+stot); + arma::vec theta_diff(theta_update.size()); + theta_diff.zeros(); + + List conv_list(maxit+1); + + // create a uvec of sigma indices + arma::uvec _sigma_index(c); + for(int i=0; i < c; i++){ + _sigma_index[i] = i+1; + } + + // setup vectors to index the theta updates + // assume always in order of beta then u + arma::uvec beta_ix(m); + for(int x=0; x < m; x++){ + beta_ix[x] = x; + } + + arma::uvec u_ix(stot); + for(int px = 0; px < stot; px++){ + u_ix[px] = m + px; + } + + // we only need to invert the Kinship once + unsigned long _kn = K.n_cols; + arma::mat Kinv(_kn, _kn); + + // check this isn't singular first - it could be due to a block structure + double _rcond = arma::rcond(K); + bool is_singular; + is_singular = _rcond < 1e-9; + + // check for singular condition + if(is_singular){ + // first try to invert the top block which should be N/2 x N/2 + Rcpp::warning("Kinship is singular - attempting broad cast inverse"); + double nhalfloat = (double)n/2; + unsigned int nhalf = nhalfloat; + Kinv = broadcastInverseMatrix(K, nhalf); + } else{ + Kinv = arma::inv(K); // this could be very slow + } + + bool converged = false; + bool _phi_est = true; // control if we re-estimate phi or not + + // initial optimisation of dispersion + // switch this to a golden-section search? + update_disp = phiGoldenSearch(curr_disp, delta_lo, delta_up, c, + muvec, G_inv, pi, + curr_u, curr_sigma, y); + disp_diff = abs(curr_disp - update_disp); + curr_disp = update_disp; + // make the upper and lower bounds based on the current value, + // but 0 < lo < up < 1.0 + delta_lo = std::max(0.0, curr_disp - (curr_disp*0.5)); + delta_up = std::max(0.0, curr_disp); + + while(!meet_cond){ + D.diag() = muvec; // data space + Dinv = D.i(); + y_star = computeYStar(X, curr_beta, Z, Dinv, curr_u, y, offsets); // data space + + Vmu = computeVmu(muvec, curr_disp, vardist); + W = computeW(curr_disp, Dinv, vardist); + Winv = W.i(); + + V_star = computeVStar(Z, curr_G, W); // K is implicitly included in curr_G + V_star_inv = invertPseudoVar(Winv, curr_G, Z); + + if(REML){ + P = computePREML(V_star_inv, X); + // take the partial derivative outside the while loop, just keep the P*\dVar\dSigma + VP_partial = pseudovarPartial_P(V_partial, P); + + score_sigma = sigmaScoreREML_arma(VP_partial, y_star, P); + information_sigma = sigmaInfoREML_arma(VP_partial, P); + } else{ + List VP_partial = V_partial; + score_sigma = sigmaScore(y_star, curr_beta, X, VP_partial, V_star_inv); + information_sigma = sigmaInformation(V_star_inv, VP_partial); + }; + + // choose between HE regression and Fisher scoring for variance components + // sigma_update is always 1 element longer than the others with HE, but we need to keep track of this + if(solver == "HE"){ + // try Haseman-Elston regression instead of Fisher scoring + sigma_update = estHasemanElstonGenetic(Z, P, u_indices, y_star, K); + } else if (solver == "HE-NNLS"){ + // for the first iteration use the current non-zero estimate + arma::dvec _curr_sigma(c+1); + _curr_sigma.fill(constval); + + if(iters > 0){ + _curr_sigma[0] = _intercept; + _curr_sigma.elem(_sigma_index) = curr_sigma; // is this valid to set elements like this? + } + + sigma_update = estHasemanElstonConstrainedGenetic(Z, P, u_indices, y_star, K, _curr_sigma, iters); + _intercept = sigma_update[0]; + sigma_update = sigma_update.tail(c); + + }else if(solver == "Fisher"){ + sigma_update = fisherScore(information_sigma, score_sigma, curr_sigma); + } + + // if we have negative sigmas then we need to switch solver + if(any(sigma_update < 0.0)){ + warning("Negative variance components - re-running with NNLS"); + solver = "HE-NNLS"; + // for the first iteration use the current non-zero estimate + arma::dvec _curr_sigma(c+1, arma::fill::zeros); + _curr_sigma.fill(constval); + + // if these are all zero then it can only be that they are initial estimates + if(iters > 0){ + _curr_sigma[0] = _intercept; + _curr_sigma.elem(_sigma_index) = curr_sigma; // is this valid to set elements like this? + } + sigma_update = estHasemanElstonConstrainedGenetic(Z, P, u_indices, y_star, K, _curr_sigma, iters); + } + + sigma_diff = abs(sigma_update - curr_sigma); // needs to be an unsigned real value + + // update sigma, G, and G_inv + curr_sigma = sigma_update; + curr_G = initialiseG_G(u_indices, curr_sigma, K); + G_inv = invGmat_G(u_indices, curr_sigma, Kinv); + + // Update the dispersion with the new variances + update_disp = phiMME(y_star, curr_sigma); + + // Next, solve pseudo-likelihood GLMM equations to compute solutions for beta and u + // compute the coefficient matrix + coeff_mat = coeffMatrix(X, Winv, Z, G_inv); //model space + theta_update = solveEquations(stot, m, Winv, Zstar.t(), X.t(), coeff_mat, curr_beta, curr_u, y_star); //model space + + LogicalVector _check_theta = check_na_arma_numeric(theta_update); + bool _any_ystar_na = any(_check_theta).is_true(); // .is_true required for proper type casting to bool + + if(_any_ystar_na){ + List this_conv(5); + this_conv = List::create(_["ThetaDiff"]=theta_diff, _["SigmaDiff"]=sigma_diff, _["beta"]=curr_beta, + _["u"]=curr_u, _["sigma"]=curr_sigma); + conv_list(iters-1) = this_conv; + warning("NaN in theta update"); + break; + } + + theta_diff = abs(theta_update - curr_theta); + + curr_theta = theta_update; //model space + curr_beta = curr_theta.elem(beta_ix); //model space + curr_u = curr_theta.elem(u_ix); //model space + + muvec = exp(offsets + (X * curr_beta) + (Z * curr_u)); // data space + LogicalVector _check_mu = check_na_arma_numeric(muvec); + bool _any_na = any(_check_mu).is_true(); // .is_true required for proper type casting to bool + + LogicalVector _check_inf = check_inf_arma_numeric(muvec); + bool _any_inf = any(_check_inf).is_true(); + + if(_any_na){ + stop("NA estimates in linear predictor - consider an alternative model"); + } + + if(_any_inf){ + stop("Infinite parameter estimates - consider an alternative model"); + } + + iters++; + + bool _thconv = false; + _thconv = all(theta_diff < theta_conv); + + bool _siconv = false; + _siconv = all(sigma_diff < theta_conv); + + bool _ithit = false; + _ithit = iters > maxit; + + meet_cond = ((_thconv && _siconv) || _ithit); + converged = _thconv && _siconv; + + // compute final loglihood + // make non-broadcast G matrix + arma::mat littleG(c, c, arma::fill::zeros); + + for(int i=0; i +#include +// [[Rcpp::depends(RcppArmadillo)]] +#include "paramEst.h" +#include "computeMatrices.h" +#include "invertPseudoVar.h" +#include "pseudovarPartial.h" +#include "multiP.h" +#include "inference.h" +#include "utils.h" +using namespace Rcpp; + +//' GLMM parameter estimation using pseudo-likelihood +//' +//' Iteratively estimate GLMM fixed and random effect parameters, and variance +//' component parameters using Fisher scoring based on the Pseudo-likelihood +//' approximation to a Normal loglihood. +//' +//' @param Z mat - sparse matrix that maps random effect variable levels to +//' observations +//' @param X mat - sparse matrix that maps fixed effect variables to +//' observations +//' @param muvec vec vector of estimated phenotype means +//' @param offsets vec vector of model offsets +//' @param curr_theta vec vector of initial parameter estimates +//' @param curr_beta vec vector of initial beta estimates +//' @param curr_u vec of initial u estimates +//' @param curr_sigma vec of initial sigma estimates +//' @param curr_G mat c X c matrix of variance components +//' @param y vec of observed counts +//' @param u_indices List a List, each element contains the indices of Z relevant +//' to each RE and all its levels +//' @param theta_conv double Convergence tolerance for paramter estimates +//' @param rlevels List containing mapping of RE variables to individual +//' levels +//' @param curr_disp double Dispersion parameter estimate +//' @param REML bool - use REML for variance component estimation +//' @param maxit int maximum number of iterations if theta_conv is FALSE +//' @param solver string which solver to use - either HE (Haseman-Elston regression) or Fisher scoring +//' @param vardist string which variance form to use NB = negative binomial, P=Poisson [not yet implemented.] +//' +//' @details Fit a NB-GLMM to the counts provided in \emph{y}. The model uses an iterative approach that +//' switches between the joint fixed and random effect parameter inference, and the variance component +//' estimation. A pseudo-likelihood approach is adopted to minimise the log-likelihood of the model +//' given the parameter estimates. The fixed and random effect parameters are estimated using +//' Hendersons mixed model equations, and the variance component parameters are then estimated with +//' the specified solver, i.e. Fisher scoring, Haseman-Elston or constrained Haseman-Elston regression. As +//' the domain of the variance components is [0, +\code{Inf}], any negative variance component estimates will +//' trigger the switch to the HE-NNLS solver until the model converges. +//' +//' @return A \code{list} containing the following elements (note: return types are dictated by Rcpp, so the R +//' types are described here): +//' \describe{ +//' \item{\code{FE}:}{\code{numeric} vector of fixed effect parameter estimates.} +//' \item{\code{RE}:}{\code{list} of the same length as the number of random effect variables. Each slot contains the best +//' linear unbiased predictors (BLUPs) for the levels of the corresponding RE variable.} +//' \item{\code{Sigma:}}{\code{numeric} vector of variance component estimates, 1 per random effect variable.} +//' \item{\code{converged:}}{\code{logical} scalar of whether the model has reached the convergence tolerance or not.} +//' \item{\code{Iters:}}{\code{numeric} scalar with the number of iterations that the model ran for. Is strictly <= \code{max.iter}.} +//' \item{\code{Dispersion:}}{\code{numeric} scalar of the dispersion estimate computed off-line} +//' \item{\code{Hessian:}}{\code{matrix} of 2nd derivative elements from the fixed and random effect parameter inference.} +//' \item{\code{SE:}}{\code{matrix} of standard error estimates, derived from the hessian, i.e. the square roots of the diagonal elements.} +//' \item{\code{t:}}{\code{numeric} vector containing the compute t-score for each fixed effect variable.} +//' \item{\code{COEFF:}}{\code{matrix} containing the coefficient matrix from the mixed model equations.} +//' \item{\code{P:}}{\code{matrix} containing the elements of the REML projection matrix.} +//' \item{\code{Vpartial:}}{\code{list} containing the partial derivatives of the (pseudo)variance matrix with respect to each variance +//' component.} +//' \item{\code{Ginv:}}{\code{matrix} of the inverse variance components broadcast to the full Z matrix.} +//' \item{\code{Vsinv:}}{\code{matrix} of the inverse pseudovariance.} +//' \item{\code{Winv:}}{\code{matrix} of the inverse elements of W = D^-1 V D^-1} +//' \item{\code{VCOV:}}{\code{matrix} of the variance-covariance for all model fixed and random effect variable parameter estimates. +//' This is required to compute the degrees of freedom for the fixed effect parameter inference.} +//' \item{\code{CONVLIST:}}{\code{list} of \code{list} containing the parameter estimates and differences between current and previous +//' iteration estimates at each model iteration. These are included for each fixed effect, random effect and variance component parameter. +//' The list elements for each iteration are: \emph{ThetaDiff}, \emph{SigmaDiff}, \emph{beta}, \emph{u}, \emph{sigma}.} +//' } +//' +//' @author Mike Morgan +//' +//' @examples +//' NULL +//' +//' @name fitPLGlmm +// [[Rcpp::export]] +List fitPLGlmm(const arma::mat& Z, const arma::mat& X, arma::vec muvec, + arma::vec offsets, arma::vec curr_beta, + arma::vec curr_theta, arma::vec curr_u, arma::vec curr_sigma, + arma::mat curr_G, const arma::vec& y, List u_indices, + double theta_conv, + const List& rlevels, double curr_disp, const bool& REML, const int& maxit, + std::string solver, + std::string vardist){ + + // no guarantee that Pi exists before C++ 20(?!?!?!) + constexpr double pi = 3.14159265358979323846; + + // declare all variables + List outlist(12); + int iters=0; + int stot = Z.n_cols; + const int& c = curr_sigma.size(); + const int& m = X.n_cols; + const int& n = X.n_rows; + bool meet_cond = false; + double constval = 1e-8; // value at which to constrain values + double _intercept = constval; // intercept for HE regression + double delta_up = 2.0 * curr_disp; + double delta_lo = 0.0; + double update_disp = 0.0; + double disp_diff = 0.0; + + // setup matrices + arma::mat D(n, n); + D.zeros(); + arma::mat Dinv(n, n); + Dinv.zeros(); + + arma::vec y_star(n); + + arma::mat Vmu(n, n); + Vmu.zeros(); + arma::mat W(n, n); + W.zeros(); + arma::mat Winv(n, n); + Winv.zeros(); + + arma::mat V_star(n, n); + V_star.zeros(); + arma::mat V_star_inv(n, n); + V_star_inv.zeros(); + arma::mat P(n, n); + P.zeros(); + + arma::mat coeff_mat(m+c, m+c); + coeff_mat.zeros(); + List V_partial(c); + V_partial = pseudovarPartial_C(Z, u_indices); + // compute outside the loop + List VP_partial(c); + + arma::vec score_sigma(c); + arma::mat information_sigma(c, c); + information_sigma.zeros(); + arma::vec sigma_update(c); + arma::vec sigma_diff(sigma_update.size()); + sigma_diff.zeros(); + + arma::mat G_inv(stot, stot); + G_inv.zeros(); + + arma::vec theta_update(m+stot); + arma::vec theta_diff(theta_update.size()); + theta_diff.zeros(); + + List conv_list(maxit+1); + + // create a uvec of sigma indices + arma::uvec _sigma_index(c); + for(int i=0; i < c; i++){ + _sigma_index[i] = i+1; + } + + // setup vectors to index the theta updates + // assume always in order of beta then u + arma::uvec beta_ix(m); + for(int x=0; x < m; x++){ + beta_ix[x] = x; + } + + arma::uvec u_ix(stot); + for(int px = 0; px < stot; px++){ + u_ix[px] = m + px; + } + + bool converged = false; + bool _phi_est = true; // control if we re-estimate phi or not + + // // initial optimisation of dispersion + update_disp = phiGoldenSearch(curr_disp, delta_lo, delta_up, c, + muvec, G_inv, pi, + curr_u, curr_sigma, y); + disp_diff = abs(curr_disp - update_disp); + curr_disp = update_disp; + // make the upper and lower bounds based on the current value, + // but 0 < lo < up < ?? + delta_lo = std::max(0.0, curr_disp - (curr_disp*0.5)); + delta_up = std::max(0.0, curr_disp); + + while(!meet_cond){ + D.diag() = muvec; + + // check for all zero eigen values + arma::cx_vec d_eigenval = arma::eig_gen(D); // this needs to handle complex values + LogicalVector _check_zero = check_zero_arma_complex(d_eigenval); + bool _all_zero = any(_check_zero).is_true(); + + if(_all_zero){ + stop("Zero eigenvalues in D - do you have collinear variables?"); + } + + Dinv = D.i(); + y_star = computeYStar(X, curr_beta, Z, Dinv, curr_u, y, offsets); + Vmu = computeVmu(muvec, curr_disp, vardist); + + W = computeW(curr_disp, Dinv, vardist); + Winv = W.i(); + V_star = computeVStar(Z, curr_G, W); + V_star_inv = invertPseudoVar(Winv, curr_G, Z); + + if(REML){ + P = computePREML(V_star_inv, X); + // take the partial derivative outside the while loop, just keep the P*\dVar\dSigma + VP_partial = pseudovarPartial_P(V_partial, P); + + score_sigma = sigmaScoreREML_arma(VP_partial, y_star, P); + information_sigma = sigmaInfoREML_arma(VP_partial, P); + } else{ + // theres a strange bug that means assigning V_partial to VP_partial + // doesn't copy over the contents of the list - perhaps it needs to + // be a pointer? Crude solve is to just to pre-multiply by I + P = arma::eye(n, n); + VP_partial = pseudovarPartial_P(V_partial, P); + // List VP_partial = V_partial; + score_sigma = sigmaScore(y_star, curr_beta, X, VP_partial, V_star_inv); + information_sigma = sigmaInformation(V_star_inv, VP_partial); + }; + + // choose between HE regression and Fisher scoring for variance components + // would a hybrid approach work here? If any HE estimates are zero switch + // to NNLS using these as the initial estimates? + if(solver == "HE"){ + // try Haseman-Elston regression instead of Fisher scoring + if(REML){ + sigma_update = estHasemanElston(Z, P, u_indices, y_star); + } else{ + sigma_update = estHasemanElstonML(Z, u_indices, y_star); + } + + } else if(solver == "HE-NNLS"){ + // for the first iteration use the current non-zero estimate + arma::dvec _curr_sigma(c+1, arma::fill::zeros); + _curr_sigma.fill(constval); + + // if these are all zero then it can only be that they are initial estimates + if(iters > 0){ + _curr_sigma[0] = _intercept; + _curr_sigma.elem(_sigma_index) = curr_sigma; // is this valid to set elements like this? + } + if(REML){ + sigma_update = estHasemanElstonConstrained(Z, P, u_indices, y_star, _curr_sigma, iters); + } else{ + sigma_update = estHasemanElstonConstrainedML(Z, u_indices, y_star, _curr_sigma, iters); + } + }else if(solver == "Fisher"){ + sigma_update = fisherScore(information_sigma, score_sigma, curr_sigma); + } + + // if we have negative sigmas then we need to switch solver + if(any(sigma_update < 0.0)){ + warning("Negative variance components - re-running with NNLS"); + solver = "HE-NNLS"; + // for the first iteration use the current non-zero estimate + arma::dvec _curr_sigma(c+1, arma::fill::zeros); + _curr_sigma.fill(constval); + + // if these are all zero then it can only be that they are initial estimates + if(iters > 0){ + _curr_sigma[0] = _intercept; + _curr_sigma.elem(_sigma_index) = curr_sigma; // is this valid to set elements like this? + } + if(REML){ + sigma_update = estHasemanElstonConstrained(Z, P, u_indices, y_star, _curr_sigma, iters); + } else{ + sigma_update = estHasemanElstonConstrainedML(Z, u_indices, y_star, _curr_sigma, iters); + } + } + + // update sigma, G, and G_inv + curr_sigma = sigma_update; + curr_G = initialiseG(u_indices, curr_sigma); + G_inv = invGmat(u_indices, curr_sigma); + + // Update the dispersion with the new variances + update_disp = phiMME(y_star, curr_sigma); + + // Next, solve pseudo-likelihood GLMM equations to compute solutions for B and u + // compute the coefficient matrix + coeff_mat = coeffMatrix(X, Winv, Z, G_inv); + theta_update = solveEquations(stot, m, Winv, Z.t(), X.t(), coeff_mat, curr_beta, curr_u, y_star); + theta_diff = abs(theta_update - curr_theta); + + // inference + curr_theta = theta_update; + curr_beta = curr_theta.elem(beta_ix); + curr_u = curr_theta.elem(u_ix); + + // need to check for infinite and NA values here... + muvec = exp(offsets + (X * curr_beta) + (Z * curr_u)); + LogicalVector _check_mu = check_na_arma_numeric(muvec); + bool _any_na = any(_check_mu).is_true(); // .is_true required for proper type casting to bool + + LogicalVector _check_inf = check_inf_arma_numeric(muvec); + bool _any_inf = any(_check_inf).is_true(); + + if(_any_na){ + stop("NA estimates in linear predictor - consider an alternative model"); + } + + if(_any_inf){ + stop("Infinite parameter estimates - consider an alternative model"); + } + + iters++; + + bool _thconv = false; + _thconv = all(theta_diff < theta_conv); + + bool _siconv = false; + _siconv = all(sigma_diff < theta_conv); + + bool _ithit = false; + _ithit = iters > maxit; + + meet_cond = ((_thconv && _siconv) || _ithit); + converged = _thconv && _siconv; + + // compute final loglihood + // make non-broadcast G matrix + arma::mat littleG(c, c, arma::fill::zeros); + + for(int i=0; i +#include +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppEigen)]] +// using namespace Rcpp; + +// All functions used for inference +arma::vec computeSE(const int& m, const int& c, const arma::mat& coeff_mat) { + // compute the fixed effect standard errors from the MME coefficient matrix + + const int& l = coeff_mat.n_cols; // this should be m + c + const int& p = coeff_mat.n_rows; // this should be m + c + + const int& scol = m + c; + const int& srow = m + c; + if(l != scol){ + Rcpp::Rcout << scol << std::endl; + Rcpp::Rcout << l << std::endl; + Rcpp::Rcout << p << std::endl; + Rcpp::stop("N cols and input dimensions m + c are not equal"); + } + + if(p != srow){ + Rcpp::stop("N rows and input dimensions m + c are not equal"); + } + + arma::mat ul(coeff_mat.submat(0, 0, m-1, m-1)); // m X m + arma::mat ur(coeff_mat.submat(0, m, m-1, m+c-1)); // m X l + arma::mat ll(coeff_mat.submat(m, 0, m+c-1, m-1)); // p X m + arma::mat lr(coeff_mat.submat(m, m, m+c-1, m+c-1)); // p X l + + + arma::mat _se(ul - ur * lr.i() * ll); // m X m - (m X c X m) <- this should commute + arma::vec se(m+c); + // will need a check here for singular hessians... + try{ + double _rcond = arma::rcond(_se); + bool is_singular; + is_singular = _rcond < 1e-12; + + // check for singular condition + if(is_singular){ + Rcpp::stop("Standard Error coefficient matrix is computationally singular"); + } + + arma::mat _seInv(_se.i()); + // arma::vec se(arma::sqrt(_seInv.diag())); + se = arma::sqrt(_seInv.diag()); + } catch(std::exception &ex){ + forward_exception_to_r(ex); + } catch(...){ + Rf_error("c++ exception (unknown reason)"); + } + + return se; +} + + +arma::vec computeTScore(const arma::vec& curr_beta, const arma::vec& SE){ + + const int& m = curr_beta.size(); + const int& selength = SE.size(); + + if(m != selength){ + Rcpp::stop("standard errors and beta estimate sizes differ"); + } + + arma::vec tscore(m); + + for(int x=0; x < m; x++){ + double _beta = curr_beta[x]; + double _se = SE[x]; + + double _t = _beta/_se; + tscore[x] = _t; + } + + return tscore; +} + + +arma::mat varCovar(const Rcpp::List& psvari, const int& c){ + arma::mat Va(c, c); + for(int i=0; i < c; i++){ + arma::mat _ips = psvari(i); // why isn't this + for(int j=i; j < c; j++){ + arma::mat _jps = psvari(j); + arma::mat _ij(_ips * _jps); + Va(i, j) = 2 * (1/(arma::trace(_ij))); + if(i != j){ + Va(j, i) = 2 * (1/(arma::trace(_ij))); + } + } + } + + return Va; +} + diff --git a/src/inference.h b/src/inference.h new file mode 100644 index 0000000..226f5e8 --- /dev/null +++ b/src/inference.h @@ -0,0 +1,11 @@ +#ifndef INFERENCE_H +#define INFERENCE_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +arma::vec computeSE(const int& m, const int& c, const arma::mat& coeff_mat); +arma::vec computeTScore(const arma::vec& curr_beta, const arma::vec& SE); +arma::mat varCovar(const Rcpp::List& psvari, const int& c); + +#endif diff --git a/src/invertPseudoVar.cpp b/src/invertPseudoVar.cpp new file mode 100644 index 0000000..7195195 --- /dev/null +++ b/src/invertPseudoVar.cpp @@ -0,0 +1,47 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] +#include "invertPseudoVar.h" +using namespace Rcpp; + +arma::mat invertPseudoVar(arma::mat A, arma::mat B, arma::mat Z){ + // make sparse - helps with the many matrix multiplications + int c = B.n_cols; + int n = A.n_cols; + + arma::sp_mat spA(A); + arma::sp_mat spB(B); + arma::sp_mat spZ(Z); + + arma::sp_mat I = arma::speye(c, c); // create the cxc identity matrix + arma::sp_mat omt(n, n); + arma::sp_mat mid(c, c); + arma::sp_mat AZB(A.n_rows, B.n_cols); + arma::mat out_omt(omt); + + AZB = spA * spZ * spB; + mid = I + (spZ.t() * AZB); // If we know the structure in B can we simplify this more??? + arma::mat dmid(mid); + + try{ + // double _rcond = arma::rcond(mid); + double _rcond = arma::rcond(dmid); + bool is_singular; + is_singular = _rcond < 1e-12; + + // check for singular condition + if(is_singular){ + Rcpp::stop("Pseudovariance component matrix is computationally singular"); + } + + arma::sp_mat midinv(dmid.i()); + omt = spA - (AZB * midinv * spZ.t() * spA); // stack multiplications like this appear to be slow + arma::mat out_omt(omt); + return out_omt; + } catch(std::exception &ex){ + forward_exception_to_r(ex); + } catch(...){ + Rf_error("c++ exception (unknown reason)"); + } + + return out_omt; +} diff --git a/src/invertPseudoVar.h b/src/invertPseudoVar.h new file mode 100644 index 0000000..d3fcb12 --- /dev/null +++ b/src/invertPseudoVar.h @@ -0,0 +1,9 @@ +#ifndef INVERTPSEUDOVAR_H +#define INVERTPSEUDOVAR_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +arma::mat invertPseudoVar(arma::mat A, arma::mat B, arma::mat Z); + +#endif diff --git a/src/multiP.cpp b/src/multiP.cpp new file mode 100644 index 0000000..2446a30 --- /dev/null +++ b/src/multiP.cpp @@ -0,0 +1,22 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] +#include "multiP.h" +using namespace Rcpp; + +List multiP(List partials, arma::mat psvar_in){ + int nsize = partials.size(); + int ps_nrows = psvar_in.n_rows; // this should be the column dimension + List out(nsize); + + for(int k = 0; k < nsize; k++){ + arma::mat _p = partials[k]; + int ncol = _p.n_cols; + + arma::mat _P(ps_nrows, ncol); // this is an empty sparse matrix rows from psvar_in, cols from _p + _P = (psvar_in * _p); + + out[k] = _P; + } + + return out; +} diff --git a/src/multiP.h b/src/multiP.h new file mode 100644 index 0000000..9c53fb2 --- /dev/null +++ b/src/multiP.h @@ -0,0 +1,9 @@ +#ifndef MULTIP_H +#define MULTIP_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +Rcpp::List multiP(Rcpp::List partials, arma::mat psvar_in); + +#endif diff --git a/src/paramEst.cpp b/src/paramEst.cpp new file mode 100644 index 0000000..83bdc10 --- /dev/null +++ b/src/paramEst.cpp @@ -0,0 +1,1042 @@ +#include "paramEst.h" +#include "computeMatrices.h" +#include "utils.h" +#include +#include +#include +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppEigen)]] +// using namespace Rcpp; + +// All functions used in parameter estimation + +arma::vec sigmaScoreREML_arma (const Rcpp::List& pvstar_i, const arma::vec& ystar, const arma::mat& P){ + // Armadillo implementation + // sparsifying doesn't speed up - overhead is too high + const int& c = pvstar_i.size(); + arma::vec reml_score(c); + + for(int i=0; i < c; i++){ + const arma::mat& P_pvi = pvstar_i(i); // this is P * partial derivative + double lhs = -0.5 * arma::trace(P_pvi); + arma::mat mid1(1, 1); + mid1 = arma::trans(ystar) * P_pvi * P * ystar; + double rhs = 0.5 * mid1[0, 0]; + + reml_score[i] = lhs + rhs; + } + + return reml_score; +} + + +arma::mat sigmaInfoREML_arma (const Rcpp::List& pvstari, const arma::mat& P){ + // REML Fisher/expected information matrix + // sparsifying this is baaaad for performance + const int& c = pvstari.size(); + arma::mat sinfo(c, c); + + // this is a symmetric matrix so only need to fill the upper or + // lower triangle to make it O(n^2/2) rather than O(n^2) + for(int i=0; i < c; i++){ + const arma::mat& _ipP = pvstari[i]; // this is P * \d Var/ \dsigma + + for(int j=i; j < c; j++){ + const arma::mat& P_jp = pvstari[j]; // this is P * \d Var/ \dsigma + arma::mat a_ij(_ipP * P_jp); // this is the biggest bottleneck - it takes >2s! + double _artr = arma::trace(a_ij); + + sinfo(i, j) = 0.5 * _artr; + if(i != j){ + sinfo(j, i) = 0.5 * _artr; + } + + } + } + + return sinfo; +} + + +arma::vec sigmaScore (arma::vec ystar, arma::vec beta, arma::mat X, Rcpp::List V_partial, arma::mat V_star_inv){ + + int c = V_partial.size(); + int n = X.n_rows; + arma::vec score(c); + arma::vec ystarminx(n); + ystarminx = ystar - (X * beta); + + for(int i=0; i < c; i++){ + arma::mat _ip = V_partial[i]; + double lhs = -0.5 * arma::trace(V_star_inv * _ip); + arma::mat rhs_mat(1, 1); + + rhs_mat = ystarminx.t() * V_star_inv * _ip * V_star_inv * ystarminx; + score[i] = lhs + 0.5 * rhs_mat(0, 0); + } + + return score; +} + + +arma::mat sigmaInformation (arma::mat V_star_inv, Rcpp::List V_partial){ + int c = V_partial.size(); + int n = V_star_inv.n_cols; + arma::mat sinfo = arma::zeros(c, c); + + for(int i=0; i < c; i++){ + arma::mat _ip = V_partial(i); + for(int j=0; j < c; j++){ + arma::mat _jp = V_partial(j); + + arma::mat _inmat(n, n); + double _tr = 0.0; + _inmat = V_star_inv * _ip * V_star_inv * _jp; + _tr = 0.5 * arma::trace(_inmat); + + sinfo(i, j) = _tr; + } + } + + return sinfo; +} + + +arma::vec fisherScore (const arma::mat& hess, const arma::vec& score_vec, const arma::vec& theta_hat){ + // sequentially update the parameter using the Newton-Raphson algorithm + // theta ~= theta_hat + hess^-1 * score + // this needs to be in a direction of descent towards a minimum + int m = theta_hat.size(); + arma::vec theta(m, arma::fill::zeros); + arma::mat hessinv(hess.n_rows, hess.n_cols); + + // will need a check here for singular hessians... + try{ + double _rcond = arma::rcond(hess); + bool is_singular; + is_singular = _rcond < 1e-9; + + // check for singular condition + if(is_singular){ + Rcpp::stop("Variance Component Hessian is computationally singular"); + } + + hessinv = arma::inv(hess); // always use pinv? solve() and inv() are most sensitive than R versions + theta = theta_hat + (hessinv * score_vec); + } catch(std::exception const& ex){ + forward_exception_to_r(ex); + } catch(...){ + Rf_error("c++ exception (unknown reason)"); + } + + return theta; +} + + +arma::mat coeffMatrix(const arma::mat& X, const arma::mat& Winv, const arma::mat& Z, const arma::mat& Ginv){ + // compute the components of the coefficient matrix for the MMEs + // sparsification _does_ help here, despite the added overhead + // unsparsify + int c = Z.n_cols; + int m = X.n_cols; + + // arma::sp_mat ul(m, m); + // arma::sp_mat ur(m, c); + // arma::sp_mat ll(c, m); + // arma::sp_mat lr(c, c); + // + // arma::sp_mat lhs_top(m, m+c); + // arma::sp_mat lhs_bot(c, m+c); + // + // arma::sp_mat res(m+c, m+c); + // + // arma::sp_mat sX(X); + // arma::sp_mat sZ(Z); + // arma::sp_mat sW(Winv); + // arma::sp_mat sG(Ginv); + // + // ul = sX.t() * sW * sX; + // ur = sX.t() * sW * sZ; + // ll = sZ.t() * sW * sX; + // lr = (sZ.t() * sW * sZ) + sG; + + arma::mat ul(m, m); + arma::mat ur(m, c); + arma::mat ll(c, m); + arma::mat lr(c, c); + + arma::mat lhs_top(m, m+c); + arma::mat lhs_bot(c, m+c); + + arma::mat lhs(m+c, m+c); + + ul = X.t() * Winv * X; + ur = X.t() * Winv * Z; + ll = Z.t() * Winv * X; + lr = (Z.t() * Winv * Z) + Ginv; + + lhs_top = arma::join_rows(ul, ur); // join_rows matches the rows i.e. glue columns together + lhs_bot = arma::join_rows(ll, lr); + + lhs = arma::join_cols(lhs_top, lhs_bot); // join_cols matches the cols, i.e. glue rows together + // res = arma::join_cols(lhs_top, lhs_bot); // join_cols matches the cols, i.e. glue rows together + // arma::mat lhs(res); + return lhs; +} + + +arma::vec solveEquations (const int& c, const int& m, const arma::mat& Winv, const arma::mat& Zt, const arma::mat& Xt, + const arma::mat& coeffmat, const arma::vec& beta, const arma::vec& u, const arma::vec& ystar){ + // solve the mixed model equations + arma::vec rhs_beta(m); + arma::vec rhs_u(c); + arma::mat rhs(m+c, 1); + + arma::vec theta_up(m+c, arma::fill::zeros); + + rhs_beta.col(0) = Xt * Winv * ystar; + rhs_u.col(0) = Zt * Winv * ystar; + + rhs = arma::join_cols(rhs_beta, rhs_u); + + // need a check for singular hessian here + double _rcond = arma::rcond(coeffmat); + try{ + // double _rcond = arma::rcond(coeffmat); + bool is_singular; + is_singular = _rcond < 1e-9; + + // check for singular condition + if(is_singular){ + // this happens when G^-1 contains NaN values <- how does this happen + // and how do we prevent it? + Rcpp::stop("Coefficients Hessian is computationally singular"); + } + + // can we just use solve here instead? + // if the coefficient matrix is singular then do we resort to pinv? + theta_up = arma::solve(coeffmat, rhs, arma::solve_opts::no_approx); + } catch(std::exception const& ex){ + forward_exception_to_r(ex); + } catch(...){ + Rf_error("c++ exception (unknown reason)"); + } + return theta_up; +} + + +arma::mat computeZstar(const arma::mat& Z, const arma::vec& curr_sigma, const Rcpp::List& u_indices){ + // use the A = LL^T, ZL = Z* trick similar to lme4 - described in more detail + // in https://onlinelibrary.wiley.com/doi/epdf/10.1046/j.1439-0388.2002.00327.x + // this assumes that G is PSD - it will fail if it is not + + // compute the Cholesky of G + int stot = Z.n_cols; + int n = Z.n_rows; + arma::mat G(stot, stot); + G = initialiseG(u_indices, curr_sigma); + + arma::mat cholG(stot, stot); + try{ + cholG = arma::chol(G, "lower"); + } catch(std::exception &ex){ + Rcpp::stop("G is not positive (semi) definite - Cholesky failed"); + // forward_exception_to_r(ex); + } catch(...){ + Rf_error("c++ exception (unknown reason)"); + } + + // construct Z*=ZL + arma::mat Zstar(n, stot); + Zstar = Z * cholG; + + return Zstar; +} + + +// arma::vec solveEquationsPCG (const int& c, const int& m, const arma::mat& Winv, const arma::mat& Zt, const arma::mat& Xt, +// arma::mat coeffmat, arma::vec curr_theta, const arma::vec& ystar, const double& conv_tol){ +// // solve the mixed model equations with a preconditioned conjugate gradient +// // A = coeffmat +// // x = theta +// // b = rhs +// +// arma::vec rhs_beta(m); +// arma::vec rhs_u(c); +// arma::mat rhs(m+c, 1); +// +// arma::vec theta_up(m+c, arma::fill::zeros); +// +// rhs_beta.col(0) = Xt * Winv * ystar; +// rhs_u.col(0) = Zt * Winv * ystar; +// +// rhs = arma::join_cols(rhs_beta, rhs_u); +// +// // I'll assume any preconditioning has already been applied +// // need a check for singular hessian here +// try{ +// double _rcond = arma::rcond(coeffmat); +// bool is_singular; +// is_singular = _rcond < 1e-9; +// +// // check for singular condition +// if(is_singular){ +// Rcpp::stop("Coefficients Hessian is computationally singular"); +// } +// +// // can we just use solve here instead? +// // if the coefficient matrix is singular then do we resort to pinv? +// // is it worth doing a quick analysis of the eigen values of coeff? +// // the _rcond might be sufficient to tell us if the matrix is ill-conditioned +// // do we need to know a priori if we have a few large eigenvalues?? +// // maybe this could be tweaked by setting the convergence criterai to > 0? +// theta_up = conjugateGradient(coeffmat, curr_theta, rhs, conv_tol); +// // theta_up = arma::solve(coeffmat, rhs); +// +// } catch(std::exception &ex){ +// forward_exception_to_r(ex); +// } catch(...){ +// Rf_error("c++ exception (unknown reason)"); +// } +// +// return theta_up; +// } + + +// arma::vec conjugateGradient(arma::mat A, arma::vec x, arma::vec b, double conv_tol){ +// // use conjugate gradients to solve the system of linear equations +// // Ax = b +// // Algorithm: +// // r_0 <- Ax_0 - b, p_0 <- -r_0, k <- 0 +// // while r_k != 0: +// // alpha_k <- (rk^T * rk)/(pk^T * A * pk) +// // x_k+1 <- xk + alpha_k * pK +// // r_k+1 <- rk + alpha_k * A * pK +// // beta_k+1 <- r rk+1 + beta_k+1 * pk +// // pk+1 <- -r_k+1 + beta_k+1 * pk +// // k++ +// +// // need to generate x_0 from the current estimates: [beta u] +// const unsigned int m = A.n_cols; +// const unsigned int n = b.size(); +// +// arma::dcolvec xk(m); +// xk = x; // use current estimates as x0 +// arma::vec xk_update(m); +// xk_update = arma::dcolvec(m); +// // x0.randu(); // initial x values +// +// double alpha_k = 0.0; +// double beta_k = 0.0; +// +// arma::dcolvec rk(n); +// arma::dcolvec rk_update(n); +// arma::dcolvec pk(m); +// arma::dcolvec pk_update(m); +// +// rk = (A * xk) - b; +// pk = -rk; +// unsigned int k = 0; +// +// Rcpp::LogicalVector _check_rzero = check_tol_arma_numeric(rk, conv_tol); +// bool _all_rk_zero = Rcpp::all(_check_rzero).is_false(); // .is_false() required for proper type casting to bool +// +// while(_all_rk_zero){ // evaluates true until all rk are zero +// alpha_k = (rk.t() * rk).eval()(0,0)/(pk.t() * A * pk).eval()(0, 0); // needed to convert vector inner product to scalar +// xk_update = xk + alpha_k * pk; +// rk_update = rk + alpha_k * A * pk; +// beta_k = (rk_update.t() * rk_update).eval()(0, 0)/(rk.t() * rk).eval()(0, 0); // needed to convert vector inner product to scalar +// pk_update = -rk_update + beta_k * pk; +// +// rk = rk_update; +// pk = pk_update; +// xk = xk_update; +// +// _check_rzero = check_tol_arma_numeric(rk, conv_tol); +// _all_rk_zero = Rcpp::all(_check_rzero).is_false(); // .is_false() required for proper type casting to bool +// k++; +// } +// +// Rprintf("CG completed in %u iterations\n", k); +// return xk_update; +// } + + +arma::vec estHasemanElstonGenetic(const arma::mat& Z, const arma::mat& PREML, + const Rcpp::List& u_indices, const arma::vec& ystar, const arma::mat& Kin){ + // use HasemanElston regression to estimate variance components + // vectorize everything + // we will also estimate a "residual" variance parameter + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + arma::mat Ycovar(n, n); + Ycovar = PREML * (ystar * ystar.t()) * PREML; // project onto REML P matrix + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZGenetic(Z, u_indices, PREML, Kin); // projection already applied + + // solve by linear least squares + arma::vec _he_update(c+1); + arma::vec he_update(c); + + // use OSL here for starters + _he_update = arma::solve(vecZ, Ybig); + he_update = _he_update.tail(c); + + return he_update; +} + + +arma::vec estHasemanElston(const arma::mat& Z, const arma::mat& PREML, const Rcpp::List& u_indices, const arma::vec& ystar){ + // use HasemanElston regression to estimate variance components + // vectorize everything + // we will also estimate a "residual" variance parameter + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + // sparsify just the multiplication steps. + // arma::mat Ycovar(n, n); + arma::sp_mat sYcovar(n, n); + arma::sp_mat sP(PREML); + arma::sp_mat YT(ystar * ystar.t()); + + sYcovar = sP * YT * sP; // project onto REML P matrix + // Ycovar = PREML * (ystar * ystar.t()) * PREML; // project onto REML P matrix + + // select the upper triangular elements, including the diagonal + arma::mat Ycovar(sYcovar); + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZ(Z, u_indices, PREML); // projection already applied + + // solve by linear least squares + arma::vec _he_update(c+1); + arma::vec he_update(c); + + // use OSL here for starters + _he_update = arma::solve(vecZ, Ybig, arma::solve_opts::fast); + he_update = _he_update.tail(c); + + return he_update; +} + + +arma::vec estHasemanElstonML(const arma::mat& Z, const Rcpp::List& u_indices, const arma::vec& ystar){ + // use HasemanElston regression to estimate variance components + // vectorize everything + // we will also estimate a "residual" variance parameter + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + // sparsify just the multiplication steps. + // arma::mat Ycovar(n, n); + arma::mat Ycovar(ystar * ystar.t()); + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZML(Z, u_indices); // projection already applied + + // solve by linear least squares + arma::vec _he_update(c+1); + arma::vec he_update(c); + + // use OSL here for starters + _he_update = arma::solve(vecZ, Ybig, arma::solve_opts::fast); + he_update = _he_update.tail(c); + + return he_update; +} + + +arma::vec estHasemanElstonConstrained(const arma::mat& Z, const arma::mat& PREML, const Rcpp::List& u_indices, + const arma::vec& ystar, arma::vec he_update, const int& Iters){ + // use constrained HasemanElston regression to estimate variance components - using a NNLS estimator + // vectorize everything + // we will also estimate a "residual" variance parameter + // however, there is no reason this "residual" paramer has to be constrained... + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + // sparsify just the multiplication steps. + // arma::mat Ycovar(n, n); + arma::sp_mat sYcovar(n, n); + arma::sp_mat sP(PREML); + arma::sp_mat YT(ystar * ystar.t()); + + sYcovar = sP * YT * sP; // project onto REML P matrix + // Ycovar = PREML * (ystar * ystar.t()) * PREML; // project onto REML P matrix + // select the upper triangular elements, including the diagonal + arma::mat Ycovar(sYcovar); + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZ(Z, u_indices, PREML); // projection already applied + + // solve by linear least squares + arma::vec _he_update(c+1); + + // first check if we can get a non-negative OLS estimate + arma::dvec _ols = arma::solve(vecZ, Ybig, arma::solve_opts::fast); + if(any(_ols < 1e-8)){ + // use NNSL here - Lawson and Hanson algorithm or FAST-NNLS + // the latter is only applicable when vecZ is PD - need to check this with all positive eigenvalues + bool _ispd; + _ispd = check_pd_matrix(vecZ); + + if(_ispd){ + // use the FAST NNLS solver + _he_update = fastNnlsSolve(vecZ, Ybig); + } else{ + // have to use slower implementation from Lawson and Hanson + _he_update = nnlsSolve(vecZ, Ybig, he_update, Iters); + } + he_update = _he_update.tail(c); + } else{ + he_update = _ols.tail(c); + } + + return he_update; +} + + +arma::vec estHasemanElstonConstrainedML(const arma::mat& Z, const Rcpp::List& u_indices, + const arma::vec& ystar, arma::vec he_update, const int& Iters){ + // use constrained HasemanElston regression to estimate variance components - using a NNLS estimator + // vectorize everything + // we will also estimate a "residual" variance parameter + // however, there is no reason this "residual" paramer has to be constrained... + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + // sparsify just the multiplication steps. + arma::mat Ycovar(ystar * ystar.t()); + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZML(Z, u_indices); // projection already applied + + // solve by linear least squares + arma::vec _he_update(c+1); + + // first check if we can get a non-negative OLS estimate + arma::dvec _ols = arma::solve(vecZ, Ybig, arma::solve_opts::fast); + if(any(_ols < 1e-8)){ + // use NNSL here - Lawson and Hanson algorithm or FAST-NNLS + // the latter is only applicable when vecZ is PD - need to check this with all positive eigenvalues + bool _ispd; + _ispd = check_pd_matrix(vecZ); + + if(_ispd){ + // use the FAST NNLS solver + _he_update = fastNnlsSolve(vecZ, Ybig); + } else{ + // have to use slower implementation from Lawson and Hanson + _he_update = nnlsSolve(vecZ, Ybig, he_update, Iters); + } + he_update = _he_update.tail(c); + } else{ + he_update = _ols.tail(c); + } + + return he_update; +} + + +arma::vec estHasemanElstonConstrainedGenetic(const arma::mat& Z, const arma::mat& PREML, + const Rcpp::List& u_indices, + const arma::vec& ystar, const arma::mat& Kin, + arma::vec he_update, const int& Iters){ + // use constrained HasemanElston regression to estimate variance components - using a NNLS estimator + // vectorize everything + // we will also estimate a "residual" variance parameter + unsigned int n = ystar.size(); + unsigned int c = u_indices.size(); // number of variance components + unsigned long nsq = n * (n + 1)/2; //size of vectorised components using just upper or lower triangle of covariance matrix + + arma::mat Ycovar(n, n); + Ycovar = PREML * (ystar * ystar.t()) * PREML; // project onto REML P matrix + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = trimatu_ind(arma::size(Ycovar)); + arma::vec Ybig = Ycovar(upper_indices); + + // sequentially vectorise ZZ^T - this automatically adds a vectorised identity matrix + // for the "residual" variance + arma::mat vecZ(nsq, c+1); + vecZ = vectoriseZGenetic(Z, u_indices, PREML, Kin); // projection already applied + + arma::vec _he_update(c+1); + + // first check if we can get a non-negative OLS estimate + arma::dvec _ols = arma::solve(vecZ, Ybig, arma::solve_opts::fast); + if(any(_ols < 1e-8)){ + // use NNSL here - Lawson and Hanson algorithm or FAST-NNLS + // the latter is only applicable when vecZ is PD - need to check this with all positive eigenvalues + bool _ispd; + _ispd = check_pd_matrix(vecZ); + + if(_ispd){ + // use the FAST NNLS solver + _he_update = fastNnlsSolve(vecZ, Ybig); + } else{ + // have to use slower implementation from Lawson and Hanson + _he_update = nnlsSolve(vecZ, Ybig, he_update, Iters); + } + he_update = _he_update.tail(c); + } else{ + he_update = _ols.tail(c); + } + + return he_update; +} + + +arma::vec nnlsSolve(const arma::mat& vecZ, const arma::vec& Y, arma::vec nnls_update, const int& Iters){ + // Lawson and Hanson algorithm for constrained NNLS + // initial params + // P is the passive set - i.e. the indices of the non-negative estimates + // R is the active set - the indices of the constrained estimates, i.e. those held at zero + // d = 0 is a solution vector + // w is a vector of the Langrangian multipliers + + // need to implement a check for infinite loop - have a max iters argument? + double constval = 0.0; // value at which to constrain values + unsigned int inner_count = 0; + unsigned int outer_count = 0; + double EPS = 1e-6; + unsigned int m = vecZ.n_cols; + arma::ivec P(m, arma::fill::ones); // the indices have to be set to negative values to be empty + + for(int i=0; i < m; i++){ + if(nnls_update[i] <= constval){ + P[i] = -1; + } + } + + // all indices need to be active if all are zero - i.e. the first iteration + // what happens if all estimates get regularized to exactly zero? + arma::ivec R(m, arma::fill::ones); // these are the indices of the active set + + for(int i=0; i < m; i++){ + if(nnls_update[i] > constval){ + R[i] = -1; + } + } + + arma::dvec w(m); + w = vecZ.t() * (Y - vecZ*nnls_update); //Lagrangian multipliers + + // _hessian.print("(X^TX)^-1"); + bool check_R; + bool check_wR; + bool check_wP; + bool check_conditions; + check_R = all(R < 0); + check_wR = all(w.elem(find(R > 0)) <= constval); // these should be 0 or negative + check_wP = all(abs(w.elem(find(P > 0)) - constval) < EPS); // these should be ~0 + check_conditions = check_R || (check_wR && check_wP); + + double max_w = 0.0; + + while(!check_conditions){ // set a tolerance here to check for positive Langrangian + max_w = max(w.elem(find(R > 0))); // what if several are identical? + unsigned int max_j = 0; + + // get the index of the maximal Langrangian multiplier + // need to find the maximum value in w^R, but the index needs to be from w + // turn this into a function? + for(int i=0; i < m; i++){ + if(abs(w[i] - max_w) <= EPS){ + max_j = i; + } + } + + P[max_j] = -1 * P[max_j]; // this should reverse the sign + R[max_j] = -1 * R[max_j]; + + // find the elements >= 0 + arma::uvec select_P = find(P > 0); + arma::vec s_all(m); + s_all.fill(constval); + s_all.elem(select_P) = arma::solve(vecZ.cols(select_P), Y); // is this faster? + + double min_sp = s_all.elem(select_P).min(); + double alpha; // the step size + // outer_count++; + + while(min_sp <= 0){ + inner_count = 0; + // recompute selection of P element restricted to negative estimates in S + arma::uvec select_sP = find(P > 0 && s_all < 0); + arma::vec diffVec(select_sP.size()); + + // zero divisions create problems here + // need a check for inf, -inf and nan + diffVec = nnls_update.elem(select_sP)/(nnls_update.elem(select_sP) - s_all.elem(select_sP)); + alpha = diffVec.min(); + + // update estimates + nnls_update = nnls_update + (alpha * (s_all - nnls_update)); + + // switch any zeros to P and negatives to R + arma::uvec _isswitch(m, arma::fill::ones); + + for(int i=0; i < m; i++){ + if(nnls_update[i] <= constval){ + _isswitch[i] = 0; + } + } + + arma::uvec switch_j = find(_isswitch == 0); + + P.elem(switch_j) = -1 * P.elem(switch_j); + R.elem(switch_j) = -1 * R.elem(switch_j); + select_P = find(P > 0); + + // update the elements of S + s_all.fill(constval); + s_all.elem(select_P) = arma::solve(vecZ.cols(select_P), Y); + min_sp = s_all.elem(select_P).min(); + // inner_count++; + } + + nnls_update = s_all; + w = vecZ.t() * (Y - vecZ * nnls_update); + check_R = all(R < 0); + check_wR = all(w.elem(find(R > 0)) <= constval); // these should be 0 or negative + check_wP = all(abs(abs(w.elem(find(P > 0))) - constval) < EPS); // these should be ~0 + check_conditions = check_R || (check_wR && check_wP); + } + + // at convergence w^R < 0 and w^P = 0 + return nnls_update; +} + + +arma::vec fastNnlsSolve(const arma::mat& vecZ, const arma::vec& Y){ + // This uses the Fast Approximate Solution Trajectory NNLS + // from https://stackoverflow.com/questions/58006606/rcpp-implementation-of-fast-nonnegative-least-squares + + // assumes the problem is not ill-conditioned - how realistic is this? + // vecZ is almost _never_ square + int m = vecZ.n_rows; + arma::mat A(m, m); + A = vecZ * vecZ.t(); // is this right? I don't think that it is! + arma::vec nnls_update = arma::solve(A, Y, + arma::solve_opts::likely_sympd + arma::solve_opts::fast); + + while(any(nnls_update < 0)){ + // define the feasible set P as all estimates > 0 + arma::uvec nonzero = find(nnls_update > 0); + + // reset estimates + nnls_update.zeros(); + + // now solve the OLS problem for params in the feasible set + nnls_update(nonzero) = arma::solve(vecZ.submat(nonzero, nonzero), Y.elem(nonzero), + arma::solve_opts::likely_sympd + arma::solve_opts::fast); + } + return nnls_update; +} + + +arma::mat vectoriseZ(const arma::mat& Z, const Rcpp::List& u_indices, const arma::mat& P){ + // sequentially vectorise the columns ZZ^T that map to each random effect + // pre- and post- multiply by the REML projection matrix + int c = u_indices.size(); + int n = Z.n_rows; + unsigned long nsq = n * (n + 1)/2; + + Rcpp::List _Zelements(1); + arma::mat bigI(n, n, arma::fill::eye); // this should be the identity which we vectorise + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = arma::trimatu_ind(arma::size(bigI)); + arma::mat _vI = bigI(upper_indices); + _Zelements(0) = _vI; + + for(int i=0; i < c; i++){ + // extract the elements of u_indices + arma::uvec u_idx = u_indices(i); + unsigned int q = u_idx.n_rows; + arma::mat _subZ(n, q); + unsigned int qmin = arma::min(u_idx); + unsigned int qmax = arma::max(u_idx); + _subZ = Z.cols(qmin-1, qmax-1); + + arma::mat _ZZT(n, n); + _ZZT = P * (_subZ * _subZ.t()) * P; // REML projection + + // vectorise + arma::vec _vecZ = _ZZT(upper_indices); + arma::mat _vecZZT = _Zelements(0); + unsigned long _zc = _vecZZT.n_cols; + + arma::mat _vZ(nsq, _zc+1); + _vZ = arma::join_rows(_vecZZT, _vecZ); + _Zelements(0) = _vZ; + } + + arma::mat vecMat = _Zelements(0); + return vecMat; +} + + +arma::mat vectoriseZML(const arma::mat& Z, const Rcpp::List& u_indices){ + // sequentially vectorise the columns ZZ^T that map to each random effect + // pre- and post- multiply by the REML projection matrix + int c = u_indices.size(); + int n = Z.n_rows; + unsigned long nsq = n * (n + 1)/2; + + Rcpp::List _Zelements(1); + arma::mat bigI(n, n, arma::fill::eye); // this should be the identity which we vectorise + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = arma::trimatu_ind(arma::size(bigI)); + arma::mat _vI = bigI(upper_indices); + _Zelements(0) = _vI; + + for(int i=0; i < c; i++){ + // extract the elements of u_indices + arma::uvec u_idx = u_indices(i); + unsigned int q = u_idx.n_rows; + arma::mat _subZ(n, q); + unsigned int qmin = arma::min(u_idx); + unsigned int qmax = arma::max(u_idx); + _subZ = Z.cols(qmin-1, qmax-1); + + arma::mat _ZZT(n, n); + _ZZT = (_subZ * _subZ.t()); + + // vectorise + arma::vec _vecZ = _ZZT(upper_indices); + arma::mat _vecZZT = _Zelements(0); + unsigned long _zc = _vecZZT.n_cols; + + arma::mat _vZ(nsq, _zc+1); + _vZ = arma::join_rows(_vecZZT, _vecZ); + _Zelements(0) = _vZ; + } + + arma::mat vecMat = _Zelements(0); + return vecMat; +} + +arma::mat vectoriseZGenetic(const arma::mat& Z, const Rcpp::List& u_indices, + const arma::mat& P, const arma::mat& Kin){ + // sequentially vectorise the columns ZZ^T that map to each random effect + // pre- and post- multiply by the REML projection matrix + int c = u_indices.size(); + int n = Z.n_rows; + // unsigned long nsq = pow(static_cast(n), 2); + unsigned long nsq = n * (n + 1)/2; + + Rcpp::List _Zelements(1); + arma::mat bigI(n, n, arma::fill::ones); // should this be column of 1s + + // select the upper triangular elements, including the diagonal + arma::uvec upper_indices = arma::trimatu_ind(arma::size(bigI)); + arma::mat _vI = bigI(upper_indices); + _Zelements(0) = _vI; + + for(int i=0; i < c; i++){ + // extract the elements of u_indices + arma::uvec u_idx = u_indices(i); + unsigned int q = u_idx.n_rows; + arma::mat _subZ(n, q); + unsigned int qmin = arma::min(u_idx); + unsigned int qmax = arma::max(u_idx); + _subZ = Z.cols(qmin-1, qmax-1); + + // always set the last component to the genetic variance if there is a kinship matrix + if(i == c-1){ + arma::vec _vecZ = Kin(upper_indices); + arma::mat _vecZZT = _Zelements(0); + unsigned long _zc = _vecZZT.n_cols; + + arma::mat _vZ(nsq, _zc+1); + _vZ = arma::join_rows(_vecZZT, _vecZ); + _Zelements(0) = _vZ; + } else{ + // compute Z_i Z_i^T + arma::mat _ZZT(n, n); + _ZZT = P * (_subZ * _subZ.t()) * P; // REML projection + + // vectorise + arma::vec _vecZ = _ZZT(upper_indices); + arma::mat _vecZZT = _Zelements(0); + unsigned long _zc = _vecZZT.n_cols; + + arma::mat _vZ(nsq, _zc+1); + _vZ = arma::join_rows(_vecZZT, _vecZ); + _Zelements(0) = _vZ; + } + } + + arma::mat vecMat = _Zelements(0); + return vecMat; +} + + +double phiLineSearch(double disp, double lower, double upper, const int& c, + const arma::vec& mu, const arma::mat& Ginv, double pi, + const arma::vec& curr_u, const arma::vec& sigma, + const arma::vec& y){ + // perform a bisection search for dispersion + // evaluate the loglihood at each bound + arma::mat littleG(c, c, arma::fill::zeros); + + for(int i=0; i= tol){ + if(c_logli > d_logli){ + upper = pd; + pd = pc; + d_logli = c_logli; + pc = lower + r*(upper - lower); + c_logli = nbLogLik(mu, pc, y) - normlihood; + } else{ + lower = pc; + pc = pd; + c_logli = d_logli; + pd = upper - r*(upper - lower); + d_logli = nbLogLik(mu, pd, y) - normlihood; + } + } + + // return the mid-point between [c, d] + double new_disp = (pc + pd)/2.0; + return new_disp; +} + + +double phiMME(const arma::vec& y, const arma::vec& curr_sigma){ + // use the pseudovariance and method of moments + double ps_bar = arma::var(y); + double y_bar = arma::mean(y); + double sigma_sum = arma::sum(y); + double denom = 0.0; + double disp_mme = 0.0; + + denom = ps_bar - y_bar - sigma_sum; + // check for near zero denom that could blow + // up the estimate + if(denom < 1e-3){ + disp_mme = 0; + } else{ + disp_mme = 1/denom; + } + + return disp_mme; +} + + + +double nbLogLik(const arma::vec& mu, double phi, const arma::vec& y){ + double logli = 0.0; + arma::vec logli_indiv(y.n_rows); + arma::vec muphi(y.n_rows); + muphi = mu/(mu + phi); + + // element wise multiplication of y and other equation elements + logli_indiv = y % arma::log(mu/(mu + phi)) + (phi * (1 - (mu/(mu+phi)))) + (arma::lgamma(y+1) - std::lgamma(phi)); + + logli = arma::sum(logli_indiv); + return logli; +} + + +double normLogLik(const int& c, const arma::mat& Ginv, const arma::mat& G, + const arma::vec& curr_u, double pi){ + double cdouble = (double)c; + double detG = arma::det(G); + double logdet = std::log(detG); + + arma::vec normlog_indiv = ((cdouble/2.0) * std::log(2*pi)) - (0.5 * logdet) - (0.5 * (curr_u.t() * Ginv * curr_u)); + double normlihood = arma::sum(normlog_indiv); + + return normlihood; +} + + + diff --git a/src/paramEst.h b/src/paramEst.h new file mode 100644 index 0000000..3fc776d --- /dev/null +++ b/src/paramEst.h @@ -0,0 +1,53 @@ +#ifndef PARAMEST_H +#define PARAMEST_H + +#include +#include +// [[Rcpp::depends(RcppArmadillo)]] +// [[Rcpp::depends(RcppEigen)]] + +arma::vec sigmaScoreREML_arma (const Rcpp::List& pvstar_i, const arma::vec& ystar, const arma::mat& P); +arma::mat sigmaInfoREML_arma (const Rcpp::List& pvstari, const arma::mat& P); +arma::vec sigmaScore (arma::vec ystar, arma::vec beta, arma::mat X, Rcpp::List V_partial, arma::mat V_star_inv); +arma::mat sigmaInformation (arma::mat V_star_inv, Rcpp::List V_partial); +arma::vec fisherScore (const arma::mat& hess, const arma::vec& score_vec, const arma::vec& theta_hat); +arma::vec solveEquations (const int& c, const int& m, const arma::mat& Winv, const arma::mat& Zt, const arma::mat& Xt, + const arma::mat& coeffmat, const arma::vec& beta, const arma::vec& u, const arma::vec& ystar); +// arma::vec solveEquationsPCG (const int& c, const int& m, const arma::mat& Winv, const arma::mat& Zt, const arma::mat& Xt, +// const arma::mat& coeffmat, const arma::vec& curr_theta, const arma::vec& ystar, const double& conv_tol); +arma::mat coeffMatrix(const arma::mat& X, const arma::mat& Winv, const arma::mat& Z, const arma::mat& Ginv); +arma::mat computeZstar(const arma::mat& Z, const arma::vec& curr_sigma, const Rcpp::List& u_indices); +// arma::vec conjugateGradient(const arma::mat& A, const arma::vec& x, const arma::vec& b, double conv_tol); +arma::vec estHasemanElston(const arma::mat& Z, const arma::mat& PREML, + const Rcpp::List& u_indices, const arma::vec& ystar); +arma::vec estHasemanElstonML(const arma::mat& Z, const Rcpp::List& u_indices, + const arma::vec& ystar); +arma::vec estHasemanElstonGenetic(const arma::mat& Z, const arma::mat& PREML, + const Rcpp::List& u_indices, const arma::vec& ystar, + const arma::mat& Kin); +arma::vec estHasemanElstonConstrained(const arma::mat& Z, const arma::mat& PREML, const Rcpp::List& u_indices, + const arma::vec& ystar, arma::vec he_update, const int& Iters); +arma::vec estHasemanElstonConstrainedML(const arma::mat& Z, const Rcpp::List& u_indices, + const arma::vec& ystar, arma::vec he_update, const int& Iters); +arma::vec estHasemanElstonConstrainedGenetic(const arma::mat& Z, const arma::mat& PREML, const Rcpp::List& u_indices, + const arma::vec& ystar, const arma::mat& Kin, arma::vec he_update, + const int& Iters); +arma::vec nnlsSolve(const arma::mat& vecZ, const arma::vec& Y, arma::vec nnls_update, const int& Iters); +arma::vec fastNnlsSolve(const arma::mat& vecZ, const arma::vec& Y); +arma::mat vectoriseZ(const arma::mat& Z, const Rcpp::List& u_indices, const arma::mat& P); +arma::mat vectoriseZML(const arma::mat& Z, const Rcpp::List& u_indices); +arma::mat vectoriseZGenetic(const arma::mat& Z, const Rcpp::List& u_indices, + const arma::mat& P, const arma::mat& Kin); +double phiLineSearch(double disp, double lower, double upper, const int& c, + const arma::vec& mu, const arma::mat& Ginv, double pi, + const arma::vec& curr_u, const arma::vec& sigma, + const arma::vec& y); +double phiGoldenSearch(double disp, double lower, double upper, const int& c, + const arma::vec& mu, const arma::mat& Ginv, double pi, + const arma::vec& curr_u, const arma::vec& sigma, + const arma::vec& y); +double phiMME(const arma::vec& y, const arma::vec& curr_sigma); +double nbLogLik(const arma::vec& mu, double phi, const arma::vec& y); +double normLogLik(const int& c, const arma::mat& Ginv, const arma::mat& G, + const arma::vec& curr_u, double pi); +#endif diff --git a/src/pseudovarPartial.cpp b/src/pseudovarPartial.cpp new file mode 100644 index 0000000..10442f6 --- /dev/null +++ b/src/pseudovarPartial.cpp @@ -0,0 +1,82 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] +#include "pseudovarPartial.h" +using namespace Rcpp; + +List pseudovarPartial(arma::mat x, List rlevels, StringVector cnames){ + // this currently doesn't support sparse matrices - it's not super clear how to do + // that concretely without defining some sparse matrix class somewhere along the line + + unsigned int items = rlevels.size(); + List outlist(items); + + for(unsigned int i = 0; i < items; i++){ + StringVector lelements = rlevels[i]; + IntegerVector icol = match(lelements, cnames); // need to add a check here in case of NAs or no matches + int low = min(icol)-1; // need indexing correction from R to C++ + int hi = max(icol)-1; + + // Need to output an S4 object - arma::sp_mat uses implicit interconversion for support dg Matrices + arma::mat omat(x.cols(low, hi) * x.cols(low, hi).t()); + outlist[i] = omat; + } + + return outlist; +} + + +List pseudovarPartial_C(arma::mat Z, List u_indices){ + // A Rcpp specific implementation that uses positional indexing rather than character indexes + unsigned int items = u_indices.size(); + List outlist(items); + + for(unsigned int i = 0; i < items; i++){ + arma::uvec icols = u_indices[i]; + + arma::mat omat(Z.cols(icols - 1) * Z.cols(icols - 1).t()); + outlist[i] = omat; + } + + return outlist; +} + + +List pseudovarPartial_P(List V_partial, const arma::mat& P){ + // A Rcpp specific implementation that uses positional indexing rather than character indexes + // don't be tempted to sparsify this - the overhead of casting is too expensive + unsigned int items = V_partial.size(); + List outlist(items); + + for(unsigned int i = 0; i < items; i++){ + // Need to output an S4 object - arma::sp_mat uses implicit interconversion for support dg Matrices + arma::mat _omat = V_partial(i); + arma::mat omat(P * _omat); + outlist[i] = omat; + } + + return outlist; + +} + + +List pseudovarPartial_G(arma::mat Z, const arma::mat& K, List u_indices){ + // A Rcpp specific implementation that uses positional indexing rather than character indexes + unsigned int items = u_indices.size(); + List outlist(items); + + for(unsigned int i = 0; i < items; i++){ + if(i == items - 1){ + arma::uvec icols = u_indices[i]; + arma::mat _omat(Z.cols(icols - 1) * K * Z.cols(icols - 1).t()); + outlist[i] = _omat; // K is equivalent to ZZ^T + } else{ + arma::uvec icols = u_indices[i]; + // Need to output an S4 object - arma::sp_mat uses implicit interconversion for support dg Matrices + arma::mat _omat(Z.cols(icols - 1) * Z.cols(icols - 1).t()); + outlist[i] = _omat; + } + } + + return outlist; + +} diff --git a/src/pseudovarPartial.h b/src/pseudovarPartial.h new file mode 100644 index 0000000..e7c5fef --- /dev/null +++ b/src/pseudovarPartial.h @@ -0,0 +1,12 @@ +#ifndef PSEUDOVARPARTIAL_H +#define PSEUDOVARPARTIAL_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +Rcpp::List pseudovarPartial(arma::mat x, Rcpp::List rlevels, Rcpp::StringVector cnames); +Rcpp::List pseudovarPartial_C(arma::mat Z, Rcpp::List u_indices); +Rcpp::List pseudovarPartial_G(arma::mat Z, const arma::mat& G, Rcpp::List u_indices); +Rcpp::List pseudovarPartial_P(Rcpp::List V_partial, const arma::mat& P); + +#endif diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100644 index 0000000..a6f0465 --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,113 @@ +#include +#include +// [[Rcpp::depends(RcppArmadillo)]] +#include "utils.h" + +// utility functions +Rcpp::LogicalVector check_na_arma_numeric(arma::vec X){ + // don't being function names with '_' + // input is an arma::vec + const int& n = X.size(); + Rcpp::LogicalVector _out(n); + bool _isnan; + + for(int i=0; i < n; i++){ + // check for NA of any time + _isnan = std::isnan(X[i]); + _out[i] = _isnan; + } + + return _out; +} + + +Rcpp::LogicalVector check_inf_arma_numeric(arma::vec X){ + // don't being function names with '_' + // input is an arma::vec + const int& n = X.size(); + Rcpp::LogicalVector _out(n); + bool _isinf; + + for(int i=0; i < n; i++){ + // check for NA of any time + _isinf = !std::isfinite(X[i]); + _out[i] = _isinf; + } + + return _out; +} + + +Rcpp::LogicalVector check_zero_arma_numeric(arma::vec X){ + // don't being function names with '_' + // input is an arma::vec + const int& n = X.size(); + Rcpp::LogicalVector _out(n); + bool _iszero; + + for(int i=0; i < n; i++){ + // check for NA of any time + _iszero = X[i] == 0; + _out[i] = _iszero; + } + + return _out; +} + + +Rcpp::LogicalVector check_zero_arma_complex(arma::cx_vec X){ + // don't begin function names with '_' + // input is an arma::vec + const int& n = X.size(); + Rcpp::LogicalVector _out(n); + bool _iszero; + + for(int i=0; i < n; i++){ + // check for NA of any time + _iszero = X[i] == 0.0; + _out[i] = _iszero; + } + + return _out; +} + + +Rcpp::LogicalVector check_tol_arma_numeric(arma::vec X, double tol){ + // don't being function names with '_' + // input is an arma::vec + const int& n = X.size(); + Rcpp::LogicalVector _out(n); + bool _iszero; + + for(int i=0; i < n; i++){ + // check for NA of any time + _iszero = X[i] <= tol; + _out[i] = _iszero; + } + + return _out; +} + +bool check_pd_matrix(arma::mat A){ + // check that A matrix is positive definite - i.e. all positive eigenvalues + // A must be square + unsigned int m = A.n_cols; + unsigned int n = A.n_rows; + bool _is_sym; + + if(m != n){ + return false; + } + + _is_sym = A.is_symmetric(); + + if(!_is_sym){ + Rcpp::stop("matrix A is not symmetric"); + return false; + } + + arma::vec eigenvals = arma::eig_sym(A); + bool alltrue; + alltrue = all(eigenvals > 0.0); + return alltrue; +} diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..bc6f299 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,13 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +// [[Rcpp::depends(RcppArmadillo)]] + +Rcpp::LogicalVector check_na_arma_numeric(arma::vec x); +Rcpp::LogicalVector check_inf_arma_numeric(arma::vec X); +Rcpp::LogicalVector check_zero_arma_numeric(arma::vec X); +Rcpp::LogicalVector check_zero_arma_complex(arma::cx_vec X); +Rcpp::LogicalVector check_tol_arma_numeric(arma::vec X, double tol); +bool check_pd_matrix(arma::mat A); +#endif diff --git a/tests/testthat/test_annotateNhoods.R b/tests/testthat/test_annotateNhoods.R index da333fd..9bb5e69 100644 --- a/tests/testthat/test_annotateNhoods.R +++ b/tests/testthat/test_annotateNhoods.R @@ -1,13 +1,15 @@ context("Testing annotateNhoods function") -library(miloR) ### Set up a mock data set using simulated data -library(SingleCellExperiment) -library(scran) -library(scater) -library(irlba) -library(MASS) -library(mvtnorm) +suppressWarnings({ + library(miloR) + library(SingleCellExperiment) + library(scran) + library(scater) + library(irlba) + library(MASS) + library(mvtnorm) +}) set.seed(42) r.n <- 1000 @@ -104,7 +106,7 @@ colData(sim1.mylo) <- DataFrame(meta.df) test_that("Wrong input gives errors", { # Subsetted da.res expect_error(annotateNhoods(sim1.mylo, da_df[1:10,], "Block"), - "the number of rows in da.res does not match the number of neighbourhoods in nhoods(x). Are you sure da.res is the output of testNhoods(x)?", + "the number of rows in da.res does not match the number of neighbourhoods in nhoods(x). Are you sure da.res is the output of testNhoods(x) or did you use subset.nhoods?", fixed=TRUE ) @@ -124,3 +126,4 @@ test_that("The fractions are right", { da_anno <- annotateNhoods(sim1.mylo, da_df, "Block") expect_true(all(da_anno[,"Block_fraction"] == 1 )) }) + diff --git a/tests/testthat/test_checkSeparation.R b/tests/testthat/test_checkSeparation.R new file mode 100644 index 0000000..e703b6a --- /dev/null +++ b/tests/testthat/test_checkSeparation.R @@ -0,0 +1,140 @@ +context("Testing checkSeparation function") + +### Set up a mock data set using simulated data +suppressWarnings({ + library(miloR) + library(SingleCellExperiment) + library(scran) + library(scater) + library(irlba) + library(MASS) + library(mvtnorm) +}) + +set.seed(42) +r.n <- 1000 +n.dim <- 50 +block1.cells <- 500 +# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues? +block1.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50)))) +block1.eigens <- block1.eigens[order(block1.eigens)] +block1.p <- qr.Q(qr(matrix(rnorm(block1.cells^2, mean=4, sd=0.01), block1.cells))) +block1.sigma <- crossprod(block1.p, block1.p*block1.eigens) +block1.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block1.cells, mean=2, sd=0.01), sigma=block1.sigma)) + + +block2.cells <- 500 +# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues? +block2.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50)))) +block2.eigens <- block2.eigens[order(block2.eigens)] +block2.p <- qr.Q(qr(matrix(rnorm(block2.cells^2, mean=4, sd=0.01), block2.cells))) +block2.sigma <- crossprod(block2.p, block2.p*block2.eigens) +block2.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block2.cells, mean=4, sd=0.01), sigma=block2.sigma)) + + +block3.cells <- 650 +# select a set of eigen values for the covariance matrix of each block, say 50 eigenvalues? +block3.eigens <- sapply(1:n.dim, FUN=function(X) rexp(n=1, rate=abs(runif(n=1, min=0, max=50)))) +block3.eigens <- block3.eigens[order(block3.eigens)] +block3.p <- qr.Q(qr(matrix(rnorm(block3.cells^2, mean=4, sd=0.01), block3.cells))) +block3.sigma <- crossprod(block3.p, block3.p*block3.eigens) +block3.gex <- abs(rmvnorm(n=r.n, mean=rnorm(n=block3.cells, mean=5, sd=0.01), sigma=block3.sigma)) + +sim1.gex <- do.call(cbind, list("b1"=block1.gex, "b2"=block2.gex, "b3"=block3.gex)) +colnames(sim1.gex) <- paste0("Cell", 1:ncol(sim1.gex)) +rownames(sim1.gex) <- paste0("Gene", 1:nrow(sim1.gex)) +sim1.pca <- prcomp_irlba(t(sim1.gex), n=50, scale.=TRUE, center=TRUE) + +set.seed(42) +block1.cond <- rep("A", block1.cells) +block1.a <- sample(1:block1.cells, size=floor(block1.cells*0.9)) +block1.b <- setdiff(1:block1.cells, block1.a) +block1.cond[block1.b] <- "B" + +block2.cond <- rep("A", block2.cells) +block2.a <- sample(1:block2.cells, size=floor(block2.cells*0.05)) +block2.b <- setdiff(1:block2.cells, block2.a) +block2.cond[block2.b] <- "B" + +block3.cond <- rep("A", block3.cells) +block3.a <- sample(1:block3.cells, size=floor(block3.cells*0.5)) +block3.b <- setdiff(1:block3.cells, block3.a) +block3.cond[block3.b] <- "B" + +meta.df <- data.frame("Block"=c(rep("B1", block1.cells), rep("B2", block2.cells), rep("B3", block3.cells)), + "Condition"=c(block1.cond, block2.cond, block3.cond), + "Replicate"=c(rep("R1", floor(block1.cells*0.33)), rep("R2", floor(block1.cells*0.33)), + rep("R3", block1.cells-(2*floor(block1.cells*0.33))), + rep("R1", floor(block2.cells*0.33)), rep("R2", floor(block2.cells*0.33)), + rep("R3", block2.cells-(2*floor(block2.cells*0.33))), + rep("R1", floor(block3.cells*0.33)), rep("R2", floor(block3.cells*0.33)), + rep("R3", block3.cells-(2*floor(block3.cells*0.33))))) +colnames(meta.df) <- c("Block", "Condition", "Replicate") +# define a "sample" as teh combination of condition and replicate +meta.df$Sample <- paste(meta.df$Condition, meta.df$Replicate, sep="_") +meta.df$Vertex <- c(1:nrow(meta.df)) + +sim1.sce <- SingleCellExperiment(assays=list(logcounts=sim1.gex), + reducedDims=list("PCA"=sim1.pca$x)) + +sim1.mylo <- Milo(sim1.sce) + +# build a graph - this can take a while for large graphs - will need to play +# around with the parallelisation options +sim1.mylo <- buildGraph(sim1.mylo, k=21, d=30) + +# define neighbourhoods - this is slow for large data sets +# how can this be sped up? There are probably some parallelisable steps +sim1.mylo <- makeNhoods(sim1.mylo, k=21, prop=0.1, refined=TRUE, + d=30, + reduced_dims="PCA") +sim1.mylo <- calcNhoodDistance(sim1.mylo, d=30) + +sim1.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)), + "Replicate"=rep(c("R1", "R2", "R3"), 2)) +sim1.meta$Sample <- paste(sim1.meta$Condition, sim1.meta$Replicate, sep="_") +rownames(sim1.meta) <- sim1.meta$Sample + +sim1.mylo <- countCells(sim1.mylo, samples="Sample", meta.data=meta.df) + + +test_that("Wrong input gives errors", { + # no rownames - can't do subsetting + rownames(sim1.meta) <- NULL + expect_error(checkSeparation(sim1.mylo, sim1.meta, "Condition"), + "Please add rownames to design.df that are the same as the colnames of nhoodCounts", + fixed=TRUE) + + rownames(sim1.meta) <- paste0("NOTSAMPLE", sim1.meta$Sample) + expect_error(checkSeparation(sim1.mylo, sim1.meta, "Condition"), + "rownames of design.df are not a subset of nhoodCounts colnames", + fixed=TRUE) + + # Asking for column not in coldata + expect_error(checkSeparation(sim1.mylo, sim1.meta, "BLAH"), + "BLAH is not a variable in design", + fixed=TRUE) + + nhoodCounts(sim1.mylo) <- matrix(0L) + expect_error(checkSeparation(sim1.mylo, sim1.meta, "Condition"), + "nhoodCounts not found - please run countCells() first", + fixed=TRUE) + +}) + +test_that("Factor checking handles expected formats", { + sim1.meta$Variable <- seq_len(nrow(sim1.meta)) + expect_error(checkSeparation(sim1.mylo, sim1.meta, "Variable"), + "Too many levels in Variable", + fixed=FALSE) +}) + +test_that("Output is the expected format", { + # set the first 5 nhood counts to 0 for one condition level + nhoodCounts(sim1.mylo)[c(1:5), sim1.meta$Sample[sim1.meta$Condition %in% c("A")]] <- 0 + expect_gte(sum(checkSeparation(sim1.mylo, sim1.meta, "Condition", min.val=1)), 5) + + expect_identical(length(checkSeparation(sim1.mylo, sim1.meta, "Condition", min.val=1)), nrow(nhoodCounts(sim1.mylo))) +}) + + diff --git a/tests/testthat/test_glmm.R b/tests/testthat/test_glmm.R new file mode 100644 index 0000000..724c20a --- /dev/null +++ b/tests/testthat/test_glmm.R @@ -0,0 +1,121 @@ +context("Testing fitGLMM function") +### Set up a mock data set using simulated data +suppressWarnings({ + library(miloR) + library(SingleCellExperiment) + library(scran) + library(scater) + library(irlba) +}) + +##### ------- Simulate data ------- ##### +data(sim_family) +sim.df <- sim_family$DF + +set.seed(42) +random.levels <- list("Fam"=paste0("Fam", unique(as.numeric(as.factor(sim.df$Fam))))) +X <- as.matrix(data.frame("Intercept"=rep(1, nrow(sim.df)), "FE2"=as.numeric(sim.df$FE2))) +Z <- as.matrix(data.frame("Fam"=as.numeric(as.factor(sim.df$Fam)))) +y <- sim.df$Mean.Count +dispersion <- 0.5 +mmcontrol <- glmmControl.defaults() +mmcontrol$solver <- "Fisher" + +test_that("Discordant input matrices give errors", { + # truncated y + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z, y=y[seq_len(nrow(X)-1)], offsets=rep(0, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Dimensions of") + + # trucated X + set.seed(42) + expect_error(fitGLMM(X=X[seq_len(nrow(X)-1), ], Z=Z, y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Dimensions of") + + # trucated Z + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z[seq_len(nrow(Z)-1), , drop=FALSE], y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Dimensions of") + + # non-square covariance matrix + kin <- sim_family$IBD + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z, y=y, offsets=rep(0, nrow(X)), Kin=kin[seq_len(nrow(kin)-1), , drop=FALSE], + random.levels=random.levels, REML = TRUE, dispersion=dispersion, glmm.control=mmcontrol), + "Input covariance matrix is not square") + + # non-square covariance matrix - covariance only model + kin <- sim_family$IBD + g.Z <- diag(nrow(Z)) + colnames(g.Z) <- paste0("Genetic", seq_len(ncol(g.Z))) + set.seed(42) + expect_error(fitGLMM(X=X, Z=g.Z, y=y, offsets=rep(0, nrow(X)), Kin=kin[seq_len(nrow(kin)-1), , drop=FALSE], geno.only=TRUE, + random.levels=random.levels, REML = TRUE, dispersion=dispersion, glmm.control=mmcontrol), + "Input covariance matrix is not square") + + # discordant covariance and Z dimensions - RE and covariance + set.seed(42) + expect_error(fitGLMM(X=X[seq_len(nrow(Z)-1), , drop=FALSE], Z=Z[seq_len(nrow(Z)-1), , drop=FALSE], y=y[seq_len(nrow(Z)-1)], + offsets=rep(0, nrow(X)), Kin=kin, random.levels=random.levels, REML = TRUE, dispersion=dispersion, + glmm.control=mmcontrol), + "Input covariance matrix and Z design matrix are discordant") + + # random levels and Z matrix are discordant + wrong.Z <- Z + wrong.Z[wrong.Z == 5] <- 1 # arbitrarily re-set family IDs + set.seed(42) + expect_error(fitGLMM(X=X, Z=wrong.Z, + y=y, offsets=rep(0, nrow(X)), + random.levels=random.levels, REML = TRUE, dispersion=dispersion, glmm.control=mmcontrol), + "Columns of Z are discordant with input random effect levels") + + # invalid column names in Z + inv.Z <- Z + colnames(inv.Z) <- NULL + set.seed(42) + expect_error(fitGLMM(X=X, Z=inv.Z, y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Columns of Z must have valid names") + + # non unique column names in Z + fail.random.levels <- list("RE1"=paste("RE1", unique(as.numeric(as.factor(sim.df$RE1))), sep="_"), + "RE2"=paste("RE1", as.numeric(unique(sim.df$RE2)), sep="_")) + fail.X <- as.matrix(data.frame("Intercept"=rep(1, nrow(sim.df)), "FE2"=as.numeric(sim.df$FE2))) + fail.Z <- as.matrix(data.frame("RE1"=as.numeric(as.factor(sim.df$RE1)), "RE2"=as.numeric(sim.df$RE2))) + fail.y <- sim.df$Mean.Count + + set.seed(42) + expect_error(fitGLMM(X=fail.X, Z=fail.Z, y=fail.y, offsets=rep(0, nrow(X)), random.levels=fail.random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), + "Columns of Z are discordant with input random effect levels") + +}) + +test_that("Infinite and NA values fail as expected", { + inf.offsets <- rep(0, nrow(X)) + inf.offsets[sample(length(inf.offsets), size=1)] <- Inf + + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z, y=y, offsets=inf.offsets, random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Infinite values in initial estimates") + + na.offsets <- rep(0, nrow(X)) + na.offsets[sample(length(na.offsets), size=1)] <- NA + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z, y=y, offsets=na.offsets, random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "NA values in offsets") + + na.X <- X + na.X[sample(seq_len(nrow(X)), size=1), 2] <- NA + set.seed(42) + expect_error(fitGLMM(X=na.X, Z=Z, y=y, offsets=rep(0, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "NAs values in initial estimates") + + # force infinite values with large offsets + set.seed(42) + expect_error(fitGLMM(X=X, Z=Z, y=y, offsets=rep(10000, nrow(X)), random.levels=random.levels, REML = TRUE, + dispersion=dispersion, glmm.control=mmcontrol), "Infinite values in initial estimates - reconsider model") + +}) + + + diff --git a/tests/testthat/test_testNhoods.R b/tests/testthat/test_testNhoods.R index 4bd1e80..dd7f7f1 100644 --- a/tests/testthat/test_testNhoods.R +++ b/tests/testthat/test_testNhoods.R @@ -8,6 +8,7 @@ library(scater) library(irlba) library(MASS) library(mvtnorm) +library(BiocParallel) set.seed(42) r.n <- 1000 @@ -237,3 +238,42 @@ test_that("Providing a subset model.matrix is reproducible", { expect_identical(kd.ref1, kd.ref2) }) +sim1.meta$Condition_num <- paste0("Condition_num", c(1, 1, 1, 0, 0, 0)) +sim1.meta$Replicate_num <- paste0("Replicate_num", c(1, 2, 3, 1, 2, 3)) +sim1.meta$Replicate2 <- paste0("Replicate2", c(1, 2, 1, 2, 1, 2)) + +test_that("Singular Hessians are detectable and fail appropriately", { + set.seed(42) + # having a singular Hessian depends on some of the staring values <- this test needs to + # be reproducible and not depend on setting a specific seed. The easiest way might be to have + # a variance component that is effectively 0. + + # collinear fixed and random effects + expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition + (1|Condition), + design.df=sim1.meta, glmm.solver="Fisher", fail.on.error=TRUE)), + "Coefficients Hessian is computationally singular") +}) + +test_that("Invalid formulae give expected errors", { + expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition + (50|Condition), + design.df=sim1.meta, glmm.solver="Fisher")), + "is an invalid formula for random effects") +}) + +test_that("NA or Inf cell sizes causes the expected errors", { + cell.sizes.na <- colSums(nhoodCounts(sim1.mylo)) + cell.sizes.na[1] <- NA + expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition, + design.df=sim1.meta, + cell.sizes=cell.sizes.na)), + "NA or Infinite values found in cell\\.sizes") + + cell.sizes.inf <- colSums(nhoodCounts(sim1.mylo)) + cell.sizes.inf[1] <- Inf + expect_error(suppressWarnings(testNhoods(sim1.mylo, design=~Condition, + design.df=sim1.meta, + cell.sizes=cell.sizes.inf)), + "NA or Infinite values found in cell\\.sizes") +}) + + diff --git a/vignettes/milo_contrasts.Rmd b/vignettes/milo_contrasts.Rmd index 0debea8..82c4cf8 100644 --- a/vignettes/milo_contrasts.Rmd +++ b/vignettes/milo_contrasts.Rmd @@ -121,6 +121,24 @@ mod.constrast This shows the contrast matrix. If we want to test each of these comparisons then we need to pass them sequentially to `testNhoods`, then apply an additional multiple testing correction to the spatial FDR values. +```{r} +contrast1.res <- testNhoods(thy.milo, design=~ Age, design.df=thy.design, fdr.weighting="graph-overlap", model.contrasts = mod.constrast) +head(contrast1.res) +``` + +This matrix of contrasts will perform a quasi-likelihood F-test over all contrasts, hence a single p-value and spatial FDR are returned. Log fold changes are returned for +all contrasts on each level of the `Age` variable, which gives 20 log-fold change columns for each - this is the default behaviour of `glmQLFTest` in the `edgeR` package +which is what Milo uses for hypothesis testing. In general, and to avoid confusion, we recommend testing each pair of contrasts separately if these are the comparisons +of interest, as shown below. + +```{r} +# compare weeks 4 and 16, with week 4 as the reference. +cont.4vs16.res <- testNhoods(thy.milo, design=~ Age, design.df=thy.design, fdr.weighting="graph-overlap", model.contrasts = mod.constrast[c("Age4wk"), c("Age4wk - Age16wk")]) +head(cont.4vs16.res) +``` + +Now we have a single logFC which compares nhood abundance between week 4 and week 16. + Contrasts are not limited to these simple pair-wise comparisons, we can also group levels together for comparisons. For instance, imagine we want to know what the effect of the cell counts in the week 1 mice is _compared to all other time points_. @@ -131,15 +149,15 @@ mod.constrast <- makeContrasts(contrasts=ave.contrast, levels=model) mod.constrast ``` - In this contrasts matrix we can see that we have taken the average effect over the other time points. Now running this using `testNhoods` ```{r} da_results <- testNhoods(thy.milo, design = ~ 0 + Age, design.df = thy.design, model.contrasts = ave.contrast, fdr.weighting="graph-overlap") table(da_results$SpatialFDR < 0.1) +head(da_results) ``` -In this comparison there are `r sum(da_results$SpatialFDR < 0.1)` DA nhoods - which we can visualise on a superimposed single-cell UMAP. +The results table In this comparison there are `r sum(da_results$SpatialFDR < 0.1)` DA nhoods - which we can visualise on a superimposed single-cell UMAP. ```{r, fig.width=10, fig.height=4.5} thy.milo <- buildNhoodGraph(thy.milo) diff --git a/vignettes/milo_glmm.Rmd b/vignettes/milo_glmm.Rmd new file mode 100644 index 0000000..ab50667 --- /dev/null +++ b/vignettes/milo_glmm.Rmd @@ -0,0 +1,269 @@ +--- +title: "Mixed effect models for Milo DA testing" +author: "Mike Morgan" +date: "14/03/2023" +output: + BiocStyle::html_document: + toc_float: true + BiocStyle::pdf_document: default +package: miloR +vignette: | + %\VignetteIndexEntry{Mixed effect models for Milo DA testing} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = FALSE +) +``` + + +```{r setup, message=FALSE, warning=FALSE} +library(miloR) +library(SingleCellExperiment) +library(scater) +library(scran) +library(dplyr) +library(patchwork) +library(scRNAseq) +library(scuttle) +library(irlba) +library(BiocParallel) +library(ggplot2) +``` + + +# Introduction + +Our first implementation of Milo used a negative binomial GLM to perform hypothesis testing and identify differentially abundant neighbourhoods. GLMs are incredibly powerful, +but they have certain limitations. Notably, they assume that the dependent variable (nhood counts) are (conditionally) independently and identically distributed - that means +there can't be any relationship between the individual counts e.g. they can't be derived from the same individual. This creates a dependency between count observations in the +same nhood and can lead to inflated type I error rates. This is especially problematic when considering genetic analyses and organisms of the same species share a genetic +ancestry, which in humans only goes back ~100,000 years. In the simplest example, imagine we performed single-cell experiments on individuals from multiple families, and from +each family we included siblings and parents. Within a family the individuals would share on average 50% of their DNA, so the observations for DA testing wouldn't be independent. +For more distantly related individuals the relationships are smaller, but can be non-trivial, particularly as sample sizes increase. It's not just genetics that leads to +dependencies, multiple measurements from the same individual, e.g. multiple time points or from different organs, can also introduce dependency between observations. + +We have opted to use GLMM to address this problem as they are very powerful and can explicitly account for fairly arbitrary dependencies, as long as they can be encoded either +as a multi-level factor variable (sometimes referred to as clusters in the mixed effect model literature) or by an nXn matrix. + +For the purpose of demonstrating how to use Milo in GLMM mode I'll use a data set `KotliarovPBMC` from the `scRNAseq` package. These data are derived from SLE patients with +variable treatment responses. This should allow us to model the batching as a random effect variable while testing for differential abundance between the high and low drug +responders. + +# Load data + +We will use the `KotliarovPBMCData` data set as there are multiple groups that we can compare. + +```{r} +pbmc.sce <- KotliarovPBMCData(mode="rna", ensembl=TRUE) # download the PBMC data from Kotliarov _et al._ +pbmc.sce +``` +# Data processing and normalisation + +```{r} +set.seed(42) +# remove sparser cells +drop.cells <- colSums(counts(pbmc.sce)) < 1000 +pbmc.sce <- computeSumFactors(pbmc.sce[, !drop.cells]) +pbmc.sce <- logNormCounts(pbmc.sce) + +pbmc.hvg <- modelGeneVar(pbmc.sce) +pbmc.hvg$FDR[is.na(pbmc.hvg$FDR)] <- 1 +reducedDim(pbmc.sce, "PCA") <- prcomp_irlba(t(logcounts(pbmc.sce[pbmc.hvg$FDR < 0.1, ])), n=50, center=TRUE, scale.=TRUE)$x +pbmc.sce +``` + +# Define cell neighbourhoods + +```{r, fig.height=4.1, fig.width=10.5} +set.seed(42) +pbmc.sce <- runUMAP(pbmc.sce, n_neighbors=30, pca=50) # add a UMAP for plotting results later + +pbmc.milo <- Milo(pbmc.sce) # from the SCE object +reducedDim(pbmc.milo, "UMAP") <- reducedDim(pbmc.sce, "UMAP") + +plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotUMAP(pbmc.milo, colour_by="sampleid") +``` + +These UMAPs shows how the different constituent cell types of PBMCs distributed across the drug response categories (left) and samples (right). Next we build the KNN graph and +define neighbourhoods to quantify cell abundance across our experimental samples. + +```{r} +set.seed(42) +# we build KNN graph +pbmc.milo <- buildGraph(pbmc.milo, k = 60, d = 30) +pbmc.milo <- makeNhoods(pbmc.milo, prop = 0.05, k = 60, d=30, refined = TRUE, refinement_scheme="graph") # make nhoods using graph-only as this is faster +colData(pbmc.milo)$ObsID <- paste(colData(pbmc.milo)$tenx_lane, colData(pbmc.milo)$sampleid, sep="_") +pbmc.milo <- countCells(pbmc.milo, meta.data = data.frame(colData(pbmc.milo)), samples="ObsID") # make the nhood X sample counts matrix +pbmc.milo +``` + +Do we have a good distribution of nhood sizes? + +```{r} +plotNhoodSizeHist(pbmc.milo) +``` + +The smallest nhood is 60 (we used k=60) - that should be sufficient for the number of samples (N~120) + +# Demonstrating the GLMM syntax + +Now we have the pieces in place for DA testing to demonstrate how to use the GLMM. We should first consider what our random effect variable is. There is a fair bit of debate on +what constitutes a random effect vs. a fixed effect. As a rule of thumb, we can ask if the groups are randomly selected from a larger population of possible groups. For instance, +if we recruited patients from 5 hospitals, could we consider the hospital as a random effect if there are actually 500 hospitals that we could have chosen? For these PBMC data +we don't have a variable in the experiment that fits this decision _per se_, so the `tenx_lane` will be arbitrarily selected (assuming cells were randomly assigned to batches). + +```{r} +pbmc.design <- data.frame(colData(pbmc.milo))[,c("tenx_lane", "adjmfc.time", "sample", "sampleid", "timepoint", "ObsID")] +pbmc.design <- distinct(pbmc.design) +rownames(pbmc.design) <- pbmc.design$ObsID +## Reorder rownames to match columns of nhoodCounts(milo) +pbmc.design <- pbmc.design[colnames(nhoodCounts(pbmc.milo)), , drop=FALSE] +table(pbmc.design$adjmfc.time) +``` + +We encode the fixed effect variables as normal - but the random effects are different. We encode them as `(1|variable)` which tells the model that this is a random intercept. There +are also random slopes GLMMs, but Milo doesn't currently work with these. There are few other arguments we have to pass to `testNhoods`. We need to consider what solver we use, as +the parameter estimation is a little more complex. The options are 'Fisher', 'HE' or 'HE-NNLS'; the latter refers to a constrained optimisation for the variance components. If at +any point during the optimisation negative variance estimates are encountered, then Milo will produce a warning message and automatically switch to 'HE-NNLS'. As we are estimating +variances, we also want these to be unbiased, so we use restricted maximum likelihood (`REML=TRUE`). Note that NB-GLMMs are by construction slower than NB-GLMs as there are additional +matrix inversion steps that don't scale very nicely. While we have made every effort to reduce the computational burden we are still limited by the bounds of matrix algebra! + +```{r} +set.seed(42) +rownames(pbmc.design) <- pbmc.design$ObsID +bpparam <- SerialParam() +register(bpparam) + +da_results <- testNhoods(pbmc.milo, design = ~ adjmfc.time + (1|tenx_lane), design.df = pbmc.design, fdr.weighting="graph-overlap", + glmm.solver="HE-NNLS", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE) +table(da_results$SpatialFDR < 0.1) +``` + +We can see that the GLMM produces a warning if parameters don't converge - this is important because we want to know if we can trust our estimates or not. One way to handle this +is to increase `max.iters` (default is 50) - the downside is that this will increase the compute time and doesn't guarantee convergence. If the nhood counts are very sparse then this +can cause problems as there isn't much information/variance from which to learn the (locally) optimal parameter values. An additional point is that the GLMM may fail on some nhoods, +likely due to singular Hessian matrices during parameter estimation. These nhoods will have results with all `NA` values. + +```{r} +which(is.na(da_results$logFC)) +``` + +In this analysis there are `r length(which(is.na(da_results$logFC)))` nhood models that failed. If this happens to a large number of nhoods then there may be issues with the +combination of variables in the model, nhood counts might have a lot of zero-values, or there could be separation. For the latter `checkSeparation` can be used to check for all-zero +values in the testing variables of interest. If any nhoods have perfect separation between zero and non-zero counts on a variable level then these nhoods can be excluded from the +analysis. + +```{r} +which(checkSeparation(pbmc.milo, design.df=pbmc.design, condition="adjmfc.time", min.val=3)) +``` + +Here we can see that nhood 1632 can be separated into counts >= 2 and < 2 on our test variable `adjmfc.time` - this is the same nhood that encounters a model failure in our GLMM. We +can also visualise the count distribution to confirm this using `plotNhoodCounts` (kindly contributed by Nick Hirschmüller). + +```{r} +plotNhoodCounts(pbmc.milo, which(checkSeparation(pbmc.milo, design.df=pbmc.design, condition="adjmfc.time", min.val=3)), + design.df=pbmc.design, condition="adjmfc.time") +``` + +This shows the extremely low counts in the d0 high group, or specifically, that only a single observation is non-zero. + +As with the GLM implementation, the GLMM calculates a log fold-change and corrected P-value for each neighbourhood, which indicates whether there is significant differential +abundance between conditions for `r sum(da_results$SpatialFDR < 0.1)` neighbourhoods. The hypothesis testing is slightly different - for the GLM we use `edgeR::glmQLFTest` which +performs an F-test on the quasi-likelihood negative binomial fit. In the GLMM we use a pseudo-likelihood, so we instead we opt for a Wald-type test on the fixed effect variable +log-fold change; FDR correction is performed the same. + +The output of `testNhoods` with a GLMM has some additional columns that are worth exploring. + +```{r} +head(da_results[!is.na(da_results$logFC), ]) +``` + +Due to the way parameter estimation is performed in the GLMM, we can directly compute standard errors (SE column) - these are used to compute the subequent test statistic (tvalue) +and p-value. We next have the variance parameter estimate for each random effect variable (here 'tenx_lane variance'). As we use constrained optimisation to prevent negative +estimates some of these values will be bounded at 1e-8. We then have a column that determines which nhoods have converged - this can be useful for checking if, for example, the +inference is different between converged and not-converged nhoods. We also return the estimated dispersion value and the log pseudo-likelihood in addition the same columns in +the results table when using the GLM. + +We can also inspect our results as we would for the GLM, by using the neighbourhood graph produced by `buildNhoodGraph` + +```{r, fig.width=10, fig.height=4.5} +pbmc.milo <- buildNhoodGraph(pbmc.milo, overlap=25) + +# we need to subset the plotting results as it can't handle the NAs internally. +plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotNhoodGraphDA(pbmc.milo, da_results[!is.na(da_results$logFC), ], + subset.nhoods=!is.na(da_results$logFC), alpha=0.1) + + plot_layout(guides="auto" ) +``` + +We can see that there are some complex differences between the high and low responder patients. How does this compare to running the same analysis with a GLM using the batching +variable as a fixed effect? + +```{r} +set.seed(42) +# we need to use place the test variable at the end of the formula +glm_results <- testNhoods(pbmc.milo, design = ~ tenx_lane + adjmfc.time, design.df = pbmc.design, fdr.weighting="graph-overlap", + REML=TRUE, norm.method="TMM", BPPARAM = bpparam) +table(glm_results$SpatialFDR < 0.1) +``` + +The first and obvious difference is that we have fewer DA nhoods. We can attribute this to the fact that the GLM uses more degrees of freedom to model the batching variable which +reduces the overall statistical power. + +```{r, fig.width=10, fig.height=4.5} +plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotNhoodGraphDA(pbmc.milo, glm_results, alpha=0.1) + + plot_layout(guides="auto" ) +``` + +We can do a side-by-side comparison of the estimated log fold changes from the GLM and GLMM. + +```{r} +comp.da <- merge(da_results, glm_results, by='Nhood') +comp.da$Sig <- "none" +comp.da$Sig[comp.da$SpatialFDR.x < 0.1 & comp.da$SpatialFDR.y < 0.1] <- "Both" +comp.da$Sig[comp.da$SpatialFDR.x >= 0.1 & comp.da$SpatialFDR.y < 0.1] <- "GLM" +comp.da$Sig[comp.da$SpatialFDR.x < 0.1 & comp.da$SpatialFDR.y >= 0.1] <- "GLMM" + +ggplot(comp.da, aes(x=logFC.x, y=logFC.y)) + + geom_point(data=comp.da[, c("logFC.x", "logFC.y")], aes(x=logFC.x, y=logFC.y), + colour='grey80', size=1) + + geom_point(aes(colour=Sig)) + + labs(x="GLMM LFC", y="GLM LFC") + + facet_wrap(~Sig) + + NULL +``` + +This shows that the parameter estimates are extremely similar - this is what we _should_ expect to see. We can see that both models identify the nhoods with the strongest DA. The +difference appears in the nhoods that are more modestly DA - the GLMM has more power to identify these. + +# A note on when to use GLMM vs. GLM + +In general, GLMMs require larger samples sizes than GLMs - the power gain comes from the narrower scope of the GLMM in it's inference that leads to (generally) smaller standard +errors and thus bigger test statistics. That doesn't mean that GLMMs are inherently better than GLMs - with great power comes great responsibilities, and it's easy to abuse +a mixed effect model. In general I wouldn't recommend using a GLMM with fewer than 50 observations and a good case for including a variable as a random effect. The simplest +case for this is where you have multiple observations per experimental individual/sample and thus the nhood counts are not i.i.d. The other obvious case, as discussed in the intro +is for genetic analysis or for time course data. + + +
+ **Session Info** + +```{r} +sessionInfo() +``` + +
+ + + + + + + + + + + diff --git a/vignettes/milo_glmm.html b/vignettes/milo_glmm.html new file mode 100644 index 0000000..61177d8 --- /dev/null +++ b/vignettes/milo_glmm.html @@ -0,0 +1,2341 @@ + + + + + + + + + + + + + + +Mixed effect models for Milo DA testing + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
library(miloR)
+library(SingleCellExperiment)
+library(scater)
+library(scran)
+library(dplyr)
+library(patchwork)
+library(scRNAseq)
+library(scuttle)
+library(irlba)
+library(BiocParallel)
+library(ggplot2)
+
+

1 Introduction

+

Our first implementation of Milo used a negative binomial GLM to perform hypothesis testing and identify differentially abundant neighbourhoods. GLMs are incredibly powerful, +but they have certain limitations. Notably, they assume that the dependent variable (nhood counts) are (conditionally) independently and identically distributed - that means +there can’t be any relationship between the individual counts e.g. they can’t be derived from the same individual. This creates a dependency between count observations in the +same nhood and can lead to inflated type I error rates. This is especially problematic when considering genetic analyses and organisms of the same species share a genetic +ancestry, which in humans only goes back ~100,000 years. In the simplest example, imagine we performed single-cell experiments on individuals from multiple families, and from +each family we included siblings and parents. Within a family the individuals would share on average 50% of their DNA, so the observations for DA testing wouldn’t be independent. +For more distantly related individuals the relationships are smaller, but can be non-trivial, particularly as sample sizes increase. It’s not just genetics that leads to +dependencies, multiple measurements from the same individual, e.g. multiple time points or from different organs, can also introduce dependency between observations.

+

We have opted to use GLMM to address this problem as they are very powerful and can explicitly account for fairly arbitrary dependencies, as long as they can be encoded either +as a multi-level factor variable (sometimes referred to as clusters in the mixed effect model literature) or by an nXn matrix.

+

For the purpose of demonstrating how to use Milo in GLMM mode I’ll use a data set KotliarovPBMC from the scRNAseq package. These data are derived from SLE patients with +variable treatment responses. This should allow us to model the batching as a random effect variable while testing for differential abundance between the high and low drug +responders.

+
+
+

2 Load data

+

We will use the KotliarovPBMCData data set as there are multiple groups that we can compare.

+
pbmc.sce <- KotliarovPBMCData(mode="rna", ensembl=TRUE) # download the PBMC data from Kotliarov _et al._
+
## snapshotDate(): 2022-10-31
+
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
+
## loading from cache
+
## snapshotDate(): 2022-10-31
+
## loading from cache
+
## require("ensembldb")
+
## Warning: Unable to map 11979 of 32738 requested IDs.
+
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
+
## loading from cache
+
pbmc.sce
+
## class: SingleCellExperiment 
+## dim: 20759 58654 
+## metadata(0):
+## assays(1): counts
+## rownames(20759): ENSG00000284557 ENSG00000237613 ... ENSG00000274412
+##   ENSG00000283767
+## rowData names(1): originalName
+## colnames(58654): AAACCTGAGAGCCCAA_H1B1ln1 AAACCTGAGGCGTACA_H1B1ln1 ...
+##   TTTGTCATCGGTTCGG_H1B2ln6 TTTGTCATCTACCTGC_H1B2ln6
+## colData names(24): nGene nUMI ... dmx_hto_match timepoint
+## reducedDimNames(0):
+## mainExpName: NULL
+## altExpNames(0):
+
+
+

3 Data processing and normalisation

+
set.seed(42)
+# remove sparser cells
+drop.cells <- colSums(counts(pbmc.sce)) < 1000
+pbmc.sce <- computeSumFactors(pbmc.sce[, !drop.cells])
+pbmc.sce <- logNormCounts(pbmc.sce)
+
+pbmc.hvg <- modelGeneVar(pbmc.sce)
+pbmc.hvg$FDR[is.na(pbmc.hvg$FDR)] <- 1
+reducedDim(pbmc.sce, "PCA") <- prcomp_irlba(t(logcounts(pbmc.sce[pbmc.hvg$FDR < 0.1, ])), n=50, center=TRUE, scale.=TRUE)$x
+
## Warning in (function (A, nv = 5, nu = nv, maxit = 1000, work = nv + 7, reorth =
+## TRUE, : did not converge--results might be invalid!; try increasing work or
+## maxit
+
pbmc.sce
+
## class: SingleCellExperiment 
+## dim: 20759 51636 
+## metadata(0):
+## assays(2): counts logcounts
+## rownames(20759): ENSG00000284557 ENSG00000237613 ... ENSG00000274412
+##   ENSG00000283767
+## rowData names(1): originalName
+## colnames(51636): AAACCTGAGAGCCCAA_H1B1ln1 AAACCTGAGGCGTACA_H1B1ln1 ...
+##   TTTGTCATCGAGAACG_H1B2ln6 TTTGTCATCTACCTGC_H1B2ln6
+## colData names(25): nGene nUMI ... timepoint sizeFactor
+## reducedDimNames(1): PCA
+## mainExpName: NULL
+## altExpNames(0):
+
+
+

4 Define cell neighbourhoods

+
set.seed(42)
+pbmc.sce <- runUMAP(pbmc.sce, n_neighbors=30, pca=50) # add a UMAP for plotting results later
+
+pbmc.milo <- Milo(pbmc.sce) # from the SCE object
+reducedDim(pbmc.milo, "UMAP") <- reducedDim(pbmc.sce, "UMAP")
+
+plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotUMAP(pbmc.milo, colour_by="sampleid")
+

+

These UMAPs shows how the different constituent cell types of PBMCs distributed across the drug response categories (left) and samples (right). Next we build the KNN graph and +define neighbourhoods to quantify cell abundance across our experimental samples.

+
set.seed(42)
+# we build KNN graph
+pbmc.milo <- buildGraph(pbmc.milo, k = 60, d = 30)
+
## Constructing kNN graph with k:60
+
pbmc.milo <- makeNhoods(pbmc.milo, prop = 0.05, k = 60, d=30, refined = TRUE, refinement_scheme="graph") # make nhoods using graph-only as this is faster
+
## Checking valid object
+
## Running refined sampling with graph
+
colData(pbmc.milo)$ObsID <- paste(colData(pbmc.milo)$tenx_lane, colData(pbmc.milo)$sampleid, sep="_")
+pbmc.milo <- countCells(pbmc.milo, meta.data = data.frame(colData(pbmc.milo)), samples="ObsID") # make the nhood X sample counts matrix
+
## Checking meta.data validity
+
## Counting cells in neighbourhoods
+
pbmc.milo
+
## class: Milo 
+## dim: 20759 51636 
+## metadata(0):
+## assays(2): counts logcounts
+## rownames(20759): ENSG00000284557 ENSG00000237613 ... ENSG00000274412
+##   ENSG00000283767
+## rowData names(1): originalName
+## colnames(51636): AAACCTGAGAGCCCAA_H1B1ln1 AAACCTGAGGCGTACA_H1B1ln1 ...
+##   TTTGTCATCGAGAACG_H1B2ln6 TTTGTCATCTACCTGC_H1B2ln6
+## colData names(26): nGene nUMI ... sizeFactor ObsID
+## reducedDimNames(2): PCA UMAP
+## mainExpName: NULL
+## altExpNames(0):
+## nhoods dimensions(2): 51636 2294
+## nhoodCounts dimensions(2): 2294 126
+## nhoodDistances dimension(1): 0
+## graph names(1): graph
+## nhoodIndex names(1): 2294
+## nhoodExpression dimension(2): 1 1
+## nhoodReducedDim names(0):
+## nhoodGraph names(0):
+## nhoodAdjacency dimension(2): 1 1
+

Do we have a good distribution of nhood sizes?

+
plotNhoodSizeHist(pbmc.milo)
+

+

The smallest nhood is 60 (we used k=60) - that should be sufficient for the number of samples (N~120)

+
+
+

5 Demonstrating the GLMM syntax

+

Now we have the pieces in place for DA testing to demonstrate how to use the GLMM. We should first consider what our random effect variable is. There is a fair bit of debate on +what constitutes a random effect vs. a fixed effect. As a rule of thumb, we can ask if the groups are randomly selected from a larger population of possible groups. For instance, +if we recruited patients from 5 hospitals, could we consider the hospital as a random effect if there are actually 500 hospitals that we could have chosen? For these PBMC data +we don’t have a variable in the experiment that fits this decision per se, so the tenx_lane will be arbitrarily selected (assuming cells were randomly assigned to batches).

+
pbmc.design <- data.frame(colData(pbmc.milo))[,c("tenx_lane", "adjmfc.time", "sample", "sampleid", "timepoint", "ObsID")]
+pbmc.design <- distinct(pbmc.design)
+rownames(pbmc.design) <- pbmc.design$ObsID
+## Reorder rownames to match columns of nhoodCounts(milo)
+pbmc.design <- pbmc.design[colnames(nhoodCounts(pbmc.milo)), , drop=FALSE]
+table(pbmc.design$adjmfc.time)
+
## 
+## d0 high  d0 low 
+##      66      60
+

We encode the fixed effect variables as normal - but the random effects are different. We encode them as (1|variable) which tells the model that this is a random intercept. There +are also random slopes GLMMs, but Milo doesn’t currently work with these. There are few other arguments we have to pass to testNhoods. We need to consider what solver we use, as +the parameter estimation is a little more complex. The options are ‘Fisher’, ‘HE’ or ‘HE-NNLS’; the latter refers to a constrained optimisation for the variance components. If at +any point during the optimisation negative variance estimates are encountered, then Milo will produce a warning message and automatically switch to ‘HE-NNLS’. As we are estimating +variances, we also want these to be unbiased, so we use restricted maximum likelihood (REML=TRUE). Note that NB-GLMMs are by construction slower than NB-GLMs as there are additional +matrix inversion steps that don’t scale very nicely. While we have made every effort to reduce the computational burden we are still limited by the bounds of matrix algebra!

+
set.seed(42)
+rownames(pbmc.design) <- pbmc.design$ObsID
+bpparam <- SerialParam()
+register(bpparam)
+
+da_results <- testNhoods(pbmc.milo, design = ~ adjmfc.time + (1|tenx_lane), design.df = pbmc.design, fdr.weighting="graph-overlap",
+                         glmm.solver="HE-NNLS", REML=TRUE, norm.method="TMM", BPPARAM = bpparam, fail.on.error=FALSE)
+
## Random effects found
+
## Using TMM normalisation
+
## Running GLMM model - this may take a few minutes
+
## Warning in testNhoods(pbmc.milo, design = ~adjmfc.time + (1 | tenx_lane), : 78
+## out of 2294 neighborhoods did not converge; increase number of iterations?
+
## Performing spatial FDR correction with graph-overlap weighting
+
table(da_results$SpatialFDR < 0.1)
+
## 
+## FALSE  TRUE 
+##  1230  1063
+

We can see that the GLMM produces a warning if parameters don’t converge - this is important because we want to know if we can trust our estimates or not. One way to handle this +is to increase max.iters (default is 50) - the downside is that this will increase the compute time and doesn’t guarantee convergence. If the nhood counts are very sparse then this +can cause problems as there isn’t much information/variance from which to learn the (locally) optimal parameter values. An additional point is that the GLMM may fail on some nhoods, +likely due to singular Hessian matrices during parameter estimation. These nhoods will have results with all NA values.

+
which(is.na(da_results$logFC))
+
## [1] 1632
+

In this analysis there are 1 nhood models that failed. If this happens to a large number of nhoods then there may be issues with the +combination of variables in the model, nhood counts might have a lot of zero-values, or there could be separation. For the latter checkSeparation can be used to check for all-zero +values in the testing variables of interest. If any nhoods have perfect separation between zero and non-zero counts on a variable level then these nhoods can be excluded from the +analysis.

+
which(checkSeparation(pbmc.milo, design.df=pbmc.design, condition="adjmfc.time", min.val=3))
+
## 1632 
+## 1632
+

Here we can see that nhood 1632 can be separated into counts >= 2 and < 2 on our test variable adjmfc.time - this is the same nhood that encounters a model failure in our GLMM. We +can also visualise the count distribution to confirm this using plotNhoodCounts (kindly contributed by Nick Hirschmüller).

+
plotNhoodCounts(pbmc.milo, which(checkSeparation(pbmc.milo, design.df=pbmc.design, condition="adjmfc.time", min.val=3)), 
+                design.df=pbmc.design, condition="adjmfc.time")
+

+

This shows the extremely low counts in the d0 high group, or specifically, that only a single observation is non-zero.

+

As with the GLM implementation, the GLMM calculates a log fold-change and corrected P-value for each neighbourhood, which indicates whether there is significant differential +abundance between conditions for NA neighbourhoods. The hypothesis testing is slightly different - for the GLM we use edgeR::glmQLFTest which +performs an F-test on the quasi-likelihood negative binomial fit. In the GLMM we use a pseudo-likelihood, so we instead we opt for a Wald-type test on the fixed effect variable +log-fold change; FDR correction is performed the same.

+

The output of testNhoods with a GLMM has some additional columns that are worth exploring.

+
head(da_results[!is.na(da_results$logFC), ])
+
##         logFC   logCPM        SE     tvalue       PValue tenx_lane variance
+## 1 -0.68085591 8.301523 0.2697548 -2.5239804 1.160347e-02         0.07573020
+## 2  0.06378131 9.236561 0.2042244  0.3123099 7.548050e-01         0.00000001
+## 3 -0.73099390 8.696363 0.2431186 -3.0067379 2.640681e-03         0.11873879
+## 4  1.46494021 9.216870 0.2515254  5.8242246 5.737968e-09         0.00000001
+## 5 -0.29339247 8.701946 0.2318521 -1.2654295 2.057174e-01         0.04750681
+## 6 -0.31999630 9.251360 0.2209538 -1.4482500 1.475472e-01         0.00000001
+##   Converged Dispersion Logliklihood Nhood   SpatialFDR
+## 1      TRUE   1.794756    105.58466     1 3.323582e-02
+## 2      TRUE   1.960250     75.43116     2 8.341971e-01
+## 3      TRUE   2.140679     94.20515     3 1.006387e-02
+## 4      TRUE   1.271712     93.29835     4 3.486131e-07
+## 5      TRUE   2.093639     79.68906     5 3.158959e-01
+## 6      TRUE   1.558557     87.39399     6 2.461761e-01
+

Due to the way parameter estimation is performed in the GLMM, we can directly compute standard errors (SE column) - these are used to compute the subequent test statistic (tvalue) +and p-value. We next have the variance parameter estimate for each random effect variable (here ‘tenx_lane variance’). As we use constrained optimisation to prevent negative +estimates some of these values will be bounded at 1e-8. We then have a column that determines which nhoods have converged - this can be useful for checking if, for example, the +inference is different between converged and not-converged nhoods. We also return the estimated dispersion value and the log pseudo-likelihood in addition the same columns in +the results table when using the GLM.

+

We can also inspect our results as we would for the GLM, by using the neighbourhood graph produced by buildNhoodGraph

+
pbmc.milo <- buildNhoodGraph(pbmc.milo, overlap=25)
+
+# we need to subset the plotting results as it can't handle the NAs internally.
+plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotNhoodGraphDA(pbmc.milo, da_results[!is.na(da_results$logFC), ],
+                                                                subset.nhoods=!is.na(da_results$logFC), alpha=0.1) +
+  plot_layout(guides="auto" )
+
## Warning in recycleSingleBracketReplacementValue(value, x, i): number of values
+## supplied is not a sub-multiple of the number of values to be replaced
+

+

We can see that there are some complex differences between the high and low responder patients. How does this compare to running the same analysis with a GLM using the batching +variable as a fixed effect?

+
set.seed(42)
+# we need to use place the test variable at the end of the formula
+glm_results <- testNhoods(pbmc.milo, design = ~ tenx_lane + adjmfc.time, design.df = pbmc.design, fdr.weighting="graph-overlap",
+                          REML=TRUE, norm.method="TMM", BPPARAM = bpparam)
+
## Using TMM normalisation
+
## Performing spatial FDR correction with graph-overlap weighting
+
table(glm_results$SpatialFDR < 0.1)
+
## 
+## FALSE  TRUE 
+##  1508   786
+

The first and obvious difference is that we have fewer DA nhoods. We can attribute this to the fact that the GLM uses more degrees of freedom to model the batching variable which +reduces the overall statistical power.

+
plotUMAP(pbmc.milo, colour_by="adjmfc.time") + plotNhoodGraphDA(pbmc.milo, glm_results, alpha=0.1) +
+  plot_layout(guides="auto" )
+

+

We can do a side-by-side comparison of the estimated log fold changes from the GLM and GLMM.

+
comp.da <- merge(da_results, glm_results, by='Nhood')
+comp.da$Sig <- "none"
+comp.da$Sig[comp.da$SpatialFDR.x < 0.1 & comp.da$SpatialFDR.y < 0.1] <- "Both"
+comp.da$Sig[comp.da$SpatialFDR.x >= 0.1 & comp.da$SpatialFDR.y < 0.1] <- "GLM"
+comp.da$Sig[comp.da$SpatialFDR.x < 0.1 & comp.da$SpatialFDR.y >= 0.1] <- "GLMM"
+
+ggplot(comp.da, aes(x=logFC.x, y=logFC.y)) +
+    geom_point(data=comp.da[, c("logFC.x", "logFC.y")], aes(x=logFC.x, y=logFC.y),
+               colour='grey80', size=1) +
+    geom_point(aes(colour=Sig)) +
+    labs(x="GLMM LFC", y="GLM LFC") +
+    facet_wrap(~Sig) +
+    NULL
+
## Warning: Removed 4 rows containing missing values (`geom_point()`).
+
## Warning: Removed 1 rows containing missing values (`geom_point()`).
+

+

This shows that the parameter estimates are extremely similar - this is what we should expect to see. We can see that both models identify the nhoods with the strongest DA. The +difference appears in the nhoods that are more modestly DA - the GLMM has more power to identify these.

+
+
+

6 A note on when to use GLMM vs. GLM

+

In general, GLMMs require larger samples sizes than GLMs - the power gain comes from the narrower scope of the GLMM in it’s inference that leads to (generally) smaller standard +errors and thus bigger test statistics. That doesn’t mean that GLMMs are inherently better than GLMs - with great power comes great responsibilities, and it’s easy to abuse +a mixed effect model. In general I wouldn’t recommend using a GLMM with fewer than 50 observations and a good case for including a variable as a random effect. The simplest +case for this is where you have multiple observations per experimental individual/sample and thus the nhood counts are not i.i.d. The other obvious case, as discussed in the intro +is for genetic analysis or for time course data.

+
+ +Session Info + +
sessionInfo()
+
## R version 4.2.3 (2023-03-15)
+## Platform: aarch64-apple-darwin20 (64-bit)
+## Running under: macOS Ventura 13.3.1
+## 
+## Matrix products: default
+## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
+## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
+## 
+## locale:
+## [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
+## 
+## attached base packages:
+## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
+## [8] base     
+## 
+## other attached packages:
+##  [1] ensembldb_2.22.0            AnnotationFilter_1.22.0    
+##  [3] GenomicFeatures_1.50.4      AnnotationDbi_1.60.2       
+##  [5] BiocParallel_1.33.10.1      irlba_2.3.5.1              
+##  [7] Matrix_1.5-3                scRNAseq_2.12.0            
+##  [9] patchwork_1.1.2             dplyr_1.1.0                
+## [11] scran_1.26.2                scater_1.26.1              
+## [13] ggplot2_3.4.1               scuttle_1.8.4              
+## [15] SingleCellExperiment_1.20.0 SummarizedExperiment_1.28.0
+## [17] Biobase_2.58.0              GenomicRanges_1.50.2       
+## [19] GenomeInfoDb_1.34.9         IRanges_2.32.0             
+## [21] S4Vectors_0.36.2            BiocGenerics_0.44.0        
+## [23] MatrixGenerics_1.10.0       matrixStats_0.63.0         
+## [25] BiocStyle_2.26.0            miloR_1.99.1               
+## [27] edgeR_3.40.2                limma_3.54.2               
+## 
+## loaded via a namespace (and not attached):
+##   [1] utf8_1.2.3                    tidyselect_1.2.0             
+##   [3] RSQLite_2.3.0                 htmlwidgets_1.6.2            
+##   [5] grid_4.2.3                    devtools_2.4.5               
+##   [7] munsell_0.5.0                 ScaledMatrix_1.6.0           
+##   [9] codetools_0.2-19              statmod_1.5.0                
+##  [11] miniUI_0.1.1.1                withr_2.5.0                  
+##  [13] colorspace_2.1-0              filelock_1.0.2               
+##  [15] highr_0.10                    knitr_1.42                   
+##  [17] rstudioapi_0.14               labeling_0.4.2               
+##  [19] GenomeInfoDbData_1.2.9        polyclip_1.10-4              
+##  [21] bit64_4.0.5                   farver_2.1.1                 
+##  [23] rprojroot_2.0.3               vctrs_0.6.0                  
+##  [25] generics_0.1.3                xfun_0.37                    
+##  [27] BiocFileCache_2.6.1           R6_2.5.1                     
+##  [29] ggbeeswarm_0.7.1              graphlayouts_0.8.4           
+##  [31] rsvd_1.0.5                    locfit_1.5-9.7               
+##  [33] bitops_1.0-7                  cachem_1.0.7                 
+##  [35] DelayedArray_0.24.0           promises_1.2.0.1             
+##  [37] BiocIO_1.8.0                  scales_1.2.1                 
+##  [39] ggraph_2.1.0                  beeswarm_0.4.0               
+##  [41] gtable_0.3.2                  beachmat_2.14.0              
+##  [43] processx_3.8.0                tidygraph_1.2.3              
+##  [45] rlang_1.1.0                   splines_4.2.3                
+##  [47] lazyeval_0.2.2                rtracklayer_1.58.0           
+##  [49] BiocManager_1.30.20           yaml_2.3.7                   
+##  [51] httpuv_1.6.9                  tools_4.2.3                  
+##  [53] usethis_2.1.6                 bookdown_0.33                
+##  [55] ellipsis_0.3.2                jquerylib_0.1.4              
+##  [57] RColorBrewer_1.1-3            sessioninfo_1.2.2            
+##  [59] Rcpp_1.0.10                   sparseMatrixStats_1.10.0     
+##  [61] progress_1.2.2                zlibbioc_1.44.0              
+##  [63] purrr_1.0.1                   RCurl_1.98-1.10              
+##  [65] ps_1.7.2                      prettyunits_1.1.1            
+##  [67] viridis_0.6.2                 cowplot_1.1.1                
+##  [69] urlchecker_1.0.1              ggrepel_0.9.3                
+##  [71] cluster_2.1.4                 fs_1.6.1                     
+##  [73] magrittr_2.0.3                magick_2.7.4                 
+##  [75] ProtGenerics_1.30.0           pkgload_1.3.2                
+##  [77] hms_1.1.2                     mime_0.12                    
+##  [79] evaluate_0.20                 xtable_1.8-4                 
+##  [81] XML_3.99-0.14                 gridExtra_2.3                
+##  [83] compiler_4.2.3                biomaRt_2.54.0               
+##  [85] tibble_3.2.1                  crayon_1.5.2                 
+##  [87] htmltools_0.5.4               later_1.3.0                  
+##  [89] tidyr_1.3.0                   DBI_1.1.3                    
+##  [91] tweenr_2.0.2                  ExperimentHub_2.6.0          
+##  [93] dbplyr_2.3.1                  MASS_7.3-58.2                
+##  [95] rappdirs_0.3.3                cli_3.6.0                    
+##  [97] parallel_4.2.3                metapod_1.6.0                
+##  [99] igraph_1.4.1                  pkgconfig_2.0.3              
+## [101] GenomicAlignments_1.34.1      numDeriv_2016.8-1.1          
+## [103] xml2_1.3.3                    roxygen2_7.2.3               
+## [105] vipor_0.4.5                   bslib_0.4.2                  
+## [107] dqrng_0.3.0                   XVector_0.38.0               
+## [109] stringr_1.5.0                 callr_3.7.3                  
+## [111] digest_0.6.31                 RcppAnnoy_0.0.20             
+## [113] Biostrings_2.66.0             rmarkdown_2.20               
+## [115] uwot_0.1.14                   DelayedMatrixStats_1.20.0    
+## [117] restfulr_0.0.15               curl_5.0.0                   
+## [119] Rsamtools_2.14.0              shiny_1.7.4                  
+## [121] gtools_3.9.4                  rjson_0.2.21                 
+## [123] lifecycle_1.0.3               jsonlite_1.8.4               
+## [125] BiocNeighbors_1.16.0          desc_1.4.2                   
+## [127] viridisLite_0.4.1             fansi_1.0.4                  
+## [129] pillar_1.8.1                  lattice_0.20-45              
+## [131] KEGGREST_1.38.0               fastmap_1.1.1                
+## [133] httr_1.4.5                    pkgbuild_1.4.0               
+## [135] interactiveDisplayBase_1.36.0 glue_1.6.2                   
+## [137] remotes_2.4.2                 png_0.1-8                    
+## [139] bluster_1.8.0                 BiocVersion_3.16.0           
+## [141] bit_4.0.5                     ggforce_0.4.1                
+## [143] stringi_1.7.12                sass_0.4.5                   
+## [145] profvis_0.3.7                 blob_1.2.4                   
+## [147] BiocSingular_1.14.0           AnnotationHub_3.6.0          
+## [149] memoise_2.0.1
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + + +