New updates to improve package instability (#20)

* Improve Fine-Gray model simulation * Fine-Gray model: increase the number of the second outcome. * Adding a new fold split function for given foldid for Fine and Gray model: to avoid discrepancy caused by different input format between Cox and FG. * Bugfix for logistic regression 2-step procedure * Improve stability for 2-stage selection when only 1 pair can be selected. * Enable parallel computation for cross-validation. * Fix bug in mcv.FLORAL and a.FLORAL * bugfix for fine gray model with longitudinal covariates * bugfix for finegray method * Update readme file * Update NEWS * Increment version number to 0.2.0.9000
vdblab · Jan 8, 2024 · 02277a8 · 02277a8
1 parent d10d714
commit 02277a8
Show file tree

Hide file tree

Showing 14 changed files with 474 additions and 194 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: FLORAL
 Type: Package
 Title: Fit Log-Ratio Lasso Regression for Compositional Data
-Version: 0.2.0
+Version: 0.2.0.9000
 Date: 2023-07-04
 Authors@R: c(
     person("Teng", "Fei", , "[email protected]", role = c("aut", "cre", "cph"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# FLORAL (development version)
+
+* Improves stability when fitting Fine-Gray model with longitudinal covariates.
+
+* Enables parallel computation for cross-validation.
+
+* Fixes several bugs as reported in Issues. 
+
 # FLORAL 0.2.0
 
 * Including more examples in document compared to CRAN version (0.1.0.9000)

diff --git a/R/FLORAL.R b/R/FLORAL.R
@@ -15,6 +15,7 @@
 #' @param a A scalar between 0 and 1: \code{a} is the weight for lasso penalty while \code{1-a} is the weight for ridge penalty.
 #' @param mu Value of penalty for the augmented Lagrangian
 #' @param ncv Folds of cross-validation. Use \code{NULL} if cross-validation is not wanted.
+#' @param ncore Number of cores for parallel computing for cross-validation. Default is 1.
 #' @param intercept \code{TRUE} or \code{FALSE}, indicating whether an intercept should be estimated.
 #' @param foldid A vector of fold indicator. Default is \code{NULL}.
 #' @param step2 \code{TRUE} or \code{FALSE}, indicating whether a second-stage feature selection for specific ratios should be performed for the features selected by the main lasso algorithm. Will only be performed if cross validation is enabled.
@@ -45,7 +46,7 @@
 #' # fit <- FLORAL(dat$xcount,survival::Surv(dat$t,dat$d,type="mstate"),failcode=1,
 #' #               family="finegray",progress=FALSE,step2=FALSE)
 #' 
-#' @import Rcpp ggplot2 survival glmnet dplyr
+#' @import Rcpp ggplot2 survival glmnet dplyr doParallel foreach doRNG parallel
 #' @importFrom survcomp concordance.index
 #' @importFrom reshape melt
 #' @importFrom utils combn
@@ -69,6 +70,7 @@ FLORAL <- function(x,
                    a=1,
                    mu=1,
                    ncv=5,
+                   ncore=1,
                    intercept=FALSE,
                    foldid=NULL,
                    step2=TRUE,
@@ -103,7 +105,8 @@ FLORAL <- function(x,
                            foldid,
                            step2,
                            progress,
-                           plot)
+                           plot,
+                           ncore=ncore)
     }
 
   }else if(family == "binomial"){
@@ -123,7 +126,8 @@ FLORAL <- function(x,
                                    foldid,
                                    step2,
                                    progress,
-                                   plot)
+                                   plot,
+                                   ncore=ncore)
     }
 
   }else if(family == "cox"){
@@ -165,7 +169,8 @@ FLORAL <- function(x,
                                   foldid,
                                   step2,
                                   progress,
-                                  plot)
+                                  plot,
+                                  ncore=ncore)
 
       }
 
@@ -183,7 +188,8 @@ FLORAL <- function(x,
                               foldid,
                               step2,
                               progress,
-                              plot)
+                              plot,
+                              ncore=ncore)
 
     }
   }else if (family == "finegray"){
@@ -210,9 +216,10 @@ FLORAL <- function(x,
 
         if (is.null(failcode)){
           warning("`failcode` is `NULL`. Using the first failure type as default")
-          df_FG <- finegray(Surv(tstart, tstop, dd) ~ ., id=id, data=xy, etype=failcode,timefix = FALSE)
+          failcode = 1
+          df_FG <- finegray(Surv(tstart, tstop, dd, type="mstate") ~ ., id=id, data=xy, etype=failcode,timefix = FALSE)
         }else{
-          df_FG <- finegray(Surv(tstart, tstop, dd) ~ ., id=id, data=xy, etype=failcode,timefix = FALSE)
+          df_FG <- finegray(Surv(tstart, tstop, dd, type="mstate") ~ ., id=id, data=xy, etype=failcode,timefix = FALSE)
         }
 
         newx <- as.matrix(df_FG[,colnames(x)])
@@ -234,7 +241,8 @@ FLORAL <- function(x,
                                foldid,
                                step2,
                                progress,
-                               plot)
+                               plot,
+                               ncore=ncore)
 
       }
 
@@ -270,7 +278,8 @@ FLORAL <- function(x,
                              foldid,
                              step2,
                              progress,
-                             plot)
+                             plot,
+                             ncore=ncore)
 
     }
 
@@ -378,7 +387,7 @@ FLORAL <- function(x,
 #' dat <- simu(n=50,p=30,model="linear")
 #' fit <- mcv.FLORAL(mcv=2,ncore=1,x=dat$xcount,y=dat$y,ncv=2,progress=FALSE,step2=TRUE,plot=FALSE)
 #' 
-#' @import Rcpp ggplot2 survival glmnet dplyr doParallel foreach doRNG parallel
+#' @import Rcpp ggplot2 survival glmnet dplyr doParallel foreach parallel
 #' @importFrom survcomp concordance.index
 #' @importFrom reshape melt
 #' @importFrom utils combn
@@ -444,6 +453,7 @@ mcv.FLORAL <- function(mcv=10,
                                   a,
                                   mu,
                                   ncv,
+                                  ncore=1,
                                   intercept,
                                   foldid=NULL,
                                   step2,
@@ -499,6 +509,7 @@ mcv.FLORAL <- function(mcv=10,
                a,
                mu,
                ncv,
+               ncore=1,
                intercept,
                foldid=NULL,
                step2,
@@ -715,6 +726,7 @@ a.FLORAL <- function(a=c(0.1,0.5,1),
                       a = a[i],
                       mu,
                       ncv,
+                      ncore=1,
                       intercept,
                       foldid=foldid,
                       step2,
@@ -769,6 +781,7 @@ a.FLORAL <- function(a=c(0.1,0.5,1),
                      a = a[1],
                      mu,
                      ncv,
+                     ncore=1,
                      intercept,
                      foldid=NULL,
                      step2,
@@ -796,6 +809,7 @@ a.FLORAL <- function(a=c(0.1,0.5,1),
                       a = a[i],
                       mu,
                       ncv,
+                      ncore=1,
                       intercept,
                       foldid=fit0$foldid,
                       step2,

diff --git a/R/LogRatioCoxLasso.R b/R/LogRatioCoxLasso.R
@@ -13,7 +13,8 @@ LogRatioCoxLasso <- function(x,
                              plot=TRUE,
                              mcv="Deviance",
                              loop1=FALSE,
-                             loop2=FALSE){
+                             loop2=FALSE,
+                             ncore=1){
 
   ptm <- proc.time()
 
@@ -66,7 +67,7 @@ LogRatioCoxLasso <- function(x,
   if (!is.null(ncv)){
 
     cvdev <- matrix(0,nrow=length(lidx),ncol=ncv)
-    cvdevnull <- rep(0,ncol=ncv)
+    # cvdevnull <- rep(0,ncol=ncv)
 
     if (is.null(foldid)){
       labels <- coxsplity(as.matrix(y),ncv)
@@ -76,66 +77,112 @@ LogRatioCoxLasso <- function(x,
 
     # labels <- caret::createFolds(factor(d),k=ncv)
 
-    for (cv in 1:ncv){
+    if (ncore == 1){
 
-      if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n"))
-
-      # train.x <- x[-labels[[cv]],]
-      # train.d <- d[-labels[[cv]]]
-      # train.t <- t[-labels[[cv]]]
-      # test.x <- x[labels[[cv]],]
-      # test.d <- d[labels[[cv]]]
-      # test.t <- t[labels[[cv]]]
-
-      train.x <- x[labels!=cv,]
-      train.d <- d[labels!=cv]
-      train.t <- t[labels!=cv]
-      test.x <- x[labels==cv,]
-      test.d <- d[labels==cv]
-      test.t <- t[labels==cv]
-
-      cv.devnull <- 0
-      train.tj <- sort(train.t[train.d==1])
-      for (j in 1:length(train.tj)){
-        cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j]))
+      for (cv in 1:ncv){
+
+        if (progress) cat(paste0("Algorithm running for cv dataset ",cv," out of ",ncv,": \n"))
+
+        # train.x <- x[-labels[[cv]],]
+        # train.d <- d[-labels[[cv]]]
+        # train.t <- t[-labels[[cv]]]
+        # test.x <- x[labels[[cv]],]
+        # test.d <- d[labels[[cv]]]
+        # test.t <- t[labels[[cv]]]
+
+        train.x <- x[labels!=cv,]
+        train.d <- d[labels!=cv]
+        train.t <- t[labels!=cv]
+        test.x <- x[labels==cv,]
+        test.d <- d[labels==cv]
+        test.t <- t[labels==cv]
+
+        cv.devnull <- 0
+        train.tj <- sort(train.t[train.d==1])
+        for (j in 1:length(train.tj)){
+          cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j]))
+        }
+        cv.devnull <- 2*cv.devnull
+
+        cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,100,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,progress,loop1,loop2,notcv=FALSE)
+
+        cv.devnull <- 0
+        loglik <- rep(0,length(lidx))
+        linprod <- test.x %*% cvfit$beta
+
+        if (mcv == "Deviance"){
+
+          test.tj <- sort(test.t[test.d==1])
+          for (j in 1:length(test.tj)){
+            cv.devnull <- cv.devnull + log(sum(test.t >= test.tj[j]))
+            if (sum(test.t >= test.tj[j]) > 1){
+              cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
+                log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8)
+            }else if (sum(test.t >= test.tj[j]) == 1){
+              cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
+                log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 
+            }
+            #- log(accu(link(widx))+1e-8)
+          }
+
+          # cvdevnull[cv] <- 2*cv.devnull
+          cvdev[,cv] <- -2*cvdev[,cv]
+
+        }else if (mcv == "Cindex"){
+
+          for (kk in 1:length(lidx)){
+            cvdev[kk,cv] <- 1-concordance.index(x=linprod[,kk],surv.time=test.t,surv.event=test.d)$c.index
+          }
+
+        }
+        # cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y - exp(x)/(1+exp(x)))^2)/length(test.y))
+
       }
-      cv.devnull <- 2*cv.devnull
 
-      cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,100,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,progress,loop1,loop2,notcv=FALSE)
+    }else if (ncore > 1){
 
-      cv.devnull <- 0
-      loglik <- rep(0,length(lidx))
-      linprod <- test.x %*% cvfit$beta
+      if (progress) warning(paste0("Using ", ncore ," core for cross-validation computation."))
 
-      if (mcv == "Deviance"){
+      cl <- makeCluster(ncore)
+      registerDoParallel(cl)
+
+      cvdev <- foreach(cv=1:ncv,.combine=cbind) %dopar% {
+
+        train.x <- x[labels!=cv,]
+        train.d <- d[labels!=cv]
+        train.t <- t[labels!=cv]
+        test.x <- x[labels==cv,]
+        test.d <- d[labels==cv]
+        test.t <- t[labels==cv]
+
+        cv.devnull <- 0
+        train.tj <- sort(train.t[train.d==1])
+        for (j in 1:length(train.tj)){
+          cv.devnull <- cv.devnull + log(sum(train.t >= train.tj[j]))
+        }
+        cv.devnull <- 2*cv.devnull
+
+        cvfit <- cox_enet_al(train.x,train.t,train.d,train.tj,length(lidx),mu,100,lambda[lidx],wcov,a,adjust,ncov,cv.devnull,FALSE,loop1,loop2,notcv=FALSE)
+
+        linprod <- test.x %*% cvfit$beta
+        cv.dev <- rep(0,length(lidx))
 
         test.tj <- sort(test.t[test.d==1])
         for (j in 1:length(test.tj)){
-          cv.devnull <- cv.devnull + log(sum(test.t >= test.tj[j]))
           if (sum(test.t >= test.tj[j]) > 1){
-            cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
+            cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 
               log(colSums(exp(linprod[test.t >= test.tj[j],])) + 1e-8)
           }else if (sum(test.t >= test.tj[j]) == 1){
-            cvdev[,cv] <- cvdev[,cv] + linprod[test.t == test.tj[j],] - 
+            cv.dev <- cv.dev + linprod[test.t == test.tj[j],] - 
               log(exp(linprod[test.t >= test.tj[j],]) + 1e-8) 
           }
-          #- log(accu(link(widx))+1e-8)
-        }
-
-        cvdevnull[cv] <- 2*cv.devnull
-        cvdev[,cv] <- -2*cvdev[,cv]
-
-      }else if (mcv == "Cindex"){
-
-        for (kk in 1:length(lidx)){
-          cvdev[kk,cv] <- 1-concordance.index(x=linprod[,kk],surv.time=test.t,surv.event=test.d)$c.index
         }
 
+        cv.dev <- -2*cv.dev
+        cv.dev
       }
 
-
-
-      # cvmse[,cv] <- apply(cbind(1,test.x) %*% rbind(t(cvfit$beta0),cvfit$beta),2,function(x) sum((test.y - exp(x)/(1+exp(x)))^2)/length(test.y))
+      stopCluster(cl)
 
     }
 
@@ -230,6 +277,8 @@ LogRatioCoxLasso <- function(x,
             stepglmnet <- cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")
             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
+          }else{
+            idxs <- as.vector(idxs)
           }
 
           df_step2 <- data.frame(t=t,d=d,x=x.select.min)
@@ -259,6 +308,8 @@ LogRatioCoxLasso <- function(x,
             stepglmnet <- cv.glmnet(x=x.select.min,y=Surv(t,d),type.measure = "deviance",family="cox")
             x.select.min <- x.select.min[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
             idxs <- idxs[,which(stepglmnet$glmnet.fit$beta[,stepglmnet$index[1]]!=0)]
+          }else{
+            idxs <- as.vector(idxs)
           }
 
           df_step2 <- data.frame(t=t,d=d,x=x.select.min)