From 38ba74a36d6a65a4e49402210d54c93727649ef3 Mon Sep 17 00:00:00 2001 From: Peter Carbonetto Date: Tue, 16 Jul 2024 08:34:16 -0500 Subject: [PATCH] Implemented first draft of script compile_lda_runs.R. --- scripts/compile_lda_runs.R | 52 ++++++++++++++++++++++++++++++++++++++ scripts/run_lda.R | 3 --- scripts/run_lda.sbatch | 8 +++--- scripts/run_lda_all.sh | 46 ++++++++++++++++++++++++++++++++- 4 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 scripts/compile_lda_runs.R diff --git a/scripts/compile_lda_runs.R b/scripts/compile_lda_runs.R new file mode 100644 index 0000000..04f529d --- /dev/null +++ b/scripts/compile_lda_runs.R @@ -0,0 +1,52 @@ +# Compile the LDA runs for one data set into a single .RData file. +library(tools) +library(stringr) +library(tm) +library(topicmodels) + +# Combine results from all files of the form lda-*.rds in this +# directory. +out.dir <- "../output/nips/rds" + +# List all the RDS files containing the model fits. +files <- Sys.glob(file.path(out.dir,"lda-*.rds")) +n <- length(files) + +# Set up two data structures: "fits", a list used to store all the +# results; and "dat", a data frame summarizing the model parameters +# and optimization settings used to produce these fits. +fits <- vector("list",n) +labels <- files +labels <- str_remove(labels,paste(out.dir,"/",sep = "")) +labels <- str_remove(labels,".rds") +names(fits) <- labels +dat <- data.frame(label = labels, + k = 0, + method = "", + extrapolate = FALSE, + stringsAsFactors = FALSE) + +# Load the results from the RDS files. +for (i in 1:n) { + + out <- readRDS(files[i]) + fits[[i]] <- out$lda + dat[i,"k"] <- out$lda@k + dat[i,"method"] <- unlist(strsplit(labels[i],"-"))[3] + dat[i,"extrapolate"] <- grepl("ex",labels[i],fixed = TRUE) +} + +# Reorder the results in "fits" and "dat". +dat <- transform(dat,method = factor(method,c("em","scd"))) +i <- with(dat,order(k,extrapolate,method)) +dat <- dat[i,] +fits <- fits[i] +rownames(dat) <- NULL + +# Convert the "k" column to a factor. +dat <- transform(dat,k = factor(k)) + +# Save the combined results to an .RData file. +save(list = c("dat","fits"), + file = "lda.RData") +resaveRdaFiles("lda.RData") diff --git a/scripts/run_lda.R b/scripts/run_lda.R index 154412f..c803794 100755 --- a/scripts/run_lda.R +++ b/scripts/run_lda.R @@ -57,9 +57,6 @@ k <- ncol(fit0$F) # ------- # Now we are ready to perform variational inference for the LDA model. # -# For the droplet data with k = 10, this step took roughly 6 min per -# iteration. -# # For the 68k PBMC data with k = 10, this step took roughly 20 min # per iteration. # diff --git a/scripts/run_lda.sbatch b/scripts/run_lda.sbatch index 1f70f35..61ec1ec 100644 --- a/scripts/run_lda.sbatch +++ b/scripts/run_lda.sbatch @@ -1,10 +1,10 @@ #!/bin/bash -#SBATCH --partition=broadwl +#SBATCH --partition=mstephens #SBATCH --account=pi-mstephens #SBATCH --cpus-per-task=4 -#SBATCH --mem=8G -#SBATCH --time=24:00:00 +#SBATCH --mem=32G +#SBATCH --time=48:00:00 # This script allocates computing resources (CPUs, memory), loads R, # and runs run_lda.R. See run_lda_all.sh for examples illustrating how @@ -14,7 +14,7 @@ # # (a) .libPaths()[1] should be "/home/pcarbo/R_libs" # -# (b) Change "mem" to 32G for pbmc_68k.RData. +# (b) Change "mem" to 32G and "time" to 48 h for pbmc_68k.RData. # # Get the command-line arguments. diff --git a/scripts/run_lda_all.sh b/scripts/run_lda_all.sh index fd611a3..6689ca4 100644 --- a/scripts/run_lda_all.sh +++ b/scripts/run_lda_all.sh @@ -150,4 +150,48 @@ sbatch ${MAIN_SCRIPT} droplet droplet/rds/fit-droplet-scd-ex-k=11 sbatch ${MAIN_SCRIPT} droplet droplet/rds/fit-droplet-scd-ex-k=12 # Run LDA on the 68k pbmc data. -# TO DO. +# data initfile +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=2 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=3 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=4 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=5 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=6 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=7 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=8 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=9 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=10 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=11 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-k=12 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=2 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=3 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=4 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=5 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=6 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=7 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=8 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=9 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=10 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=11 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-em-ex-k=12 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=2 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=3 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=4 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=5 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=6 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=7 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=8 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=9 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=10 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=11 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-k=12 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=2 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=3 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=4 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=5 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=6 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=7 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=8 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=9 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=10 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=11 +sbatch ${MAIN_SCRIPT} pbmc_68k pbmc68k/rds/fit-pbmc68k-scd-ex-k=12