From 7bf43d56b373d51655c4721a905f6920c15aafc6 Mon Sep 17 00:00:00 2001
From: kopardev <vishal.koparde@nih.gov>
Date: Fri, 24 Nov 2023 13:14:03 -0500
Subject: [PATCH] chore: typo fixing

---
 aspen                                         | 773 ++++++++++++++++++
 .../scripts/_aggregate_motif_enrichments.py   |  26 +
 workflow/scripts/aggregate_results.Rmd        | 161 ++++
 workflow/scripts/aggregate_results_runner.R   |  83 ++
 4 files changed, 1043 insertions(+)
 create mode 100755 aspen
 create mode 100644 workflow/scripts/_aggregate_motif_enrichments.py
 create mode 100644 workflow/scripts/aggregate_results.Rmd
 create mode 100644 workflow/scripts/aggregate_results_runner.R

diff --git a/aspen b/aspen
new file mode 100755
index 0000000..b677238
--- /dev/null
+++ b/aspen
@@ -0,0 +1,773 @@
+#!/usr/bin/env bash
+# Author: Vishal Koparde, Ph.D.
+# CCBR, NCI
+# (c) 2021
+#
+# wrapper script to run the snakemake pipeline
+# a) on an interactive node (runlocal) OR
+# b) submit to the slurm load scheduler (run)
+#
+# DISCLAIMER: This wrapper only works on BIOWULF
+
+# set -exo pipefail
+# module purge
+retries="2"
+
+##########################################################################################
+# functions
+##########################################################################################
+
+function get_git_commitid_tag() {
+# This function gets the latest git commit id and tag
+# Input is PIPELINE_HOME folder which is a git folder
+  cd $1
+  gid=$(git rev-parse HEAD)
+  tag=$(git describe --tags $gid 2>/dev/null)
+  echo -ne "$gid\t$tag"
+}
+
+function printbanner() {
+versionnumber=$1
+cat << EOF
+____ ____ ___  ____ _  _
+|__| [__  |__] |___ |\ |
+|  | ___] |    |___ | \|    v${versionnumber}
+EOF
+}
+
+##########################################################################################
+# initial setup
+##########################################################################################
+
+# ## setting PIPELINE_HOME
+PIPELINE_HOME=$(readlink -f $(dirname "$0"))
+
+# set snakefile
+SNAKEFILE="${PIPELINE_HOME}/workflow/Snakefile"
+
+VERSIONFILE="${PIPELINE_HOME}/VERSION"
+
+# get github commit tag
+GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME)
+
+##########################################################################################
+# Some more set up
+##########################################################################################
+
+PYTHONVERSION="python/3.10"
+SNAKEMAKEVERSION="snakemake"
+#SINGULARITYVERSION="singularity/3.7.4"
+SINGULARITYVERSION="singularity"
+ASPENVERSION=$(head -n1 $VERSIONFILE|awk '{print $1}')
+
+# set defaults
+GENOME="hg38"
+SUPPORTED_GENOMES="hg19 hg38 mm10 mmul10 bosTau9 hs1"
+
+# essential files
+# these are relative to the workflows' base folder
+# these are copied into the WORKDIR
+ESSENTIAL_FILES="config/config.yaml config/samples.tsv resources/cluster.json resources/tools.yaml"
+ESSENTIAL_FOLDERS="workflow/scripts"
+
+SCRIPTNAME="$0"
+SCRIPTDIRNAME=$(readlink -f $(dirname $0))
+SCRIPTBASENAME=$(readlink -f $(basename $0))
+
+# set extra singularity bindings comma separated
+# /data/CCBR_Pipeliner required for fastq_screen_config.txt to work
+EXTRA_SINGULARITY_BINDS="/lscratch,/data/CCBR_Pipeliner"
+
+##########################################################################################
+# USAGE
+##########################################################################################
+
+function usage() { cat << EOF
+
+##########################################################################################
+
+Welcome to
+EOF
+printbanner $ASPENVERSION
+cat << EOF
+
+A_TAC_S_eq A_nalysis P_ip_E_li_N_e
+
+##########################################################################################
+
+This pipeline was built by CCBR (https://bioinformatics.ccr.cancer.gov/ccbr)
+Please contact Vishal Koparde for comments/questions (vishal.koparde@nih.gov)
+
+##########################################################################################
+
+Here is a list of genome supported by aspen:
+
+  * hg19          [Human]
+  * hg38          [Human]
+  * mm10          [Mouse]
+  * mmul10        [Macaca mulatta(Rhesus monkey) or rheMac10]
+  * bosTau9       [Bos taurus(cattle)]
+
+aspen calls peaks using the following tools:
+
+ * MACS2
+ * Genrich        [RECOMMENDED FOR USE]
+
+USAGE:
+  bash ${SCRIPTNAME} -w/--workdir=<WORKDIR> -m/--runmode=<RUNMODE>
+
+Required Arguments:
+1.  WORKDIR     : [Type: String]: Absolute or relative path to the output folder with write permissions.
+
+2.  RUNMODE     : [Type: String] Valid options:
+    * init      : initialize workdir
+    * dryrun    : dry run snakemake to generate DAG
+    * run       : run with slurm
+    * runlocal  : run without submitting to sbatch
+    ADVANCED RUNMODES (use with caution!!)
+    * unlock    : unlock WORKDIR if locked by snakemake NEVER UNLOCK WORKDIR WHERE PIPELINE IS CURRENTLY RUNNING!
+    * reconfig  : recreate config file in WORKDIR (debugging option) EDITS TO config.yaml WILL BE LOST!
+    * reset     : DELETE workdir dir and re-init it (debugging option) EDITS TO ALL FILES IN WORKDIR WILL BE LOST!
+    * printbinds: print singularity binds (paths)
+    * local     : same as runlocal
+
+Optional Arguments:
+
+--genome|-g     : genome eg. hg38
+--manifest|-s   : absolute path to samples.tsv. This will be copied to output folder                    (--runmode=init only)
+--useenvmod|-e  : use "--use-enmodules" option while running Snakemake. This is for using modules on HPC instead of containers(default).
+--help|-h       : print this help
+
+Example commands:
+  bash ${SCRIPTNAME} -w=/my/output/folder -m=init
+  bash ${SCRIPTNAME} -w=/my/output/folder -m=dryrun
+  bash ${SCRIPTNAME} -w=/my/output/folder -m=run
+
+##########################################################################################
+
+VersionInfo:
+  python          : $PYTHONVERSION
+  snakemake       : $SNAKEMAKEVERSION
+  pipeline_home   : $PIPELINE_HOME
+  git commit/tag  : $GIT_COMMIT_TAG
+  aspen_version   : v${ASPENVERSION}
+
+##########################################################################################
+
+EOF
+}
+
+##########################################################################################
+# ERR
+##########################################################################################
+
+function err() { usage && cat <<< "
+#
+# ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR
+#
+  $@
+#
+# ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR
+#
+" && exit 1 1>&2; }
+
+
+##########################################################################################
+# INIT
+##########################################################################################
+
+function init() {
+
+# This function initializes the workdir by:
+# 1. creating the working dir
+# 2. copying essential files like config.yaml and samples.tsv into the workdir
+# 3. setting up logs and stats folders
+
+printbanner $ASPENVERSION
+
+# create output folder
+if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; fi
+mkdir -p $WORKDIR
+
+# copy essential files
+# for f in $ESSENTIAL_FILES;do
+f="${PIPELINE_HOME}/config/config.yaml"
+echo "Copying essential file: $f"
+fbn=$(basename $f)
+sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \
+  -e "s/WORKDIR/${WORKDIR//\//\\/}/g" \
+  -e "s/GENOME/${GENOME}/g" $f > $WORKDIR/$fbn
+
+for f in $MANIFEST ${PIPELINE_HOME}/resources/cluster.json ${PIPELINE_HOME}/resources/tools.yaml
+do
+  echo "Copying essential file: $f"
+  fbn=$(basename $f)
+  cp $f $WORKDIR/$fbn
+done
+
+# copy essential folders
+for f in $ESSENTIAL_FOLDERS;do
+  # rsync -az --progress ${PIPELINE_HOME}/$f $WORKDIR/
+  cp -rv ${PIPELINE_HOME}/$f ${WORKDIR}/
+done
+
+cd ${WORKDIR}
+curl https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/jobby -O
+curl https://github.com/CCBR/Tools/blob/master/Biowulf/run_jobby_on_snakemake_log -O
+
+chmod a+x ${WORKDIR}/jobby
+chmod a+x ${WORKDIR}/run_jobby_on_snakemake_log
+
+#create log and stats folders
+if [ ! -d $WORKDIR/logs ]; then mkdir -p $WORKDIR/logs;echo "Logs Dir: $WORKDIR/logs";fi
+if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi
+
+cat << EOF
+Done Initializing   : $WORKDIR
+You can now edit    : $WORKDIR/config.yaml and
+                      $WORKDIR/samples.tsv
+EOF
+
+}
+
+##########################################################################################
+# set random str
+##########################################################################################
+
+function _set_rand_str() {
+  x=$(mktemp)
+  rm -rf $x
+  RAND_STR=$(echo $x|awk -F"." '{print $NF}')
+}
+
+##########################################################################################
+# CHECK ESSENTIAL FILES
+##########################################################################################
+
+function check_essential_files() {
+
+# Checks if files essential to start running the pipeline exist in the workdir
+
+  if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; fi
+  for f in config.yaml samples.tsv cluster.json tools.yaml; do
+    if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";fi
+  done
+
+}
+
+##########################################################################################
+# set --reuun-triggers to "mtime" ... only for newer snakemake
+##########################################################################################
+
+function set_snakemake_rerun_triggers() {
+  runcheck
+  snakemakeVer=$(snakemake --version 2>/dev/null)
+  verlte() {
+[  "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ]
+  }
+  verlt() {
+[ "$1" = "$2" ] && return 1 || verlte $1 $2
+  }
+  snakemakeOld=$(verlt $snakemakeVer 7.8 && echo "yes" || echo "no") # check if snakemake is older than 7.8
+  if [[ "$snakemakeVer" == "" || "$snakemakeOld" == "no" ]];then
+    RERUNTRIGGERS="--rerun-triggers mtime"
+  else
+    RERUNTRIGGERS=""
+  fi
+}
+
+##########################################################################################
+# RECONFIG ... recreate config.yaml and overwrite old version
+##########################################################################################
+
+function reconfig(){
+# Rebuild config file and replace the config.yaml in the WORKDIR
+# this is only for dev purposes when new key-value pairs are being added to the config file
+
+  check_essential_files
+  sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \
+    -e "s/WORKDIR/${WORKDIR//\//\\/}/g" \
+    -e "s/GENOME/${GENOME}/g" \
+    ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml
+  echo "$WORKDIR/config.yaml has been updated!"
+
+}
+
+##########################################################################################
+# SET SINGULARITY BINDS ... bind required singularity folders appropriately
+##########################################################################################
+
+function set_singularity_binds(){
+# this functions tries find what folders to bind
+# biowulf specific
+  echo "$PIPELINE_HOME" > ${WORKDIR}/tmp1
+  echo "$WORKDIR" >> ${WORKDIR}/tmp1
+  grep -o '\/.*' <(cat ${WORKDIR}/config.yaml ${WORKDIR}/samples.tsv) | tr '\t' '\n' | grep -v ' \|\/\/' | sort | uniq >> ${WORKDIR}/tmp1
+  grep "^/" ${WORKDIR}/tmp1 | grep gpfs    | awk -F'/' -v OFS='/' '{print $1,$2,$3,$4,$5}' | sort | uniq > ${WORKDIR}/tmp2
+  grep "^/" ${WORKDIR}/tmp1 | grep -v gpfs | awk -F'/' -v OFS='/' '{print $1,$2,$3}'       | sort | uniq > ${WORKDIR}/tmp3
+  while read a;do readlink -f $a;done < ${WORKDIR}/tmp3 > ${WORKDIR}/tmp4
+  binds=$(cat ${WORKDIR}/tmp2 ${WORKDIR}/tmp3 ${WORKDIR}/tmp4 | sort | uniq | tr '\n' ',')
+  rm -f ${WORKDIR}/tmp?
+  binds=$(echo $binds | awk '{print substr($1,1,length($1)-1)}')
+  SINGULARITY_BINDS="-B $EXTRA_SINGULARITY_BINDS,$binds"
+  SINGULARITY_STR="--use-singularity --singularity-args \"${SINGULARITY_BINDS}\""
+}
+
+
+function set_cluster_arg(){
+  CLUSTER_SBATCH_CMD="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}"
+  clustername=$(scontrol show config|grep ClusterName|awk '{print $NF}')
+  if [[ "$clustername" == "biowulf" ]];then
+    CLUSTER_SBATCH_CMD="${CLUSTER_SBATCH_CMD} --gres {cluster.gres}"
+  fi
+}
+##########################################################################################
+# RUNCHECK ... check essential files and load required packages
+##########################################################################################
+
+function runcheck(){
+# Check "job-essential" files and load required modules
+
+  check_essential_files
+  # MODULE_STR="module load $PYTHONVERSION $SNAKEMAKEVERSION singularity"
+  MODULE_STR=$(
+cat << END_HEREDOC
+command -V python 2>/dev/null || module load $PYTHONVERSION || (>&2 echo "module $PYTHONVERSION could not be loaded"; exit 1)
+command -V snakemake 2>/dev/null || module load $SNAKEMAKEVERSION || (>&2 echo "module $SNAKEMAKEVERSION could not be loaded"; exit 1)
+command -V singularity 2>/dev/null || module load singularity || (>&2 echo "module singularity could not be loaded"; exit 1)
+END_HEREDOC
+  )
+# If not on BIOWULF then change MODULE_STR such that python, snakemake and singularity are all in PATH
+
+command -V python 2>/dev/null || module load $PYTHONVERSION || (>&2 echo "module $PYTHONVERSION could not be loaded"; exit 1)
+command -V snakemake 2>/dev/null || module load $SNAKEMAKEVERSION || (>&2 echo "module $SNAKEMAKEVERSION could not be loaded"; exit 1)
+command -V singularity 2>/dev/null || module load singularity || (>&2 echo "module singularity could not be loaded"; exit 1)
+
+}
+
+##########################################################################################
+# DRYRUN ... also run automatically before actual run
+##########################################################################################
+
+function dryrun() {
+# Dry-run
+  runcheck
+  if [ -f ${WORKDIR}/dryrun.log ]; then
+    modtime=$(stat ${WORKDIR}/dryrun.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g")
+    mv ${WORKDIR}/dryrun.log ${WORKDIR}/logs/dryrun.${modtime}.log
+    if [ -f ${WORKDIR}/dryrun_git_commit.txt ];then
+      mv ${WORKDIR}/dryrun_git_commit.txt ${WORKDIR}/logs/dryrun_git_commit.${modtime}.txt
+    fi
+  fi
+  run "--dry-run" | tee ${WORKDIR}/dryrun.log && \
+  echo "Git Commit/Tag: $GIT_COMMIT_TAG" > ${WORKDIR}/dryrun_git_commit.txt
+}
+
+##########################################################################################
+# UNLOCK
+##########################################################################################
+
+function unlock() {
+# Unlock the workdir if previous snakemake run ended abruptly
+
+  runcheck
+  run "--unlock"
+}
+
+##########################################################################################
+# DAG
+##########################################################################################
+
+function dag() {
+  runcheck
+  snakemake -s $SNAKEFILE --configfile ${WORKDIR}/config.yaml --forceall --dag |dot -Teps > ${WORKDIR}/dag.eps
+}
+
+##########################################################################################
+# PRINT SINGULARITY BINDS ... print bound singularity folders for debugging
+##########################################################################################
+
+function printbinds(){
+  set_singularity_binds
+  echo $SINGULARITY_BINDS
+}
+
+##########################################################################################
+# RUNLOCAL ... run directly on local interactive node ... no submission to SLURM
+##########################################################################################
+
+function runlocal() {
+# If the pipeline is fired up on an interactive node (with sinteractive), this function runs the pipeline
+
+  runcheck
+  set_singularity_binds
+  set_cluster_arg
+  if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; exit 1; fi
+  module load singularity
+  run "--dry-run" && echo "Dry-run was successful .... starting local execution" && \
+  run "local"
+}
+
+##########################################################################################
+# RUNSLURM ... submit head job to slurm which will spawn other jobs on SLURM
+##########################################################################################
+
+function runslurm() {
+# Submit the execution of the pipeline to the biowulf job scheduler (slurm)
+
+  runcheck
+  set_singularity_binds
+  set_cluster_arg
+  run "--dry-run" && \
+    echo "Dry-run was successful .... submitting jobs to job-scheduler" && \
+    run "slurm"
+}
+
+##########################################################################################
+# CREATE RUNINFO ... create runinfo.yaml in workdir
+##########################################################################################
+
+function create_runinfo {
+  modtime=$1
+  if [ "$modtime" == "" ];then
+   modtime=$(stat ${WORKDIR}/runinfo.yaml|grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g")
+  fi
+  if [ -f ${WORKDIR}/runinfo.yaml ];then
+    mv ${WORKDIR}/runinfo.yaml ${WORKDIR}/stats/runinfo.${modtime}.yaml
+  fi
+  echo "Pipeline Dir: $PIPELINE_HOME" > ${WORKDIR}/runinfo.yaml
+  echo "Git Commit/Tag: $GIT_COMMIT_TAG" >> ${WORKDIR}/runinfo.yaml
+  userlogin=$(whoami)
+  if [[ `which finger 2>/dev/null` ]];then
+          username=$(finger $userlogin |grep ^Login | awk -F"Name: " '{print $2}');
+  elif [[ `which lslogins 2>/dev/null` ]];then
+          username=$(lslogins -u $userlogin | grep ^Geco | awk -F": " '{print $2}' | awk '{$1=$1;print}');
+  else username="NOTFOUND";fi
+  echo "Login: $userlogin" >> ${WORKDIR}/runinfo.yaml
+  echo "Name: $username" >> ${WORKDIR}/runinfo.yaml
+  g=$(groups)
+  echo "Groups: $g" >> ${WORKDIR}/runinfo.yaml
+  d=$(date)
+  echo "Date/Time: $d" >> ${WORKDIR}/runinfo.yaml
+}
+
+##########################################################################################
+# PRERUN CLEANUP ... get ready to run .. park old logs/stats etc.
+##########################################################################################
+
+function preruncleanup() {
+# Cleanup function to rename/move files related to older runs to prevent overwriting them.
+
+  echo "Running..."
+
+  # check initialization
+  check_essential_files
+
+  cd $WORKDIR
+  ## Archive previous run files
+  if [ -f ${WORKDIR}/snakemake.log ];then
+    modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g")
+    mv ${WORKDIR}/snakemake.log ${WORKDIR}/logs/snakemake.${modtime}.log
+    if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then
+      mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt
+    fi
+    if [ -f ${WORKDIR}/snakemake.stats ];then
+      mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats
+    fi
+    if [ -f ${WORKDIR}/run_git_commit.txt ];then
+      mv ${WORKDIR}/run_git_commit.txt ${WORKDIR}/logs/run_git_commit.${modtime}.txt
+    fi
+  fi
+  nslurmouts=$(find ${WORKDIR} -maxdepth 1 -name "slurm-*.out" |wc -l)
+  if [ "$nslurmouts" != "0" ];then
+    for f in $(ls ${WORKDIR}/slurm-*.out);do mv ${f} ${WORKDIR}/logs/;done
+  fi
+
+  create_runinfo $modtime
+
+}
+
+
+function run() {
+# RUN function
+# argument1 can be:
+# 1. local or
+# 2. dryrun or
+# 3. unlock or
+# 4. slurm
+
+##########################################################################################
+# local run
+##########################################################################################
+
+  if [ "$1" == "local" ];then
+
+  preruncleanup
+  echo "Done preruncleanup!"
+
+  # --use-envmodules \
+  _set_rand_str
+
+  cat > ${HOME}/${RAND_STR} << EOF
+#/bin/bash
+set -exo pipefail
+
+$MODULE_STR
+
+snakemake -s $SNAKEFILE \
+--directory $WORKDIR \
+--printshellcmds \
+$SINGULARITY_STR \
+--latency-wait 120 \
+--configfile $CONFIGFILE \
+--cores all \
+--rerun-incomplete \
+${RERUNTRIGGERS} \
+--restart-times ${retries} \
+--keep-going \
+--stats ${WORKDIR}/snakemake.stats \
+2>&1|tee ${WORKDIR}/snakemake.log
+
+# if [ "$?" -eq "0" ];then
+#   snakemake -s $SNAKEFILE \
+#   --report ${WORKDIR}/runlocal_snakemake_report.html \
+#   --directory $WORKDIR \
+#   --configfile $CONFIGFILE
+# fi
+
+EOF
+
+    if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then
+      $EXPORT_SING_CACHE_DIR_CMD && \
+      bash ${HOME}/${RAND_STR}
+    else
+      bash ${HOME}/${RAND_STR}
+    fi
+
+    rm -rf ${HOME}/${RAND_STR}
+
+##########################################################################################
+# slurm run
+##########################################################################################
+
+  elif [ "$1" == "slurm" ];then
+
+  preruncleanup
+# if QOS is other than "global" and is supplied in the cluster.json file then add " --qos={cluster.qos}" to the
+# snakemake command below
+#define partitions
+BUYINPARTITIONS=$(bash <(curl -s https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/get_buyin_partition_list.bash 2>/dev/null))
+# remove "norm" partition
+BUYINPARTITIONS=$(echo $BUYINPARTITIONS | tr ',' '\n' | grep -v "norm" | tr '\n' ',' | sed 's/.$//')
+PARTITIONS="norm"
+if [ ! -z "$BUYINPARTITIONS" ];then
+# as only 2 partitions are allow and 1 is norm .. so we randomly pick the first buyin partition
+  BUYINPARTITIONS=$(echo $BUYINPARTITIONS | tr ',' '\n' | head -n1)
+  PARTITIONS="norm,$BUYINPARTITIONS"
+fi
+
+  cat > ${WORKDIR}/submit_script.sbatch << EOF
+#!/bin/bash
+#SBATCH --job-name="apsen"
+#SBATCH --mem=40g
+#SBATCH --partition=$PARTITIONS
+#SBATCH --time=96:00:00
+#SBATCH --cpus-per-task=2
+
+$MODULE_STR
+
+cd \$SLURM_SUBMIT_DIR
+
+snakemake -s $SNAKEFILE \
+--directory $WORKDIR \
+$SINGULARITY_STR \
+--printshellcmds \
+--latency-wait 120 \
+--configfile $CONFIGFILE \
+--cluster-config $CLUSTERFILE \
+--cluster "$CLUSTER_SBATCH_CMD" \
+-j 500 \
+--rerun-incomplete \
+${RERUNTRIGGERS} \
+--restart-times ${retries} \
+--keep-going \
+--stats ${WORKDIR}/snakemake.stats \
+2>&1|tee ${WORKDIR}/snakemake.log
+
+if [ "\$?" -eq "0" ];then
+  snakemake -s $SNAKEFILE \
+  --directory $WORKDIR \
+  --report ${WORKDIR}/runslurm_snakemake_report.html \
+  --configfile $CONFIGFILE
+fi
+
+EOF
+
+  cd $WORKDIR
+  if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then
+    $EXPORT_SING_CACHE_DIR_CMD && \
+    sbatch submit_script.sbatch
+  else
+    sbatch submit_script.sbatch
+  fi
+
+##########################################################################################
+# unlock or dry-run
+##########################################################################################
+
+  else # for unlock and dryrun
+
+  _set_rand_str
+  set_cluster_arg
+
+  cat > ${HOME}/${RAND_STR} << EOF
+#/bin/bash
+set -exo pipefail
+$MODULE_STR
+
+snakemake $1 -s $SNAKEFILE \
+--directory $WORKDIR \
+--printshellcmds \
+--latency-wait 120 \
+--configfile $CONFIGFILE \
+--cluster-config $CLUSTERFILE \
+--cluster "$CLUSTER_SBATCH_CMD" \
+-j 500 \
+--rerun-incomplete \
+$RERUNTRIGGERS \
+--keep-going \
+--reason \
+--stats ${WORKDIR}/snakemake.stats
+
+EOF
+
+    if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then
+      $EXPORT_SING_CACHE_DIR_CMD && \
+      bash ${HOME}/${RAND_STR}
+    else
+      bash ${HOME}/${RAND_STR}
+    fi
+
+    rm -rf ${HOME}/${RAND_STR}
+
+  fi
+
+}
+
+##########################################################################################
+# RESET ... delete workdir and then initialize
+##########################################################################################
+
+function reset() {
+# Delete the workdir and re-initialize it
+  printbanner $ASPENVERSION
+  echo "Working Dir: $WORKDIR"
+  if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi
+  echo "Deleting $WORKDIR"
+  rm -rf $WORKDIR
+  echo "Re-Initializing $WORKDIR"
+  init
+}
+
+##########################################################################################
+# Print singularity binds and exist
+##########################################################################################
+
+function printbinds(){
+  printbanner $ASPENVERSION
+  set_singularity_binds
+  echo $SINGULARITY_BINDS
+}
+
+##########################################################################################
+# MAIN ... command line argument parsing
+##########################################################################################
+
+function main(){
+# Main function which parses all arguments
+
+  if [ $# -eq 0 ]; then usage; exit 1; fi
+
+  for i in "$@"
+  do
+  case $i in
+      -m=*|--runmode=*)
+        RUNMODE="${i#*=}"
+      ;;
+      -w=*|--workdir=*)
+        WORKDIR="${i#*=}"
+      ;;
+      -e|--useenvmod)
+          USE_ENVMODULES=1
+      ;;
+      -c=*|--singcache=*)
+        SING_CACHE_DIR="${i#*=}"
+      ;;
+      -s=*|--manifest=*)
+        MANIFEST="${i#*=}"
+        if [ ! -f $MANIFEST ];then err "File $MANIFEST does NOT exist!";fi
+      ;;
+      -g=*|--genome=*)
+        GENOME="${i#*=}"
+        found=0
+        for g in $SUPPORTED_GENOMES;do
+          if [[ "$GENOME" == "$g" ]];then
+            found=1
+            break
+          fi
+        done
+        if [[ "$found" == "0" ]];then
+          err "$GENOME is not supported by ASPEN; Supported genomes are: $SUPPORTED_GENOMES"
+          exit 1
+        fi
+      ;;
+      --version)
+        printbanner ${ASPENVERSION} && exit 0;
+      ;;
+      -h|--help)
+        usage && exit 0;;
+      *)
+        err "Unknown argument:     $i!"    # unknown option
+      ;;
+  esac
+  done
+  WORKDIR=$(readlink -f "$WORKDIR")
+  if [[ -z $MANIFEST ]];then
+    MANIFEST=${PIPELINE_HOME}/config/samples.tsv
+  fi
+  echo "Working Dir       : $WORKDIR"
+  echo "Samples Manifest  : $MANIFEST"
+
+  # required files
+  CONFIGFILE="${WORKDIR}/config.yaml"
+  CLUSTERFILE="${WORKDIR}/cluster.json"
+  JOBBY="${WORKDIR}/jobby"
+  JOBBY2="${WORKDIR}/run_jobby_on_snakemake_log"
+
+  # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh"
+
+  if [[ ! -z "$SING_CACHE_DIR" ]];then
+    EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\""
+  else
+    EXPORT_SING_CACHE_DIR_CMD=""
+  fi
+
+  case $RUNMODE in
+    init)     init      && exit 0;;
+    dag)      dag       && exit 0;;
+    dryrun)   dryrun    && exit 0;;
+    unlock)   unlock    && exit 0;;
+    run)      runslurm  && exit 0;;
+    runlocal) runlocal  && exit 0;;
+    reset)    reset     && exit 0;;
+    dry)      dryrun    && exit 0;;                      # hidden option
+    local)    runlocal  && exit 0;;                  # hidden option
+    reconfig) reconfig  && exit 0;;               # hidden option for debugging
+    printbinds) printbinds && exit 0;;           # hidden option
+    *) err "Unknown RUNMODE \"$RUNMODE\"";;
+  esac
+}
+
+# call the main function
+
+main "$@"
diff --git a/workflow/scripts/_aggregate_motif_enrichments.py b/workflow/scripts/_aggregate_motif_enrichments.py
new file mode 100644
index 0000000..dc80f8a
--- /dev/null
+++ b/workflow/scripts/_aggregate_motif_enrichments.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+import sys
+import glob
+import pathlib
+import pandas as pd
+
+inputfolder = sys.argv[1]
+outexcelfile = sys.argv[2]
+files = glob.glob("**/ame_results.txt", recursive=True)
+files.sort()
+repname2file = dict()
+with pd.ExcelWriter(outexcelfile) as writer:
+    for f in files:
+        x = pathlib.Path(f)
+        y = list(
+            filter(
+                lambda x: "narrowPeak_motif_enrichment" in x,
+                map(lambda x: str(x), x.parents),
+            )
+        )
+        if len(y) != 1:
+            continue
+        repname = y[0].split(".")[0]
+        repname2file[repname] = f
+        df = pd.read_csv(f, sep="\t", header=0, skip_blank_lines=True)
+        df.to_excel(writer, sheet_name=repname, index=False)
diff --git a/workflow/scripts/aggregate_results.Rmd b/workflow/scripts/aggregate_results.Rmd
new file mode 100644
index 0000000..2575cf2
--- /dev/null
+++ b/workflow/scripts/aggregate_results.Rmd
@@ -0,0 +1,161 @@
+---
+title: "aggregate_DESeq2_results.Rmd"
+output:
+#  pdf_document:
+#    toc: yes
+  html_document:
+    toc: yes
+    toc_float: yes
+    code_folding: hide
+params:
+  rawcountsmatrix: ROI.counts.tsv
+  coldata: sampleinfo.tsv
+  diffatacdir: /Users/kopardevn/Documents/Projects/ccbr1266/ATACseq/DiffAtac
+  FC_cutoff: 2
+  FDR_cutoff: 0.05
+  indexcols: Geneid
+  excludecols: Chr,Start,End,Strand,Length
+  outtsv: all_diff_atacs.tsv
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+suppressPackageStartupMessages(library("DESeq2"))
+suppressPackageStartupMessages(library("edgeR"))
+suppressPackageStartupMessages(library("tidyverse"))
+suppressPackageStartupMessages(library("dplyr"))
+suppressPackageStartupMessages(library("DT"))
+suppressPackageStartupMessages(library("reshape2"))
+suppressPackageStartupMessages(library("pander"))
+suppressPackageStartupMessages(library("plotly"))
+suppressPackageStartupMessages(library("ggplot2"))
+suppressPackageStartupMessages(library("ggfortify"))
+suppressPackageStartupMessages(library("ggrepel"))
+suppressPackageStartupMessages(library("RColorBrewer"))
+suppressPackageStartupMessages(library("ComplexHeatmap"))
+
+# read in sampleinfo
+coldata=params$coldata
+sampleinfo=read.csv(file=coldata,header=FALSE,sep="\t",comment.char = "#",strip.white = TRUE)
+colnames(sampleinfo) = c("replicateName","sampleName")
+
+# read in counts matrix
+rawcountsmatrix=params$rawcountsmatrix
+countsdf=read.csv(file=rawcountsmatrix,header=TRUE,sep="\t",comment.char = "#",check.names = FALSE,strip.white = TRUE)
+original_countsdf = countsdf
+nroi=nrow(countsdf)
+
+excludecols=unlist(strsplit(params$excludecols,","))
+indexcols=unlist(strsplit(params$indexcols,","))
+countmatrix=countsdf %>% dplyr::select(-all_of(excludecols)) %>% column_to_rownames(var="Geneid")
+countmatrix=countmatrix[,sampleinfo$replicateName]
+sampleinfo$librarySize=colSums(countmatrix)
+nsamples=nrow(sampleinfo)
+```
+
+## ROIs
+
+Total of `r nroi` ROIs
+
+## Sample Stats
+
+Total of `r nsamples` replicates.
+
+```{r metadata,include=TRUE,echo=FALSE,cache=FALSE}
+pander(sampleinfo,style="rmarkdown")
+
+```
+
+## PCA
+
+```{r allsamplepca,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE}
+dds1 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix),
+                                colData = sampleinfo[,c("replicateName","sampleName")],
+                                design = ~ 1)
+dds1 <- DESeq(dds1)
+rld1 <- vst(dds1)
+assayrld1 = as.data.frame(assay(rld1))
+assayrld1$row_variance = rowVars(as.matrix(assayrld1))
+assayrld1 = arrange(assayrld1,desc(row_variance))
+zero_variance_rows=assayrld1$row_variance<1e-5
+assayrld1$row_variance = NULL
+assayrld1 = assayrld1[!zero_variance_rows,]
+assayrld1_original=assayrld1
+if (nrow(assayrld1) > 50000) {
+  assayrld1 = assayrld1[1:50000,]
+}
+pca1=prcomp(t(assayrld1),scale. = T)
+m.pc1 = round(pca1$sdev[1]^2/sum(pca1$sdev^2)*100,2)
+m.pc2 = round(pca1$sdev[2]^2/sum(pca1$sdev^2)*100,2)
+m.pc3 = round(pca1$sdev[3]^2/sum(pca1$sdev^2)*100,2)
+xlab=paste0("PC1(",m.pc1,"%)")
+ylab=paste0("PC2(",m.pc2,"%)")
+samples <- as.factor(sampleinfo$sampleName)
+p <- ggplot(pca1$x,aes(x=PC1,y=PC2,label=rownames(pca1$x)))+
+  geom_point(aes(col=samples,shape=samples))+scale_shape_manual(values=seq(0,15))+
+  geom_text_repel(max.overlaps = 15,size=2)+
+  xlab(xlab)+ylab(ylab)+
+  theme_light(base_size = 10)+
+  ggtitle("PCA (all samples)")+
+  theme_light()
+print(p)
+```
+
+```{r degs,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE}
+allfiles = list.files(path=params$diffatacdir,pattern="_vs_")
+degfiles = allfiles[grepl(".tsv",allfiles)]
+contrasts = gsub(".tsv","",degfiles)
+# deg_file=paste(params$diffatacdir,"DGCR8_KO_vs_Wt.tsv",sep="/")
+add_prefix <- function(string,prefix) {
+  return (paste(prefix,string,sep="_"))
+}
+get_sig_geneids <- function(df,fc=2,fdr=0.05) {
+  df %>% filter(log2FoldChange > fc) %>% filter(padj < fdr) -> df_up
+  df %>% filter(log2FoldChange < (-1 * fc)) %>% filter(padj < fdr) -> df_down
+  return (c(df_up$Geneid,df_down$Geneid))
+}
+read_deg_file <- function(deg_file) {
+  df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE)
+  df %>% select(c("Geneid","log2FoldChange","pvalue","padj")) -> df
+  return (df)
+}
+get_anno_from_deg_file <- function(deg_file) {
+  df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE)
+  df %>% select(-c("baseMean","lfcSE","stat","width","strand","log2FoldChange","pvalue","padj")) -> df
+  return (df)
+}
+rename_columns <- function(df,contrast) {
+  colnames(df) <- c("Geneid",add_prefix(string="log2FoldChange",prefix=contrast),add_prefix(string="pvalue",prefix=contrast),add_prefix(string="padj",prefix=contrast))
+  return(df)
+}
+all_sig_geneids = c()
+dfs = list()
+collist = c()
+for (i in 1:length(degfiles)){
+  # i=1
+  deg_file=paste(params$diffatacdir,degfiles[i],sep="/")
+  contrast=contrasts[i]
+  x <- read_deg_file(deg_file)
+  all_sig_geneids <- c(all_sig_geneids,get_sig_geneids(x,params$FC_cutoff,params$FDR_cutoff))
+  dfs[[contrast]] <- rename_columns(x,contrast)
+  for (j in c("log2FoldChange","pvalue","padj")){
+    k=add_prefix(j,contrast)
+    collist = c(collist,k)
+  }
+}
+all_sig_geneids <- unique(all_sig_geneids)
+dfs[["anno"]] <- get_anno_from_deg_file(deg_file=paste(params$diffatacdir,degfiles[1],sep="/"))
+dfs %>% purrr::reduce(inner_join, by="Geneid") -> merged_dfs
+write.table(merged_dfs,file=params$outtsv,quote = FALSE,sep="\t",row.names = FALSE,col.names = TRUE)
+DT::datatable(merged_dfs,rownames = FALSE) %>% DT::formatRound(columns=collist,digits=3)
+```
+
+## Heatmap
+
+Considering all significantly UP or DOWN ROIs aggregated across all contrasts. (Total `r length(all_sig_geneids)` ROIs)
+
+```{r heatmap,include=TRUE,echo=FALSE, message=FALSE, results='hide',fig.width=4,fig.height=10,warning=FALSE,message=FALSE,error=FALSE}
+heatmap_matrix=assay(rld1)
+heatmap_matrix=heatmap_matrix[all_sig_geneids,]
+Heatmap(matrix = t(scale(t(heatmap_matrix),scale = TRUE)),show_row_names = FALSE,name=" ")
+```
diff --git a/workflow/scripts/aggregate_results_runner.R b/workflow/scripts/aggregate_results_runner.R
new file mode 100644
index 0000000..bde78ed
--- /dev/null
+++ b/workflow/scripts/aggregate_results_runner.R
@@ -0,0 +1,83 @@
+#!/usr/bin/env Rscript
+suppressPackageStartupMessages(library("argparse"))
+
+
+# create parser object
+parser <- ArgumentParser(description="Wrapper for running aggregate_results.Rmd")
+
+
+parser$add_argument("-m", "--countsmatrix",
+                    type="character",
+                    help="path to countsmatrix",
+                    required=TRUE)
+parser$add_argument("-a", "--diffatacdir",
+                    type="character",
+                    help="diff atac dir",
+                    required=TRUE)
+parser$add_argument("-c", "--coldata",
+                    type="character",
+                    help="coldata or sampleinto TSV file",
+                    required=TRUE)
+parser$add_argument("-f", "--foldchange",
+                    type="double",
+                    help="log2FC threshold",
+                    required=FALSE,
+                    default=2.0)
+parser$add_argument("-p", "--fdr",
+                    type="double",
+                    help="adj. p-value threshold",
+                    required=FALSE,
+                    default=0.05)
+parser$add_argument("-i", "--indexcols",
+                    type="character",
+                    help="comma-separated list of index columns",
+                    required=TRUE)
+parser$add_argument("-e", "--excludecols",
+                    type="character",
+                    help="comma-separated list of columns to exclude",
+                    required=TRUE)
+parser$add_argument("-t", "--tmpdir",
+                    type="character",
+                    help="temp dir",
+                    required=FALSE,
+		                default="/tmp")
+parser$add_argument("-s", "--scriptsdir",
+                    type="character",
+                    help="scripts dir",
+                    required=TRUE)
+
+args <- parser$parse_args()
+
+# check inputs
+for ( f in c(args$countmatrix,args$coldata)) {
+  if (!file.exists(f)){
+    outstr=paste("ERROR: File: ",f,"Does not exists!")
+    message(outstr)
+    quit(status = 1)
+  }
+  if (file.access(f,mode=4)!=0){
+    outstr=paste("ERROR: File: ",f,"is not readable!")
+    message(outstr)
+    quit(status = 1)
+  }
+}
+setwd(args$diffatacdir)
+
+
+outprefix =	"all_diff_atacs"
+outhtml 	=	paste0(args$diffatacdir,"/",outprefix,".html")
+outtsv	 	=	paste0(args$diffatacdir,"/",outprefix,".tsv")
+
+myparams <- list(rawcountsmatrix 	= args$countsmatrix,
+                 coldata 		  = args$coldata,
+                 FC_cutoff 		= args$foldchange,
+                 FDR_cutoff 	= args$fdr,
+                 diffatacdir  = args$diffatacdir,
+                 indexcols 		= args$indexcols,
+                 excludecols 	= args$excludecols,
+                 outtsv       = outtsv)
+
+rmarkdown::render(paste(args$scriptsdir,'aggregate_results.Rmd',sep="/"),
+          params	    =	myparams,
+	        output_file	=	outhtml,
+	  intermediates_dir	=	args$tmpdir)