From 7bf43d56b373d51655c4721a905f6920c15aafc6 Mon Sep 17 00:00:00 2001 From: kopardev Date: Fri, 24 Nov 2023 13:14:03 -0500 Subject: [PATCH] chore: typo fixing --- aspen | 773 ++++++++++++++++++ .../scripts/_aggregate_motif_enrichments.py | 26 + workflow/scripts/aggregate_results.Rmd | 161 ++++ workflow/scripts/aggregate_results_runner.R | 83 ++ 4 files changed, 1043 insertions(+) create mode 100755 aspen create mode 100644 workflow/scripts/_aggregate_motif_enrichments.py create mode 100644 workflow/scripts/aggregate_results.Rmd create mode 100644 workflow/scripts/aggregate_results_runner.R diff --git a/aspen b/aspen new file mode 100755 index 0000000..b677238 --- /dev/null +++ b/aspen @@ -0,0 +1,773 @@ +#!/usr/bin/env bash +# Author: Vishal Koparde, Ph.D. +# CCBR, NCI +# (c) 2021 +# +# wrapper script to run the snakemake pipeline +# a) on an interactive node (runlocal) OR +# b) submit to the slurm load scheduler (run) +# +# DISCLAIMER: This wrapper only works on BIOWULF + +# set -exo pipefail +# module purge +retries="2" + +########################################################################################## +# functions +########################################################################################## + +function get_git_commitid_tag() { +# This function gets the latest git commit id and tag +# Input is PIPELINE_HOME folder which is a git folder + cd $1 + gid=$(git rev-parse HEAD) + tag=$(git describe --tags $gid 2>/dev/null) + echo -ne "$gid\t$tag" +} + +function printbanner() { +versionnumber=$1 +cat << EOF +____ ____ ___ ____ _ _ +|__| [__ |__] |___ |\ | +| | ___] | |___ | \| v${versionnumber} +EOF +} + +########################################################################################## +# initial setup +########################################################################################## + +# ## setting PIPELINE_HOME +PIPELINE_HOME=$(readlink -f $(dirname "$0")) + +# set snakefile +SNAKEFILE="${PIPELINE_HOME}/workflow/Snakefile" + +VERSIONFILE="${PIPELINE_HOME}/VERSION" + +# get github commit tag +GIT_COMMIT_TAG=$(get_git_commitid_tag $PIPELINE_HOME) + +########################################################################################## +# Some more set up +########################################################################################## + +PYTHONVERSION="python/3.10" +SNAKEMAKEVERSION="snakemake" +#SINGULARITYVERSION="singularity/3.7.4" +SINGULARITYVERSION="singularity" +ASPENVERSION=$(head -n1 $VERSIONFILE|awk '{print $1}') + +# set defaults +GENOME="hg38" +SUPPORTED_GENOMES="hg19 hg38 mm10 mmul10 bosTau9 hs1" + +# essential files +# these are relative to the workflows' base folder +# these are copied into the WORKDIR +ESSENTIAL_FILES="config/config.yaml config/samples.tsv resources/cluster.json resources/tools.yaml" +ESSENTIAL_FOLDERS="workflow/scripts" + +SCRIPTNAME="$0" +SCRIPTDIRNAME=$(readlink -f $(dirname $0)) +SCRIPTBASENAME=$(readlink -f $(basename $0)) + +# set extra singularity bindings comma separated +# /data/CCBR_Pipeliner required for fastq_screen_config.txt to work +EXTRA_SINGULARITY_BINDS="/lscratch,/data/CCBR_Pipeliner" + +########################################################################################## +# USAGE +########################################################################################## + +function usage() { cat << EOF + +########################################################################################## + +Welcome to +EOF +printbanner $ASPENVERSION +cat << EOF + +A_TAC_S_eq A_nalysis P_ip_E_li_N_e + +########################################################################################## + +This pipeline was built by CCBR (https://bioinformatics.ccr.cancer.gov/ccbr) +Please contact Vishal Koparde for comments/questions (vishal.koparde@nih.gov) + +########################################################################################## + +Here is a list of genome supported by aspen: + + * hg19 [Human] + * hg38 [Human] + * mm10 [Mouse] + * mmul10 [Macaca mulatta(Rhesus monkey) or rheMac10] + * bosTau9 [Bos taurus(cattle)] + +aspen calls peaks using the following tools: + + * MACS2 + * Genrich [RECOMMENDED FOR USE] + +USAGE: + bash ${SCRIPTNAME} -w/--workdir= -m/--runmode= + +Required Arguments: +1. WORKDIR : [Type: String]: Absolute or relative path to the output folder with write permissions. + +2. RUNMODE : [Type: String] Valid options: + * init : initialize workdir + * dryrun : dry run snakemake to generate DAG + * run : run with slurm + * runlocal : run without submitting to sbatch + ADVANCED RUNMODES (use with caution!!) + * unlock : unlock WORKDIR if locked by snakemake NEVER UNLOCK WORKDIR WHERE PIPELINE IS CURRENTLY RUNNING! + * reconfig : recreate config file in WORKDIR (debugging option) EDITS TO config.yaml WILL BE LOST! + * reset : DELETE workdir dir and re-init it (debugging option) EDITS TO ALL FILES IN WORKDIR WILL BE LOST! + * printbinds: print singularity binds (paths) + * local : same as runlocal + +Optional Arguments: + +--genome|-g : genome eg. hg38 +--manifest|-s : absolute path to samples.tsv. This will be copied to output folder (--runmode=init only) +--useenvmod|-e : use "--use-enmodules" option while running Snakemake. This is for using modules on HPC instead of containers(default). +--help|-h : print this help + +Example commands: + bash ${SCRIPTNAME} -w=/my/output/folder -m=init + bash ${SCRIPTNAME} -w=/my/output/folder -m=dryrun + bash ${SCRIPTNAME} -w=/my/output/folder -m=run + +########################################################################################## + +VersionInfo: + python : $PYTHONVERSION + snakemake : $SNAKEMAKEVERSION + pipeline_home : $PIPELINE_HOME + git commit/tag : $GIT_COMMIT_TAG + aspen_version : v${ASPENVERSION} + +########################################################################################## + +EOF +} + +########################################################################################## +# ERR +########################################################################################## + +function err() { usage && cat <<< " +# +# ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR +# + $@ +# +# ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR +# +" && exit 1 1>&2; } + + +########################################################################################## +# INIT +########################################################################################## + +function init() { + +# This function initializes the workdir by: +# 1. creating the working dir +# 2. copying essential files like config.yaml and samples.tsv into the workdir +# 3. setting up logs and stats folders + +printbanner $ASPENVERSION + +# create output folder +if [ -d $WORKDIR ];then err "Folder $WORKDIR already exists!"; fi +mkdir -p $WORKDIR + +# copy essential files +# for f in $ESSENTIAL_FILES;do +f="${PIPELINE_HOME}/config/config.yaml" +echo "Copying essential file: $f" +fbn=$(basename $f) +sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \ + -e "s/WORKDIR/${WORKDIR//\//\\/}/g" \ + -e "s/GENOME/${GENOME}/g" $f > $WORKDIR/$fbn + +for f in $MANIFEST ${PIPELINE_HOME}/resources/cluster.json ${PIPELINE_HOME}/resources/tools.yaml +do + echo "Copying essential file: $f" + fbn=$(basename $f) + cp $f $WORKDIR/$fbn +done + +# copy essential folders +for f in $ESSENTIAL_FOLDERS;do + # rsync -az --progress ${PIPELINE_HOME}/$f $WORKDIR/ + cp -rv ${PIPELINE_HOME}/$f ${WORKDIR}/ +done + +cd ${WORKDIR} +curl https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/jobby -O +curl https://github.com/CCBR/Tools/blob/master/Biowulf/run_jobby_on_snakemake_log -O + +chmod a+x ${WORKDIR}/jobby +chmod a+x ${WORKDIR}/run_jobby_on_snakemake_log + +#create log and stats folders +if [ ! -d $WORKDIR/logs ]; then mkdir -p $WORKDIR/logs;echo "Logs Dir: $WORKDIR/logs";fi +if [ ! -d $WORKDIR/stats ];then mkdir -p $WORKDIR/stats;echo "Stats Dir: $WORKDIR/stats";fi + +cat << EOF +Done Initializing : $WORKDIR +You can now edit : $WORKDIR/config.yaml and + $WORKDIR/samples.tsv +EOF + +} + +########################################################################################## +# set random str +########################################################################################## + +function _set_rand_str() { + x=$(mktemp) + rm -rf $x + RAND_STR=$(echo $x|awk -F"." '{print $NF}') +} + +########################################################################################## +# CHECK ESSENTIAL FILES +########################################################################################## + +function check_essential_files() { + +# Checks if files essential to start running the pipeline exist in the workdir + + if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!"; fi + for f in config.yaml samples.tsv cluster.json tools.yaml; do + if [ ! -f $WORKDIR/$f ]; then err "Error: '${f}' file not found in workdir ... initialize first!";fi + done + +} + +########################################################################################## +# set --reuun-triggers to "mtime" ... only for newer snakemake +########################################################################################## + +function set_snakemake_rerun_triggers() { + runcheck + snakemakeVer=$(snakemake --version 2>/dev/null) + verlte() { +[ "$1" = "`echo -e "$1\n$2" | sort -V | head -n1`" ] + } + verlt() { +[ "$1" = "$2" ] && return 1 || verlte $1 $2 + } + snakemakeOld=$(verlt $snakemakeVer 7.8 && echo "yes" || echo "no") # check if snakemake is older than 7.8 + if [[ "$snakemakeVer" == "" || "$snakemakeOld" == "no" ]];then + RERUNTRIGGERS="--rerun-triggers mtime" + else + RERUNTRIGGERS="" + fi +} + +########################################################################################## +# RECONFIG ... recreate config.yaml and overwrite old version +########################################################################################## + +function reconfig(){ +# Rebuild config file and replace the config.yaml in the WORKDIR +# this is only for dev purposes when new key-value pairs are being added to the config file + + check_essential_files + sed -e "s/PIPELINE_HOME/${PIPELINE_HOME//\//\\/}/g" \ + -e "s/WORKDIR/${WORKDIR//\//\\/}/g" \ + -e "s/GENOME/${GENOME}/g" \ + ${PIPELINE_HOME}/config/config.yaml > $WORKDIR/config.yaml + echo "$WORKDIR/config.yaml has been updated!" + +} + +########################################################################################## +# SET SINGULARITY BINDS ... bind required singularity folders appropriately +########################################################################################## + +function set_singularity_binds(){ +# this functions tries find what folders to bind +# biowulf specific + echo "$PIPELINE_HOME" > ${WORKDIR}/tmp1 + echo "$WORKDIR" >> ${WORKDIR}/tmp1 + grep -o '\/.*' <(cat ${WORKDIR}/config.yaml ${WORKDIR}/samples.tsv) | tr '\t' '\n' | grep -v ' \|\/\/' | sort | uniq >> ${WORKDIR}/tmp1 + grep "^/" ${WORKDIR}/tmp1 | grep gpfs | awk -F'/' -v OFS='/' '{print $1,$2,$3,$4,$5}' | sort | uniq > ${WORKDIR}/tmp2 + grep "^/" ${WORKDIR}/tmp1 | grep -v gpfs | awk -F'/' -v OFS='/' '{print $1,$2,$3}' | sort | uniq > ${WORKDIR}/tmp3 + while read a;do readlink -f $a;done < ${WORKDIR}/tmp3 > ${WORKDIR}/tmp4 + binds=$(cat ${WORKDIR}/tmp2 ${WORKDIR}/tmp3 ${WORKDIR}/tmp4 | sort | uniq | tr '\n' ',') + rm -f ${WORKDIR}/tmp? + binds=$(echo $binds | awk '{print substr($1,1,length($1)-1)}') + SINGULARITY_BINDS="-B $EXTRA_SINGULARITY_BINDS,$binds" + SINGULARITY_STR="--use-singularity --singularity-args \"${SINGULARITY_BINDS}\"" +} + + +function set_cluster_arg(){ + CLUSTER_SBATCH_CMD="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" + clustername=$(scontrol show config|grep ClusterName|awk '{print $NF}') + if [[ "$clustername" == "biowulf" ]];then + CLUSTER_SBATCH_CMD="${CLUSTER_SBATCH_CMD} --gres {cluster.gres}" + fi +} +########################################################################################## +# RUNCHECK ... check essential files and load required packages +########################################################################################## + +function runcheck(){ +# Check "job-essential" files and load required modules + + check_essential_files + # MODULE_STR="module load $PYTHONVERSION $SNAKEMAKEVERSION singularity" + MODULE_STR=$( +cat << END_HEREDOC +command -V python 2>/dev/null || module load $PYTHONVERSION || (>&2 echo "module $PYTHONVERSION could not be loaded"; exit 1) +command -V snakemake 2>/dev/null || module load $SNAKEMAKEVERSION || (>&2 echo "module $SNAKEMAKEVERSION could not be loaded"; exit 1) +command -V singularity 2>/dev/null || module load singularity || (>&2 echo "module singularity could not be loaded"; exit 1) +END_HEREDOC + ) +# If not on BIOWULF then change MODULE_STR such that python, snakemake and singularity are all in PATH + +command -V python 2>/dev/null || module load $PYTHONVERSION || (>&2 echo "module $PYTHONVERSION could not be loaded"; exit 1) +command -V snakemake 2>/dev/null || module load $SNAKEMAKEVERSION || (>&2 echo "module $SNAKEMAKEVERSION could not be loaded"; exit 1) +command -V singularity 2>/dev/null || module load singularity || (>&2 echo "module singularity could not be loaded"; exit 1) + +} + +########################################################################################## +# DRYRUN ... also run automatically before actual run +########################################################################################## + +function dryrun() { +# Dry-run + runcheck + if [ -f ${WORKDIR}/dryrun.log ]; then + modtime=$(stat ${WORKDIR}/dryrun.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") + mv ${WORKDIR}/dryrun.log ${WORKDIR}/logs/dryrun.${modtime}.log + if [ -f ${WORKDIR}/dryrun_git_commit.txt ];then + mv ${WORKDIR}/dryrun_git_commit.txt ${WORKDIR}/logs/dryrun_git_commit.${modtime}.txt + fi + fi + run "--dry-run" | tee ${WORKDIR}/dryrun.log && \ + echo "Git Commit/Tag: $GIT_COMMIT_TAG" > ${WORKDIR}/dryrun_git_commit.txt +} + +########################################################################################## +# UNLOCK +########################################################################################## + +function unlock() { +# Unlock the workdir if previous snakemake run ended abruptly + + runcheck + run "--unlock" +} + +########################################################################################## +# DAG +########################################################################################## + +function dag() { + runcheck + snakemake -s $SNAKEFILE --configfile ${WORKDIR}/config.yaml --forceall --dag |dot -Teps > ${WORKDIR}/dag.eps +} + +########################################################################################## +# PRINT SINGULARITY BINDS ... print bound singularity folders for debugging +########################################################################################## + +function printbinds(){ + set_singularity_binds + echo $SINGULARITY_BINDS +} + +########################################################################################## +# RUNLOCAL ... run directly on local interactive node ... no submission to SLURM +########################################################################################## + +function runlocal() { +# If the pipeline is fired up on an interactive node (with sinteractive), this function runs the pipeline + + runcheck + set_singularity_binds + set_cluster_arg + if [ "$SLURM_JOB_ID" == "" ];then err "runlocal can only be done on an interactive node"; exit 1; fi + module load singularity + run "--dry-run" && echo "Dry-run was successful .... starting local execution" && \ + run "local" +} + +########################################################################################## +# RUNSLURM ... submit head job to slurm which will spawn other jobs on SLURM +########################################################################################## + +function runslurm() { +# Submit the execution of the pipeline to the biowulf job scheduler (slurm) + + runcheck + set_singularity_binds + set_cluster_arg + run "--dry-run" && \ + echo "Dry-run was successful .... submitting jobs to job-scheduler" && \ + run "slurm" +} + +########################################################################################## +# CREATE RUNINFO ... create runinfo.yaml in workdir +########################################################################################## + +function create_runinfo { + modtime=$1 + if [ "$modtime" == "" ];then + modtime=$(stat ${WORKDIR}/runinfo.yaml|grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") + fi + if [ -f ${WORKDIR}/runinfo.yaml ];then + mv ${WORKDIR}/runinfo.yaml ${WORKDIR}/stats/runinfo.${modtime}.yaml + fi + echo "Pipeline Dir: $PIPELINE_HOME" > ${WORKDIR}/runinfo.yaml + echo "Git Commit/Tag: $GIT_COMMIT_TAG" >> ${WORKDIR}/runinfo.yaml + userlogin=$(whoami) + if [[ `which finger 2>/dev/null` ]];then + username=$(finger $userlogin |grep ^Login | awk -F"Name: " '{print $2}'); + elif [[ `which lslogins 2>/dev/null` ]];then + username=$(lslogins -u $userlogin | grep ^Geco | awk -F": " '{print $2}' | awk '{$1=$1;print}'); + else username="NOTFOUND";fi + echo "Login: $userlogin" >> ${WORKDIR}/runinfo.yaml + echo "Name: $username" >> ${WORKDIR}/runinfo.yaml + g=$(groups) + echo "Groups: $g" >> ${WORKDIR}/runinfo.yaml + d=$(date) + echo "Date/Time: $d" >> ${WORKDIR}/runinfo.yaml +} + +########################################################################################## +# PRERUN CLEANUP ... get ready to run .. park old logs/stats etc. +########################################################################################## + +function preruncleanup() { +# Cleanup function to rename/move files related to older runs to prevent overwriting them. + + echo "Running..." + + # check initialization + check_essential_files + + cd $WORKDIR + ## Archive previous run files + if [ -f ${WORKDIR}/snakemake.log ];then + modtime=$(stat ${WORKDIR}/snakemake.log |grep Modify|awk '{print $2,$3}'|awk -F"." '{print $1}'|sed "s/ //g"|sed "s/-//g"|sed "s/://g") + mv ${WORKDIR}/snakemake.log ${WORKDIR}/logs/snakemake.${modtime}.log + if [ -f ${WORKDIR}/snakemake.log.HPC_summary.txt ];then + mv ${WORKDIR}/snakemake.log.HPC_summary.txt ${WORKDIR}/stats/snakemake.${modtime}.log.HPC_summary.txt + fi + if [ -f ${WORKDIR}/snakemake.stats ];then + mv ${WORKDIR}/snakemake.stats ${WORKDIR}/stats/snakemake.${modtime}.stats + fi + if [ -f ${WORKDIR}/run_git_commit.txt ];then + mv ${WORKDIR}/run_git_commit.txt ${WORKDIR}/logs/run_git_commit.${modtime}.txt + fi + fi + nslurmouts=$(find ${WORKDIR} -maxdepth 1 -name "slurm-*.out" |wc -l) + if [ "$nslurmouts" != "0" ];then + for f in $(ls ${WORKDIR}/slurm-*.out);do mv ${f} ${WORKDIR}/logs/;done + fi + + create_runinfo $modtime + +} + + +function run() { +# RUN function +# argument1 can be: +# 1. local or +# 2. dryrun or +# 3. unlock or +# 4. slurm + +########################################################################################## +# local run +########################################################################################## + + if [ "$1" == "local" ];then + + preruncleanup + echo "Done preruncleanup!" + + # --use-envmodules \ + _set_rand_str + + cat > ${HOME}/${RAND_STR} << EOF +#/bin/bash +set -exo pipefail + +$MODULE_STR + +snakemake -s $SNAKEFILE \ +--directory $WORKDIR \ +--printshellcmds \ +$SINGULARITY_STR \ +--latency-wait 120 \ +--configfile $CONFIGFILE \ +--cores all \ +--rerun-incomplete \ +${RERUNTRIGGERS} \ +--restart-times ${retries} \ +--keep-going \ +--stats ${WORKDIR}/snakemake.stats \ +2>&1|tee ${WORKDIR}/snakemake.log + +# if [ "$?" -eq "0" ];then +# snakemake -s $SNAKEFILE \ +# --report ${WORKDIR}/runlocal_snakemake_report.html \ +# --directory $WORKDIR \ +# --configfile $CONFIGFILE +# fi + +EOF + + if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then + $EXPORT_SING_CACHE_DIR_CMD && \ + bash ${HOME}/${RAND_STR} + else + bash ${HOME}/${RAND_STR} + fi + + rm -rf ${HOME}/${RAND_STR} + +########################################################################################## +# slurm run +########################################################################################## + + elif [ "$1" == "slurm" ];then + + preruncleanup +# if QOS is other than "global" and is supplied in the cluster.json file then add " --qos={cluster.qos}" to the +# snakemake command below +#define partitions +BUYINPARTITIONS=$(bash <(curl -s https://raw.githubusercontent.com/CCBR/Tools/master/Biowulf/get_buyin_partition_list.bash 2>/dev/null)) +# remove "norm" partition +BUYINPARTITIONS=$(echo $BUYINPARTITIONS | tr ',' '\n' | grep -v "norm" | tr '\n' ',' | sed 's/.$//') +PARTITIONS="norm" +if [ ! -z "$BUYINPARTITIONS" ];then +# as only 2 partitions are allow and 1 is norm .. so we randomly pick the first buyin partition + BUYINPARTITIONS=$(echo $BUYINPARTITIONS | tr ',' '\n' | head -n1) + PARTITIONS="norm,$BUYINPARTITIONS" +fi + + cat > ${WORKDIR}/submit_script.sbatch << EOF +#!/bin/bash +#SBATCH --job-name="apsen" +#SBATCH --mem=40g +#SBATCH --partition=$PARTITIONS +#SBATCH --time=96:00:00 +#SBATCH --cpus-per-task=2 + +$MODULE_STR + +cd \$SLURM_SUBMIT_DIR + +snakemake -s $SNAKEFILE \ +--directory $WORKDIR \ +$SINGULARITY_STR \ +--printshellcmds \ +--latency-wait 120 \ +--configfile $CONFIGFILE \ +--cluster-config $CLUSTERFILE \ +--cluster "$CLUSTER_SBATCH_CMD" \ +-j 500 \ +--rerun-incomplete \ +${RERUNTRIGGERS} \ +--restart-times ${retries} \ +--keep-going \ +--stats ${WORKDIR}/snakemake.stats \ +2>&1|tee ${WORKDIR}/snakemake.log + +if [ "\$?" -eq "0" ];then + snakemake -s $SNAKEFILE \ + --directory $WORKDIR \ + --report ${WORKDIR}/runslurm_snakemake_report.html \ + --configfile $CONFIGFILE +fi + +EOF + + cd $WORKDIR + if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then + $EXPORT_SING_CACHE_DIR_CMD && \ + sbatch submit_script.sbatch + else + sbatch submit_script.sbatch + fi + +########################################################################################## +# unlock or dry-run +########################################################################################## + + else # for unlock and dryrun + + _set_rand_str + set_cluster_arg + + cat > ${HOME}/${RAND_STR} << EOF +#/bin/bash +set -exo pipefail +$MODULE_STR + +snakemake $1 -s $SNAKEFILE \ +--directory $WORKDIR \ +--printshellcmds \ +--latency-wait 120 \ +--configfile $CONFIGFILE \ +--cluster-config $CLUSTERFILE \ +--cluster "$CLUSTER_SBATCH_CMD" \ +-j 500 \ +--rerun-incomplete \ +$RERUNTRIGGERS \ +--keep-going \ +--reason \ +--stats ${WORKDIR}/snakemake.stats + +EOF + + if [[ "$EXPORT_SING_CACHE_DIR_CMD" != "" ]];then + $EXPORT_SING_CACHE_DIR_CMD && \ + bash ${HOME}/${RAND_STR} + else + bash ${HOME}/${RAND_STR} + fi + + rm -rf ${HOME}/${RAND_STR} + + fi + +} + +########################################################################################## +# RESET ... delete workdir and then initialize +########################################################################################## + +function reset() { +# Delete the workdir and re-initialize it + printbanner $ASPENVERSION + echo "Working Dir: $WORKDIR" + if [ ! -d $WORKDIR ];then err "Folder $WORKDIR does not exist!";fi + echo "Deleting $WORKDIR" + rm -rf $WORKDIR + echo "Re-Initializing $WORKDIR" + init +} + +########################################################################################## +# Print singularity binds and exist +########################################################################################## + +function printbinds(){ + printbanner $ASPENVERSION + set_singularity_binds + echo $SINGULARITY_BINDS +} + +########################################################################################## +# MAIN ... command line argument parsing +########################################################################################## + +function main(){ +# Main function which parses all arguments + + if [ $# -eq 0 ]; then usage; exit 1; fi + + for i in "$@" + do + case $i in + -m=*|--runmode=*) + RUNMODE="${i#*=}" + ;; + -w=*|--workdir=*) + WORKDIR="${i#*=}" + ;; + -e|--useenvmod) + USE_ENVMODULES=1 + ;; + -c=*|--singcache=*) + SING_CACHE_DIR="${i#*=}" + ;; + -s=*|--manifest=*) + MANIFEST="${i#*=}" + if [ ! -f $MANIFEST ];then err "File $MANIFEST does NOT exist!";fi + ;; + -g=*|--genome=*) + GENOME="${i#*=}" + found=0 + for g in $SUPPORTED_GENOMES;do + if [[ "$GENOME" == "$g" ]];then + found=1 + break + fi + done + if [[ "$found" == "0" ]];then + err "$GENOME is not supported by ASPEN; Supported genomes are: $SUPPORTED_GENOMES" + exit 1 + fi + ;; + --version) + printbanner ${ASPENVERSION} && exit 0; + ;; + -h|--help) + usage && exit 0;; + *) + err "Unknown argument: $i!" # unknown option + ;; + esac + done + WORKDIR=$(readlink -f "$WORKDIR") + if [[ -z $MANIFEST ]];then + MANIFEST=${PIPELINE_HOME}/config/samples.tsv + fi + echo "Working Dir : $WORKDIR" + echo "Samples Manifest : $MANIFEST" + + # required files + CONFIGFILE="${WORKDIR}/config.yaml" + CLUSTERFILE="${WORKDIR}/cluster.json" + JOBBY="${WORKDIR}/jobby" + JOBBY2="${WORKDIR}/run_jobby_on_snakemake_log" + + # CLUSTERSTATUSCMD="${PIPELINE_HOME}/resources/cluster_status.sh" + + if [[ ! -z "$SING_CACHE_DIR" ]];then + EXPORT_SING_CACHE_DIR_CMD="export SINGULARITY_CACHEDIR=\"${SING_CACHE_DIR}\"" + else + EXPORT_SING_CACHE_DIR_CMD="" + fi + + case $RUNMODE in + init) init && exit 0;; + dag) dag && exit 0;; + dryrun) dryrun && exit 0;; + unlock) unlock && exit 0;; + run) runslurm && exit 0;; + runlocal) runlocal && exit 0;; + reset) reset && exit 0;; + dry) dryrun && exit 0;; # hidden option + local) runlocal && exit 0;; # hidden option + reconfig) reconfig && exit 0;; # hidden option for debugging + printbinds) printbinds && exit 0;; # hidden option + *) err "Unknown RUNMODE \"$RUNMODE\"";; + esac +} + +# call the main function + +main "$@" diff --git a/workflow/scripts/_aggregate_motif_enrichments.py b/workflow/scripts/_aggregate_motif_enrichments.py new file mode 100644 index 0000000..dc80f8a --- /dev/null +++ b/workflow/scripts/_aggregate_motif_enrichments.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import sys +import glob +import pathlib +import pandas as pd + +inputfolder = sys.argv[1] +outexcelfile = sys.argv[2] +files = glob.glob("**/ame_results.txt", recursive=True) +files.sort() +repname2file = dict() +with pd.ExcelWriter(outexcelfile) as writer: + for f in files: + x = pathlib.Path(f) + y = list( + filter( + lambda x: "narrowPeak_motif_enrichment" in x, + map(lambda x: str(x), x.parents), + ) + ) + if len(y) != 1: + continue + repname = y[0].split(".")[0] + repname2file[repname] = f + df = pd.read_csv(f, sep="\t", header=0, skip_blank_lines=True) + df.to_excel(writer, sheet_name=repname, index=False) diff --git a/workflow/scripts/aggregate_results.Rmd b/workflow/scripts/aggregate_results.Rmd new file mode 100644 index 0000000..2575cf2 --- /dev/null +++ b/workflow/scripts/aggregate_results.Rmd @@ -0,0 +1,161 @@ +--- +title: "aggregate_DESeq2_results.Rmd" +output: +# pdf_document: +# toc: yes + html_document: + toc: yes + toc_float: yes + code_folding: hide +params: + rawcountsmatrix: ROI.counts.tsv + coldata: sampleinfo.tsv + diffatacdir: /Users/kopardevn/Documents/Projects/ccbr1266/ATACseq/DiffAtac + FC_cutoff: 2 + FDR_cutoff: 0.05 + indexcols: Geneid + excludecols: Chr,Start,End,Strand,Length + outtsv: all_diff_atacs.tsv +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +suppressPackageStartupMessages(library("DESeq2")) +suppressPackageStartupMessages(library("edgeR")) +suppressPackageStartupMessages(library("tidyverse")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("DT")) +suppressPackageStartupMessages(library("reshape2")) +suppressPackageStartupMessages(library("pander")) +suppressPackageStartupMessages(library("plotly")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("ggfortify")) +suppressPackageStartupMessages(library("ggrepel")) +suppressPackageStartupMessages(library("RColorBrewer")) +suppressPackageStartupMessages(library("ComplexHeatmap")) + +# read in sampleinfo +coldata=params$coldata +sampleinfo=read.csv(file=coldata,header=FALSE,sep="\t",comment.char = "#",strip.white = TRUE) +colnames(sampleinfo) = c("replicateName","sampleName") + +# read in counts matrix +rawcountsmatrix=params$rawcountsmatrix +countsdf=read.csv(file=rawcountsmatrix,header=TRUE,sep="\t",comment.char = "#",check.names = FALSE,strip.white = TRUE) +original_countsdf = countsdf +nroi=nrow(countsdf) + +excludecols=unlist(strsplit(params$excludecols,",")) +indexcols=unlist(strsplit(params$indexcols,",")) +countmatrix=countsdf %>% dplyr::select(-all_of(excludecols)) %>% column_to_rownames(var="Geneid") +countmatrix=countmatrix[,sampleinfo$replicateName] +sampleinfo$librarySize=colSums(countmatrix) +nsamples=nrow(sampleinfo) +``` + +## ROIs + +Total of `r nroi` ROIs + +## Sample Stats + +Total of `r nsamples` replicates. + +```{r metadata,include=TRUE,echo=FALSE,cache=FALSE} +pander(sampleinfo,style="rmarkdown") + +``` + +## PCA + +```{r allsamplepca,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} +dds1 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix), + colData = sampleinfo[,c("replicateName","sampleName")], + design = ~ 1) +dds1 <- DESeq(dds1) +rld1 <- vst(dds1) +assayrld1 = as.data.frame(assay(rld1)) +assayrld1$row_variance = rowVars(as.matrix(assayrld1)) +assayrld1 = arrange(assayrld1,desc(row_variance)) +zero_variance_rows=assayrld1$row_variance<1e-5 +assayrld1$row_variance = NULL +assayrld1 = assayrld1[!zero_variance_rows,] +assayrld1_original=assayrld1 +if (nrow(assayrld1) > 50000) { + assayrld1 = assayrld1[1:50000,] +} +pca1=prcomp(t(assayrld1),scale. = T) +m.pc1 = round(pca1$sdev[1]^2/sum(pca1$sdev^2)*100,2) +m.pc2 = round(pca1$sdev[2]^2/sum(pca1$sdev^2)*100,2) +m.pc3 = round(pca1$sdev[3]^2/sum(pca1$sdev^2)*100,2) +xlab=paste0("PC1(",m.pc1,"%)") +ylab=paste0("PC2(",m.pc2,"%)") +samples <- as.factor(sampleinfo$sampleName) +p <- ggplot(pca1$x,aes(x=PC1,y=PC2,label=rownames(pca1$x)))+ + geom_point(aes(col=samples,shape=samples))+scale_shape_manual(values=seq(0,15))+ + geom_text_repel(max.overlaps = 15,size=2)+ + xlab(xlab)+ylab(ylab)+ + theme_light(base_size = 10)+ + ggtitle("PCA (all samples)")+ + theme_light() +print(p) +``` + +```{r degs,include=TRUE,echo=FALSE, results='hide',cache=FALSE,warning=FALSE,message=FALSE,error=FALSE} +allfiles = list.files(path=params$diffatacdir,pattern="_vs_") +degfiles = allfiles[grepl(".tsv",allfiles)] +contrasts = gsub(".tsv","",degfiles) +# deg_file=paste(params$diffatacdir,"DGCR8_KO_vs_Wt.tsv",sep="/") +add_prefix <- function(string,prefix) { + return (paste(prefix,string,sep="_")) +} +get_sig_geneids <- function(df,fc=2,fdr=0.05) { + df %>% filter(log2FoldChange > fc) %>% filter(padj < fdr) -> df_up + df %>% filter(log2FoldChange < (-1 * fc)) %>% filter(padj < fdr) -> df_down + return (c(df_up$Geneid,df_down$Geneid)) +} +read_deg_file <- function(deg_file) { + df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE) + df %>% select(c("Geneid","log2FoldChange","pvalue","padj")) -> df + return (df) +} +get_anno_from_deg_file <- function(deg_file) { + df = read.csv(file=deg_file,header=TRUE,sep="\t",comment.char = "#",strip.white = TRUE) + df %>% select(-c("baseMean","lfcSE","stat","width","strand","log2FoldChange","pvalue","padj")) -> df + return (df) +} +rename_columns <- function(df,contrast) { + colnames(df) <- c("Geneid",add_prefix(string="log2FoldChange",prefix=contrast),add_prefix(string="pvalue",prefix=contrast),add_prefix(string="padj",prefix=contrast)) + return(df) +} +all_sig_geneids = c() +dfs = list() +collist = c() +for (i in 1:length(degfiles)){ + # i=1 + deg_file=paste(params$diffatacdir,degfiles[i],sep="/") + contrast=contrasts[i] + x <- read_deg_file(deg_file) + all_sig_geneids <- c(all_sig_geneids,get_sig_geneids(x,params$FC_cutoff,params$FDR_cutoff)) + dfs[[contrast]] <- rename_columns(x,contrast) + for (j in c("log2FoldChange","pvalue","padj")){ + k=add_prefix(j,contrast) + collist = c(collist,k) + } +} +all_sig_geneids <- unique(all_sig_geneids) +dfs[["anno"]] <- get_anno_from_deg_file(deg_file=paste(params$diffatacdir,degfiles[1],sep="/")) +dfs %>% purrr::reduce(inner_join, by="Geneid") -> merged_dfs +write.table(merged_dfs,file=params$outtsv,quote = FALSE,sep="\t",row.names = FALSE,col.names = TRUE) +DT::datatable(merged_dfs,rownames = FALSE) %>% DT::formatRound(columns=collist,digits=3) +``` + +## Heatmap + +Considering all significantly UP or DOWN ROIs aggregated across all contrasts. (Total `r length(all_sig_geneids)` ROIs) + +```{r heatmap,include=TRUE,echo=FALSE, message=FALSE, results='hide',fig.width=4,fig.height=10,warning=FALSE,message=FALSE,error=FALSE} +heatmap_matrix=assay(rld1) +heatmap_matrix=heatmap_matrix[all_sig_geneids,] +Heatmap(matrix = t(scale(t(heatmap_matrix),scale = TRUE)),show_row_names = FALSE,name=" ") +``` diff --git a/workflow/scripts/aggregate_results_runner.R b/workflow/scripts/aggregate_results_runner.R new file mode 100644 index 0000000..bde78ed --- /dev/null +++ b/workflow/scripts/aggregate_results_runner.R @@ -0,0 +1,83 @@ +#!/usr/bin/env Rscript +suppressPackageStartupMessages(library("argparse")) + + +# create parser object +parser <- ArgumentParser(description="Wrapper for running aggregate_results.Rmd") + + +parser$add_argument("-m", "--countsmatrix", + type="character", + help="path to countsmatrix", + required=TRUE) +parser$add_argument("-a", "--diffatacdir", + type="character", + help="diff atac dir", + required=TRUE) +parser$add_argument("-c", "--coldata", + type="character", + help="coldata or sampleinto TSV file", + required=TRUE) +parser$add_argument("-f", "--foldchange", + type="double", + help="log2FC threshold", + required=FALSE, + default=2.0) +parser$add_argument("-p", "--fdr", + type="double", + help="adj. p-value threshold", + required=FALSE, + default=0.05) +parser$add_argument("-i", "--indexcols", + type="character", + help="comma-separated list of index columns", + required=TRUE) +parser$add_argument("-e", "--excludecols", + type="character", + help="comma-separated list of columns to exclude", + required=TRUE) +parser$add_argument("-t", "--tmpdir", + type="character", + help="temp dir", + required=FALSE, + default="/tmp") +parser$add_argument("-s", "--scriptsdir", + type="character", + help="scripts dir", + required=TRUE) + +args <- parser$parse_args() + +# check inputs +for ( f in c(args$countmatrix,args$coldata)) { + if (!file.exists(f)){ + outstr=paste("ERROR: File: ",f,"Does not exists!") + message(outstr) + quit(status = 1) + } + if (file.access(f,mode=4)!=0){ + outstr=paste("ERROR: File: ",f,"is not readable!") + message(outstr) + quit(status = 1) + } +} +setwd(args$diffatacdir) + + +outprefix = "all_diff_atacs" +outhtml = paste0(args$diffatacdir,"/",outprefix,".html") +outtsv = paste0(args$diffatacdir,"/",outprefix,".tsv") + +myparams <- list(rawcountsmatrix = args$countsmatrix, + coldata = args$coldata, + FC_cutoff = args$foldchange, + FDR_cutoff = args$fdr, + diffatacdir = args$diffatacdir, + indexcols = args$indexcols, + excludecols = args$excludecols, + outtsv = outtsv) + +rmarkdown::render(paste(args$scriptsdir,'aggregate_results.Rmd',sep="/"), + params = myparams, + output_file = outhtml, + intermediates_dir = args$tmpdir)