From 9780f1b8c9c74911f2752de74b5bed4f537c382f Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 16 Oct 2023 10:34:21 -0400 Subject: [PATCH 1/4] refactor!: update CLI from CHAMPAGNE --- assets/slurm_header_biowulf.sh | 12 +++ assets/slurm_header_frce.sh | 12 +++ conf/biowulf.config | 59 +++++-------- conf/frce.config | 29 +++++++ conf/interactive.config | 6 ++ conf/slurm.config | 27 ++++++ nextflow.config | 7 +- src/__main__.py | 92 ++++++++++---------- src/util.py | 152 ++++++++++++++++++++------------- 9 files changed, 251 insertions(+), 145 deletions(-) create mode 100644 assets/slurm_header_biowulf.sh create mode 100644 assets/slurm_header_frce.sh create mode 100644 conf/frce.config create mode 100644 conf/interactive.config create mode 100644 conf/slurm.config diff --git a/assets/slurm_header_biowulf.sh b/assets/slurm_header_biowulf.sh new file mode 100644 index 0000000..c63a778 --- /dev/null +++ b/assets/slurm_header_biowulf.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=1g +#SBATCH --time=1-00:00:00 +#SBATCH --parsable +#SBATCH -J "CRUISE" +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --output "log/slurm_%j.log" +#SBATCH --output "log/slurm_%j.log" + +module load nextflow +NXF_SINGULARITY_CACHEDIR=/data/CCBR_Pipeliner/SIFS diff --git a/assets/slurm_header_frce.sh b/assets/slurm_header_frce.sh new file mode 100644 index 0000000..5441095 --- /dev/null +++ b/assets/slurm_header_frce.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +#SBATCH --cpus-per-task=1 +#SBATCH --mem=1g +#SBATCH --time=1-00:00:00 +#SBATCH --parsable +#SBATCH -J "CRUISE" +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH --output "log/slurm_%j.log" +#SBATCH --output "log/slurm_%j.log" + +module load nextflow +NXF_SINGULARITY_CACHEDIR=/mnt/projects/CCBR-Pipelines/SIFs diff --git a/conf/biowulf.config b/conf/biowulf.config index f3faf79..cd7a6ea 100644 --- a/conf/biowulf.config +++ b/conf/biowulf.config @@ -1,51 +1,36 @@ params { - config_profile_description = 'Biowulf nf-core config' - config_profile_contact = 'staff@hpc.nih.gov' - config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html' - max_memory = '224 GB' - max_cpus = 32 - max_time = '72 h' - - igenomes_base = '/fdb/igenomes/' -} - -executor { - - $slurm { - queue = 'norm' - queueSize = 200 - pollInterval = '2 min' - queueStatInterval = '5 min' - submitRateLimit = '6/1min' - retry.maxAttempts = 1 + config_profile_description = 'Biowulf nf-core config' + config_profile_contact = 'staff@hpc.nih.gov' + config_profile_url = 'https://hpc.nih.gov/apps/nextflow.html' + max_memory = '224 GB' + max_cpus = 32 + max_time = '72 h' + + igenomes_base = '/fdb/igenomes/' + + // CCBR shared resource paths + index_dir = '/data/CCBR_Pipeliner/db/PipeDB/Indices' + fastq_screen { + conf = "assets/fastq_screen_biowulf.conf" + db_dir = '/data/CCBR_Pipeliner/db/PipeDB/lib/fastq_screen_db/' } } singularity { enabled = true autoMounts = true - cacheDir = "/data/$USER/.singularity" + cacheDir = "/data/CCBR_Pipeliner/SIFS" envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' } env { - SINGULARITY_CACHEDIR="/data/$USER/.singularity" - PYTHONNOUSERSITE = 1 + SINGULARITY_CACHEDIR = "/data/CCBR_Pipeliner/SIFS" } - -process { - executor = 'slurm' - maxRetries = 1 - - clusterOptions = ' --gres=lscratch:200 ' - - scratch = '/lscratch/$SLURM_JOBID' - - stageInMode = 'symlink' - stageOutMode = 'rsync' - - // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps - cache = 'lenient' -} +process.clusterOptions = ' --gres=lscratch:200 ' +process.scratch = '/lscratch/$SLURM_JOBID' +process.stageInMode = 'symlink' +process.stageOutMode = 'rsync' +// for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps +process.cache = 'lenient' diff --git a/conf/frce.config b/conf/frce.config new file mode 100644 index 0000000..4f132a8 --- /dev/null +++ b/conf/frce.config @@ -0,0 +1,29 @@ +params { + config_profile_description = 'FRCE config' + max_memory = '224 GB' + max_cpus = 32 + max_time = '72 h' + + // CCBR shared resource paths + index_dir = null // TODO + fastq_screen { + conf = "assets/fastq_screen_frce.conf" // TODO + db_dir = null // TODO + } +} + +singularity { + enabled = true + autoMounts = true + cacheDir = "/mnt/projects/CCBR-Pipelines/SIFs" + envWhitelist='https_proxy,http_proxy,ftp_proxy,DISPLAY,SLURM_JOBID,SINGULARITY_BINDPATH' +} + +process.scratch = null // TODO + +process.stageInMode = 'symlink' +process.stageOutMode = 'rsync' + +// for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps +process.cache = 'lenient' +} diff --git a/conf/interactive.config b/conf/interactive.config new file mode 100644 index 0000000..432ede8 --- /dev/null +++ b/conf/interactive.config @@ -0,0 +1,6 @@ +params { + config_profile_name = 'Run tasks locally on a slurm interactive node' + max_memory = '220 GB' + max_cpus = 56 + max_time = '12 h' +} diff --git a/conf/slurm.config b/conf/slurm.config new file mode 100644 index 0000000..efe327b --- /dev/null +++ b/conf/slurm.config @@ -0,0 +1,27 @@ + +executor { + + $slurm { + queue = 'norm' + queueSize = 200 + pollInterval = '2 min' + queueStatInterval = '5 min' + submitRateLimit = '6/1min' + retry.maxAttempts = 1 + } +} + +process { + executor = 'slurm' + maxRetries = 1 + + clusterOptions = ' --gres=lscratch:200 ' + + scratch = '/lscratch/$SLURM_JOBID' + + stageInMode = 'symlink' + stageOutMode = 'rsync' + + // for running pipeline on group sharing data directory, this can avoid inconsistent files timestamps + cache = 'lenient' +} diff --git a/nextflow.config b/nextflow.config index 5ddce2a..af7be36 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,8 +57,11 @@ profiles { biowulf { includeConfig "conf/biowulf.config" } - slurmint { - includeConfig "conf/slurmint.config" + slurm { + includeConfig "conf/slurm.config" + } + interactive { + includeConfig "conf/interactive.config" } ci_stub { includeConfig "conf/ci_stub.config" diff --git a/src/__main__.py b/src/__main__.py index 448e5f9..71029de 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,5 +1,5 @@ """ -Entrypoint for pipeline CLI +Entrypoint for CRUISE CLI Check out the wiki for a detailed look at customizing this file: https://github.com/beardymcjohnface/Snaketool/wiki/Customising-your-Snaketool @@ -20,21 +20,6 @@ def common_options(func): """Common options decorator for use with click commands.""" options = [ - click.option( - "--configfile", - default="nextflow.config", - help="Custom config file", - show_default=True, - ), - click.option( - "--paramsfile", default=None, help="Custom params file", show_default=True - ), - click.option( # when threads=None, uses max available - "--threads", - help="Number of threads to use", - default=None, - show_default=True, - ), click.argument("nextflow_args", nargs=-1), ] for option in reversed(options): @@ -47,7 +32,7 @@ def common_options(func): ) @click.version_option(get_version(), "-v", "--version", is_flag=True) def cli(): - """CRUISE description TODO + """CHromAtin iMmuno PrecipitAtion sequencinG aNalysis pipEline For more options, run: cruise [command] --help""" @@ -56,15 +41,17 @@ def cli(): help_msg_extra = """ \b -CLUSTER EXECUTION: -cruise run ... -profile [profile],[profile],... -For information on Nextflow config and profiles see: -https://www.nextflow.io/docs/latest/config.html#config-profiles -\b -RUN EXAMPLES: -Use singularity: cruise run ... -profile singularity -Specify threads: cruise run ... --threads [threads] -Add NextFlow args: cruise run ... -work-dir workDir -with-docker +EXAMPLES: +Execute with slurm: + cruise run ... --mode slurm +Preview the processes that will run: + cruise run ... --mode local -preview +Add nextflow args (anything supported by `nextflow run`): + cruise run ... -work-dir path/to/workDir +Run with a specific installation of cruise: + cruise run --main path/to/cruise/main.nf ... +Run with a specific tag, branch, or commit from GitHub: + cruise run --main CCBR/CRUISE -r v0.1.0 ... """ @@ -74,32 +61,47 @@ def cli(): help_option_names=["-h", "--help"], ignore_unknown_options=True ), ) +@click.option( + "--main", + "main_path", + help="Path to the cruise main.nf file or the GitHub repo (CCBR/CRUISE). Defaults to the version installed in the $PATH.", + type=str, + default=nek_base(os.path.join("main.nf")), + show_default=True, +) +@click.option( + "--mode", + "_mode", + help="Run mode (slurm, local)", + type=str, + default="local", + show_default=True, +) @common_options -def run(**kwargs): +def run(main_path, _mode, **kwargs): """Run the workflow""" - # optional: merge config from CLI with nf config - # run! + if ( # this is the only acceptable github repo option for cruise + main_path != "CCBR/CRUISE" + ): + # make sure the path exists + if not os.path.exists(main_path): + raise FileNotFoundError( + f"Path to the cruise main.nf file not found: {main_path}" + ) + run_nextflow( - nextfile_path=nek_base(os.path.join("main.nf")), # Full path to Nextflow file + nextfile_path=main_path, + mode=_mode, **kwargs, ) @click.command() -@click.option( - "--configfile", - default="nextflow.config", - help="Copy template config to file", - show_default=True, -) -def config(configfile, **kwargs): - """Copy the system default config files""" - for filename in ("nextflow.config", "params.yml"): - if os.path.exists(nek_base(filename)): - copy_config( - local_config=configfile, - system_config=nek_base(filename), - ) +def init(**kwargs): + """Initialize the working directory by copying the system default config files""" + paths = ("nextflow.config", "conf/", "assets/") + copy_config(paths) + os.mkdir("log/") @click.command() @@ -109,7 +111,7 @@ def citation(**kwargs): cli.add_command(run) -cli.add_command(config) +cli.add_command(init) # cli.add_command(citation) # TODO uncomment if cruise is published in a journal or Zenodo diff --git a/src/util.py b/src/util.py index ac1d41e..c4e7985 100644 --- a/src/util.py +++ b/src/util.py @@ -1,13 +1,13 @@ -import sys +from time import localtime, strftime +import click +import collections.abc import os +import pprint +import shutil +import stat import subprocess +import sys import yaml -import collections.abc -from shutil import copyfile -import stat -from time import localtime, strftime - -import click def nek_base(rel_path): @@ -48,9 +48,16 @@ def append_config_block(nf_config="nextflow.config", scope=None, **kwargs): f.write("}\n") -def copy_config(local_config=None, system_config=None): - msg(f"Copying system default config to {local_config}") - copyfile(system_config, local_config) +def copy_config(config_paths, overwrite=True): + msg(f"Copying default config files to current working directory") + for local_config in config_paths: + system_config = nek_base(local_config) + if os.path.isfile(system_config): + shutil.copyfile(system_config, local_config) + elif os.path.isdir(system_config): + shutil.copytree(system_config, local_config, dirs_exist_ok=overwrite) + else: + raise FileNotFoundError(f"Cannot copy {system_config} to {local_config}") def read_config(file): @@ -106,68 +113,91 @@ def list_commands(self, ctx: click.Context): return list(self.commands) -def is_biowulf(): - is_biowulf = False - for env_var in ("HOSTNAME", "SLURM_SUBMIT_HOST"): - if env_var in os.environ.keys() and os.environ[env_var] == "biowulf.nih.gov": - is_biowulf = True - return is_biowulf +def scontrol_show(): + scontrol_dict = dict() + scontrol_out = subprocess.run( + "scontrol show config", shell=True, capture_output=True, text=True + ).stdout + if len(scontrol_out) > 0: + for line in scontrol_out.split("\n"): + line_split = line.split("=") + if len(line_split) > 1: + scontrol_dict[line_split[0].strip()] = line_split[1].strip() + return scontrol_dict + + +hpc_options = { + "biowulf": {"profile": "biowulf", "slurm": "assets/slurm_header_biowulf.sh"}, + "fnlcr": { + "profile": "frce", + "slurm": "assets/slurm_header_frce.sh", + }, +} + + +def get_hpc(): + scontrol_out = scontrol_show() + if "ClusterName" in scontrol_out.keys(): + hpc = scontrol_out["ClusterName"] + else: + hpc = None + return hpc def run_nextflow( - paramsfile=None, - configfile=None, nextfile_path=None, merge_config=None, threads=None, nextflow_args=None, + mode="local", ): - """Run a Nextflow workfile""" + """Run a Nextflow workflow""" nextflow_command = ["nextflow", "run", nextfile_path] - - if paramsfile: - # copy sys default params if needed - copy_config( - local_config=paramsfile, - system_config=nek_base("params.yaml"), - ) - # read the params - nf_config = read_config(paramsfile) - # merge in command line params if provided - if merge_config: - update_config(nf_config, merge_config) - # update params file - write_config(nf_config, paramsfile) - nextflow_command += ["-params-file", paramsfile] - # display the runtime params - msg_box("Runtime parameters", errmsg=yaml.dump(nf_config, Dumper=yaml.Dumper)) - - if configfile: - if not os.path.exists(configfile): - copy_config( - local_config=configfile, - system_config=nek_base("nextflow.config"), - ) - - # add threads - if threads: # when threads=None, uses max available - append_config_block(scope="executor", cpus=threads) - - nextflow_command += ["-c", configfile] - - # display the runtime configuration - # msg_box("Launcher Configuration", errmsg=open(configfile, "r").read()) # TODO verbose flag to toggle printing config? - - # add any additional Nextflow commands - if nextflow_args: - nextflow_command += list(nextflow_args) - # make sure bins are executable for nextflow processes chmod_bins_exec() - # Run Nextflow!!! + hpc = get_hpc() + if mode == "slurm" and not hpc: + raise ValueError("mode is 'slurm' but no HPC environment was detected") + # add any additional Nextflow commands + args_dict = dict() + prev_arg = "" + for arg in nextflow_args: + if arg.startswith("-"): + args_dict[arg] = "" + elif prev_arg.startswith("-"): + args_dict[prev_arg] = arg + prev_arg = arg + # make sure profile matches biowulf or frce + profiles = ( + set(args_dict["-profile"].split(",")) + if "-profile" in args_dict.keys() + else set() + ) + if mode == "slurm": + profiles.add("slurm") + if hpc: + profiles.add(hpc_options[hpc]["profile"]) + args_dict["-profile"] = ",".join(sorted(profiles)) + nextflow_command += list(f"{k} {v}" for k, v in args_dict.items()) + + # Print nextflow command nextflow_command = " ".join(str(nf) for nf in nextflow_command) - if is_biowulf(): - nextflow_command = f'bash -c "module load nextflow && {nextflow_command}"' msg_box("Nextflow command", errmsg=nextflow_command) - subprocess.run(nextflow_command, shell=True, check=True) + + if mode == "slurm": + slurm_filename = "submit_slurm.sh" + with open(slurm_filename, "w") as sbatch_file: + with open(nek_base(hpc_options[hpc]["slurm"]), "r") as template: + sbatch_file.writelines(template.readlines()) + sbatch_file.write(nextflow_command) + run_command = f"sbatch {slurm_filename}" + msg_box("Slurm batch job", errmsg=run_command) + elif mode == "local": + if hpc: + nextflow_command = f'bash -c "module load nextflow && {nextflow_command}"' + run_command = nextflow_command + else: + raise ValueError(f"mode {mode} not recognized") + # Run Nextflow!!! + subprocess.run(run_command, shell=True, check=True) From d84638ebba4df5503fe7395dae5119814f62d19e Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 16 Oct 2023 10:35:13 -0400 Subject: [PATCH 2/4] feat: adapt biowulf install script from champagne --- bin/install.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100755 bin/install.sh diff --git a/bin/install.sh b/bin/install.sh new file mode 100755 index 0000000..94fa848 --- /dev/null +++ b/bin/install.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +version=$1 + +repo_path=/data/CCBR_Pipeliner/Pipelines/CRUISE/cruise-dev/ +install_path=/data/CCBR_Pipeliner/Pipelines/CRUISE/${version} +bin_path=${install_path}/bin/ + +. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" +conda activate py311 + +echo "Installing CRUISE to ${install_path}" +pip install ${repo_path} --target ${install_path} +chmod +x ${install_path}/cruise/bin/*.* + +if [[ ":$PATH:" != *":${bin_path}:"* ]];then + export PATH="${PATH}:${bin_path}" +fi + +if [[ ":$PYTHONPATH:" != *":${install_path}:"* ]];then + export PYTHONPATH="${PYTHONPATH}:${install_path}" +fi From 77bd0ebf7c710e90d519984cdd63aacfb9fb7b12 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 16 Oct 2023 10:44:22 -0400 Subject: [PATCH 3/4] chore: update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68198cd..db76b89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ - fc (fold change) - bf (bayes factor) - pr (precision recall) +- CLI (#16) + - `champagne init` to initialize a project directory. + - `champagne run` + - `--mode slurm` to submit a slurm job to biowulf or frce. + - `--main path/to/main.nf` to select a different local install of cruise, or specify the repo (`CCBR/CRUISE`) to get it from GitHub. + --8<-- "README.md" - Information on who the pipeline was developed for, and a statement if it's only been tested on Biowulf. For example: diff --git a/docs/nextflow.md b/docs/nextflow.md new file mode 100644 index 0000000..6bddf24 --- /dev/null +++ b/docs/nextflow.md @@ -0,0 +1,23 @@ +# nextflow pipeline + +You can run the nextflow pipeline directly by specifying this GitHub repo. +You will need nextflow and either singularity or docker installed. +In this case you don't need to run `cruise init` first, +as the config files will be accessed directly from the GitHub repo. + +```sh +nextflow run CCBR/CRUISE -profile test,singularity +``` + +You can specify a specific version, tag, or branch with `-r`: + +```sh +nextflow run CCBR/CRUISE -r v1.0.0 -profile test,singularity +``` + +Create and use a custom reference genome: + +```sh +nextflow run CCBR/CRUISE -profile test -entry MAKE_REFERENCE +nextflow run CCBR/CRUISE -profile test -c results/test/genome/custom_genome.config +``` diff --git a/docs/release-guide.md b/docs/release-guide.md new file mode 100644 index 0000000..b9d2c15 --- /dev/null +++ b/docs/release-guide.md @@ -0,0 +1,23 @@ +# Release Guide + +## How to test a pre-release on biowulf + +Install the development version of cruise. + +```sh +# activate the conda env for development +. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" +conda activate py311 + +# go to the source on biowulf and update +cd /data/CCBR_Pipeliner/Pipelines/CRUISE/cruise-dev +git pull +# optionally switch to different branch if needed + +# install the version to a hidden path (e.g. .dev, .v1.0.0.9000) in /data/CCBR_Pipeliner/Pipelines/CRUISE +cd .. +pip install ./cruise-dev -t ./.dev +# add it to your PATH and PYTHONPATH with: +export PATH="$PATH:/data/CCBR_Pipeliner/Pipelines/CRUISE/.dev/bin/" +export PYTHONPATH="$PYTHONPATH:/data/CCBR_Pipeliner/Pipelines/CRUISE/.dev/" +``` diff --git a/docs/user-guide/getting-started.md b/docs/user-guide/getting-started.md deleted file mode 100644 index fbb0744..0000000 --- a/docs/user-guide/getting-started.md +++ /dev/null @@ -1,48 +0,0 @@ -This should set the stage for all of the pipeline requirements. Examples are listed below. - -# Overview - -The CARLISLE github repository is stored locally, and will be used for project deployment. Multiple projects can be deployed from this one point simultaneously, without concern. - -## 1. Getting Started - -## 1.1 Introduction - -The CARLISLE Pipelie beings with raw FASTQ files and performs trimming followed by alignment using [BOWTIE2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). Data is then normalized through either the use of an user-species species (IE E.Coli) spike-in control or through the determined library size. Peaks are then called using [MACS2](https://hbctraining.github.io/Intro-to-ChIPseq/lessons/05_peak_calling_macs.html), [SEACR](https://github.com/FredHutch/SEACR), and [GoPEAKS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02707-w) with various options selected by the user. Peaks are then annotated, and summarized into reports. If designated, differential analysis is performed using [DESEQ2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html). QC reports are also generated with each project using [FASTQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) and [MULTIQC](https://multiqc.info/). Annotations are added using [HOMER](http://homer.ucsd.edu/homer/ngs/annotation.html) and [ROSE](https://github.com/stjude/ROSE). GSEA Enrichment analysis predictions are added using [CHIPENRICH](https://bioconductor.org/packages/devel/bioc/vignettes/chipenrich/inst/doc/chipenrich-vignette.html). - -The following are sub-commands used within CARLISLE: - -- initialize: initialize the pipeline -- dryrun: predict the binding of peptides to any MHC molecule -- cluster: execute the pipeline on the Biowulf HPC -- local: execute a local, interactive, session -- git: execute GitHub actions -- unlock: unlock directory -- DAG: create DAG report -- report: create SNAKEMAKE report -- testrun: copies test manifests and files to WORKDIR - -## 1.2 Setup Dependencies - -CARLISLE has several dependencies listed below. These dependencies can be installed by a sysadmin. All dependencies will be automatically loaded if running from Biowulf. - -- bedtools: "bedtools/2.30.0" -- bedops: "bedops/2.4.40" - -## 1.3 Login to the cluster - -CARLISLE has been exclusively tested on Biowulf HPC. Login to the cluster's head node and move into the pipeline location. - -``` -# ssh into cluster's head node -ssh -Y $USER@biowulf.nih.gov -``` - -## 1.4 Load an interactive session - -An interactive session should be started before performing any of the pipeline sub-commands, even if the pipeline is to be executed on the cluster. - -``` -# Grab an interactive node -sinteractive --time=12:00:00 --mem=8gb --cpus-per-task=4 --pty bash -``` diff --git a/docs/user-guide/output.md b/docs/user-guide/output.md deleted file mode 100644 index dd5bee5..0000000 --- a/docs/user-guide/output.md +++ /dev/null @@ -1,73 +0,0 @@ -This should include all pertitant information about output files, including extensions that differentiate files. An example is provided below. - -# 4. Expected Outputs - -The following directories are created under the WORKDIR/results directory: - -- alignment_stats: this directory include information on the alignment of each sample -- peaks: this directory contains a sub-directory that relates to the quality threshold used. - - quality threshold - - contrasts: this directory includes the contrasts for each line listed in the contrast manifest - - peak_caller: this directory includes all peak calls from each peak_caller (SEACR, MACS2, GOPEAKS) for each sample - - annotation - - go_enrichment: this directory includes gene set enrichment pathway predictions - - homer: this directory includes the annotation output from HOMER - - rose: this directory includes the annotation output from ROSE - -``` -├── alignment_stats -├── bam -├── peaks -│   ├── 0.05 -│   │   ├── contrasts -│   │   │   ├── contrast_id1.dedup_status -│   │   │   └── contrast_id2.dedup_status -│   │   ├── gopeaks -│   │   │   ├── annotation -│   │   │   │   ├── go_enrichment -│   │   │   │   │   ├── contrast_id1.dedup_status.go_enrichment_tables -│   │   │   │   │   └── contrast_id2.dedup_status.go_enrichment_html_report -│   │   │   │   ├── homer -│   │   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.gopeaks_broad.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.gopeaks_narrow.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.gopeaks_broad.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.gopeaks_narrow.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   └── rose -│   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.gopeaks_broad.12500 -│   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.gopeaks_narrow.12500 -│   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.dedup.gopeaks_broad.12500 -│   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.dedup.gopeaks_narrow.12500 -│   │   │   └── peak_output -│   │   ├── macs2 -│   │   │   ├── annotation -│   │   │   │   ├── go_enrichment -│   │   │   │   │   ├── contrast_id1.dedup_status.go_enrichment_tables -│   │   │   │   │   └── contrast_id2.dedup_status.go_enrichment_html_report -│   │   │   │   ├── homer -│   │   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.macs2_narrow.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.macs2_broad.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.macs2_narrow.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.macs2_broad.motifs -│   │   │   │   │   │   ├── homerResults -│   │   │   │   │   │   └── knownResults -│   │   │   │   └── rose -│   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.macs2_broad.12500 -│   │   │   │   ├── replicate_id1_vs_control_id.dedup_status.macs2_narrow.12500 -│   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.macs2_broad.12500 -│   │   │   │   ├── replicate_id2_vs_control_id.dedup_status.macs2_narrow.12500 -│   │   │   └── peak_output -``` diff --git a/docs/user-guide/preparing-files.md b/docs/user-guide/preparing-files.md deleted file mode 100644 index 5ba6f0d..0000000 --- a/docs/user-guide/preparing-files.md +++ /dev/null @@ -1,90 +0,0 @@ -This should describe any input files needed, including config files, manifest files, and sample files. An example is provided below. - -# 2. Preparing Files - -The pipeline is controlled through editing configuration and manifest files. Defaults are found in the /WORKDIR/config and /WORKDIR/manifest directories, after initialization. - -## 2.1 Configs - -The configuration files control parameters and software of the pipeline. These files are listed below: - -- config/config.yaml -- resources/cluster.yaml -- resources/tools.yaml - -### 2.1.1 Cluster Config - -The cluster configuration file dictates the resources to be used during submission to Biowulf HPC. There are two different ways to control these parameters - first, to control the default settings, and second, to create or edit individual rules. These parameters should be edited with caution, after significant testing. - -### 2.1.2 Tools Config - -The tools configuration file dictates the version of each software or program that is being used in the pipeline. - -### 2.1.3 Config YAML - -There are several groups of parameters that are editable for the user to control the various aspects of the pipeline. These are : - -- Folders and Paths - - These parameters will include the input and output files of the pipeline, as well as list all manifest names. -- User parameters - - These parameters will control the pipeline features. These include thresholds and whether to perform processes. -- References - - These parameters will control the location of index files, spike-in references, adaptors and species calling information. - -#### 2.1.3.1 User Parameters - -##### 2.1.3.1.1 Duplication Status - -Users can select duplicated peaks (dedup) or non-deduplicated peaks (no_dedup) through the user parameter. - -``` -dupstatus: "dedup, no_dedup" -``` - -##### 2.1.3.1.2 Macs2 additional option - -MACS2 can be run with or without the control. adding a control will increase peak specificity -Selecting "Y" for the `macs2_control` will run the paired control sample provided in the sample manifest - -#### 2.1.3.2 References - -Additional reference files may be added to the pipeline, if other species were to be used. - -The absolute file paths which must be included are: - -1. fa: "/path/to/species.fa" -2. blacklist: "/path/to/blacklistbed/species.bed" - -The following information must be included: - -1. regions: "list of regions to be included; IE chr1 chr2 chr3" -2. macs2_g: "macs2 genome shorthand; IE mm IE hs" - -## 2.2 Preparing Manifests - -There are two manifests, one which required for all pipelines and one that is only required if running a differential analysis. These files describe information on the samples and desired contrasts. The paths of these files are defined in the snakemake_config.yaml file. These files are: - -- samplemanifest -- contrasts - -### 2.2.1 Samples Manifest (REQUIRED) - -This manifest will include information to sample level information. It includes the following column headers: - -- sampleName: the sample name WITHOUT replicate number (IE "SAMPLE") -- replicateNumber: the sample replicate number (IE "1") -- isControl: whether the sample should be identified as a control (IE "Y") -- controlName: the name of the control to use for this sample (IE "CONTROL") -- controlReplicateNumber: the replicate number of the control to use for this sample (IE "1") -- path_to_R1: the full path to R1 fastq file (IE "/path/to/sample1.R1.fastq") -- path_to_R2: the full path to R1 fastq file (IE "/path/to/sample2.R2.fastq") - -An example sampleManifest file is shown below: - -| sampleName | replicateNumber | isControl | controlName | controlReplicateNumber | path_to_R1 | path_to_R2 | -| ------------------------------- | --------------- | --------- | ------------------------------- | ---------------------- | ----------------------------------------------------------------- | ----------------------------------------------------------------- | -| 53_H3K4me3 | 1 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/53_H3K4me3_1.R1.fastq.gz | PIPELINE_HOME/.test/53_H3K4me3_1.R2.fastq.gz | -| 53_H3K4me3 | 2 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/53_H3K4me3_2.R1.fastq.gz | PIPELINE_HOME/.test/53_H3K4me3_2.R2.fastq.gz | -| HN6_H3K4me3 | 1 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/HN6_H3K4me3_1.R1.fastq.gz | PIPELINE_HOME/.test/HN6_H3K4me3_1.R2.fastq.gz | -| HN6_H3K4me3 | 2 | N | HN6_IgG_rabbit_negative_control | 1 | PIPELINE_HOME/.test/HN6_H3K4me3_2.R1.fastq.gz | PIPELINE_HOME/.test/HN6_H3K4me3_2.R2.fastq.gz | -| HN6_IgG_rabbit_negative_control | 1 | Y | - | - | PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R1.fastq.gz | PIPELINE_HOME/.test/HN6_IgG_rabbit_negative_control_1.R2.fastq.gz | diff --git a/docs/user-guide/run.md b/docs/user-guide/run.md deleted file mode 100644 index 493caab..0000000 --- a/docs/user-guide/run.md +++ /dev/null @@ -1,52 +0,0 @@ -This should include all information about the various run commands provided within the pipeline. - -# 3. Running the Pipeline - -## 3.1 Pipeline Overview - -The Snakemake workflow has a multiple options: - -``` -Usage: bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle -m/--runmode= -w/--workdir= -1. RUNMODE: [Type: String] Valid options: - *) init : initialize workdir - *) run : run with slurm - *) reset : DELETE workdir dir and re-init it - *) dryrun : dry run snakemake to generate DAG - *) unlock : unlock workdir if locked by snakemake - *) runlocal : run without submitting to sbatch - *) testrun: run on cluster with included test dataset -2. WORKDIR: [Type: String]: Absolute or relative path to the output folder with write permissions. -``` - -## 3.2 Commands explained - -The following explains each of the command options: - -- Preparation Commands - - init (REQUIRED): This must be performed before any Snakemake run (dry, local, cluster) can be performed. This will copy the necessary config, manifest and Snakefiles needed to run the pipeline to the provided output directory. - - dryrun (OPTIONAL): This is an optional step, to be performed before any Snakemake run (local, cluster). This will check for errors within the pipeline, and ensure that you have read/write access to the files needed to run the full pipeline. -- Processing Commands - - local: This will run the pipeline on a local node. NOTE: This should only be performed on an interactive node. - - run: This will submit a master job to the cluster, and subsequent sub-jobs as needed to complete the workflow. An email will be sent when the pipeline begins, if there are any errors, and when it completes. -- Other Commands (All optional) - - unlock: This will unlock the pipeline if an error caused it to stop in the middle of a run. - - testrun: This will run a test of the pipeline with test data - -To run any of these commands, follow the the syntax: - -``` -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=COMMAND --workdir=/path/to/output/dir -``` - -## 3.3 Typical Workflow - -A typical command workflow, running on the cluster, is as follows: - -``` -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=init --workdir=/path/to/output/dir - -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=dryrun --workdir=/path/to/output/dir - -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=run --workdir=/path/to/output/dir -``` diff --git a/docs/user-guide/test-info.md b/docs/user-guide/test-info.md deleted file mode 100644 index ac02374..0000000 --- a/docs/user-guide/test-info.md +++ /dev/null @@ -1,71 +0,0 @@ -This should walk the user through the steps of running the pipeline using test data - -# 5. Pipeline Tutorial - -Welcome to the CARLISLE Pipeline Tutorial! - -## 5.1 Getting Started - -Review the information on the [Getting Started](https://ccbr.github.io/CARLISLE/user-guide/getting-started/) for a complete overview the pipeline. The tutorial below will use test data available on NIH Biowulf HPC only. All example code will assume you are running v1.0 of the pipeline, using test data available on GitHub. - -A. Change working directory to the CARLISLE repository - -B. Initialize Pipeline - -``` -bash ./path/to/dir/carlisle --runmode=init --workdir=/path/to/output/dir -``` - -## 5.2 About the test data - -This test data consists of sub-sampled inputs, consisting of two pairs of two replicate samples and one control. The reference to be used is hg38. - -## 5.3 Submit the test data - -Test data is included in the .test directory as well as the config directory. - -A Run the test command to prepare the data, perform a dry-run and submit to the cluster - -``` -bash ./path/to/dir/carlisle --runmode=testrun --workdir=/path/to/output/dir - -``` - -- An expected output for the `testrun` is as follows: - -``` -Job stats: -job count min threads max threads ------------------------------ ------- ------------- ------------- -DESeq 24 1 1 -align 9 56 56 -alignstats 9 2 2 -all 1 1 1 -bam2bg 9 4 4 -create_contrast_data_files 24 1 1 -create_contrast_peakcaller_files 12 1 1 -create_reference 1 32 32 -create_replicate_sample_table 1 1 1 -diffbb 24 1 1 -filter 18 2 2 -findMotif 96 6 6 -gather_alignstats 1 1 1 -go_enrichment 12 1 1 -gopeaks_broad 16 2 2 -gopeaks_narrow 16 2 2 -macs2_broad 16 2 2 -macs2_narrow 16 2 2 -make_counts_matrix 24 1 1 -multiqc 2 1 1 -qc_fastqc 9 1 1 -rose 96 2 2 -seacr_relaxed 16 2 2 -seacr_stringent 16 2 2 -spikein_assessment 1 1 1 -trim 9 56 56 -total 478 1 56 -``` - -## 5.4 Review outputs - -Review the expected outputs on the [Output](https://ccbr.github.io/CARLISLE/user-guide/output/) page. If there are errors, review and performing stesp described on the [Troubleshooting](https://ccbr.github.io/CARLISLE/user-guide/troubleshooting/) page as needed. diff --git a/docs/user-guide/troubleshooting.md b/docs/user-guide/troubleshooting.md deleted file mode 100644 index 3ac5234..0000000 --- a/docs/user-guide/troubleshooting.md +++ /dev/null @@ -1,35 +0,0 @@ -This should include basic information on how to troubleshoot the pipeline. It should also include the main pipeliner developers contact information for users to utilize, as needed. - -# Troubleshooting - -Recommended steps to troubleshoot the pipeline. - -## 1.1 Email - -Check your email for an email regarding pipeline failure. You will receive an email from slurm@biowulf.nih.gov with the subject: Slurm Job_id=[#] Name=CARLISLE Failed, Run time [time], FAILED, ExitCode 1 - -## 1.2 Review the log files - -Review the logs in two ways: - -1. Review the master slurm file: This file will be found in the `/path/to/results/dir/` and titled `slurm-[jobid].out`. Reviewing this file will tell you what rule errored, and for any local SLURM jobs, provide error details -2. Review the individual rule log files: After reviewing the master slurm-file, review the specific rules that failed within the `/path/to/results/dir/logs/`. Each rule will include a `.err` and `.out` file, with the following formatting: `{rulename}.{masterjobID}.{individualruleID}.{wildcards from the rule}.{out or err}` - -## 1.3 Restart the run - -After addressing the issue, unlock the output directory, perform another dry-run and check the status of the pipeline, then resubmit to the cluster. - -``` -#unlock dir -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=unlock --workdir=/path/to/output/dir - -#perform dry-run -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=dryrun --workdir=/path/to/output/dir - -#submit to cluster -bash ./data/CCBR_Pipeliner/Pipelines/CARLISLE/carlisle --runmode=run --workdir=/path/to/output/dir -``` - -## 1.4 Contact information - -If after troubleshooting, the error cannot be resolved, or if a bug is found, please create an [issue](https://github.com/CCBR/CARLISLE/issues) and send and email to [Samantha Chill](mailto:samantha.sevilla@nih.gov).