diff --git a/lectures/bestpractices-detail/bestpractices-detail.Rmd b/lectures/bestpractices-detail/bestpractices-detail.Rmd deleted file mode 100644 index 37a524c..0000000 --- a/lectures/bestpractices-detail/bestpractices-detail.Rmd +++ /dev/null @@ -1,593 +0,0 @@ ---- -title: -author: Per Unneberg -date: "`r format(Sys.time(), '%d %B, %Y')`" -output: - revealjs::revealjs_presentation: - css: ../revealjs.css - includes: - in_header: ../footer.html - self_contained: true - highlight: tango - fig_width: 10 - fig_height: 8 - fig_caption: false - toc: true - toc_depth: 2 - slide_level: 2 - reveal_options: - slideNumber: true - previewLinks: true - minScale: 1 - maxScale: 1 - height: 1400 - width: 1200 ---- - - -```{r snakemake-byoc-knitr, echo=FALSE, eval=TRUE, include=TRUE } -library(knitr) -knitr::opts_chunk$set(warning = FALSE, message = FALSE, - fig.width=12, fig.height=10, autodep=TRUE, echo=TRUE, - cache=FALSE, include=TRUE, eval=TRUE, tidy=FALSE, error=TRUE, - comment="", - class.source = "numberLines", - class.output = c("numberLines chunkout")) -knitr::knit_hooks$set(inline = function(x) { - prettyNum(x, big.mark=" ") -}) -``` - -```{r snakemake-byoc-libs, echo=FALSE, cache=FALSE } -library(ggplot2) -library(viridis) -bw <- theme_bw(base_size=24) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) -theme_set(bw) -curdir <- getwd() -``` - -```{bash setup-snakemake-best-practice-repo, echo=FALSE } -if [ ! -d snakemake_best_practice ]; then -git clone git@github.com:NBISweden/snakemake_best_practice.git -fi -``` - - - -# - -
-
-

Snakemake BYOC 2021

-

Best practices, wrappers, schemas, report, config files, and more

-

Per Unneberg

-

2021-09-30

-
-
- -# Contents - -
-

A best practice repo

-

Wrappers and scripts

-

Configuration and schemas

-

Reports

-

Coding practices and hints

-
- -
-

Setup

- -- Very simple examples with snakefiles and code to run -- All snakefiles and code is available in code repository - https://github.com/NBISweden/snakemake_best_practice/ -- code has been run with Snakemake version 6.8.1 - -
- - -# [Snakemake best practices summary](https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html) - -Linting -: Snakemake (>=5.11) comes with a code quality checker (a so called - linter). It is highly recommended to run the linter before - publishing any workflow, asking questions on Stack Overflow or - filing issues on Github. - -Formatting -: There is an automatic formatter for Snakemake workflows, called - Snakefmt, which should be applied to any Snakemake workflow before - publishing it. - -Testing -: It is a good idea to add some minimal test data and configure Github - Actions for continuously testing the workflow on each new commit. - -Structure -: Stick to a standardized structure. - -Configuration -: Configuration of a workflow should be handled via config files and, - if needed, tabular configuration like sample sheets (either via - Pandas or PEPs). Use such configuration for metadata and experiement - information, **not for runtime specific configuration like threads, - resources and output folders**. For those, just rely on Snakemake’s - CLI arguments like --set-threads, --set-resources, - --set-default-resources, and --directory. - -Filenames -: Try to keep filenames short, but informative. - -Rules and functions -: Try to keep Python code like helper functions separate from rules. - -Wrappers -: Make use of Snakemake wrappers whenever possible - - -# A best practice repo - -
- -Clone the repo (`git clone -git@github.com:NBISweden/snakemake_best_practice.git`) and list -contents: - -```{bash snakemake-byoc-2021-bp-overview, cache=TRUE } -tree -a -d -L 2 -I '.snakemake|.git' snakemake_best_practice -``` - -
- -## What does it do? - -Excerpts from README.md: -```{r snakemake-byoc-2021-bp-readme, code=readLines("snakemake_best_practice/README.md")[c(1:25)], eval=FALSE, highlight=FALSE } - -``` - -
- -```{r snakemake-byoc-2021-bp-readme-tail, code=readLines("snakemake_best_practice/README.md")[c(106:112)], eval=FALSE, highlight=FALSE, attr.source='startFrom="106"' } - -``` - -Use a test data set for test driven development of the workflow. It -also gives a new user a quick idea of how to organize input files and -configuration. - -
- -## Dry-run the test suite - -```{bash snakemake-byoc-2021-dry-run } -cd snakemake_best_practice/.test -snakemake -s ../workflow/Snakefile -n -q -F -``` - -
- - -```{r cd_to_snakemake_best_practice_1, echo=FALSE } -knitr::opts_knit$set(root.dir="snakemake_best_practice/.test") -``` - -```{bash dry-run-fig, echo=TRUE } -snakemake -s ../workflow/Snakefile --rulegraph 2> /dev/null | dot -T png > rulegraph.png -``` - -```{r cd_back_1, echo=FALSE } -knitr::opts_knit$set(root.dir=curdir) -``` - - -```{r dry-run-fig-png, fig.align="center" , echo=FALSE} -knitr::include_graphics("./snakemake_best_practice/.test/rulegraph.png") -``` - -
- - -## Snakefile - -
- -```{python Snakefile, code=readLines("snakemake_best_practice/workflow/Snakefile"), eval=FALSE } - -``` - -
- - - - - -## Stuff common to all snakefiles - -
- -`workflow/rules/common.smk` - -```{python common-smk, code=readLines("snakemake_best_practice/workflow/rules/common.smk"), eval=FALSE } - -``` -
- - - - -## Input functions - - -`workflow/rules/inputfunctions.smk` - -```{python inputfunctions-smk, code=readLines("snakemake_best_practice/workflow/rules/inputfunctions.smk")[1:39], eval=FALSE } - -``` - -## Pseudo-target functions - - -`workflow/rules/inputfunctions.smk` - -```{python inputfunctions-smk-2, code=readLines("snakemake_best_practice/workflow/rules/inputfunctions.smk")[41:76], eval=FALSE, attr.source='startFrom="41"' } - -``` - - - -## Rules - -`workflow/rules/mapping.smk` - -```{python mapping-smk-rule-example, code=readLines("snakemake_best_practice/workflow/rules/mapping.smk")[4:31], eval=FALSE, attr.source='startFrom="4"' } - -``` - - -# Scripts and wrappers - -`script` directive: - -- point to external script -- path relative to **file containing the directive** - -## R script - -`workflow/rules/qc.smk`: - -```{python qc_plot_samtools_coverage, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[63:81], attr.source='startFrom="63"', eval=FALSE } - -``` - -`workflow/scripts/plot_coverage.R` - -```{r snakemake-byoc-2021-r-script, code=readLines("snakemake_best_practice/workflow/scripts/plot_coverage.R"), eval=FALSE } - -``` -S4 attributes map to rule directives (e.g. `snakemake@input[["png"]]`) - -## rmarkdown scripts - -`workflow/rules/qc.smk`: - -```{python qc_rmarkdown, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[118:129], attr.source='startFrom="118"', eval=FALSE } - -``` -
-`workflow/scripts/rmarkdown.Rmd` - -```{r rmarkdown-script, code=readLines("snakemake_best_practice/workflow/scripts/rmarkdown.Rmd"), eval=FALSE } - -``` -
- - - -## python script - - -`workflow/rules/qc.smk`: - -
-```{python qc_plot_samtools_coverage_collate, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[84:99], attr.source='startFrom="84"', eval=FALSE } - -``` -
- -`workflow/scripts/plot_coverage.py` - -
- -```{python snakemake-byoc-2021-python-script, code=readLines("snakemake_best_practice/workflow/scripts/plot_coverage.py"), eval=FALSE} - -``` -
- -Rule directives accessible via `snakemake` object (e.g. `snakemake.input.txt`) - - -## jupyter notebook integration - -`workflow/rules/qc.smk` - -```{python jupyter-notebook-rule, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[102:115], eval=FALSE, attr.source='startFrom="102"'} - -``` -Generate output in `.tests` with -```{bash jupyter-notebook-output, eval=FALSE} -snakemake --use-conda -s ../workflow/Snakefile reports/qc/notebook.html -j 1 -``` - -
- -To edit, start `jupyter-notebook` and open `workflow/notebooks/notebook.py.ipynb`: - -```{bash start-jupyter-notebook, eval=FALSE } -jupyter-notebook workflow/notebooks/notebook.py.ipynb -``` - -
- -## Wrappers - -`wrapper` directive: - -- Reusable wrapper scripts around e.g. command-line tools -- [The Snakemake Wrappers - repository](https://snakemake-wrappers.readthedocs.io/en/stable/) - contains a collection of reusable wrappers. -- accession format: `{version}/bio/{tool}/{command}` - -

Example

- -```{python qc-fastqc-wrapper, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[22:43], eval=FALSE, attr.source='startFrom="22"'} - -``` -[fastqc wrapper documentation](https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/fastqc.html) - -# Configuration and schemas - -The workflow can be configured with a configuration file: - -`workflow/config/config.yaml` - -```{r configuration-example, code=readLines("snakemake_best_practice/config/config.yaml"), eval=FALSE, highlight=FALSE} - -``` - -
- -`workflow/config/sample.tsv` - -```{r configuration-example-sample, code=readLines("snakemake_best_practice/config/samples.tsv"), eval=FALSE, highlight=FALSE} - -``` - -`workflow/config/reads.tsv` - -```{r configuration-example-reads, code=readLines("snakemake_best_practice/config/reads.tsv")[1:4], eval=FALSE, highlight=FALSE} - -``` - -
- -
- -Question: is there a way to validate configuration files, require -inputs and make sure they conform to some predefined format? - -
- - -## Configuration schemas - -Schema benefits according to [https://json-schema.org/](): - -- describes your existing data formats -- provides human- and machine-readable **documentation** -- validates data input - -
- -`workflow/schemas/samples.schema.yaml` - -```{r configuration-sample-schema, code=readLines("snakemake_best_practice/workflow/schemas/samples.schema.yaml"), eval=FALSE, highlight=FALSE} - -``` -
- - -## Configuration schemas - -
- -`workflow/schemas/config.schema.yaml` - -```{r configuration-config-schema, code=readLines("snakemake_best_practice/workflow/schemas/config.schema.yaml"), eval=FALSE, highlight=FALSE} - -``` - -
- -Recall validation step in `workflow/rules/common.smk`: - -```{python configuration-config-schema-validation, code=readLines("snakemake_best_practice/workflow/rules/common.smk")[15:20], eval=FALSE, attr.source='startFrom="15"'} - -``` - -
-
- - -# Reports - - -From snakemake 5.1 and on, generate detailed self-contained HTML -reports that encompass runtime statistics, provenance information, -workflow topology and results - -## The report directive - -`workflow/Snakefile`: - -```{python snakemake-report, code=readLines("snakemake_best_practice/workflow/Snakefile")[13:13], eval=FALSE, attr.source='startFrom="13"'} - -``` - -Workflow report template defined by `workflow/report/workflow.rst`. - - -
- -Use `report` flag to target results for inclusion in report, which -could optionally point to an rst file for captioning. - -`workflow/rules/qc.smk`: - -```{python r-plot-report, code=readLines("snakemake_best_practice/workflow/rules/qc.smk")[63:81], eval=FALSE, attr.source='startFrom="63"'} - -``` -
- -## Workflow rst template files - -Template files are [restructured text -format](https://docutils.sourceforge.io/docs/user/rst/quickstart.html) -(rst) files. - -`workflow/report/workflow.rst`: - -```{r snakemake-workflow-rst, code=readLines("snakemake_best_practice/workflow/report/workflow.rst"), eval=FALSE, highlight=FALSE } - -``` -`workflow/report/coverage.rst`: - -```{r snakemake-coverage-rst, code=readLines("snakemake_best_practice/workflow/report/coverage.rst"), eval=FALSE, highlight=FALSE } - -``` - -
-

Creating the report

- -```{r cd_to_sbp_report, echo=FALSE } -knitr::opts_knit$set(root.dir="snakemake_best_practice/.test") -``` - -```{bash snakemake-create-report, eval=TRUE, cache=TRUE } -snakemake -s ../workflow/Snakefile --report report.html -``` - -```{r cd_back_sbp_report, echo=FALSE } -knitr::opts_knit$set(root.dir=curdir) -``` - -
- -# Coding practices and hints - -

snakemake --lint

- -A linter is a code quality checker that analyzes your code and -highlights issues that need to be resolved to follow best practices. - - -```{r cd_to_snakemake_best_practice_2, echo=FALSE } -knitr::opts_knit$set(root.dir="snakemake_best_practice") -``` - - -```{bash snakemake-lint } -snakemake --lint -``` - - -

snakefmt

- -[snakefmt](https://github.com/snakemake/snakefmt) is an automated code -formatter that should be applied to the workflow prior to publication. - -```{bash snakemake-fmt } -snakefmt --compact-diff workflow/Snakefile -``` - -```{r cd_back_2, echo=FALSE } -knitr::opts_knit$set(root.dir=curdir) -``` - - -## Pre-commit - for the git power user - -[Git hooks](https://git-scm.com/docs/githooks) can -be used to identify simple issues before submission to code review. - -[Pre-commit](https://pre-commit.com) is -a "framework for managing and maintaining multi-language pre-commit -hooks". - -

Write a config file

- -
- -`.pre-commit-config.yaml`: - -```{r pre-commit-config, code=readLines("snakemake_best_practice/.pre-commit-config.yaml"), eval=FALSE, highlight=FALSE } - -``` - -
- -

Usage

- -Install git hooks - -```{bash pre-commit, eval=FALSE} -pre-commit install -``` - -and see how many warnings you get when you try to commit! - -## Github actions for continuous integration - -[Snakemake github -action](https://github.com/snakemake/snakemake-github-action) allows -running the test suite on github to make sure commits and pull -requests don't break the workflow. - -`.github/workflows/main.yaml`: - -
- -```{r github-actions, code=readLines("snakemake_best_practice/.github/workflows/main.yaml"), eval=FALSE, highlight=FALSE } - -``` - -
- -## On project file structure vs workflow file structure - -Example from my config which is loosely modelled on the -[drivendata](http://drivendata.github.io/cookiecutter-data-science/) -setup and similar to the NBIS reproducibility file structure: - -```{bash project-file-structure, cache=TRUE, echo=FALSE } -tree -a -d -L 2 -I '.snakemake|.git' project -``` - -Different snakemake workflows live in `opt` (see [File System Hierachy -standard](https://www.pathname.com/fhs/pub/fhs-2.3.html) for choice of -name). Launching from project root could then look like - -```{bash project-structure-launch, eval=FALSE} -snakemake -s opt/datasources-smk/workflow/Snakefile -j 1 -``` - - - -# Questions? diff --git a/lectures/bestpractices-detail/bestpractices-detail.qmd b/lectures/bestpractices-detail/bestpractices-detail.qmd new file mode 100644 index 0000000..3a00920 --- /dev/null +++ b/lectures/bestpractices-detail/bestpractices-detail.qmd @@ -0,0 +1,948 @@ +--- +title: Best practices in detail +subtitle: An overview of best practices, wrappers, schemas, report, config files, and more +author: Per Unneberg +date: "2 September, 2022" +institute: NBIS +from: markdown+emoji +format: + revealjs: + theme: + - white + - ../custom.scss + self-contained: false + toc: false + toc-depth: 1 + slide-level: 2 + slide-number: true + preview-links: true + chalkboard: true + # Multiple logos not possible; would need to make custom logo combining both logos + footer: Snakemake BYOC 2022 - Best practices + logo: https://nbis.se/assets/img/logos/nbislogo-green.svg + smaller: true + highlight-style: gruvbox + fig-height: 3 + fig-width: 3 +execute: + echo: true + warning: false + cache: false + include: true + autodep: true + eval: true + error: true +knitr: + opts_chunk: + code-fold: false + tidy: false + fig-format: svg +--- + +## Setup {.unnumbered .unlisted} + + + +```{r } +#| label: setup +#| echo: false +#| eval: true +#| cache: false +library(ggplot2) +library(viridis) +bw <- theme_bw(base_size=24) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +theme_set(bw) +snakemake_version <- system("snakemake --version", intern=TRUE) +knitr::knit_hooks$set(inline = function(x) { + prettyNum(x, big.mark=",") + }) +``` + +- Examples based on more advanced snakefiles and code to run +- All snakefiles and code are available in code repository + [https://github.com/NBISweden/snakemake_best_practice/](https://github.com/NBISweden/snakemake_best_practice/) +- Code has been run with Snakemake version `r snakemake_version` + +The best practice example workflow is a mapping and basic qc workflow +where snakemake best practices have been applied. + + +::: {.fragment} + +#### Objective + +The objective of the lecture is to provide an overview of some +advanced features and how to structure your code. Hopefully it can +give you some ideas for developing your workflow. The material can be +a bit overwhelming so see it as a smörgåsbord where you can pick +things to your liking. + +::: + +# Snakemake best practices + + +## [Snakemake best practices summary](https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html) + +::: {.incremental} + +Linting +: Snakemake (>=5.11) comes with a code quality checker (a so called + linter). It is highly recommended to run the linter before + publishing any workflow, asking questions on Stack Overflow or + filing issues on Github. + +Formatting +: There is an automatic formatter for Snakemake workflows, called + Snakefmt, which should be applied to any Snakemake workflow before + publishing it. + +Testing +: It is a good idea to add some minimal test data and configure Github + Actions for continuously testing the workflow on each new commit. + +Structure +: Stick to a standardized structure. + +Configuration +: Configuration of a workflow should be handled via config files and, + if needed, tabular configuration like sample sheets (either via + Pandas or PEPs). Use such configuration for metadata and experiment + information, **not for runtime specific configuration like threads, + resources and output folders**. For those, just rely on Snakemake’s + CLI arguments like --set-threads, --set-resources, + --set-default-resources, and --directory. + +Filenames +: Try to keep filenames short, but informative. + +Rules and functions +: Try to keep Python code like helper functions separate from rules. + +Wrappers +: Make use of Snakemake wrappers whenever possible + +::: + +::: {.notes} + +- not necessary to follow these guidelines - suggestions +- there is however a need to comply with format to publish workflow in + snakemake workflow collection +- order of importance: structure, filenames > test > configuration > lint/format > wrappers +- if only snakemake were python + +::: + +## A best practice repo - standardized structure + + +Clone the repo (`git clone +git@github.com:NBISweden/snakemake_best_practice.git`) and list +contents: + +:::: {.columns} + +::: {.column width="40%"} + + +
+ +```{bash } +#| label: snakemake-byoc-2021-bp-overview-dir +#| cache: true +#| eval: true +#| echo: false +tree -a -F -L 2 -I '.snakemake|LICENSE|.git|resources*references.fasta|resources*README.md|Dockerfile|environment.yaml|.gitignore|.gitattributes|.editorconfig|.pre-commit-config.yaml|config*config.yaml|config*samples.tsv|config*reads.tsv|.ipynb_checkpoints|.myprofile|logs|reports|results|interim|*.~undo-tree~|*.png|*.zip|*.html|.github' ../../snakemake_best_practice | sed -e "s/\.\.\/\.\.\///g" | grep -v directories +``` + +
+ + +::: + +::: {.column width="60%"} + +::: {.incremental} + +.test +: Designated test directory containing a small data set which ideally + should suffice to run all or parts of the workflow. Useful for + test-drived development. + +README.md +: Describe what the workflow does and how to use it + +workflow +: Contains top-level `Snakefile` that includes rules files stored in + the `rules` sub-directory. NB: this is the main entry point to the + workflow. + +workflow/envs +: conda environment files loaded by rules + +workflow/notebooks +: notebooks that can be called by the workflow + +workflow/report +: workflow report templates + +workflow/rules +: workflow rules + +workflow/schemas +: schema files that describe and define configuration file and data formats + +workflow/scripts +: scripts called by workflow + + +::: + +::: + +:::: + + + +::: {.notes} + +Emphasize that **structure** is one of the important aspects + +::: + + + +## What does it do? + +The repo should contain a README.md describing briefly what the +workflow does. Here are some excerpts: + +```{r code=readLines("../README.md")[c(1:25)]} +#| label: snakemake-byoc-2021-bp-readme +#| eval: false +#| highlight: false +``` + +::: {.fragment} + +```{r code=readLines("../README.md")[c(106:112)]} +#| label: snakemake-byoc-2021-bp-readme-tail +#| eval: false +#| highlight: false +#| attr-source: startFrom="106" +``` + + +::: + +::: {.fragment} + +Use a test data set for test driven development of the workflow. It +also gives a new user a quick idea of how to organize input files and +configuration. + + +::: + + + + +## Dry-run the test suite + +```{bash } +#| label: snakemake-byoc-2021-dry-run-echo +#| eval: false +cd snakemake_best_practice/.test +snakemake -s ../workflow/Snakefile -n -q -F +``` + +```{bash } +#| label: snakemake-byoc-2021-dry-run +#| echo: false +snakemake -s ../workflow/Snakefile -n -q -F +``` + +## Draw the workflow + +```{bash } +#| label: dry-run-fig-command +#| echo: true +#| eval: false +snakemake -s ../workflow/Snakefile --rulegraph | dot | display +``` + +```{bash } +#| label: dry-run-fig +#| fig-format: svg +#| output: asis +#| echo: false +snakemake -s ../workflow/Snakefile --rulegraph | dot -T svg | grep -v "" +tree -a -N -F -L 2 -I '.snakemake|LICENSE|.git|resources*references.fasta|resources*README.md|Dockerfile|environment.yaml|.gitignore|.gitattributes|.editorconfig|.pre-commit-config.yaml|config*config.yaml|config*samples.tsv|config*reads.tsv|.ipynb_checkpoints|.myprofile|logs|reports|results|interim|*.~undo-tree~|*.png|*.zip|*.html|.github' ../../snakemake_best_practice | sed -z "s/\n/
\n/g;s/Snakefile/Snakefile<\/span>/;s/\.\.\/\.\.\///" | head -n -2 +echo "
" +``` +::: + + +::: {.notes} + +- explain pseudo-targets +- point out the two common idioms for collecting targets: +1. expand +2. input functions + +::: + +## Stuff common to all snakefiles + +::: {.absolute top=50 left=-200 } + +```{bash } +#| label: snakemake-byoc-2022-common-margin-tree +#| cache: false +#| eval: true +#| echo: false +#| results: asis +echo "
"
+tree -a -N -F -L 3 -I '.snakemake|LICENSE|.git|resources*references.fasta|resources*README.md|Dockerfile|environment.yaml|.gitignore|.gitattributes|.editorconfig|.pre-commit-config.yaml|config*config.yaml|config*samples.tsv|config*reads.tsv|.ipynb_checkpoints|.myprofile|logs|reports|results|interim|*.~undo-tree~|*.png|*.zip|*.html|.github' -P "*.smk" ../../snakemake_best_practice | sed -z "s/\n/
\n/g;s/common.smk/common.smk<\/span>/;s/\.\.\/\.\.\///" | head -n -2 +echo "
" +``` +::: + +
+ + +```{python code=readLines("../workflow/rules/common.smk")} +#| filename: workflow/rules/common.smk +#| label: common-smk +#| eval: false +#| code-line-numbers: "|15-16|21-22|31-32|40-42" +``` +
+ + + + +## Rules + +::: {.absolute top=50 left=-200 } + +```{bash } +#| label: snakemake-byoc-2022-qc-margin-tree +#| cache: false +#| eval: true +#| echo: false +#| results: asis +echo "
"
+tree -a -N -F -L 3 -I '.snakemake|LICENSE|.git|resources*references.fasta|resources*README.md|Dockerfile|environment.yaml|.gitignore|.gitattributes|.editorconfig|.pre-commit-config.yaml|config*config.yaml|config*samples.tsv|config*reads.tsv|.ipynb_checkpoints|.myprofile|logs|reports|results|interim|*.~undo-tree~|*.png|*.zip|*.html|.github' -P "*.smk" ../../snakemake_best_practice | sed -z "s/\n/
\n/g;s/qc.smk/qc.smk<\/span>/;s/\.\.\/\.\.\///" | head -n -2 +echo "
" +``` +::: + +```{python code=readLines("../workflow/rules/qc.smk")[46:60]} +#| filename: workflow/rules/qc.smk +#| label: qc-smk-rule-example +#| eval: false +#| attr-source: startFrom="46" +#| code-line-numbers: "|8-9" +``` + + +::: {.fragment} + + +::: {.absolute top=350 left=-200 } + +```{bash } +#| label: snakemake-byoc-2022-samtools-env-margin-tree +#| cache: false +#| eval: true +#| echo: false +#| results: asis +echo "
"
+tree -a -N -F -L 3 -I '.snakemake|LICENSE|.git|resources*references.fasta|resources*README.md|Dockerfile|environment.yaml|.gitignore|.gitattributes|.editorconfig|.pre-commit-config.yaml|config*config.yaml|config*samples.tsv|config*reads.tsv|.ipynb_checkpoints|.myprofile|logs|reports|results|interim|*.~undo-tree~|*.png|*.zip|*.html|.github' -P "samtools.yaml|R.yaml|multiqc.yaml|jupyter.yaml|bwa.yaml" ../../snakemake_best_practice | sed -z "s/\n/
\n/g;s/samtools.yaml/samtools.yaml<\/span>/;s/\.\.\/\.\.\///" | head -n -2 +echo "
" +``` +::: + +```{python code=readLines("../workflow/envs/samtools.yaml") } +#| filename: workflow/envs/samtools.yaml +#| label: samtools-conda-env +#| eval: false +#| attr-source: startFrom="46" +``` + +::: + + +# Questions? {.unnumbered .unlisted} + +# Additional topics {.unnumbered .unlisted} + +- scripts and wrappers +- input functions and pseudo rules +- configuration and schemas +- reports +- coding practices + +# Scripts and wrappers + + +::: {.notes} + + +`script` directive: + +- point to external script +- path relative to **file containing the directive** + +::: + +## R script + + + +```{python code=readLines("../workflow/rules/qc.smk")[63:81]} +#| filename: workflow/rules/qc.smk +#| label: qc_plot_samtools_coverage +#| attr-source: startFrom="63" +#| eval: false +#| code-line-numbers: "18-19" +``` + + +```{r code=readLines("../workflow/scripts/plot_coverage.R") } +#| filename: workflow/scripts/plot_coverage.R +#| label: snakemake-byoc-2021-r-script +#| eval: false +``` +S4 attributes map to rule directives (e.g. `snakemake@input[["png"]]`) + +## rmarkdown scripts + + +```{python code=readLines("../workflow/rules/qc.smk")[118:129] } +#| filename: workflow/rules/qc.smk +#| label: qc_rmarkdown +#| attr-source: startFrom="118" +#| eval: false +#| code-line-numbers: "11-12" +``` + +
+ +```{r code=readLines("../workflow/scripts/rmarkdown.Rmd") } +#| filename: workflow/scripts/rmarkdown.Rmd +#| label: rmarkdown-script +#| eval: false +``` +
+ + + +## python script + +
+```{python code=readLines("../workflow/rules/qc.smk")[84:99] } +#| filename: workflow/rules/qc.smk +#| label: qc_plot_samtools_coverage_collate +#| attr-source: startFrom="84" +#| eval: false +#| code-line-numbers: "15-16" +``` +
+ +
+ +```{python code=readLines("../workflow/scripts/plot_coverage.py") } +#| filename: workflow/scripts/plot_coverage.py +#| label: snakemake-byoc-2021-python-script +#| eval: false +``` +
+ +Rule directives accessible via `snakemake` object (e.g. `snakemake.input.txt`) + + +## jupyter notebook integration + +```{python code=readLines("../workflow/rules/qc.smk")[102:115] } +#| filename: workflow/rules/qc.smk +#| label: jupyter-notebook-rule +#| eval: false +#| attr-sources: startFrom="102" +#| code-line-numbers: "13-14" +``` +Generate output in `.tests` with +```{bash } +#| label: jupyter-notebook-output +#| eval: false +snakemake --use-conda -s ../workflow/Snakefile reports/qc/notebook.html -j 1 +``` + +::: {.fragment} + +To edit, start `jupyter-notebook` and open `workflow/notebooks/notebook.py.ipynb`: + +```{bash } +#| label: start-jupyter-notebook +#| eval: false +jupyter-notebook workflow/notebooks/notebook.py.ipynb +``` + +::: + +## Wrappers + +`wrapper` directive: + +- Reusable wrapper scripts around e.g. command-line tools +- [The Snakemake Wrappers + repository](https://snakemake-wrappers.readthedocs.io/en/stable/) + contains a collection of reusable wrappers. +- accession format: `{version}/bio/{tool}/{command}` + +### Example ### + + + +```{python code=readLines("../workflow/rules/qc.smk")[22:43] } +#| filename: workflow/rules/qc.smk +#| label: qc-fastqc-wrapper +#| eval: false +#| attr-source: startFrom="22" +#| code-line-numbers: "21-22" +``` +[fastqc wrapper documentation](https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/fastqc.html) + + + + +# Input functions and pseudo-rules +## Input functions + +Instead of specifying strings or lists of strings as input files, +snakemake can also make use of functions that return single or lists +of input files. Function **must** accept a single argument that +corresponds to the wildcards object which is generated by the rule. + +:::: {.columns} + +::: {.column width="50%"} + +::: {.fragment} + +```{python code=readLines("../workflow/rules/inputfunctions.smk")[1:39]} +#| filename: workflow/rules/inputfunctions.smk +#| label: inputfunctions-smk +#| eval: false +#| code-line-numbers: "|10-21" +``` + +::: + + +::: + +::: {.column width="50%"} +::: {.fragment} + + +```{python code=readLines("../workflow/rules/mapping.smk")[34:73]} +#| filename: workflow/rules/mapping.smk +#| label: inputfunctions-mapping-smk +#| eval: false +#| code-line-numbers: "|19-21" +``` + +::: + +::: + +:::: + +## Pseudo-target functions + + +::: {.fragment} + +Input functions also come in handy for **pseudo-targets** (i.e. rules +with only the `input` keyword). In this example the functions generate +workflow output target file names in different ways. + +::: + +:::: {.columns} + +::: {.column width="50%"} + +::: {.fragment} + + +```{python code=readLines("../workflow/rules/inputfunctions.smk")[41:76]} +#| filename: workflow/rules/inputfunctions.smk +#| label: inputfunctions-smk-2 +#| eval: false +#| attr-source: startFrom="41" +#| class-output: "scroll-400" +#| code-line-numbers: "|4-8" +``` +::: + +::: + +::: {.column width="50%"} + +::: {.fragment} + + +```{python code=readLines("../workflow/Snakefile")[39:42]} +#| filename: workflow/Snakefile +#| label: inputfunctions-snakefile-2 +#| eval: false +#| class-output: "scroll-400" +``` + +::: + +::: + +:::: + + +# Configuration and schemas +## Configuration and schemas + +The workflow can be configured with a configuration file: + +```{r code=readLines("../config/config.yaml") } +#| filename: workflow/config/config.yaml +#| label: configuration-example +#| eval: false +#| highlight: false +``` + +::: {.fragment} + + + +```{r code=readLines("../config/samples.tsv") } +#| filename: workflow/config/sample.tsv +#| label: configuration-example-sample +#| eval: false +#| highlight: false + +``` + + + +```{r code=readLines("../config/reads.tsv")[1:4] } +#| filename: workflow/config/reads.tsv +#| label: configuration-example-reads +#| eval: false +#| highlight: false +``` +::: + + +::: {.fragment} + +Question: is there a way to validate configuration files, require +inputs and make sure they conform to some predefined format? + +::: + + +## Configuration schemas to [https://json-schema.org/](): + +- describes your existing data formats +- provides human- and machine-readable **documentation** +- validates data input + +::: {.fragment} + +:::: {.columns} + +::: {.column width="50%"} + + + + + + +```{r code=readLines("../workflow/schemas/samples.schema.yaml") } +#| filename: workflow/schemas/samples.schema.yaml +#| label: configuration-sample-schema +#| eval: false +#| highlight-style: false +#| code-line-numbers: "|7-18|21-24" +``` + +::: + +::: {.column width="50%"} + + + +```{r code=readLines("../config/samples.tsv") } +#| filename: workflow/config/sample.tsv +#| label: configuration-example-sample +#| eval: false +#| highlight: false + +``` + +::: + +:::: + +::: + +## Configuration schemas + +
+ + + +```{r code=readLines("../workflow/schemas/config.schema.yaml") } +#| filename: workflow/schemas/config.schema.yaml +#| label: configuration-config-schema +#| eval: false +#| highlight: false +``` + +::: {.fragment} + +Recall validation step in `workflow/rules/common.smk`: + +```{python code=readLines("../workflow/rules/common.smk")[15:20] } +#| filename: workflow/rules/common.smk +#| label: configuration-config-schema-validation +#| eval: false +#| attr-source: startFrom="15" +``` + +::: +
+ + + +# Reports +## Reports + +From snakemake 5.1 and on, generate detailed self-contained HTML +reports that encompass runtime statistics, provenance information, +workflow topology and results + +## The report directive + +```{python code=readLines("../workflow/Snakefile")[13:13] } +#| filename: workflow/Snakefile +#| label: snakemake-report +#| eval: false +#| attr-source: startFrom="13" +``` + +Workflow report template defined by `workflow/report/workflow.rst`. + + +::: {.fragment} + +Use `report` flag to target results for inclusion in report, which +could optionally point to an rst file for captioning. + + + +```{python code=readLines("../workflow/rules/qc.smk")[63:81] } +#| filename: workflow/rules/qc.smk +#| label: r-plot-report +#| eval: false +#| attr-source: startFrom="63" +#| code-line-numbers: "|4-8" +``` +::: + +## Workflow rst template files + +Template files are [restructured text +format](https://docutils.sourceforge.io/docs/user/rst/quickstart.html) +(rst) files. + + + +```{r code=readLines("../workflow/report/workflow.rst") } +#| filename: workflow/report/workflow.rst +#| label: snakemake-workflow-rst +#| eval: false +#| highlight: false +``` + + + +```{r code=readLines("../workflow/report/coverage.rst") } +#| filename: workflow/report/coverage.rst +#| label: snakemake-coverage-rst +#| eval: false +#| highlight: false +``` + +::: {.fragment} + +### Creating the report ### + + +```{bash } +#| label: snakemake-create-report +#| eval: true +#| cache: true +snakemake -s ../workflow/Snakefile --report report.html +``` +::: + + +# Coding practices +## Coding practices and hints + +##### snakemake --lint ##### + +A linter is a code quality checker that analyzes your code and +highlights issues that need to be resolved to follow best practices. + +```{bash } +#| label: snakemake-lint-echo +#| echo: true +#| eval: false +snakemake --lint +``` + + + +```{bash } +#| label: snakemake-lint +#| class-output: "scroll-300" +#| echo: false +snakemake -s ../workflow/Snakefile --lint +``` + +##### snakefmt ##### + +[snakefmt](https://github.com/snakemake/snakefmt) is an automated code +formatter that should be applied to the workflow prior to publication. + +```{bash } +#| label: snakemake-fmt-echo +#| echo: true +#| eval: false +snakefmt --check --compact-diff +``` +```{bash } +#| label: snakemake-fmt +#| echo: false +#| eval: true +snakefmt --check --compact-diff ../workflow/Snakefile +``` + + +## Pre-commit - for the git power user + +[Git hooks](https://git-scm.com/docs/githooks) can +be used to identify simple issues before submission to code review. + +[Pre-commit](https://pre-commit.com) is +a "framework for managing and maintaining multi-language pre-commit +hooks". + +##### Write a config file ##### + +
+ + + +```{r pre-commit-config, code=readLines("../.pre-commit-config.yaml"), eval=FALSE, highlight=FALSE } +#| filename: .pre-commit-config.yaml +#| label: pre-commit-config +#| eval: false +#| highlight: false +``` + +
+ +##### Usage ##### + + +Install git hooks + +```{bash } +#| label: pre-commit +#| eval: false +pre-commit install +``` + +and see how many warnings you get when you try to commit! + +## Github actions for continuous integration + +[Snakemake github +action](https://github.com/snakemake/snakemake-github-action) allows +running the test suite on github to make sure commits and pull +requests don't break the workflow. + + + +
+ +```{r code=readLines("../.github/workflows/main.yaml") } +#| filename: .github/workflows/main.yaml +#| label: github-actions +#| eval: false +#| highlight: false +``` + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lectures/custom.scss b/lectures/custom.scss new file mode 100644 index 0000000..b90442e --- /dev/null +++ b/lectures/custom.scss @@ -0,0 +1,55 @@ +/* SCSS custom modifications for NBIS presentations using quarto, revealjs and rmarkdown */ + +/*-- scss:defaults --*/ + +/*-- scss:rules --*/ + +.scroll-1000 { + max-height: 1000px; + overflow-y: auto; + background-color: inherit; +} +.scroll-400 { + max-height: 400px; + overflow-y: auto; + background-color: inherit; +} +.scroll-300 { + max-height: 300px; + overflow-y: auto; + background-color: inherit; +} +.scroll-200 { + max-height: 200px; + overflow-y: auto; + background-color: inherit; +} + +pre.out { + background-color: lightgreen; +} +pre.sourceCode.src { + background-color: lightblue; +} + +.green { color: #85be42; font-weight: bold } + +code.tree { + line-height: 8px; + font-size: 18px; +} + +code.stree { + line-height: 6.6px; + font-size: 12px; +} + +code.sstree { + line-height: 5.6px; + font-size: 10px; +} + + +code.large { + font-size: 20px; +} diff --git a/lectures/reproducibility-tools/Makefile b/lectures/reproducibility-tools/Makefile new file mode 100644 index 0000000..55cee41 --- /dev/null +++ b/lectures/reproducibility-tools/Makefile @@ -0,0 +1,8 @@ +all: reproducibility-tools.html + +%.html: %.Rmd + Rscript -e 'rmarkdown::render("$<")' + +# OPENSSL_CONF due to https://github.com/nodejs/node/issues/43132#issuecomment-1130503287 +%.pdf: %.html + OPENSSL_CONF=/dev/null Rscript -e 'library(webshot); webshot("$<", "$@")' diff --git a/lectures/reproducibility-tools/reproducibility-tools.Rmd b/lectures/reproducibility-tools/reproducibility-tools.Rmd index ef2a506..ba0669b 100644 --- a/lectures/reproducibility-tools/reproducibility-tools.Rmd +++ b/lectures/reproducibility-tools/reproducibility-tools.Rmd @@ -1,5 +1,5 @@ --- -title: "Combining Tools for Reproducible Research with Snakemake" +title: "Reproducible Research and Snakemake" subtitle: "Snakemake BYOC NBIS course" date: "`r format(Sys.time(), '%d %B, %Y')`" output: @@ -19,7 +19,7 @@ layout: true class: center, middle -.HUGE[Combining Tools for Reproducible Research with Snakemake] +.HUGE[Reproducible Research and Snakemake] ```{r Setup, echo = FALSE, message = FALSE} # Knitr setup @@ -33,436 +33,392 @@ library("kableExtra") --- -# Reproducibility is rarer than you think +# Reproducibility -The results of only 26% out of 204 randomly selected papers in the journal -*Science* could be reproduced.1 - -.tiny[1 Stodden et. al (2018). "An empirical analysis of journal policy effectiveness for computational reproducibility". PNAS. 115 (11): 2584-2589] +- Reproducible research is about being able to replicate the results of a study +- It is an important aspect of the scientific method +- **Computational reproducibility** is one part of it +- Ideally, given the **same data** and the **same code**, there are identical outcomes -- -> Many journals are revising author guidelines to include data and code -> availability. +*Code* encompasses +- The workflow itself (→ `Snakefile`) +- The helper scripts you are calling (→ `scripts/`) +- The 3rd-party tools you are running/the execution environment (→ this lecture) + + +--- + +# Computational reproducibility + +Why the effort? + +.tiny[M. Schwab et al. *Making scientific computations reproducible*. https://dx.doi.org/10.1109/5992.881708] + +> Because many researchers typically forget details +> of their own work, they are not unlike strangers +> when returning to projects after time away. +> Thus, efforts to communicate your work to +> strangers can actually help you communicate +> with yourself over time. -- -> (...) an improvement over no policy, but currently insufficient for -> reproducibility. +→ **You** are part of the target audience --- -# Combining Tools for Reproducible Research with Snakemake +# Don’t be *that* person -.center[] +*Science* implemented a replication policy in 2011. +A study in 2018 requested raw data and code in accordance with the policy. +Some answers: -- +> When you approach a PI for the source codes and raw data, you better explain who you are, +> whom you work for, why you need the data and what you are going to do with it. -* Track your Snakemake code with .green[Git] and share it in a remote .green[repository] on GitHub or BitBucket (not covered in this lecture) +  -- -* Combine Snakemake with .green[Conda] and/or .green[containers] to make the compute environment reproducible +> I have to say that this is a very unusual request without any explanation! +> Please ask your supervisor to send me an email with a detailed, and I mean detailed, explanation. -- -* Integrate foreign workflow management systems such as .green[Nextflow] pipelines into your Snakemake workflow +(26% out of 204 randomly selected papers in the journal could be reproduced.) + +.tiny[Stodden et. al (2018). *An empirical analysis of journal policy effectiveness for computational reproducibility* https://doi.org/10.1073/pnas.1708290115] --- -# Conda +# Combine tools to make research reproducible -* Is a .green[package, dependency, and environment] manager +.center[] -- - > packages: any type of program (_e.g._ bowtie2, snakemake etc.) +* Track code changes over time with .green[Git] and share it on [GitHub](https://github.com) (not this talk) - > dependency: other software required by a package +-- - > environment: a distinct collection of packages +* Make your workflow reproducible with a workflow manager (.green[Snakemake], .green[Nextflow], .green[WDL]) -- -* Keeps track of the dependencies between packages in each environment +* Make the execution environment reproducible with .green[Conda] environments and/or .green[containers] + --- -# Conda +# Conda: a .green[package], .green[dependency], and .green[environment] manager -## 1. Running a Snakemake rule with a Conda environment +* Conda installs packages +* Packages come from a central repository at https://anaconda.org/ +* Users can contribute their own packages via *channels* +* Highly recommended: The [Bioconda](https://bioconda.github.io/) channel --- +--- -* Make sure you have Conda .green[installed] (Miniconda or Anaconda) +# Using Conda --- +* Install Conda, for example with [Miniconda](https://docs.conda.io/en/latest/miniconda.html) -* Find your Conda .green[package] on http://anaconda.org +* Set up the [Bioconda](https://bioconda.github.io/) channel -- -* Create a Conda .green[environment file] (e.g. `bwa.yaml`) - -```{python conda env one, eval = FALSE} -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bwa=0.7.17 +* Install Samtools and BWA into a new **Conda environment** named `mapping`: +```{bash, eval=FALSE} +$ conda create -n mapping samtools bwa ``` -.tiny[source: [best practice example](https://github.com/NBISweden/snakemake_best_practice)] +-- +* Conda also installs all .green[dependencies] – other software required by Samtools and/or BWA. -- -* Store your `yaml` files in a directory for environments +To use the tools in the environment, .green[activate] it: +```{bash, eval=FALSE} +$ conda activate mapping +$ samtools --version +samtools 1.15.1 +``` -- - -* For reproducibility, it is important to keep include package .green[versions] in your environment file +* Install a tool into an existing environment: +```{bash, eval=FALSE} +conda install -n mapping bowtie2 +``` +(Leaving out `-n mapping` installs into the currently active environment.) --- -# Conda +# Conda environments -## 1. Running a Snakemake rule with a Conda environment - -* Add the .green[path] to the Conda environment `yaml` file to your rule using `conda` +* You can have as many environments as you wish -- -```{python conda rule, eval = FALSE} -rule map_bwa_index: - output: expand("{{ref}}{ext}", ext=[".amb", ".ann", ".bwt", ".pac", ".sa"]) - input: config["ref"] - log: "logs/bwa/index/{ref}.log" - conda: "../envs/bwa.yaml" - shell: - "bwa index {input}" -``` - -.tiny[modified from: [best practice example](https://github.com/NBISweden/snakemake_best_practice)] +* Environments are independent -- -* Start your workflow on the command line with `--use-conda` +* If something is broken, simply delete the environment and start over -```{bash snakemake use conda, eval=FALSE} -$ snakemake --use-conda +-- + +```{bash, eval=FALSE} +$ conda env remove -n mapping ``` -- -* This doesn't work if you use `run` (instead of `shell` or `script`) +* To test a new tool, install it into a fresh Conda environment. Delete the environment to uninstall. + +-- + +* Find packages by searching [anaconda.org](https://anaconda.org) or with `conda search` + --- -# Conda +# Conda environment files -## 2. Using one Conda environment for the entire workflow +* Conda environments can be created from .green[environment files] in YAML format. -- -* Write a Conda .green[environment file] that includes all tools used by the workflow (save it as e.g. `environment.yaml`) +* Example `bwa.yaml`: -```{python conda env big, eval=FALSE} -name: best-practice-smk +```{yaml conda env one, eval = FALSE} channels: - conda-forge - bioconda - - default + - defaults dependencies: - - snakemake=6.8.0 - - python=3.8 - - pandas=1.3.3 - - jupyter=1.0 - - jupyter_contrib_nbextensions=0.5.1 - - jupyterlab_code_formatter=1.4 - bwa=0.7.17 - - multiqc=1.11 - - r-ggplot2=3.3.5 - - samtools=1.13 ``` -.tiny[source: [best practice example](https://github.com/NBISweden/snakemake_best_practice)] +-- +* Create the environment: +```{bash, eval = FALSE} +$ conda env create -n bwa -f bwa.yaml +``` --- -# Conda +# Snakemake + Conda -## 2. Using one Conda environment for the entire workflow +## Option one: A single environment for the entire workflow -* .green[Create] the environment - --- - -```{bash conda create, eval=FALSE} -$ conda env create -f environment.yml +* Write an environment file (`environment.yaml`) that includes .green[all tools used by the workflow]: +```{python conda env big, eval=FALSE} +name: best-practice-smk +channels: + - conda-forge + - bioconda + - default +dependencies: + - snakemake=6.8.0 # ← Snakemake is part of the environment +... + - multiqc=1.11 # ← Version numbers for reproducibility + - samtools=1.13 ``` -- - -* .green[Activate] your Conda environment - -```{bash conda activate, eval=FALSE} +* Create the environment, activate it and run the workflow within it: +```{bash snakemake conda env, eval=FALSE} +$ conda env create -f environment.yml $ conda activate best-practice-smk +$ snakemake ``` -- +* Possibly helpful: `conda export -n envname > environment.yaml` -* Start your Snakemake workflow - -```{bash snakemake conda env, eval=FALSE} -(best-practice-smk) [...] $ snakemake -``` +.tiny[source: [best practice example](https://github.com/NBISweden/snakemake_best_practice)] --- +# Snakemake + Conda -# Containers +## Option two: Rule-specific environments -## What can I use containers for? +You can let Snakemake create and activate Conda environments for you. -- - -* Run applications securely .green[isolated] in a container, packaged with .green[all dependencies and libraries] +1. Create the environment file, such as `envs/bwa.yaml` (`envs/` is best practice) -- - -* As advanced .green[environment manager] - --- - -* To package your .green[code] with the environment it needs - --- - -* To package a whole .green[workflow] (*e.g.* to accompany a manuscript) - +1. Add the `conda:` directive to the rule: +```{python conda rule, eval = FALSE} +rule create_bwa_index: + output: ... + input: ... + conda: "envs/bwa.yaml" # ← Path to environment YAML file + shell: + "bwa index {input}" +``` -- - -* And much more +1. Run `snakemake --use-conda` -- -## Docker vs. Singularity - --- +* Snakemake creates the environment for you and re-uses it next time +* If the YAML file changes, the environment is re-created +* `conda:` does not work if you use `run:` (instead of `shell:` or `script:`) -* Docker was developed for .green[any operating system] except high-performance computing (HPC) clusters --- +.tiny[modified from: [best practice example](https://github.com/NBISweden/snakemake_best_practice)] -* Singularity is an open source container platform suitable for .green[HPC clusters] --- -# Containers - -## Docker nomenclature +# Using a "module" system --- +* Conda environments can be large and slow to create -* A Docker .green[file] is a recipe used to build a Docker .green[image] +* Some cluster operators frown upon using it -- -* A Docker .green[image] is a standalone executable package of software - --- - -* A Docker .green[container] is a standard unit of software run on the Docker Engine +* UPPMAX and other clusters have a .green[module] command for getting access to software: +``` +$ module load bioinfo-tools bwa +``` -- -* .green[DockerHub] is an online service for sharing Docker images +* Snakemake supports this with the `envmodules:` directive: +```{bash, eval = FALSE} +rule create_bwa_index: + output: ... + input: ... + envmodules: + "bioinfo-tools", + "bwa", + conda: "envs/bwa.yaml" # ← Fallback + shell: + "bwa index {input}" +``` --- +* Run with `snakemake --use-envmodules` -* Docker images can be converted into Singularity images +* For reproducibility, [the Snakemake documentation recommends](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#using-environment-modules) to also include a `conda:` section --- # Containers -## 1. Running Snakemake rules with Singularity +* Containers represent another way of packaging applications -- -* Snakemake can run a rule .green[isolated] in a container, using Singularity +* Each container contains the application itself and .green[all system-level dependencies and libraries] (that is, a functional Linux installation) -- -* All Conda packages are available as Docker and Singularity images, _e.g._ on http://biocontainers.pro (bioconda channel) +* It is fully .green[isolated] from the other software on the machine: + By default, the tools in the container can only access what is in the container. -- -* Many other Docker images are available on [DockerHub](https://hub.docker.com/) - --- - -* Or build your own Docker or Singularity images +* The most common software for managing containers is .green[Docker] --- # Containers -## 1. Running Snakemake rules with Singularity +## Docker nomenclature -- - -* Make sure your system has Singularity .green[installed] +* A Docker .green[image] is a standalone executable package of software (on disk) -- - -* Find the Docker or Singularity .green[image] in which you want to run the rule +* A .green[Dockerfile] is a recipe used to build a Docker .green[image] -- - -* Add the .green[link] to the container image (or the path to a Singularity `*.sif` file) to your rule using the `container` directive - -```{python singularity rule, eval = FALSE} -rule NAME: - input: - "table.txt" - output: - "plots/myplot.pdf" - container: - "docker://joseespinosa/docker-r-ggplot2" - script: - "scripts/plot-stuff.R" -``` - -.tiny[source: [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers)] +* A Docker .green[container] is a standard unit of software run on the Docker Engine + (running an image gives a container) -- - -* Start your workflow on the command line with `--use-singularity` - -```{bash snakemake use singularity, eval=FALSE} -$ snakemake --use-singularity -``` - ---- - -# Containers - -## 2. Packaging your Snakemake workflow in a Docker container +* .green[DockerHub] is an online service for sharing Docker images -- -* Make sure your system has Docker .green[installed] - --- +## Docker vs Singularity -* Write a .green[Docker file], _e.g._ [see this example](https://github.com/NBISweden/workshop-reproducible-research/blob/main/tutorials/docker/Dockerfile) +* On high-performance clusters (HPC), Docker is often not installed due to security concerns. + .green[Singularity] is often available as an alternative. -- +* Docker images can be converted into Singularity images - * Start with the official `Ubuntu` image - * Install Miniconda and other required tools (_e.g._ Snakemake) - * Add the project files (e.g. `Snakefile`, `config.yaml`, `environment.yaml`) - * Install the Conda environment containing all packages run by the workflow +-- +* → Singularity can be used to run Docker containers --- -# Containers +# Running Snakemake jobs in containers -## 2. Packaging your Snakemake workflow in a Docker container +Snakemake can run a jobs in a container using Singularity -* Create a Docker .green[image] from your Docker file (_e.g._ called `my_workflow`) - -```{bash docker image, eval=FALSE} -$ docker build -t my_workflow . -``` +* Ensure your system has Singularity installed -- -* .green[Run] your container, _e.g._ - -```{bash docker run, eval=FALSE} -$ docker run my_workflow -``` +* Find a Docker or Singularity image with the tool to run (https://biocontainers.pro/ or [DockerHub](https://hub.docker.com/)) -- -* .green[Share] your Docker file on GitHub or BitBucket, or your Docker image on DockerHub - ---- - -# Combinations of Conda and Containers +* Add the `container:` directive to your rule: -## Combine Conda-based package management with running jobs in containers +```{python singularity rule, eval = FALSE} +rule minimap2_version: + container: "docker://quay.io/biocontainers/minimap2:2.24--h5bf99c6_0" # ← "docker://" is needed + shell: + "minimap2 --version" +``` -- -* A container can be specified globally (for the entire workflow) for a workflow - with rule-specific Conda environments - -* Snakemake then runs each job in this container with its corresponding Conda - environment when run with `--use-conda --use-singularity` +* Start your workflow on the command line with `--use-singularity` -.tiny[More info: [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#ad-hoc-combination-of-conda-package-management-with-containers) & [best practice example](https://github.com/NBISweden/snakemake_best_practice)] +```{bash snakemake use singularity, eval=FALSE} +$ snakemake --use-singularity -j 1 +... +Pulling singularity image docker://quay.io/biocontainers/minimap2:2.24--h5bf99c6_0. +... +Activating singularity image .../.snakemake/singularity/342e6ddbac7e5929a11e6ae9350454c0.simg +INFO: Converting SIF file to temporary sandbox... +2.24-r1122 +INFO: Cleaning up image... +... +``` --- -# Combinations of Conda and Containers +# Containers – advanced topics -## Containerization of Conda-based workflows +* A [Docker image to use for *all* rules can be specified](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#running-jobs-in-containers) -- - -* Snakemake can automatically generate a Docker file that contains all - Conda environments used by the rules of the workflow using the flag `--containerize` - -.tiny[More info: [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#containerization-of-conda-based-workflows)] - ---- - -# Integrating foreign workflow management systems - -* From version 6.2 on, Snakemake can run workflows written in other workflow - management systems such as .green[Nextflow] +* You can package your entire workflow into a Docker image by writing a .green[Dockerfile]. + [See this example](https://github.com/NBISweden/workshop-reproducible-research/blob/0ee1eca78ccefbd06fbeb2c0aba37030230df90d/tutorials/containers/Dockerfile) + - Snakemake runs *inside* the container. + - To run the workflow, only Docker or Singularity is needed -- - -.pull-left[ - -* The workflow runs in .green[Snakemake] until a rule to run the foreign workflow is reached - -* In this rule, Snakemake .green[hands over] to the other workflow manager - -* Afterwards, .green[Snakemake] continues to run rules processing the output files of the foreign workflow - -] +* [Conda and containers can be combined]([Snakemake documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#ad-hoc-combination-of-conda-package-management-with-containers): Specify a global container, run with `--use-conda --use-singularity`, and Snakemake creates the Conda environment within the container. -- - -.pull-right[ - -```{python nextflow, eval = FALSE} -rule chipseq_pipeline: - input: - input="design.csv", - fasta="data/genome.fasta", - gtf="data/genome.gtf", - output: - "multiqc/broadPeaks/multiqc_report.html", - params: - pipeline="nf-core/chipseq", - revision="1.2.1", - profile=["conda"], - handover: True - wrapper: - "0.74.0/utils/nextflow" -``` - -.tiny[More info & source: [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/snakefiles/foreign_wms.html)] - -] +* [Snakemake can automatically generate a Dockerfile](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#containerization-of-conda-based-workflows) + that contains all Conda environments used by the rules of the workflow using the flag + `--containerize`. --- @@ -472,7 +428,7 @@ There are many ways to use other .green[tools for reproducible research] togethe -- -* Use .green[Git] to version control, backup and share your code +* Use .green[Git] for version control, backup and share your code -- @@ -480,20 +436,20 @@ There are many ways to use other .green[tools for reproducible research] togethe -- -* Run your rules in isolated Singularity .green[containers] +* Run your rules in isolated Docker/Singularity .green[containers] -- * Package your entire workflow in a .green[Docker container] --- - -* Run pipelines written in .green[other workflow management systems] in your Snakemake workflow ---- + diff --git a/lectures/reproducibility-tools/reproducibility-tools.pdf b/lectures/reproducibility-tools/reproducibility-tools.pdf index fe8990c..c2fa933 100644 Binary files a/lectures/reproducibility-tools/reproducibility-tools.pdf and b/lectures/reproducibility-tools/reproducibility-tools.pdf differ diff --git a/lectures/running_snakemake/examples/Snakefile1 b/lectures/running_snakemake/examples/Snakefile1 new file mode 100644 index 0000000..c2c65c6 --- /dev/null +++ b/lectures/running_snakemake/examples/Snakefile1 @@ -0,0 +1,13 @@ +rule bwa_mem_CHS_HG00512: + output: + "bam/CHS.HG00512.bam" + input: + "resources/ref.fa", + "data/CHS.HG00512_1.fastq.gz", + "data/CHS.HG00512_2.fastq.gz", + shell: + "bwa mem -t 1 {input}" + + +rule all: + input: expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]) diff --git a/lectures/running_snakemake/examples/envs/bwa.yaml b/lectures/running_snakemake/examples/envs/bwa.yaml new file mode 100644 index 0000000..95b7375 --- /dev/null +++ b/lectures/running_snakemake/examples/envs/bwa.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bwa=0.7.17 diff --git a/lectures/running_snakemake/examples/ex1.smk b/lectures/running_snakemake/examples/ex1.smk new file mode 100644 index 0000000..9402008 --- /dev/null +++ b/lectures/running_snakemake/examples/ex1.smk @@ -0,0 +1,9 @@ +rule bwa_mem_CHS_HG00512: + output: + "bam/CHS.HG00512.bam" + input: + "resources/ref.fa", + "data/CHS.HG00512_1.fastq.gz", + "data/CHS.HG00512_2.fastq.gz", + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex2.smk b/lectures/running_snakemake/examples/ex2.smk new file mode 100644 index 0000000..a2f52d2 --- /dev/null +++ b/lectures/running_snakemake/examples/ex2.smk @@ -0,0 +1,9 @@ +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex3.smk b/lectures/running_snakemake/examples/ex3.smk new file mode 100644 index 0000000..0b07602 --- /dev/null +++ b/lectures/running_snakemake/examples/ex3.smk @@ -0,0 +1,12 @@ +rule all: + input: expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]) + +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex4.smk b/lectures/running_snakemake/examples/ex4.smk new file mode 100644 index 0000000..6c0da5a --- /dev/null +++ b/lectures/running_snakemake/examples/ex4.smk @@ -0,0 +1,14 @@ +rule all: + input: expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]) + +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + conda: + "envs/bwa.yaml" + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex5.smk b/lectures/running_snakemake/examples/ex5.smk new file mode 100644 index 0000000..4f51476 --- /dev/null +++ b/lectures/running_snakemake/examples/ex5.smk @@ -0,0 +1,16 @@ +rule all: + input: expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]) + +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + conda: + "envs/bwa.yaml" + singularity: + "docker://lh3lh3/bwa" + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex6.smk b/lectures/running_snakemake/examples/ex6.smk new file mode 100644 index 0000000..c66eba3 --- /dev/null +++ b/lectures/running_snakemake/examples/ex6.smk @@ -0,0 +1,20 @@ +rule all: + input: expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]) + +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + conda: + "envs/bwa.yaml" + singularity: + "docker://lh3lh3/bwa" + envmodules: + "uppmax", + "bioinfo-tools", + "bwa" + shell: + "bwa mem -t 1 {input} | samtools view -b -o {output}" diff --git a/lectures/running_snakemake/examples/ex7.smk b/lectures/running_snakemake/examples/ex7.smk new file mode 100644 index 0000000..3d8cc41 --- /dev/null +++ b/lectures/running_snakemake/examples/ex7.smk @@ -0,0 +1,29 @@ +rule all: + input: "bam/merged.bam" + +rule bwa_mem_wildcard: + output: + "bam/{sample}.bam", + "bam/{sample}.bam.bai" + input: + "resources/ref.fa", + "data/{sample}_1.fastq.gz", + "data/{sample}_2.fastq.gz", + threads: 2 + log: "logs/bam/{sample}.bam.log" + shell: + "bwa mem -t {threads} {input} 2> {log} | samtools sort --write-index -@{threads} - -o {output[0]}##idx##{output[1]} 2>> {log}" + + +rule samtools_merge_bam: + output: + "bam/merged.bam", + input: + bam = expand("bam/{sample}.bam", sample=["CHS.HG00512", "PUR.HG00731"]), + bai = expand("bam/{sample}.bam.bai", sample=["CHS.HG00512", "PUR.HG00731"]), + threads: 2 + log: "logs/bam/merged.bam.log" + resources: + mem_mb=4000 + shell: + "samtools merge -@{threads} {output} {input.bam} 2> {log}" diff --git a/lectures/running_snakemake/examples/local/config.yaml b/lectures/running_snakemake/examples/local/config.yaml new file mode 100644 index 0000000..b44c593 --- /dev/null +++ b/lectures/running_snakemake/examples/local/config.yaml @@ -0,0 +1,16 @@ +rerun-incomplete: true +keep-going: true +use-conda: true +use-singularity: true +use-envmodules: true + +default-resources: + - runtime=100 + - mem_mb=6000 + - disk_mb=1000000 +set-threads: + - bwa_mem_wildcard=4 +set-resources: + - bwa_mem_wildcard:runtime=1000 + - bwa_mem_wildcard:mem_mb=6000 + - samtools_merge_bam:runtime=100 diff --git a/lectures/running_snakemake/examples/myprofile/CookieCutter.py b/lectures/running_snakemake/examples/myprofile/CookieCutter.py new file mode 100644 index 0000000..ba4fb70 --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/CookieCutter.py @@ -0,0 +1,39 @@ +# +# Based on lsf CookieCutter.py +# +import os +import json + +d = os.path.dirname(__file__) +with open(os.path.join(d, "settings.json")) as fh: + settings = json.load(fh) + + +def from_entry_or_env(values, key): + """Return value from ``values`` and override with environment variables.""" + if key in os.environ: + return os.environ[key] + else: + return values[key] + + +class CookieCutter: + + SBATCH_DEFAULTS = from_entry_or_env(settings, "SBATCH_DEFAULTS") + CLUSTER_NAME = from_entry_or_env(settings, "CLUSTER_NAME") + CLUSTER_CONFIG = from_entry_or_env(settings, "CLUSTER_CONFIG") + + @staticmethod + def get_cluster_option() -> str: + cluster = CookieCutter.CLUSTER_NAME + if cluster != "": + return f"--cluster={cluster}" + return "" + + @staticmethod + def get_cluster_logpath() -> str: + return "logs/slurm/%r/%j-%w" + + @staticmethod + def get_cluster_jobname() -> str: + return "%r_%w" diff --git a/lectures/running_snakemake/examples/myprofile/config.yaml b/lectures/running_snakemake/examples/myprofile/config.yaml new file mode 100644 index 0000000..b9aebae --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/config.yaml @@ -0,0 +1,29 @@ + +cluster-sidecar: "slurm-sidecar.py" +cluster-cancel: "scancel" +restart-times: "0" +jobscript: "slurm-jobscript.sh" +cluster: "slurm-submit.py" +cluster-status: "slurm-status.py" +max-jobs-per-second: "10" +max-status-checks-per-second: "10" +local-cores: 1 +latency-wait: "5" +use-conda: "False" +use-singularity: "False" +jobs: "500" +printshellcmds: "False" + +# Example resource configuration +# default-resources: +# - runtime=100 +# - mem_mb=6000 +# - disk_mb=1000000 +# # set-threads: map rule names to threads +# set-threads: +# - single_core_rule=1 +# - multi_core_rule=10 +# # set-resources: map rule names to resources in general +# set-resources: +# - high_memory_rule:mem_mb=12000 +# - long_running_rule:runtime=1200 diff --git a/lectures/running_snakemake/examples/myprofile/settings.json b/lectures/running_snakemake/examples/myprofile/settings.json new file mode 100644 index 0000000..2c63298 --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/settings.json @@ -0,0 +1,5 @@ +{ + "SBATCH_DEFAULTS": "--account=account", + "CLUSTER_NAME": "", + "CLUSTER_CONFIG": "" +} diff --git a/lectures/running_snakemake/examples/myprofile/slurm-jobscript.sh b/lectures/running_snakemake/examples/myprofile/slurm-jobscript.sh new file mode 100755 index 0000000..391741e --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/slurm-jobscript.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# properties = {properties} +{exec_job} diff --git a/lectures/running_snakemake/examples/myprofile/slurm-sidecar.py b/lectures/running_snakemake/examples/myprofile/slurm-sidecar.py new file mode 100755 index 0000000..9600d4b --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/slurm-sidecar.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +"""Run a Snakemake v7+ sidecar process for Slurm + +This sidecar process will poll ``squeue --me --format='%i,%T'`` every 60 +seconds by default (use environment variable ``SNAKEMAKE_SLURM_SQUEUE_WAIT`` +for adjusting this). + +Note that you have to adjust the value to fit to your ``MinJobAge`` Slurm +configuration. Jobs remain at least ``MinJobAge`` seconds known to the +Slurm controller (default of 300 seconds). If you query ``squeue`` every +60 seconds then this is plenty and you will observe all relevant job status +states as they are relevant for Snakemake. + +If the environment variable ``SNAKEMAKE_CLUSTER_SIDECAR_VARS`` is set then +the ``slurm-status.py`` of the slurm profile will attempt to query this +sidecar process via HTTP. As the sidecar process does not update its +cache in real-time, setting ``SNAKEMAKE_SLURM_SQUEUE_WAIT`` too large might +lead to Snakemake missing the "done" job state. The defaults of +``SNAKEMAKE_SLURM_SQUEUE_WAIT=60`` and Slurm's ``MinJobAge=600`` work well +together and you will see all relevant job statuses. + +If the sidecar is queried for a job ID that it has not seen yet then it will +perform a query to ``sacct`` such that it works well if Snakemake "resume +external job" feature. The ``slurm-submit.py`` script of the Snakemake profile +will register all jobs via POST with this sidecar. +""" + +import http.server +import json +import logging +import os +import subprocess +import sys +import signal +import time +import threading +import uuid + +from CookieCutter import CookieCutter + + +#: Enables debug messages for slurm sidecar. +DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) +#: Enables HTTP request logging in sidecar. +LOG_REQUESTS = bool(int(os.environ.get("SNAKEMAKE_SLURM_LOG_REQUESTS", "0"))) +#: Command to call when calling squeue +SQUEUE_CMD = os.environ.get("SNAKEMAKE_SLURM_SQUEUE_CMD", "squeue") +#: Number of seconds to wait between ``squeue`` calls. +SQUEUE_WAIT = int(os.environ.get("SNAKEMAKE_SLURM_SQUEUE_WAIT", "60")) + +logger = logging.getLogger(__name__) +if DEBUG: + logging.basicConfig(level=logging.DEBUG) + logger.setLevel(logging.DEBUG) + + +class PollSqueueThread(threading.Thread): + """Thread that polls ``squeue`` until stopped by ``stop()``""" + + def __init__( + self, + squeue_wait, + squeue_cmd, + squeue_timeout=2, + sleep_time=0.01, + max_tries=3, + *args, + **kwargs + ): + super().__init__(target=self._work, *args, **kwargs) + #: Time to wait between squeue calls. + self.squeue_wait = squeue_wait + #: Command to call squeue with. + self.squeue_cmd = squeue_cmd + #: Whether or not the thread should stop. + self.stopped = threading.Event() + #: Previous call to ``squeue`` + self.prev_call = 0.0 + #: Time to sleep between iterations in seconds. Thread can only be + #: terminated after this interval when waiting. + self.sleep_time = sleep_time + #: Maximal running time to accept for call to ``squeue``. + self.squeue_timeout = squeue_timeout + #: Maximal number of tries if call to ``squeue`` fails. + self.max_tries = max_tries + #: Dict mapping the job id to the job state string. + self.states = {} + #: Make at least one call to squeue, must not fail. + logger.debug("initializing trhead") + self._call_squeue(allow_failure=False) + self.prev_call = time.time() + + def _work(self): + """Execute the thread's action""" + while not self.stopped.is_set(): + now = time.time() + if now - self.prev_call > self.squeue_wait: + self._call_squeue() + self.prev_call = now + time.sleep(self.sleep_time) + + def get_state(self, jobid): + """Return the job state for the given jobid.""" + jobid = str(jobid) + if jobid not in self.states: + self.states[jobid] = self._get_state_sacct(jobid) + return self.states.get(jobid, "__not_seen_yet__") + + def register_job(self, jobid): + """Register job with the given ID.""" + self.states.setdefault(jobid, None) + + def _get_state_sacct(self, jobid): + """Implement retrieving state via sacct for resuming jobs.""" + cluster = CookieCutter.get_cluster_option() + cmd = ["sacct", "-P", "-b", "-j", jobid, "-n"] + if cluster: + cmd.append(cluster) + try_num = 0 + while try_num < self.max_tries: + try_num += 1 + try: + logger.debug("Calling %s (try %d)", cmd, try_num) + output = subprocess.check_output(cmd, timeout=self.squeue_timeout, text=True) + break + except subprocess.TimeoutExpired as e: + logger.debug("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries) + except subprocess.CalledProcessError as e: + logger.debug("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries) + if try_num >= self.max_tries: + raise Exception("Problem with call to %s" % cmd) + else: + parsed = {x.split("|")[0]: x.split("|")[1] for x in output.strip().split("\n")} + logger.debug("Returning state of %s as %s", jobid, parsed[jobid]) + return parsed[jobid] + + def stop(self): + """Flag thread to stop execution""" + logger.debug("stopping thread") + self.stopped.set() + + def _call_squeue(self, allow_failure=True): + """Run the call to ``squeue``""" + cluster = CookieCutter.get_cluster_option() + try_num = 0 + cmd = [SQUEUE_CMD, "--me", "--format=%i,%T", "--state=all"] + if cluster: + cmd.append(cluster) + while try_num < self.max_tries: + try_num += 1 + try: + logger.debug("Calling %s (try %d)", cmd, try_num) + output = subprocess.check_output(cmd, timeout=self.squeue_timeout, text=True) + logger.debug("Output is:\n---\n%s\n---", output) + break + except subprocess.TimeoutExpired as e: + if not allow_failure: + raise + logger.debug("Call to %s timed out (try %d of %d)", cmd, try_num, self.max_tries) + except subprocess.CalledProcessError as e: + if not allow_failure: + raise + logger.debug("Call to %s failed (try %d of %d)", cmd, try_num, self.max_tries) + if try_num >= self.max_tries: + logger.debug("Giving up for this round") + else: + logger.debug("parsing output") + self._parse_output(output) + + def _parse_output(self, output): + """Parse output of ``squeue`` call.""" + header = None + for line in output.splitlines(): + line = line.strip() + arr = line.split(",") + if not header: + if not line.startswith("JOBID"): + continue # skip leader + header = arr + else: + logger.debug("Updating state of %s to %s", arr[0], arr[1]) + self.states[arr[0]] = arr[1] + + +class JobStateHttpHandler(http.server.BaseHTTPRequestHandler): + """HTTP handler class that responds to ```/job/status/${jobid}/`` GET requests""" + + def do_GET(self): + """Only to ``/job/status/${job_id}/?``""" + logger.debug("--- BEGIN GET") + # Remove trailing slashes from path. + path = self.path + while path.endswith("/"): + path = path[:-1] + # Ensure that /job/status was requested + if not self.path.startswith("/job/status/"): + self.send_response(400) + self.end_headers() + return + # Ensure authentication bearer is correct + auth_required = "Bearer %s" % self.server.http_secret + auth_header = self.headers.get("Authorization") + logger.debug( + "Authorization header is %s, required: %s" % (repr(auth_header), repr(auth_required)) + ) + if auth_header != auth_required: + self.send_response(403) + self.end_headers() + return + # Otherwise, query job ID status + job_id = self.path[len("/job/status/") :] + logger.debug("Querying for job ID %s" % repr(job_id)) + status = self.server.poll_thread.get_state(job_id) + logger.debug("Status: %s" % status) + if not status: + self.send_response(404) + self.end_headers() + else: + self.send_response(200) + self.send_header("Content-type", "application/json") + self.end_headers() + output = json.dumps({"status": status}) + logger.debug("Sending %s" % repr(output)) + self.wfile.write(output.encode("utf-8")) + logger.debug("--- END GET") + + def do_POST(self): + """Handle POSTs (only to ``/job/register/${job_id}/?``)""" + logger.debug("--- BEGIN POST") + # Remove trailing slashes from path. + path = self.path + while path.endswith("/"): + path = path[:-1] + # Ensure that /job/register was requested + if not self.path.startswith("/job/register/"): + self.send_response(400) + self.end_headers() + return + # Ensure authentication bearer is correct + auth_required = "Bearer %s" % self.server.http_secret + auth_header = self.headers.get("Authorization") + logger.debug( + "Authorization header is %s, required: %s", repr(auth_header), repr(auth_required) + ) + # Otherwise, register job ID + job_id = self.path[len("/job/status/") :] + self.server.poll_thread.register_job(job_id) + self.send_response(200) + self.end_headers() + logger.debug("--- END POST") + + def log_request(self, *args, **kwargs): + if LOG_REQUESTS: + super().log_request(*args, **kwargs) + + +class JobStateHttpServer(http.server.HTTPServer): + """The HTTP server class""" + + allow_reuse_address = False + + def __init__(self, poll_thread): + """Initialize thread and print the ``SNAKEMAKE_CLUSTER_SIDECAR_VARS`` to stdout, then flush.""" + super().__init__(("0.0.0.0", 0), JobStateHttpHandler) + #: The ``PollSqueueThread`` with the state dictionary. + self.poll_thread = poll_thread + #: The secret to use. + self.http_secret = str(uuid.uuid4()) + sidecar_vars = { + "server_port": self.server_port, + "server_secret": self.http_secret, + "pid": os.getpid(), + } + logger.debug(json.dumps(sidecar_vars)) + sys.stdout.write(json.dumps(sidecar_vars) + "\n") + sys.stdout.flush() + + def log_message(self, *args, **kwargs): + """Log messages are printed if ``DEBUG`` is ``True``.""" + if DEBUG: + super().log_message(*args, **kwargs) + + +def main(): + # Start thread to poll ``squeue`` in a controlled fashion. + poll_thread = PollSqueueThread(SQUEUE_WAIT, SQUEUE_CMD, name="poll-squeue") + poll_thread.start() + + # Initialize HTTP server that makes available the output of ``squeue --me`` in a + # controlled fashion. + http_server = JobStateHttpServer(poll_thread) + http_thread = threading.Thread(name="http-server", target=http_server.serve_forever) + http_thread.start() + + # Allow for graceful shutdown of poll thread and HTTP server. + def signal_handler(signum, frame): + """Handler for Unix signals. Shuts down http_server and poll_thread.""" + logger.info("Shutting down squeue poll thread and HTTP server...") + # from remote_pdb import set_trace + # set_trace() + poll_thread.stop() + http_server.shutdown() + logger.info("... HTTP server and poll thread shutdown complete.") + for thread in threading.enumerate(): + logger.info("ACTIVE %s", thread.name) + + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + # Actually run the server. + poll_thread.join() + logger.debug("poll_thread done") + http_thread.join() + logger.debug("http_thread done") + + +if __name__ == "__main__": + sys.exit(int(main() or 0)) diff --git a/lectures/running_snakemake/examples/myprofile/slurm-status.py b/lectures/running_snakemake/examples/myprofile/slurm-status.py new file mode 100755 index 0000000..7cc28d1 --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/slurm-status.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import json +import os +import re +import requests +import subprocess as sp +import shlex +import sys +import time +import logging +from CookieCutter import CookieCutter + +logger = logging.getLogger(__name__) + +STATUS_ATTEMPTS = 20 +SIDECAR_VARS = os.environ.get("SNAKEMAKE_CLUSTER_SIDECAR_VARS", None) +DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) + +if DEBUG: + logging.basicConfig(level=logging.DEBUG) + logger.setLevel(logging.DEBUG) + + +def get_status_direct(jobid): + """Get status directly from sacct/scontrol""" + cluster = CookieCutter.get_cluster_option() + for i in range(STATUS_ATTEMPTS): + try: + sacct_res = sp.check_output(shlex.split(f"sacct {cluster} -P -b -j {jobid} -n")) + res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} + break + except sp.CalledProcessError as e: + logger.error("sacct process error") + logger.error(e) + except IndexError as e: + logger.error(e) + pass + # Try getting job with scontrol instead in case sacct is misconfigured + try: + sctrl_res = sp.check_output(shlex.split(f"scontrol {cluster} -o show job {jobid}")) + m = re.search(r"JobState=(\w+)", sctrl_res.decode()) + res = {jobid: m.group(1)} + break + except sp.CalledProcessError as e: + logger.error("scontrol process error") + logger.error(e) + if i >= STATUS_ATTEMPTS - 1: + print("failed") + exit(0) + else: + time.sleep(1) + + return res[jobid] or "" + + +def get_status_sidecar(jobid): + """Get status from cluster sidecar""" + sidecar_vars = json.loads(SIDECAR_VARS) + url = "http://localhost:%d/job/status/%s" % (sidecar_vars["server_port"], jobid) + headers = {"Authorization": "Bearer %s" % sidecar_vars["server_secret"]} + try: + resp = requests.get(url, headers=headers) + if resp.status_code == 404: + return "" # not found yet + logger.debug("sidecar returned: %s" % resp.json()) + resp.raise_for_status() + return resp.json().get("status") or "" + except requests.exceptions.ConnectionError as e: + logger.warning("slurm-status.py: could not query side car: %s", e) + logger.info("slurm-status.py: falling back to direct query") + return get_status_direct(jobid) + + +jobid = sys.argv[1] + +if SIDECAR_VARS: + logger.debug("slurm-status.py: querying sidecar") + status = get_status_sidecar(jobid) +else: + logger.debug("slurm-status.py: direct query") + status = get_status_direct(jobid) + +logger.debug("job status: %s", repr(status)) + +if status == "BOOT_FAIL": + print("failed") +elif status == "OUT_OF_MEMORY": + print("failed") +elif status.startswith("CANCELLED"): + print("failed") +elif status == "COMPLETED": + print("success") +elif status == "DEADLINE": + print("failed") +elif status == "FAILED": + print("failed") +elif status == "NODE_FAIL": + print("failed") +elif status == "PREEMPTED": + print("failed") +elif status == "TIMEOUT": + print("failed") +elif status == "SUSPENDED": + print("running") +else: + print("running") diff --git a/lectures/running_snakemake/examples/myprofile/slurm-submit.py b/lectures/running_snakemake/examples/myprofile/slurm-submit.py new file mode 100755 index 0000000..c5544b4 --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/slurm-submit.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Snakemake SLURM submit script. +""" +import json +import logging +import os + +import requests +from snakemake.utils import read_job_properties + +import slurm_utils +from CookieCutter import CookieCutter + +logger = logging.getLogger(__name__) + +SIDECAR_VARS = os.environ.get("SNAKEMAKE_CLUSTER_SIDECAR_VARS", None) +DEBUG = bool(int(os.environ.get("SNAKEMAKE_SLURM_DEBUG", "0"))) + +if DEBUG: + logging.basicConfig(level=logging.DEBUG) + logger.setLevel(logging.DEBUG) + + +def register_with_sidecar(jobid): + if SIDECAR_VARS is None: + return + sidecar_vars = json.loads(SIDECAR_VARS) + url = "http://localhost:%d/job/register/%s" % (sidecar_vars["server_port"], jobid) + logger.debug("POST to %s", url) + headers = {"Authorization": "Bearer %s" % sidecar_vars["server_secret"]} + requests.post(url, headers=headers) + + +# cookiecutter arguments +SBATCH_DEFAULTS = CookieCutter.SBATCH_DEFAULTS +CLUSTER = CookieCutter.get_cluster_option() +CLUSTER_CONFIG = CookieCutter.CLUSTER_CONFIG + +RESOURCE_MAPPING = { + "time": ("time", "runtime", "walltime"), + "mem": ("mem", "mem_mb", "ram", "memory"), + "mem-per-cpu": ("mem-per-cpu", "mem_per_cpu", "mem_per_thread"), + "nodes": ("nodes", "nnodes"), + "partition": ("partition", "queue"), +} + +# parse job +jobscript = slurm_utils.parse_jobscript() +job_properties = read_job_properties(jobscript) + +sbatch_options = {} +cluster_config = slurm_utils.load_cluster_config(CLUSTER_CONFIG) + +# 1) sbatch default arguments and cluster +sbatch_options.update(slurm_utils.parse_sbatch_defaults(SBATCH_DEFAULTS)) +sbatch_options.update(slurm_utils.parse_sbatch_defaults(CLUSTER)) + +# 2) cluster_config defaults +sbatch_options.update(cluster_config["__default__"]) + +# 3) Convert resources (no unit conversion!) and threads +sbatch_options.update(slurm_utils.convert_job_properties(job_properties, RESOURCE_MAPPING)) + +# 4) cluster_config for particular rule +sbatch_options.update(cluster_config.get(job_properties.get("rule"), {})) + +# 5) cluster_config options +sbatch_options.update(job_properties.get("cluster", {})) + +# convert human-friendly time - leaves slurm format time as is +if "time" in sbatch_options: + duration = str(sbatch_options["time"]) + sbatch_options["time"] = str(slurm_utils.Time(duration)) + +# 6) Format pattern in snakemake style +sbatch_options = slurm_utils.format_values(sbatch_options, job_properties) + +# 7) create output and error filenames and paths +joblog = slurm_utils.JobLog(job_properties) +log = "" +if "output" not in sbatch_options and CookieCutter.get_cluster_logpath(): + outlog = joblog.outlog + log = outlog + sbatch_options["output"] = outlog + +if "error" not in sbatch_options and CookieCutter.get_cluster_logpath(): + errlog = joblog.errlog + log = errlog + sbatch_options["error"] = errlog + +# ensure sbatch output dirs exist +for o in ("output", "error"): + slurm_utils.ensure_dirs_exist(sbatch_options[o]) if o in sbatch_options else None + +# 9) Set slurm job name +if "job-name" not in sbatch_options and "job_name" not in sbatch_options: + sbatch_options["job-name"] = joblog.jobname + +# submit job and echo id back to Snakemake (must be the only stdout) +jobid = slurm_utils.submit_job(jobscript, **sbatch_options) +logger.debug("Registering %s with sidecar...", jobid) +register_with_sidecar(jobid) +logger.debug("... done registering with sidecar") +print(jobid) diff --git a/lectures/running_snakemake/examples/myprofile/slurm_utils.py b/lectures/running_snakemake/examples/myprofile/slurm_utils.py new file mode 100644 index 0000000..c420154 --- /dev/null +++ b/lectures/running_snakemake/examples/myprofile/slurm_utils.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +import argparse +import math +import os +import re +import subprocess as sp +import sys +from datetime import timedelta +from os.path import dirname +from time import time as unix_time +from typing import Union +from uuid import uuid4 +import shlex +from io import StringIO + +from CookieCutter import CookieCutter +from snakemake import io +from snakemake.exceptions import WorkflowError +from snakemake.io import Wildcards +from snakemake.logging import logger +from snakemake.utils import AlwaysQuotedFormatter +from snakemake.utils import QuotedFormatter +from snakemake.utils import SequenceFormatter + + +def _convert_units_to_mb(memory): + """If memory is specified with SI unit, convert to MB""" + if isinstance(memory, int) or isinstance(memory, float): + return int(memory) + siunits = {"K": 1e-3, "M": 1, "G": 1e3, "T": 1e6} + regex = re.compile(r"(\d+)({})$".format("|".join(siunits.keys()))) + m = regex.match(memory) + if m is None: + logger.error( + (f"unsupported memory specification '{memory}';" " allowed suffixes: [K|M|G|T]") + ) + sys.exit(1) + factor = siunits[m.group(2)] + return int(int(m.group(1)) * factor) + + +def parse_jobscript(): + """Minimal CLI to require/only accept single positional argument.""" + p = argparse.ArgumentParser(description="SLURM snakemake submit script") + p.add_argument("jobscript", help="Snakemake jobscript with job properties.") + return p.parse_args().jobscript + + +def parse_sbatch_defaults(parsed): + """Unpack SBATCH_DEFAULTS.""" + d = shlex.split(parsed) if type(parsed) == str else parsed + args = {} + for keyval in [a.split("=") for a in d]: + k = keyval[0].strip().strip("-") + v = keyval[1].strip() if len(keyval) == 2 else None + args[k] = v + return args + + +def load_cluster_config(path): + """Load config to dict + + Load configuration to dict either from absolute path or relative + to profile dir. + """ + if path: + path = os.path.join(dirname(__file__), os.path.expandvars(path)) + dcc = io.load_configfile(path) + else: + dcc = {} + if "__default__" not in dcc: + dcc["__default__"] = {} + return dcc + + +# adapted from format function in snakemake.utils +def format(_pattern, _quote_all=False, **kwargs): # noqa: A001 + """Format a pattern in Snakemake style. + This means that keywords embedded in braces are replaced by any variable + values that are available in the current namespace. + """ + fmt = SequenceFormatter(separator=" ") + if _quote_all: + fmt.element_formatter = AlwaysQuotedFormatter() + else: + fmt.element_formatter = QuotedFormatter() + try: + return fmt.format(_pattern, **kwargs) + except KeyError as ex: + raise NameError( + f"The name {ex} is unknown in this context. Please " + "make sure that you defined that variable. " + "Also note that braces not used for variable access " + "have to be escaped by repeating them " + ) + + +# adapted from Job.format_wildcards in snakemake.jobs +def format_wildcards(string, job_properties): + """Format a string with variables from the job.""" + + class Job(object): + def __init__(self, job_properties): + for key in job_properties: + setattr(self, key, job_properties[key]) + + job = Job(job_properties) + if "params" in job_properties: + job._format_params = Wildcards(fromdict=job_properties["params"]) + else: + job._format_params = None + if "wildcards" in job_properties: + job._format_wildcards = Wildcards(fromdict=job_properties["wildcards"]) + else: + job._format_wildcards = None + _variables = dict() + _variables.update(dict(params=job._format_params, wildcards=job._format_wildcards)) + if hasattr(job, "rule"): + _variables.update(dict(rule=job.rule)) + try: + return format(string, **_variables) + except NameError as ex: + raise WorkflowError("NameError with group job {}: {}".format(job.jobid, str(ex))) + except IndexError as ex: + raise WorkflowError("IndexError with group job {}: {}".format(job.jobid, str(ex))) + + +# adapted from ClusterExecutor.cluster_params function in snakemake.executor +def format_values(dictionary, job_properties): + formatted = dictionary.copy() + for key, value in list(formatted.items()): + if key == "mem": + value = str(_convert_units_to_mb(value)) + if isinstance(value, str): + try: + formatted[key] = format_wildcards(value, job_properties) + except NameError as e: + msg = "Failed to format cluster config " "entry for job {}.".format( + job_properties["rule"] + ) + raise WorkflowError(msg, e) + return formatted + + +def convert_job_properties(job_properties, resource_mapping=None): + options = {} + if resource_mapping is None: + resource_mapping = {} + resources = job_properties.get("resources", {}) + for k, v in resource_mapping.items(): + options.update({k: resources[i] for i in v if i in resources}) + + if "threads" in job_properties: + options["cpus-per-task"] = job_properties["threads"] + + slurm_opts = resources.get("slurm", "") + if not isinstance(slurm_opts, str): + raise ValueError( + "The `slurm` argument to resources must be a space-separated string" + ) + + for opt in slurm_opts.split(): + kv = opt.split("=", maxsplit=1) + k = kv[0] + v = None if len(kv) == 1 else kv[1] + options[k.lstrip("-").replace("_", "-")] = v + + return options + + +def ensure_dirs_exist(path): + """Ensure output folder for Slurm log files exist.""" + di = dirname(path) + if di == "": + return + if not os.path.exists(di): + os.makedirs(di, exist_ok=True) + return + + +def format_sbatch_options(**sbatch_options): + """Format sbatch options""" + options = [] + for k, v in sbatch_options.items(): + val = "" + if v is not None: + val = f"={v}" + options.append(f"--{k}{val}") + return options + + +def submit_job(jobscript, **sbatch_options): + """Submit jobscript and return jobid.""" + options = format_sbatch_options(**sbatch_options) + try: + cmd = ["sbatch"] + ["--parsable"] + options + [jobscript] + res = sp.check_output(cmd) + except sp.CalledProcessError as e: + raise e + # Get jobid + res = res.decode() + try: + jobid = re.search(r"(\d+)", res).group(1) + except Exception as e: + raise e + return jobid + + +timeformats = [ + re.compile(r"^(?P\d+)-(?P\d+):(?P\d+):(?P\d+)$"), + re.compile(r"^(?P\d+)-(?P\d+):(?P\d+)$"), + re.compile(r"^(?P\d+)-(?P\d+)$"), + re.compile(r"^(?P\d+):(?P\d+):(?P\d+)$"), + re.compile(r"^(?P\d+):(?P\d+)$"), + re.compile(r"^(?P\d+)$"), +] + + +def time_to_minutes(time): + """Convert time string to minutes. + + According to slurm: + + Acceptable time formats include "minutes", "minutes:seconds", + "hours:minutes:seconds", "days-hours", "days-hours:minutes" + and "days-hours:minutes:seconds". + + """ + if not isinstance(time, str): + time = str(time) + d = {"days": 0, "hours": 0, "minutes": 0, "seconds": 0} + regex = list(filter(lambda regex: regex.match(time) is not None, timeformats)) + if len(regex) == 0: + return + assert len(regex) == 1, "multiple time formats match" + m = regex[0].match(time) + d.update(m.groupdict()) + minutes = ( + int(d["days"]) * 24 * 60 + + int(d["hours"]) * 60 + + int(d["minutes"]) + + math.ceil(int(d["seconds"]) / 60) + ) + assert minutes > 0, "minutes has to be greater than 0" + return minutes + + +class InvalidTimeUnitError(Exception): + pass + + +class Time: + _nanosecond_size = 1 + _microsecond_size = 1000 * _nanosecond_size + _millisecond_size = 1000 * _microsecond_size + _second_size = 1000 * _millisecond_size + _minute_size = 60 * _second_size + _hour_size = 60 * _minute_size + _day_size = 24 * _hour_size + _week_size = 7 * _day_size + units = { + "s": _second_size, + "m": _minute_size, + "h": _hour_size, + "d": _day_size, + "w": _week_size, + } + pattern = re.compile(rf"(?P\d+(\.\d*)?|\.\d+)(?P[a-zA-Z])") + + def __init__(self, duration: str): + self.duration = Time._from_str(duration) + + def __str__(self) -> str: + return Time._timedelta_to_slurm(self.duration) + + def __repr__(self): + return str(self) + + @staticmethod + def _timedelta_to_slurm(delta: Union[timedelta, str]) -> str: + if isinstance(delta, timedelta): + d = dict() + d["hours"], rem = divmod(delta.seconds, 3600) + d["minutes"], d["seconds"] = divmod(rem, 60) + d["hours"] += delta.days * 24 + return "{hours}:{minutes:02d}:{seconds:02d}".format(**d) + elif isinstance(delta, str): + return delta + else: + raise ValueError("Time is in an unknown format '{}'".format(delta)) + + @staticmethod + def _from_str(duration: str) -> Union[timedelta, str]: + """Parse a duration string to a datetime.timedelta""" + + matches = Time.pattern.finditer(duration) + + total = 0 + n_matches = 0 + for m in matches: + n_matches += 1 + value = m.group("val") + unit = m.group("unit").lower() + if unit not in Time.units: + raise InvalidTimeUnitError( + "Unknown unit '{}' in time {}".format(unit, duration) + ) + + total += float(value) * Time.units[unit] + + if n_matches == 0: + return duration + + microseconds = total / Time._microsecond_size + return timedelta(microseconds=microseconds) + + +class JobLog: + def __init__(self, job_props: dict): + self.job_properties = job_props + self.uid = str(uuid4()) + + @property + def wildcards(self) -> dict: + return self.job_properties.get("wildcards", dict()) + + @property + def wildcards_str(self) -> str: + return ( + ".".join("{}={}".format(k, v) for k, v in self.wildcards.items()) + or "unique" + ) + + @property + def rule_name(self) -> str: + if not self.is_group_jobtype: + return self.job_properties.get("rule", "nameless_rule") + return self.groupid + + @property + def groupid(self) -> str: + return self.job_properties.get("groupid", "group") + + @property + def is_group_jobtype(self) -> bool: + return self.job_properties.get("type", "") == "group" + + @property + def short_uid(self) -> str: + return self.uid.split("-")[0] + + def pattern_replace(self, s: str) -> str: + """ + %r - rule name. If group job, will use the group ID instead + %i - snakemake job ID + %w - wildcards. e.g., wildcards A and B will be concatenated as 'A=.B=' + %U - a random universally unique identifier + %S - shortened version od %U + %T - Unix time, aka seconds since epoch (rounded to an integer) + """ + replacement = { + "%r": self.rule_name, + "%i": self.jobid, + "%w": self.wildcards_str, + "%U": self.uid, + "%T": str(int(unix_time())), + "%S": self.short_uid, + } + for old, new in replacement.items(): + s = s.replace(old, new) + + return s + + @property + def jobname(self) -> str: + jobname_pattern = CookieCutter.get_cluster_jobname() + if not jobname_pattern: + return "" + + return self.pattern_replace(jobname_pattern) + + @property + def jobid(self) -> str: + """The snakemake jobid""" + if self.is_group_jobtype: + return self.job_properties.get("jobid", "").split("-")[0] + return str(self.job_properties.get("jobid")) + + @property + def logpath(self) -> str: + logpath_pattern = CookieCutter.get_cluster_logpath() + if not logpath_pattern: + return "" + + return self.pattern_replace(logpath_pattern) + + @property + def outlog(self) -> str: + return self.logpath + ".out" + + @property + def errlog(self) -> str: + return self.logpath + ".err" diff --git a/lectures/running_snakemake/examples/resources/ref.fa.fai b/lectures/running_snakemake/examples/resources/ref.fa.fai new file mode 100644 index 0000000..9e6f6f8 --- /dev/null +++ b/lectures/running_snakemake/examples/resources/ref.fa.fai @@ -0,0 +1,14 @@ +scaffold1 1050000 11 60 61 +scaffold2 340000 1067522 340000 340001 +scaffold3 230000 1407534 230000 230001 +scaffold4 130000 1637546 130000 130001 +scaffold5 20000 1767558 20000 20001 +scaffold6 100000 1787570 100000 100001 +scaffold7 40000 1887582 40000 40001 +scaffold8 30000 1927594 30000 30001 +scaffold9 20000 1957606 20000 20001 +scaffold10 10000 1977619 10000 10001 +scaffold11 10000 1987632 10000 10001 +scaffold12 10000 1997645 10000 10001 +scaffold13 10000 2007658 10000 10001 +scaffold14 30001 2017671 60 61 diff --git a/lectures/running_snakemake/running.Rmd b/lectures/running_snakemake/running.Rmd deleted file mode 100644 index 2de5c8a..0000000 --- a/lectures/running_snakemake/running.Rmd +++ /dev/null @@ -1,437 +0,0 @@ ---- -title: -author: Per Unneberg -date: "`r format(Sys.time(), '%d %B, %Y')`" -output: - revealjs::revealjs_presentation: - css: ../revealjs.css - includes: - in_header: ../footer.html - self_contained: true - highlight: tango - fig_width: 10 - fig_height: 8 - fig_caption: false - toc: true - toc_depth: 2 - slide_level: 2 - reveal_options: - slideNumber: true - previewLinks: true - minScale: 1 - maxScale: 1 - height: 1400 - width: 1200 ---- - - -```{r knitr, echo=FALSE, eval=TRUE, include=TRUE } -library(knitr) -knitr::opts_chunk$set(warning = FALSE, message = FALSE, - fig.width=12, fig.height=10, autodep=TRUE, echo=TRUE, - cache=FALSE, include=TRUE, eval=TRUE, tidy=FALSE, error=TRUE, - class.source = "numberLines", comment="", - class.output = c("numberLines chunkout")) -knitr::knit_hooks$set(inline = function(x) { - prettyNum(x, big.mark=" ") - }) -knitr::opts_knit$set(root.dir="snakemake_best_practice/.test") -``` - -```{r libs, echo=FALSE, eval=TRUE, cache=FALSE } -library(ggplot2) -library(viridis) -bw <- theme_bw(base_size=24) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) -theme_set(bw) -``` - -# - -
-
-

Snakemake BYOC 2021

-

Running snakemake, finetuning performance and setting resource usage

-

Per Unneberg

-

2021-10-01

-
-
- - - -# Contents - -
-

Basic execution

-

Resources

-

Profiles

-

Cluster execution

-
- -
-

Setup

- -- Very simple examples with snakefiles and code to run -- All snakefiles and code is available in code repository - https://github.com/NBISweden/snakemake_best_practice/ -- code has been run with Snakemake version 6.8.1 -- all commands have been executed in the snakemake_best_practice .test - directory - -
- -# Basic execution - -## Running locally - some commonly used options - -We first perform a dry run (option `--dry-run`, short option `-n`) to -print (`--printshellcmds (-p)`) the commands that would happen if we -were to force re-execution of all (`--forceall (-F)`) rules: - -```{bash snakemake-local, class.output="scroll-400" } -snakemake -s ../workflow/Snakefile --forceall --dry-run --printshellcmds -``` - -This is equivalent to - -```{bash snakemake-local-short, eval=FALSE } -snakemake -s ../workflow/Snakefile -F -n -p -``` - -To actually rerun the workflow, we simple drop the `-p` and `-n` flags -and add the number of cores (`--cores (-j)`) we want to utilize: - -```{bash snakemake-rerun-workflow, eval=FALSE } -snakemake -s ../workflow/Snakefile -F -j 2 -``` - -## More commonly used options - -One can also force regeneration of a single target with the `--force -(-f)` option: - -```{bash snakemake-rerun-target } -snakemake -s ../workflow/Snakefile -f -j 1 -n -p results/qc/fastqc/data/raw/CHS.HG00512_010101_AAABBB11XX_1.fastq.gz.html -``` - - -When resuming a workflow it can be handy to add the -`--rerun-incomplete (--ri)` and `--keep-going (-k)` options: - -```{bash snakemake-rerun-incomplete-workflow, eval=FALSE } -snakemake -s ../workflow/Snakefile -j 2 --ri -k -``` - - - -## Distribution and reproducibility - -

Isolated software environments

- -Use the `conda` directive to define an isolated software environment -and use option `--use-conda` to automate generation of the rule's -software environment: - -```{bash conda, eval=FALSE } -snakemake -s ../workflow/Snakefile -j 2 --use-conda -``` - -

Running in containers

- -Use the `container` directive to define a docker container to use, -which is activated with the `--use-singularity` option: - - -```{bash singularity, eval=FALSE } -snakemake -s ../workflow/Snakefile -j 2 --use-singularity -``` - -

Environment modules

- -Use the `envmodules` directive to define environment modules available -at your local HPC (uppmax) and activate with `--use-envmodules`: - -```{bash envmodules, eval=FALSE } -snakemake -s ../workflow/Snakefile -j 2 --use-envmodules -``` - -More on this later as we discuss uppmax and SLURM. - -NB: this is most likely **not** portable to other systems. - -## snakemake has many more options... - - -
-```{bash snakemake-options, class.output="scroll-1000" } -snakemake -h -``` - -
- -# Resources - -Resources are arbitrarily named keywords that can be defined with the -`resources` directive. The snakemake job scheduler can make use of the -resources information (e.g. disk usage or memory requirements) to make -decisions on what jobs to run at what point and so on. - -
- -```{python snakemake-resources-example, eval=FALSE, code=readLines("snakemake_best_practice/workflow/rules/mapping.smk")[34:73] } - -``` -
- -## Default resources - -Snakemake can define `default resources` that can be explicitely -listed (and set with arguments) with the `--default-resources` option: - -```{bash snakemake-default-resources } -snakemake -f -n -p -s ../workflow/Snakefile resources/scaffolds.fa.amb --default-resources -``` - -## Default resources - setting - -```{bash snakemake-set-default-resources } -snakemake -f -n -p -s ../workflow/Snakefile resources/scaffolds.fa.amb --default-resources runtime=200 mem_mb=2000 -``` - -NB: `bwa_index` runtime resource not modified because it has been -defined in the rule: - -```{python snakemake-bwa-index-runtime, code=readLines("snakemake_best_practice/workflow/rules/mapping.smk")[20:21], eval=FALSE, attr.source='startFrom="20"'} - -``` - - - -## Customizing resources and threads - -Default resources are one-size-fits-all settings that would apply to -all rules. However, in many workflows, there are certain rules that -require more specific resource tuning. - -Resource tuning can be achieved with the `--set-resources` option. -Similarly `--set-threads` allows setting rule-specific thread values: - -```{bash snakemake-set-resources } -snakemake -f -n -p -s ../workflow/Snakefile resources/scaffolds.fa.amb --default-resources --set-resources map_bwa_index:runtime=1000 map_bwa_index:mem_mb=6000 --set-threads map_bwa_index=4 -j 8 -``` - - -## Putting it all together - on the limits of the command line - -Putting everything together, we could now have a command line that -looks something like this: - -```{bash snakemake-long-command-line, eval=FALSE } -snakemake -s ../workflow/Snakefile -F --ri -k \ - --use-conda --use-singularity --use-envmodules \ - --default-resources \ - --set-resources map_bwa_index:runtime=1000 map_bwa_index:mem_mb=6000 \ - --set-threads map_bwa_index=4 -j 8 -``` - -This is getting illegible and it is tedious to write. What to do? - -
- -Profiles to the rescue! - -
- -# Profiles - -Profiles are configuration files that apply to specific compute -environments and analyses. They allow setting default options. - - -
- -At its simplest, a profile is simply a directory with a `config.yaml` -file that sets program options. Let's put our previous example in a -directory called `~/.config/snakemake/myprofile`. The minimum content of -that directory is then a file `config.yaml` with (in this case) the -following contents: - -```{r snakemake-local-profile, eval=FALSE, code=readLines("~/.config/snakemake/myprofile/config.yaml") } - -``` - -
- -## Running the profile - -Run with `--profile` (NB: profile argument can also be absolute or -relative path): - -
- -```{bash snakemake-local-profile-run, class.output="sourceCode scroll-1000" } -snakemake --profile myprofile -n -p -F -``` - -
- -# Cluster execution - -Sofar we have looked at local jobs. What if we want to submit jobs at -a HPC? Here we focus on SLURM. - -The following commands will be executed in the -snakemake_best_practices .test folder at uppmax. - -
- -

sbatch solution

- -Wrap workflow in sbatch script (e.g. workflow.sh): - -```{bash sbatch-script, eval=FALSE} -#!/bin/bash -l -#SBATCH -A account -#SBATCH -p core -#SBATCH -c 20 -#... other SBATCH arguments ... - -module load snakemake -snakemake -j 20 --use-conda --use-envmodules all -``` - -and submit with -```{bash sbatch-submit, eval=FALSE} -sbatch workflow.sh -``` - -
- -
-Downside: can only make use of one node at a time. -
- -## The snakemake job scheduler - -When running jobs locally using limited number of threads, snakemake -needs to decide what job to run when. These decisions are made by an -internal *job scheduler*. As we will see, the internal scheduler still -has this role when submitting jobs to a cluster scheduler. - -
- -

Background sessions

- -A workflow can take a long time to run. A workflow submitted in a -login shell will terminate once we logout. Therefore, it is advised to -submit a workflow in a *background session*, either -[screen](https://www.gnu.org/software/screen/manual/screen.html) or -[tmux](https://tmuxguide.readthedocs.io/en/latest/tmux/tmux.html). - -A named `tmux` session can be initiated as - -```{bash tmux, eval=FALSE } -tmux -s mysession -``` - -Inside the session, use a prefix (default `Ctrl-b`; many change to -`Ctrl-a` which is default in `screen`) with key to launch command. For -instance, `Ctrl-b d` will detach (exit) from the session. See the docs -for further info. - -
- -## Generic execution - -The `--cluster` option can be used to submit jobs to the cluster -scheduler: -```{bash snakemake-cluster-generic, eval=FALSE } -snakemake --cluster "sbatch -A account -p core -n 20 -t {resources.runtime}" \ - --use-conda --use-envmodules --ri -k --default-resources runtime=100 -j 100 -``` - -Note the use of the format string "{resources.runtime}" to set running -times individually. - -One drawback with this approach is that failed jobs or timeouts go -undetected, which means you have to monitor the outputs regularly. You -don't want to do that. - -
- -

Custom cluster commands

- -The argument to `--cluster` is a command (sbatch in example above), so -could be any wrapper script that submits jobs to a cluster scheduler. - -Furthermore, option `--cluster-status` takes as argument a command -(i.e. custom script) that checks jobs for their status. - -Also, option `--jobscript` takes as argument a script that submits -jobs to the cluster. - -We could write custom scripts for each of these options to fine-tune -job submission. If only there were such scripts already available! - -
- - -## snakemake-profiles - -[Snakemake Profiles](https://github.com/Snakemake-Profiles/doc) are -collections of reusable configuration profiles for various computing -environments. The [slurm snakemake -profile](https://github.com/Snakemake-Profiles/slurm) provides the -scripts we requested on the previous slide. - -
- -

Installation

- -The profiles are [cookiecutter -templates](https://cookiecutter.readthedocs.io/en/1.7.2/) and can be -installed as follows: - -```{bash cookicutter-profile-install, eval=FALSE} -$ cookiecutter https://github.com/Snakemake-Profiles/slurm.git -profile_name [slurm]: .myprofile -sbatch_defaults []: --account=account --error=logs/slurm/%x-%j.err --output=logs/slurm/%x-%j.out" -cluster_config []: -Select advanced_argument_conversion: -1 - no -2 - yes -Choose from 1, 2 [1]: 2 -cluster_name []: rackham - -``` - -
- -## snakemake slurm profile - -
-```{bash slurm-profile-tree, echo=FALSE} -tree .myprofile -``` - - -`.myprofile/settings.json`: -```{bash slurm-settings, eval=FALSE, code=readLines("snakemake_best_practice/.test/.myprofile/settings.json") } - -``` - -`.myprofile/config.yaml`: -```{bash slurm-config, eval=FALSE, code=readLines("snakemake_best_practice/.test/.myprofile/config.yaml") } - -``` -
- -

Job submission

- -Submit jobs with -```{bash slurm-profile-submit, eval=FALSE } -snakemake --profile .myprofile -j 10 --ri -k -F -``` - - - -# Questions? diff --git a/lectures/running_snakemake/running.qmd b/lectures/running_snakemake/running.qmd new file mode 100644 index 0000000..221bccd --- /dev/null +++ b/lectures/running_snakemake/running.qmd @@ -0,0 +1,1067 @@ +--- +title: Running snakemake +subtitle: Running snakemake locally and on the cluster, finetuning performance and setting resource usage +author: Per Unneberg +date: "1 September, 2022" +institute: NBIS +from: markdown+emoji +format: + revealjs: + theme: + - white + - ../custom.scss +# css: ../revealjs.css + self-contained: false + toc: true + toc-depth: 1 + slide-level: 2 + slide-number: true + preview-links: true + chalkboard: true + # Multiple logos not possible; would need to make custom logo combining both logos + footer: Snakemake BYOC 2022 - Running snakemake + logo: https://nbis.se/assets/img/logos/nbislogo-green.svg + smaller: true + highlight-style: gruvbox + fig-height: 3 + fig-width: 3 +execute: + echo: true + warning: false + cache: false + include: true + autodep: true + eval: true + error: true +knitr: + opts_chunk: + code-fold: false + tidy: false + fig-format: svg +--- + + +## Setup {.unnumbered .unlisted} + +```{r libs } +#| echo: false +#| eval: true +#| cache: false +# For some reason this is not applied to print statements +## knitr::knit_hooks$set(inline = function(x) { +## prettyNum(x, big.mark=",") +## }) +library(ggplot2) +library(viridis) +bw <- theme_bw(base_size=24) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +theme_set(bw) +## knitr::knit_engines$set(snakemake = function(options) { +## reticulate::eng_python(options) +## }) +snakemake_version <- system("snakemake --version", intern=TRUE) +``` + +- Examples with snakefiles and code to run in `examples` subdirectory +- Snakefiles are named `ex#.smk` +- Code has been run with Snakemake version `r snakemake_version` +- Rules run `bwa mem` to map two samples to a reference + +:::: {.columns} + +::: {.column width="50%"} + +::: {.fragment} + +Input data: + +```{bash } +#| echo: false +#| label: list-data-0 +tree -L 3 data resources +``` +::: + +::: + +::: {.column width="50%"} + +::: {.fragment} + +Snakefiles: + +```{bash } +#| label: list-snakefiles +#| echo: false +ls -1 *.smk +``` + +::: + +::: + +:::: + + +# Basic execution + +## Example 1 - ex1.smk + +Let's start by writing a snakefile that runs an alignment with `bwa`. The command we want to run is + +```{bash } +#| label: bwa-command +#| echo: true +#| eval: false +bwa mem -t 1 resources/ref.fa data/CHS.HG00512_1.fastq.gz data/CHS.HG00512_2.fastq.gz | samtools view -b -o bam/CHS.HG00512.bam +``` + +where we have + +::: {.incremental} + +output +: `bam/CHS.HG00512.bam` + +inputs +: `resources/ref.fa`, `data/CHS.HG00512_1.fastq.gz`, and `data/CHS.HG00512_2.fastq.gz` + +command +: `bwa mem -t 1 {inputs} | samtools view -b -o {output}` + +::: + +::: {.fragment} + +Putting these in a snakefile yields: + +``` {python code=readLines("ex1.smk") } +#| eval: false +#| label: bwa_mem_CHS_HG00512-1 +``` + +::: + + + +## Example 1 - ex1.smk + + +We first perform a dry run (option `--dry-run`, short option `-n`) to +print (`--printshellcmds/-p`) the default rule in the snakefile +`ex1.smk` which we point to using option `--snakefile/-s`: + + +::: {.fragment} + + +```{bash } +#| label: snakemake-bwa_mem_CHS_HG00512_f1-1 +#| eval: false +snakemake --snakefile ex1.smk --dry-run --printshellcmds +``` +::: + + +## Example 1 - ex1.smk + + +We first perform a dry run (option `--dry-run`, short option `-n`) to +print (`--printshellcmds/-p`) the default rule in the snakefile +`ex1.smk` which we point to using option `--snakefile/-s`: + +```{bash } +#| label: snakemake-bwa_mem_CHS_HG00512_f1-2 +#| eval: false +snakemake -s ex1.smk -n -p +``` + +::: {.fragment} + + +```{bash } +#| label: snakemake-bwa_mem_CHS_HG00512_f2 +#| echo: false +rm -f bam/CHS.HG00512.bam +snakemake -s ex1.smk -n -p +``` +::: + + + +## Example 1 - ex1.smk + + +We first perform a dry run (option `--dry-run`, short option `-n`) to +print (`--printshellcmds/-p`) the default rule in the snakefile +`ex1.smk` which we point to using option `--snakefile/-s`: + +```{bash } +#| label: snakemake-bwa_mem_CHS_HG00512_f1-3 +#| eval: false +snakemake -s ex1.smk -n -p +``` + +```{bash } +#| label: snakemake-bwa_mem_CHS_HG00512_f3 +#| echo: false +rm -f bam/CHS.HG00512.bam +snakemake -s ex1.smk -n -p +``` + +Note the reason the rule was run[^reason] + +[^reason]: for snakemake <7.12 use the `--reason/-r` option + +::: {.notes} + +Mention the reason the rule was rerun + +::: + +## Example 1 - ex1.smk + +To actually run the workflow, we simply drop the `-p` and `-n` flags +and add the number of cores (`--cores/-c`)[^cores] we want to utilize: + +```{bash } +#| label: snakemake-rerun-workflow +#| cache: true +#| eval: false +snakemake -s ex1.smk -c 1 +``` +```{bash } +#| label: snakemake-rerun-workflow-rm +#| cache: true +#| echo: false +rm -f bam/*.bam +snakemake -s ex1.smk -c 1 +``` + +[^cores]: Required for snakemake >= 5.11.0 + +## Example 2 - ex2.smk + +The current snakefile consists only of one rule that also is specific +to the `CHS.HG00512` sample. First, let's generalize the bwa rule +using wildcards: + +``` {python code=readLines("ex2.smk") } +#| label: bwa_mem_wildcard +#| eval: false +``` + +::: {.fragment} + +Now, running snakemake as before results in an error: + +::: + +::: {.fragment} + +``` {bash } +#| eval: true +#| echo: true +snakemake -s ex2.smk -n -p +``` +::: + +::: {.fragment} + +As the error indicates, we could specify a target (e.g. +`bam/PUR.HG00731.bam`; note the use of the `--quiet/-q` option to +suppress information): + +``` {bash } +snakemake -s ex2.smk -q -n bam/PUR.HG00731.bam +``` + +::: + +::: {.fragment} + +Alternatively - and better - add a *pseudo-rule* (typically called +`all`) at the *top* of the snakefile, since if no target is provided +at the command line, snakemake will use the first rule it encounters. + +::: + +## Example 3 - ex3.smk + +With the previous in mind, the new snakefile becomes + +``` {python code=readLines("ex3.smk") } +#| label: bwa_mem_wildcard_all +#| eval: false +#| code-line-numbers: "|1-2" +``` + +::: {.fragment} + +Running the default target (implicitly `all`) gives: + +``` {bash } +#| class-output: "scroll-300" +snakemake -s ex3.smk -n -p +``` + +::: + +::: {.fragment} + +Note that since sample `CHS.HG00512` had been processed previously, +only one job is run. + +::: + +## Example 3 - ex3.smk: forcing reruns + +::: {.fragment} + +One can force regeneration of a single target with the `--force +(-f)` option: + +```{bash } +#| label: snakemake-force-rerun-target +#| output-location: fragment +#| class-output: "scroll-200" +snakemake -s ex3.smk -f -c 1 -p -n bam/CHS.HG00512.bam +``` + +::: + +::: {.fragment} + +To rerun the entire workflow use `--forceall/-F`: + +``` {bash } +#| label: snakemake-force-rerun-all +#| output-location: fragment +#| class-output: "scroll-200" +snakemake -s ex3.smk -F -c 1 -q -n +``` + +::: + + +::: {.fragment} +::: {.callout-tip} + +Always first use the `--dry-run` together with `--forceall` so as not +to inadvertently rerun everything from scratch. + +::: +::: + +## Example 3 - ex3.smk: other handy options + +##### Rerun-incomplete and keep-going + +When resuming a workflow it can be handy to add the +`--rerun-incomplete (--ri)` and `--keep-going (-k)` options: + +```{bash } +#| label: snakemake-rerun-incomplete-workflow +#| eval: false +snakemake -s ex3.smk -c 2 --ri -k +``` + +`--rerun-incomplete` takes care of unfinished jobs, e.g. slurm +timeouts. If a job fails, `--keep-going` will try to finish as many +jobs as possible before terminating the workflow. + +::: {.fragment} + +##### Printing the workflow dag and rulegraph + +::: + +:::: {.columns} + +::: {.column width="30%"} + +::: {.fragment} + +`--rulegraph` is a convenient way of getting an overview of the workflow + +``` {bash } +#| eval: false +snakemake -s ex3.smk --rulegraph | dot | display +``` + +``` {bash } +#| fig-format: svg +#| output: asis +#| echo: false + snakemake -s ex3.smk --rulegraph | dot -T svg | grep -v " + + + + + + + +::: {.notes} + +Point out that this can cause jobs to fail if they exceed the specified resources + +::: + +## Adding threads and resources to workflow - ex7.smk ## + +Now that we will be fine-tuning resources and threads per rule we add +another rule `samtools_merge_bam` that will merge our bam files, the +keyword `threads` and set resources for one of the rules: + +``` {python code=readLines("ex7.smk") } +#| label: workflow-adding-threads +#| echo: true +#| eval: false +#| code-line-numbers: "|12,24|13,25|1,2|21-23|26,27" +``` +::: {.fragment} + +```{bash } +#| label: ex7-run +#| echo: true +#| eval: true +snakemake -s ex7.smk -n -q -F -c 10 +``` +::: + +::: {.fragment} + +Note that we changed the final pseudo-target name since the final +workflow output now is a merged bam file! + +::: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## Setting rule-specific resources ## + +Default resources are one-size-fits-all settings that would apply to +all rules. However, in many workflows, there are certain rules that +require more specific resource tuning. + +Resource tuning can be achieved with the `--set-resources` option. +Similarly `--set-threads` allows setting rule-specific thread values: + +::: {.fragment} + +```{bash } +#| label: snakemake-set-resources +snakemake -F -n -s ex7.smk --default-resources mem_mb=2000 --set-resources bwa_mem_wildcard:runtime=1000 \ + bwa_mem_wildcard:mem_mb=6000 --set-threads bwa_mem_wildcard=4 -c 8 +``` + +::: + + + + + + +## Putting it all together - on the limits of the command line + +Putting everything together, we could now have a command line that +looks something like this: + +::: {.fragment} + +```{bash } +#| label: snakemake-long-command-line +#| eval: false +snakemake -s ex7.smk -F --ri -k \ + --use-conda --use-singularity --use-envmodules \ + --default-resources mem_mb=2000 --set-resources bwa_mem_wildcard:runtime=1000 \ + bwa_mem_wildcard:mem_mb=6000 samtools_merge_bam:runtime=100 \ + --set-threads bwa_mem_wildcard=4 -c 8 +``` + +::: + +::: {.fragment} + +This is getting illegible and it is tedious to write. What to do? + +::: + +::: {.fragment} + +Snakemake profiles to the rescue! + +::: + + +# Snakemake profiles # + +## About ## + +Profiles are configuration files that apply to specific compute +environments and analyses. They allow setting default options. + +::: {.fragment} + +At its simplest, a profile is simply a directory with a `config.yaml` +file that sets program options. Let's put our previous example in a +directory called `local` to represent a local profile. The minimum +content of that directory is then a file `config.yaml` with (in this +case) the following contents: + +```{r code=readLines("local/config.yaml") } +#| label: snakemake-local-profile +#| eval: false +``` + +::: + + +## Running the profile ## + +Run with `--profile` (NB: profile argument can also be absolute or +relative path): + +```{bash } +#| label: snakemake-local-profile-run +snakemake -s ex7.smk --profile local -n -p -F -c 8 +``` + + +# Cluster execution + +## Working on uppmax ## + + +Sofar we have looked at local jobs. What if we want to submit jobs at +a HPC? Here we focus on SLURM. + +::: {.fragment} + +##### sbatch solution ##### + +Wrap workflow in sbatch script (e.g. workflow.sh): + +```{bash } +#| label: sbatch-script +#| eval: false +#!/bin/bash -l +#SBATCH -A account +#SBATCH -p core +#SBATCH -c 20 +#... other SBATCH arguments ... + +module load snakemake +snakemake -j 20 --use-conda --use-envmodules all +``` + +and submit with +```{bash } +#| label: sbatch-submit +#| eval: false +sbatch workflow.sh +``` + +::: + +::: {.fragment} + +Downside: can only make use of one node at a time. + +::: + +## The snakemake job scheduler + +When running jobs locally using limited number of threads, snakemake +needs to decide what job to run when. These decisions are made by an +internal *job scheduler*. As we will see, the internal scheduler still +has this role when submitting jobs to a cluster scheduler. + +::: {.fragment} + +##### Background sessions ##### + +A workflow can take a long time to run. A workflow submitted in a +login shell will terminate once we logout. Therefore, it is advised to +submit a workflow in a *background session*, using a so-called *terminal +multiplexer* such as either +[screen](https://www.gnu.org/software/screen/manual/screen.html) or +[tmux](https://tmuxguide.readthedocs.io/en/latest/tmux/tmux.html). + +A named `tmux` session can be initiated as + +```{bash } +#| label: tmux +#| eval: false +tmux new -s mysession +``` + +Inside the session, use a prefix (default `Ctrl-b`; many change to +`Ctrl-a` which is default in `screen`) with key to launch `tmux` +commands. For instance, `Ctrl-b d` will detach (exit) from the +session. See the [tmux +documentation](https://tmuxguide.readthedocs.io/en/latest/tmux/tmux.html) +for further info. + +::: + + +## Generic execution + +The `--cluster` option can be used to submit jobs to the cluster +scheduler: + +```{bash } +#| label: snakemake-cluster-generic +#| eval: false +snakemake --cluster "sbatch -A account -p core -n 20 -t {resources.runtime}" \ + --use-conda --use-envmodules --ri -k --default-resources runtime=100 -j 100 +``` + +Note the use of the format string "{resources.runtime}" to set running +times individually. + +One drawback with this approach is that failed jobs or timeouts go +undetected, which means you have to monitor the outputs regularly. You +don't want to do that. + +::: {.fragment} + +##### Custom cluster commands ##### + +The argument to `--cluster` is a command (sbatch in example above), so +could be any wrapper script that submits jobs to a cluster scheduler. + +::: + +::: {.fragment} + + Furthermore, option `--cluster-status` takes as argument a command +(i.e. custom script) that checks jobs for their status. + +::: + +::: {.fragment} + +Also, option `--jobscript` takes as argument a script that submits +jobs to the cluster. + +::: + +::: {.fragment} + +We could write custom scripts for each of these options to fine-tune +job submission. If only there were such scripts already available! + +::: + +## snakemake-profiles + +[Snakemake Profiles](https://github.com/Snakemake-Profiles/doc) are +collections of reusable configuration profiles for various computing +environments. The [slurm snakemake +profile](https://github.com/Snakemake-Profiles/slurm) provides the +scripts we requested on the previous slide. + + +::: {.fragment} + +##### Installation ##### + +The profiles are [cookiecutter +templates](https://cookiecutter.readthedocs.io/en/1.7.2/) and can be +installed as follows: + +::: + +:::: {.columns} + +::: {.column width="50%"} + +::: {.fragment} + +```{bash } +#| label: cookiecutter-profile-install +#| eval: false +$ cookiecutter https://github.com/Snakemake-Profiles/slurm.git +profile_name [slurm]: myprofile +Select use_singularity: +1 - False +2 - True +Choose from 1, 2 [1]: +Select use_conda: +1 - False +2 - True +Choose from 1, 2 [1]: +jobs [500]: +restart_times [0]: +max_status_checks_per_second [10]: +max_jobs_per_second [10]: +latency_wait [5]: +Select print_shell_commands: +1 - False +2 - True +Choose from 1, 2 [1]: +sbatch_defaults []: --account=account +cluster_sidecar_help [Use cluster sidecar. NB! Requires snakemake >= 7.0! Enter +to continue...]: +Select cluster_sidecar: +1 - yes +2 - no +Choose from 1, 2 [1]: +cluster_name []: +cluster_jobname [%r_%w]: +cluster_logpath [logs/slurm/%r/%j-%w]: +cluster_config_help [The use of cluster-config is discouraged. Rather, set snakemake CLI options in the profile configuration file (see snakemake documentation +on best practices). Enter to continue...]: +cluster_config []: +``` +::: +::: + +::: {.column width="50%"} + +::: {.fragment} + + +`Profile directory contents`: +```{bash } +#| label: slurm-profile-tree +#| echo: false +tree myprofile | head -n -2 +``` + + +::: + +::: + +:::: + + + +## snakemake slurm profile + +:::: {.columns} + +::: {.column width="50%"} +```{r code=readLines("myprofile/settings.json") } +#| filename: "myprofile/settings.json" +#| label: slurm-settings +#| eval: false +#| cache: false +``` + + + +```{r code=readLines("myprofile/config.yaml") } +#| filename: "myprofile/config.yaml" +#| label: slurm-config +#| eval: false +``` +::: + +::: {.column width="50%"} + + + +```{python code=readLines("myprofile/CookieCutter.py")[4:39]} +#| filename: "myprofile/CookieCutter.py" +#| label: slurm-cookiecutter-class +#| eval: false +``` + +::: + +:::: + + + +::: {.fragment} + +##### Job submission ##### + + +Submit jobs with +```{bash } +#| label: slurm-profile-submit +#| eval: false +snakemake -s ex7.smk --profile myprofile -j 10 --ri -k -F +``` + +::: + +## New features - time formatting and sbatch parameters + +Previously could set e.g. `partition` in rule: +```{python } +#| label: set-partition-in-rule +#| echo: true +#| eval: false +rule bwa_mem: + resources: + time = "00:10:00", + mem = 12000, + partition = "devel" +``` +::: {.fragment} + +However, in many cases you would like to *constrain* on features +defined on the HPC with the SLURM `--constraint` option. For example, +UPPMAX defines the following features: + +```{bash } +#| label: uppmax-sinfo-features +#| echo: true +#| eval: false +sinfo -e -o "%P %m %c %.5a %f" | grep "ibsw2,\|ibsw16\|PARTITION" | grep "PARTITION\|node" +``` +```{bash } +#| label: uppmax-sinfo-features-results +#| echo: true +#| eval: false +PARTITION MEMORY CPUS AVAIL AVAIL_FEATURES +node 256000 20 up fat,mem256GB,ibsw2,usage_mail +node 128000 20 up thin,mem128GB,ibsw2,usage_mail +node 1000000 20 up fat,mem1TB,mem256GB,mem512GB,ibsw16,usage_mail +node 128000 20 up thin,mem128GB,ibsw16,usage_mail +``` + +::: +::: {.fragment} + +With the latest version of the slurm profile, you can do the following: + +```{python } +#| label: set-constraint-in-rule +#| echo: true +#| eval: false +rule gpu_stuff: + resources: + time="12h30m", + partition="node", + slurm="constraint=fat qos=gpuqos gres=gpu:2 gpus=4" +``` + +::: + + + +# Questions? {.unnumbered .unlisted}