diff --git a/lectures/anatomy-of-a-rule/anatomy.Rmd b/lectures/anatomy-of-a-rule/anatomy.Rmd index 84dc63f..4e98900 100644 --- a/lectures/anatomy-of-a-rule/anatomy.Rmd +++ b/lectures/anatomy-of-a-rule/anatomy.Rmd @@ -1,5 +1,5 @@ --- -title: "Anatomy of a Snakemake rule" +title: "Anatomy of a Snakefile" subtitle: "Snakemake BYOC NBIS course" date: "`r format(Sys.time(), '%d %B, %Y')`" output: @@ -21,14 +21,13 @@ class: center, middle .HUGE[Anatomy of]
-.HUGE[a Snakemake rule] +.HUGE[a Snakefile]
--- - # Basic structure of a rule + ```python -### Contents of Snakefile in cwd ## rule: output: "results/sample1.stats.txt" shell: @@ -39,7 +38,7 @@ rule: -- ```bash -$ snakemake -j 1 results/sample1.stats.txt +$ snakemake -c 1 results/sample1.stats.txt Building DAG of jobs... Using shell: /bin/bash Provided cores: 1 (use --cores to define parallelism) @@ -64,13 +63,14 @@ sample1 50% ``` --- +# Basic structure of a rule More commonly, rules are named and have both input and output: ```python rule generate_stats: - input: "results/sample1.bam" output: "results/sample1.stats.txt" + input: "results/sample1.bam" shell: """ samtools flagstat {input} > {output} @@ -78,12 +78,35 @@ rule generate_stats: ``` --- -Wildcards generalize a workflow: +# Wildcards + +.green[Wildcards] generalize a workflow. Imagine you have not just sample1 but samples 1..100. + +-- + +Instead of writing 100 rules... + +```python +rule generate_stats_sample1: + output: "results/sample1.stats.txt" + input: "results/sample1.bam" +... +rule generate_stats_sample100: + output: "results/sample100.stats.txt" + input: "results/sample100.bam" +``` + +-- + +...we can introduce one or more .green[wildcards] which Snakemake can match to several text strings using regular expressions. + +In our example, we replace the actual sample ids with the +wildcard `sample`: ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" + input: "results/{sample}.bam" shell: """ samtools flagstat {input} > {output} @@ -91,98 +114,130 @@ rule generate_stats: ``` --- -Input can also come directly from functions: +# Wildcards -```yaml -### Contents of config file ### -samples: - sample1: "results/genomeMap/sample1.bam" - sample2: "results/transcriptomeMap/sample2.bam" -``` +Rules can have multiple wildcards... ```python -def get_bamfile(wildcards): - return config["samples"][wildcards.sample] - rule generate_stats: - input: get_bamfile - output: "results/{sample}.stats.txt" + output: "results/{sample}_{lane}.stats.txt" + input: "results/{sample}_{lane}.bam" shell: - """ - samtools flagstat {input} > {output} - """ + """ + samtools flagstats {input} > {output} + """ ``` -- -.small[This can also be written in the form of .green[lambda expressions], _e.g._:] + +...but wildcards **must** be present in the output section. + +**Will work:** ```python - input: lambda wildcards: config["samples"][wildcards.sample] +rule generate_stats: + output: "results/{sample}_{lane}.stats.txt" + input: "results/{sample}.bam" +``` + +-- + +**Won't work:** +```python +rule generate_stats: + output: "results/{sample}.stats.txt" + input: "results/{sample}_{lane}.bam" +``` +```bash +Wildcards in input files cannot be determined from output files: 'lane' ``` --- +# Rule ambiguities Ambiguities can arise when two rules produce the same output: ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" + input: "results/{sample}.bam" shell: """ samtools flagstat {input} > {output} """ rule print_stats: - input: "results/{sample}.log" output: "results/{sample}.stats.txt" + input: "results/{sample}.log" shell: """ grep "% alignment" {input} > {output} """ rule make_report: - input: "results/{sample}.stats.txt" output: "results/{sample}.report.pdf" + input: "results/{sample}.stats.txt" ``` -- -.small[This can be handled either via the `ruleorder` directive:] +```bash +$ snakemake -c 1 -n results/sample1.report.pdf +Building DAG of jobs... +AmbiguousRuleException: +Rules generate_stats and print_stats are ambiguous for the file results/sample1.stats.txt. +``` + +--- +# Rule ambiguities + +Ambiguities can arise when two rules produce the same output: +This can be handled in a number of ways: + +- by changing the output file name of one of the rules + +-- + +- or via the `ruleorder` directive: ```python ruleorder: generate_stats > print_stats ``` --- -.small[Or by specifically referring to the output of a certain rule:] +-- +- or by specifically referring to the output of a certain rule: ```python rule make_report: - input: rules.generate_stats.output output: "results/{sample}.report.pdf" + input: rules.generate_stats.output ``` ---- +--- +# Logging -Messages and logfiles add descriptions and help with debugging: +Logfiles and messages add descriptions and help with debugging: ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" shell: """ samtools flagstat {input} > {output} 2>{log} """ ``` + +Log files are not deleted by snakemake if there's an error. + --- +# Resources -Compute resources can be set with _e.g._ .green[threads] and .green[resources]: +Compute resources can be set with .green[threads] and .green[resources]: ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" threads: 4 resources: @@ -193,15 +248,42 @@ rule generate_stats: """ ``` --- +# Resources + +Compute resources can be set with .green[threads] and .green[resources]: -Rule parameters can be set with the .green[params] directive: ```python rule generate_stats: + output: "results/{sample}.stats.txt" input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" + message: "Generating stats for sample {wildcards.sample}" + threads: workflow.cores * 0.5 # <---- threads as a function of workflow cores + resources: + mem_mb=100 + shell: + """ + samtools flagstat --threads {threads} {input} > {output} 2>{log} + """ +``` + +It's also possible to set threads based on the cores given to snakemake (_e.g._ `--cores 8` or `-c 8`). + +-- + +More on resources in a lecture tomorrow. + +--- +# Parameters + +Rule parameters can be set with the .green[params] directive: +```python +rule generate_stats: output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" - threads: 4 + threads: workflow.cores * 0.5 resources: mem_mb=100 params: @@ -213,15 +295,17 @@ rule generate_stats: """ ``` --- +# Software environments Software environments can be set for each rule using `conda:`: + ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" - threads: 4 + threads: workflow.cores * 0.5 resources: mem_mb=100 params: @@ -240,20 +324,23 @@ name: samtools channels: - bioconda dependencies: - - samtools + - samtools=1.15.1 ``` +To make Snakemake use the conda environment, specify `--use-conda` on the command line. + --- +# Software environments Or by using `envmodules`, _e.g._ in compute clusters: ```python rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" - threads: 4 + threads: workflow.cores * 0.5 resources: mem_mb=100 params: @@ -269,27 +356,102 @@ rule generate_stats: """ ``` +To make Snakemake use envmodules, specify `--use-envmodules` on the command line. + +-- + +More on conda and envmodules tomorrow. + --- +# Config files + +.green[Config files] allow you to configure workflows without having to change the underlying code. -If no targets are given on the command line, Snakemake will run the first rule specified. By convention this rule is named `all` and is used as a 'pseudo-rule' to define what the workflow will generate. +-- + +.small[Config files should be in `YAML` or `JSON` format:] + +```yaml +### Contents of config.yml ### +samples: ["sample1", "sample2", "sample3"] +verbosity: 2 +``` + +```json +### Contents of config.json ### +{ + "samples": [ + "sample1", + "sample2", + "sample3" + ], + "verbosity": 2 +} +``` + +-- + +.small[Specify one or more config files on the command line with:] +```bash +snakemake --configfile config.yml -j 1 +``` +-- + +.small[Or directly in a snakefile, _e.g._:] ```python +configfile: "config.yml"" +``` + +--- +# Config files + +The config parameters are available as a dictionary inside your snakefiles: -samples = ["sample1","sample2"] +```yaml +### Contents of config.yml ### +samples: ["sample1", "sample2", "sample3"] +verbosity: 2 +``` + +```python +configfile: "config.yml" +print(config) +{'samples': ['sample1', 'sample2', 'sample3'], 'verbosity': 2} +``` + +-- + +This allows you to manipulate the config variable inside your snakemake files and python code. + +```python +config["samples"].append("sample4") +print(config) +{'samples': ['sample1', 'sample2', 'sample3', 'sample4'], 'verbosity': 2} +``` + +--- +# Config files + +In our example rule, verbosity level can be controlled via a config file like this: + +```python +configfile: "config.yml" rule all: - input: expand("results/{sample}.stats.txt", sample = samples) + input: + expand("results/{sample}.stats.txt", sample = config["samples"]) rule generate_stats: - input: "results/{sample}.bam" output: "results/{sample}.stats.txt" - log: "results/{sample}.log" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" message: "Generating stats for sample {wildcards.sample}" - threads: 4 + threads: workflow.cores * 0.5 resources: mem_mb=100 params: - verbosity = 2 + verbosity = config["verbosity"] # <----- conda: "envs/samtools.yml" envmodules: "bioinfo-tools", @@ -300,6 +462,79 @@ rule generate_stats: --threads {threads} {input} > {output} 2>{log} """ ``` +--- +# Config files + +Config files are also convenient for defining what the workflow will do. + +-- + +If no targets are given on the command line, Snakemake will run the first rule specified. + +By convention this rule is named `all` and is used as a 'pseudo-rule' to define what the workflow will generate. + +```python +rule all: + input: + "results/sample1.stats.txt", + "results/sample2.stats.txt", + "results/sample3.stats.txt" + +rule generate_stats: + output: "results/{sample}.stats.txt" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" +... +``` + +--- +# Config files + +Config files are also convenient for defining what the workflow will do. + +If no targets are given on the command line, Snakemake will run the first rule specified. + +By convention this rule is named `all` and is used as a 'pseudo-rule' to define what the workflow will generate. + +```python +samples = ["sample1", "sample2", "sample3"] +rule all: + input: + expand("results/{sample}.stats.txt", sample = config["samples"]) + +rule generate_stats: + output: "results/{sample}.stats.txt" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" +... +``` + +If we define a list of samples we can condense the input section of the `all` rule using the .green[expand] function. + +--- +# Config files + +By defining samples in the config file (either directly or via a sample list that is read and stored in the config dictionary), your workflow becomes way more flexible. + +```yaml +### Contents of config.yml ### +samples: ["sample1", "sample2", "sample3"] +verbosity: 2 +``` + +```python +configfile: "config.yml" +rule all: + input: + expand("results/{sample}.stats.txt", sample = config["samples"]) + +rule generate_stats: + output: "results/{sample}.stats.txt" + input: "results/{sample}.bam" + log: "results/{sample}.flagstat.log" +... +``` + --- # What else? diff --git a/lectures/anatomy-of-a-rule/anatomy.html b/lectures/anatomy-of-a-rule/anatomy.html new file mode 100644 index 0000000..4bdadcc --- /dev/null +++ b/lectures/anatomy-of-a-rule/anatomy.html @@ -0,0 +1,652 @@ + + + + Anatomy of a Snakefile + + + + + + + + + + + + + + + + diff --git a/lectures/anatomy-of-a-rule/anatomy.pdf b/lectures/anatomy-of-a-rule/anatomy.pdf index d8a89f7..2cbd064 100644 Binary files a/lectures/anatomy-of-a-rule/anatomy.pdf and b/lectures/anatomy-of-a-rule/anatomy.pdf differ diff --git a/lectures/bestpractices/bestpractices.Rmd b/lectures/bestpractices/bestpractices.Rmd deleted file mode 100644 index 4d36662..0000000 --- a/lectures/bestpractices/bestpractices.Rmd +++ /dev/null @@ -1,298 +0,0 @@ ---- -title: "Snakemake best-practices" -subtitle: "Snakemake BYOC NBIS course" -date: "`r format(Sys.time(), '%d %B, %Y')`" -output: - xaringan::moon_reader: - self-contained: true - seal: false - css: ["default", "../template.css"] - nature: - slideNumberFormat: "" ---- - -layout: true - - - ---- - -class: center, middle - -.HUGE[Snakemake workflows] -
-.HUGE[guidelines and best-practices] -
- ---- - -# Structure of a snakemake workflow - -Workflows can be organized basically anyway you want - --- -.small[ -- Rules can be added to a single Snakefile, or split across several files -] --- -.small[ -- Files can be named anything you want (`Snakefile`, `rulefile.rules`, `myrules.smk` *etc.*) -] --- -.small[ -- You can mix and match snakemake rules with python code -] -```python -def trim_params(config): - return config["trim_settings"] - -rule trim_reads: - input: "data/reads.fastq.gz" - output: "results/trimmed.fastq.gz" - params: settings = trim_params(config) - shell: "read-trimmer {input} {output}" -``` - ---- - -# What are some best-practices and guidelines? - --- - -## 1. Snakemake documentation - -[Distribution and Reproducibility](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#distribution-and-reproducibility) - -> It is recommended to store each workflow in a dedicated git repository of the following structure - -``` -|-- .gitignore -|-- README.md -|-- LICENSE.md -|-- workflow -| |-- rules -| | |-- module1.smk -| | |-- module2.smk -| |-- envs -| | |-- tool1.yaml -| | |-- tool2.yaml -| |-- scripts -| | |-- script1.py -| | |-- script2.R -| |-- notebooks -| | |-- notebook1.py.ipynb -| | |-- notebook2.r.ipynb -| |-- report -| | |-- plot1.rst -| | |-- plot2.rst -| |-- Snakefile -|-- config -| |-- config.yaml -| |-- some-sheet.tsv -|-- results -|-- resources -``` ---- - -# What are some best-practices and guidelines? - -## 1. Snakemake documentation - -[Distribution and Reproducibility](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#distribution-and-reproducibility) - -- workflow code goes into a subfolder `workflow/` --- - -- `workflow/Snakefile` marks the entrypoint of the workflow --- - -- the (optional) rules can be stored in subfolder `workflow/rules/` (they should end in `.smk`) --- - -- scripts should be stored in `workflow/scripts/` -- notebooks should be stored in `workflow/notebooks/` --- - -- conda environments should be fine-grained and stored in `workflow/envs/` --- - -- report caption files should be stored in `workflow/report/` --- - -- all output files should be stored under `results/` -- all resource files should be stored under `resources/` --- - -- configuration is stored in a subfolder `config/` ---- - -# What are some best-practices and guidelines? - -## 2. The Snakemake-Workflows project - -> a joint effort to create workflows for common use cases of the Snakemake workflow management system - -[Guidelines](https://github.com/snakemake-workflows/docs#guidelines) --- - -- A workflow repository shall consist of .green[one Snakemake workflow]. --- - -- The workflow should be configurable via a well documented YAML-based .green[configuration file] and (when necessary) a .green[sample and a unit sheet]. --- - -- Whenever possible, Snakemake .green[wrappers] should be used. --- - -- The structure of the workflow should follow our .green[template]. - ---- - -## The snakemake workflows template - -- A [cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template available on [GitHub](https://github.com/snakemake-workflows/cookiecutter-snakemake-workflow) -- Can be used to initialize new workflows ---- -**How to use the template** - -Step 1. Install `cookiecutter` if you haven't already - -``` -$ conda create -n cc -c conda-forge cookiecutter -<...> -$ conda activate cc -``` ---- -**How to use the template** - -Step 2. Create a new workflow with cookiecutter - -``` -$ (cc) cookiecutter gh:snakemake-workflows/cookiecutter-snakemake-workflow -full_name [Johannes Köster]: John Sundh -``` - ---- -**How to use the template** - -Step 2. Create a new workflow with cookiecutter - -``` -$ (cc) cookiecutter gh:snakemake-workflows/cookiecutter-snakemake-workflow -full_name [Johannes Köster]: John Sundh -email [johannes.koester@protonmail.com]: john.sundh@scilifelab.se -``` - ---- -**How to use the template** - -Step 2. Create a new workflow with cookiecutter - -``` -$ (cc) cookiecutter gh:snakemake-workflows/cookiecutter-snakemake-workflow -full_name [Johannes Köster]: John Sundh -email [johannes.koester@protonmail.com]: john.sundh@scilifelab.se -username [johanneskoester]: johnne -``` - ---- -**How to use the template** - -Step 2. Create a new workflow with cookiecutter - -``` -$ (cc) cookiecutter gh:snakemake-workflows/cookiecutter-snakemake-workflow -full_name [Johannes Köster]: John Sundh -email [johannes.koester@protonmail.com]: john.sundh@scilifelab.se -username [johanneskoester]: johnne -project_name [RNA-Seq]: gut-microbiota -repo_name [rna-seq]: gut-microbiome-repo -min_snakemake_version [5.7.0]: 5.11.0 -``` - ---- -**How to use the template** - -Step 3. Inspect your new workflow - -``` -$ ls gut-microbiome-repo/ --rw-r--r-- 1 john staff 1.0K Oct 1 15:49 LICENSE --rw-r--r-- 1 john staff 4.9K Oct 1 15:49 README.md -drwxr-xr-x 4 john staff 128B Oct 1 15:49 config -drwxr-xr-x 3 john staff 96B Oct 1 15:49 resources -drwxr-xr-x 6 john staff 192B Oct 1 15:49 results -drwxr-xr-x 8 john staff 256B Oct 1 15:49 workflow -``` ---- -**How to use the template** - -Step 3. Inspect your new workflow - -.small[Your info has been inserted into the `README.md` file] - -```markdown -# Snakemake workflow: gut-microbiota - -[![Snakemake](https://img.shields.io/badge/snakemake->5.11.0-brightgreen.svg)](https://snakemake.bitbucket.io) -[![Build Status](https://travis-ci.org/snakemake-workflows/gut-microbiome-repo.svg?branch=master)](https://travis-ci.org/snakemake-workflows/gut-microbiome-repo) - -This is the template for a new Snakemake workflow. Replace this text with a comprehensive description covering the purpose and domain. -Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs`. Define the entry point of the workflow in the `Snakefile` and the main configuration in the `config.yaml` file. - -## Authors - -* John Sundh (@johnne) -``` ---- -**How to use the template** - -Step 3. Inspect your new workflow - -.small[Your info has been inserted into the `README.md` file] - -# Snakemake workflow: gut-microbiota - -[![Snakemake](https://img.shields.io/badge/snakemake->5.11.0-brightgreen.svg)](https://snakemake.bitbucket.io) -[![Build Status](https://travis-ci.org/snakemake-workflows/gut-microbiome-repo.svg?branch=master)](https://travis-ci.org/snakemake-workflows/gut-microbiome-repo) - -This is the template for a new Snakemake workflow. Replace this text with a comprehensive description covering the purpose and domain. -Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs`. Define the entry point of the workflow in the `Snakefile` and the main configuration in the `config.yaml` file. - -## Authors - -* John Sundh (@johnne) - ---- -# What are some best-practices and guidelines? - -## 3. Snakemake linting - -Since `v5.11` Snakemake comes with code quality checker that can be used on your workflow. - -To get a list of warnings for parts of the workflow that does not conform to best-practices, run `snakemake --lint` - ---- -# What are some best-practices and guidelines? - -## 3. Snakemake linting - -Examples of warnings: - -- *Mixed rules and functions in same snakefile.* -- *No log directive defined* -- *Migrate long run directives into scripts or notebooks* - -```bash -Lints for rule compose_sample_sheet (line 6, rna-seq-kallisto-sleuth/workflow/rules/diffexp.smk): - * No log directive defined: - Without a log directive, all output will be printed to the terminal. In distributed environments, this means that errors are harder to discover. In local environments, - output of concurrent jobs will be mixed and become unreadable. - Also see: - https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#log-files -``` - ---- - -class: center, middle -# Questions? diff --git a/lectures/bestpractices/bestpractices.pdf b/lectures/bestpractices/bestpractices.pdf deleted file mode 100644 index ab8cb04..0000000 Binary files a/lectures/bestpractices/bestpractices.pdf and /dev/null differ diff --git a/lectures/example-workflow/GenErode_illustration.jpg b/lectures/example-workflow/GenErode_illustration.jpg new file mode 100644 index 0000000..f1d48b1 Binary files /dev/null and b/lectures/example-workflow/GenErode_illustration.jpg differ diff --git a/lectures/example-workflow/example-workflow.Rmd b/lectures/example-workflow/example-workflow.Rmd index 49fe477..eff6a20 100644 --- a/lectures/example-workflow/example-workflow.Rmd +++ b/lectures/example-workflow/example-workflow.Rmd @@ -36,6 +36,14 @@ library("kableExtra") # The GenErode pipeline +.center[] +.center[https://github.com/NBISweden/GenErode] +.center[[Kutschera et al. 2022 (BMC Bioinformatics)](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-04757-0)] + +--- + +# The GenErode pipeline + * Developed in a NBIS project with Love Dalén's lab (Centre for Palaeogenetics, SU & NRM) -- @@ -52,7 +60,7 @@ library("kableExtra") # The GenErode pipeline -* Started at Snakemake version 3.10 (!), current pipeline runs with Snakemake version 5.22 +* Started at Snakemake version 3.10 (!), current pipeline runs with Snakemake version 6.12.1 (latest version: 7.13.0) -- @@ -70,52 +78,36 @@ library("kableExtra") # Analysis Tracks of the Workflow -* .green[Data processing track] - * Repeat element identification - * Fastq file processing - * _Optional:_ mapping to mitochondrial genomes - * Mapping to reference genome - * BAM file processing - * _Optional:_ base quality rescaling for historical samples - * _Optional:_ subsampling to target depth - * Genotyping - * _Optional:_ CpG site identification & removal - * VCF file processing & merging per dataset +.center[] --- # Analysis Tracks of the Workflow -* .green[BAM file track] - * _Optional:_ mlRho - --- - -* .green[VCF file track] - * _Optional:_ plink (PCA) - * _Optional:_ plink (runs of homozygosity) - * _Optional:_ snpEff - * _Optional:_ GERP++ & relative mutational load calculation +.center[] --- # The Workflow Structure * Rules with the actual analyses in separate Snakefiles (in `workflow/rules/`) + * `workflow/rules/common.smk` contains Python code to create sample and readgroup ID + dictionaries & lists from metadata tables and the config file + -- * `Snakefile` - * Python code to create sample and readgroup ID dictionaries & lists from metadata tables and a config file * `include` of rule Snakefiles * `all` rule collecting output files produced by the different rule Snakefiles + * Python and bash code to generate and edit the pipeline report with `snakemake --report` -- -* UPPMAX / slurm system: - * `cluster.yaml` file to set up slurm profile +* Cluster execution (e.g. UPPMAX) with slurm: + * `config/cluster.yaml` file to set up slurm profile --- @@ -149,6 +141,8 @@ metadata <- data.frame(samplename_index_lane = c("VK01_01_L2","VK01_02_L2"), create_table(metadata) ``` +* see `config/historical_samples_paths.txt` and `config/modern_samples_paths.txt` + --- # The Workflow Structure @@ -164,15 +158,17 @@ create_table(metadata) ```{python config file data, eval = FALSE} ################################################################# # 1) Full path to reference genome assembly. -# Reference genome has to be checked for short and concise fasta -# headers without special characters. -# File name extension can be ".fasta" or ".fa". +# Reference genome has to be checked for short and concise FASTA +# headers without special characters and has to be uncompressed. +# The file name will be reused by the pipeline and can have the file +# name extensions *.fasta, *.fa or *.fna. ref_path: "" ################################################################# ################################################################# -# 2) Relative paths (from the main snakemake directory) to metadata tables of samples. +# 2) Relative paths (from the main snakemake directory) to metadata +# files with sample information. # Example files can be found in "config/" historical_samples: "" # leave empty ("") if not run for historical samples. modern_samples: "" # leave empty ("") if not run for modern samples. @@ -189,7 +185,11 @@ modern_samples: "" # leave empty ("") if not run for modern samples. ```{python config file start, eval = FALSE} ##### -# FastQC on raw reads, adapter trimming, read merging (historical samples), FastQC on trimmed reads. +# FastQC on raw reads, adapter and quality trimming (incl. read merging +# for historical samples) using fastp, FastQC on trimmed reads. +# Adapter sequences are automatically detected. +# Automatic detection of NovaSeq or NextSeq samples and activation of +# poly-G tail trimming. fastq_processing: True [...] @@ -205,38 +205,31 @@ mapping: False # How to choose which steps to run -**Step 1**: Use booleans in the config file (`config/config.yaml`) as on/off switches +> For many analysis steps, parameters can be specified in the config file -* For many analysis steps, parameters can be specified in the config file: +> These parameters can be used in the workflow with the syntax `config["parameter_name"]`, + e.g. `config["hist_readlength"]` -- ```{python config file parameters, eval = FALSE} ##### -# FastQC on raw reads, adapter trimming, read merging (historical samples), FastQC on trimmed reads. +# FastQC on raw reads, adapter and quality trimming (incl. read merging +# for historical samples) using fastp, FastQC on trimmed reads. +# Adapter sequences are automatically detected. +# Automatic detection of NovaSeq or NextSeq samples and activation of +# poly-G tail trimming. fastq_processing: True -# Adapter sequences for trimming of historical samples using SeqPrep v1.1 (modified source code) -# Forward read primer/adapter sequence to trim historical samples in SeqPrep v1.1 (parameter "-A") -hist_F_adapter: "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNATCTCGTATGCCGTCTTCTGCTTG" # example from Meyer & Kircher 2010 -# Reverse read primer/adapter sequence to trim historical samples in SeqPrep v1.1 (parameter "-B") -# When using double indices, include full adapter length (replacing the index by "N") -hist_R_adapter: "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTT" # example from Meyer & Kircher 2010 +# Minimum read length. +# Historical samples (after trimming and read merging) +hist_readlength: "30" # recommended setting: 30 bp -[...] - -# Minimum read length allowed after trimming. -# Historical samples (SeqPrep v1.1 with modified source code) -hist_readlength: "30" # default setting: 30 bp - -# Modern samples (trim-galore) +# Modern samples (after trimming) mod_readlength: "30" ##### ``` --- - -* These parameters can be used in the workflow with the syntax `config["parameter_name"]` --- @@ -268,7 +261,7 @@ if config["fastq_processing"]: ```{python Snakefile all, eval = FALSE} rule all: - input: all_outputs + input: all_outputs, ``` --- @@ -286,15 +279,47 @@ rule all: import os if os.path.exists(config["historical_samples"]): all_outputs.append("data/raw_reads_symlinks/historical/stats/multiqc/multiqc_report.html") - all_outputs.append("results/historical/trimmed_merged_reads/stats/multiqc/multiqc_report.html") + all_outputs.append("results/historical/trimming/stats/multiqc/multiqc_report.html") if os.path.exists(config["modern_samples"]): all_outputs.append("data/raw_reads_symlinks/modern/stats/multiqc/multiqc_report.html") - all_outputs.append("results/modern/trimmed_reads/stats/multiqc/multiqc_report.html") + all_outputs.append("results/modern/trimming/stats/multiqc/multiqc_report.html") ``` -- +* `config["historical_samples"]` and `config["modern_samples"]` point to the config file where the paths to metadata files are specified: + +```{python config file data 2, eval = FALSE} +################################################################# +# 2) Relative paths (from the main snakemake directory) to metadata +# files with sample information. +# Example files can be found in "config/" +historical_samples: "" # leave empty ("") if not run for historical samples. +modern_samples: "" # leave empty ("") if not run for modern samples. +################################################################# +``` + +--- + +# How to choose which steps to run + +**Step 2**: Use some Python code, `include` and the rule `all` to figure out what the workflow will do + +* The rule Snakefile (`workflow/rules/1.1_fastq_processing.smk`) contains some Python code to add its + final output files to the list `all_outputs` in the main `Snakefile` + +```{python rules file fastq 2, eval = FALSE} +import os +if os.path.exists(config["historical_samples"]): + all_outputs.append("data/raw_reads_symlinks/historical/stats/multiqc/multiqc_report.html") + all_outputs.append("results/historical/trimming/stats/multiqc/multiqc_report.html") + +if os.path.exists(config["modern_samples"]): + all_outputs.append("data/raw_reads_symlinks/modern/stats/multiqc/multiqc_report.html") + all_outputs.append("results/modern/trimming/stats/multiqc/multiqc_report.html") +``` + * By using the `if` statement to check for the presence of historical or modern metadata files, the workflow can also be run only for historical or only for modern samples diff --git a/lectures/example-workflow/example-workflow.pdf b/lectures/example-workflow/example-workflow.pdf index 3f1b398..0813293 100644 Binary files a/lectures/example-workflow/example-workflow.pdf and b/lectures/example-workflow/example-workflow.pdf differ diff --git a/lectures/example-workflow/figure_1_generode_pipeline_v7.png b/lectures/example-workflow/figure_1_generode_pipeline_v7.png new file mode 100644 index 0000000..026c701 Binary files /dev/null and b/lectures/example-workflow/figure_1_generode_pipeline_v7.png differ diff --git a/lectures/example-workflow/figure_2_generode_pipeline_v7.png b/lectures/example-workflow/figure_2_generode_pipeline_v7.png new file mode 100644 index 0000000..e2355bf Binary files /dev/null and b/lectures/example-workflow/figure_2_generode_pipeline_v7.png differ diff --git a/lectures/pepfiles/pepfiles.Rmd b/lectures/pepfiles/pepfiles.Rmd deleted file mode 100644 index 33c6247..0000000 --- a/lectures/pepfiles/pepfiles.Rmd +++ /dev/null @@ -1,335 +0,0 @@ ---- -title: "Configuring workflows via PEPs" -subtitle: "Snakemake BYOC NBIS course" -date: "`r format(Sys.time(), '%d %B, %Y')`" -output: - xaringan::moon_reader: - self-contained: true - seal: false - css: ["default", "../template.css"] - nature: - slideNumberFormat: "" ---- -layout: true - - ---- - -class: center, middle - -.HUGE[Configuring workflows via PEPs] - ---- - -# Snakemake and PEPs - -.SMALL[Since version 5.23.0, Snakemake supports configuration via PEP-files] - -```python -pepfile: "pep/config.yaml" -pepschema: "schemas/pep.yaml" - -rule all: - input: expand("results/{sample}.stats.txt", sample = pep.sample_table["sample_name"]) -``` - ---- - -# So what are PEPs? - -**P**ortable **E**ncapsulated **P**rojects (or [PEPs](http://pep.databio.org/en/latest/)) -is an attempt to standardize metadata and how it's used. - --- -- Written in `yaml` and `csv` format --- - -- Can be loaded by `R` (via `pepr`) and `python` (via `peppy`) --- - -- Now supported by `Snakemake` (with Nextflow next in line?) - ---- - -# Ok ok, but what is it _really_?? - --- - -## Rationale - -- A lot of time in bioinformatics revolves around organizing sample data - --- -- Each dataset is annotated in a unique way - --- -- and tools often expect sample information in a unique format - --- -- This limits .green[portability] and .green[reusability] of datasets. ---- - -# How do PEPs make my life easier? - -The idea is that PEPs help to: - -1. Standardize metadata so that it works for several projects and with several tools -2. Collaborate on and share metadata with others by using a common interface ---- - -# How does it work with Snakemake? - --- -- .small[Make sure you install `peppy` in the environment containing snakemake] - -```bash -conda install -c conda-forge peppy -``` --- - -- .small[Create a project config file, _e.g._ `pep/config.yaml`] - -```yaml -pep_version: 2.0.0 -sample_table: "config/samples.csv" -``` --- - -- .small[Create the sample table (`config/samples.csv`)] - -```bash -"sample_name","read1","read2" -"sample1","/path/to/datadir/sample1_R1.fast1.gz","/path/to/datadir/sample1_R2.fastq.gz" -"sample2","/path/to/datadir/sample2_R1.fast1.gz","/path/to/datadir/sample2_R2.fastq.gz" -``` ---- - -# How does it work with Snakemake? - -- .small[In your `Snakefile`, add the `pepfile:` directive] - -```python -pepfile: "pep/config.yaml" - -rule all: - input: expand("results/{sample}.bam", sample = pep.sample_table["sample_name"]) - -rule align: - input: - R1 = lambda wildcards: pep.get_sample(wildcards.sample)["read1"], - R2 = lambda wildcards: pep.get_sample(wildcards.sample)["read2"], - ref = "resources/genome/genome.fasta" - output: - "results{sample}.bam" - shell: - """ - someAlignmentTool -1 {input.R1} -2 {input.R2} -x {input.ref} > {output} - """ -``` ---- - -# Difference between pepfiles and workflow configfiles - -> a PEP should describe everything needed about the data, while a workflow and its configuration should describe everything needed about the analysis that is applied to it. ---- - -# What else can PEPs do? - --- - -**Example 1: Remove hardcoded paths from the sample file** - -`config/samples.csv` -```bash -"sample_name","read1","read2" -"sample1","/path/to/datadir/sample1_R1.fast1.gz","/path/to/datadir/sample1_R2.fastq.gz" -"sample2","/path/to/datadir/sample2_R1.fast1.gz","/path/to/datadir/sample2_R2.fastq.gz" -``` ---- - -# What else can PEPs do? - -**Example 1: Remove hardcoded paths from the sample file** - -`config/samples.csv` -```bash -"sample_name","read1","read2" -"sample1","source1","source2" -"sample2","source1","source2" -``` - -`pep/config.yaml` -```yaml -pep_version: 2.0.0 -sample_table: "config/samples.csv" -sample_modifiers: - derive: - attributes: [read1, read2] - sources: - source1: "data/{sample_name}_R1.fastq.gz" - source2: "data/{sample_name}_R2.fastq.gz" -``` - -- .green[Benefit]: If data is moved or if you want to share/publish the project -you simply update the project config - ---- - -# What else can PEPs do? - -**Example 2: Multiple input files for each sample** - -Say you have samples sequenced on more than one lane and you want to include all -files in the workflow. -```bash -data/ -├── L001 - ├── sample1_L001_R1.fastq.gz - ├── sample1_L001_R2.fastq.gz - ├── sample2_L001_R1.fastq.gz - └── sample2_L001_R2.fastq.gz -├── L002 - ├── sample1_L002_R1.fastq.gz - ├── sample1_L002_R2.fastq.gz - ├── sample2_L002_R1.fastq.gz - └── sample2_L002_R2.fastq.gz -``` - ---- - -# What else can PEPs do? - -**Example 2: Multiple input files for each sample** - -- Option1: Use wildcards in project config - -`pep/config.yaml` -```yaml -pep_version: 2.0.0 -sample_table: "../config/samples.csv" -sample_modifiers: - derive: - attributes: [read1, read2] - sources: - source1: "data/L*/{sample_name}_L*_R1.fastq.gz" - source2: "data/L*/{sample_name}_L*_R2.fastq.gz" -``` ---- - -# What else can PEPs do? - -**Example 2: Multiple input files for each sample** - -- Option2: Use a subsample table - -`pep/config.yaml` -```yaml -pep_version: 2.0.0 -sample_table: "config/samples.csv" -subsample_table: "config/subsample_table.csv" -``` --- - -`config/sample_table.csv` -```yaml -"sample_name" -"sample1" -"sample2" -``` - -`config/subsample_table.csv` -```yaml -"sample_name","read1","read2" -"sample1","data/L001/sample1_L001_R1.fastq.gz","data/L001/sample1_L001_R2.fastq.gz" -"sample1","data/L002/sample1_L002_R1.fastq.gz","data/L002/sample1_L002_R2.fastq.gz" -"sample2","data/L001/sample2_L001_R1.fastq.gz","data/L001/sample2_L001_R2.fastq.gz" -"sample2","data/L002/sample2_L002_R1.fastq.gz","data/L002/sample2_L002_R2.fastq.gz" -``` - --- -.green[Benefit]: Your main sample table can still describe one sample per line. - ---- - -# What else can PEPs do? - -**Example 3: Modify your project on the fly** - -Say you have a Snakemake project set up for a particular dataset and genomic reference. -Further down the line you get a new dataset that should use another genome -reference. - --- - -With PEPs you can easily .green[amend] your current project to point to the new -dataset _and_ use the other genome reference. - ---- - -# What else can PEPs do? - -**Example 3: Modify your project on the fly** - -`pep/config.yaml` -```yaml -pep_version: 2.0.0 -sample_table: "../config/samples.csv" -genome: ref1 -sample_modifiers: - derive: - attributes: [read1, read2] - sources: - source1: "data/L*/{sample_name}_L*_R1.fastq.gz" - source2: "data/L*/{sample_name}_L*_R2.fastq.gz" -``` - ---- - -# What else can PEPs do? - -**Example 3: Modify your project on the fly** - -`pep/config.yaml` -```yaml -pep_version: 2.0.0 -sample_table: "../config/samples.csv" -genome: ref1 -sample_modifiers: - derive: - attributes: [read1, read2] - sources: - source1: "data/L*/{sample_name}_L*_R1.fastq.gz" - source2: "data/L*/{sample_name}_L*_R2.fastq.gz" -project_modifiers: - amend: - batch2: - sample_table: "../config/samples_batch2.csv" - genome: ref2 -``` - --- - -Then activate that amendment if you want to use it in your workflow: - -```python -pepfile: "pep/config.yaml" -pep.activate_amendments("batch2") -``` - --- - -.green[Benefit]: Quickly change setup with little duplication - ---- - -# More resources - -- [Snakemake docs](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#configuring-scientific-experiments-via-peps) -- [PEP docs](http://pep.databio.org/en/latest/) -- [peppy package](http://peppy.databio.org/en/latest/) -- [pepr package](http://code.databio.org/pepr/) - ---- - -class: center, middle -# Questions? diff --git a/lectures/scatter-gather/Snakefile b/lectures/scatter-gather/Snakefile new file mode 100644 index 0000000..c1b6ec4 --- /dev/null +++ b/lectures/scatter-gather/Snakefile @@ -0,0 +1,52 @@ +import os + +samples = ["sample1", "sample2"] + +splits = 5 +scatteritems = range(1, splits+1) + +wildcard_constraints: + scatteritems = "\d+", + sample = "[\w\d\-\.]+" + +rule all: + input: + expand("{sample}.rc.fastq", sample = samples) + +rule scatter: + output: + expand("splits/{{sample}}/{{sample}}.{scatteritem}.fastq", scatteritem = scatteritems) + input: + "data/{sample}.fastq" + log: + "logs/{sample}.scatter.log" + conda: + "envs/seqkit.yml" + params: + parts = splits, + outdir = lambda wildcards, output: os.path.dirname(output[0]) + shell: + """ + seqkit split -p {params.parts} -O {params.outdir} {input} > {log} 2>&1 + rename 's/part_0*//' {params.outdir}/{wildcards.sample}.*.fastq + """ + +rule reversecomplement: + output: + "rc/{sample}/{sample}.{scatteritem}.rc.fastq" + input: + "splits/{sample}/{sample}.{scatteritem}.fastq" + conda: + "envs/seqkit.yml" + shell: + """ + seqkit seq --reverse --complement {input} > {output} + """ + +rule gather: + output: + "{sample}.rc.fastq" + input: + expand("rc/{{sample}}/{{sample}}.{scatteritem}.rc.fastq", scatteritem = scatteritems) + shell: + "cat {input} > {output}" \ No newline at end of file diff --git a/lectures/scatter-gather/dag_scatter.png b/lectures/scatter-gather/dag_scatter.png new file mode 100644 index 0000000..13cb16f Binary files /dev/null and b/lectures/scatter-gather/dag_scatter.png differ diff --git a/lectures/scatter-gather/data/sample1.fastq b/lectures/scatter-gather/data/sample1.fastq new file mode 100644 index 0000000..6eea82b --- /dev/null +++ b/lectures/scatter-gather/data/sample1.fastq @@ -0,0 +1,400 @@ +@ERR9875491.5.2 H8:1:H32YCBCX2:2:1101:17975:1993 length=151 +TTGTACACACCGCCCGTCATACCATGGAAGTGGATTGCACCAGAAGTAGATAGTCTAACCTTAGGGAGGGCGTTTACCACGGTGTGCTTCATGACTGGGGTAAAGTCGTAACAAGGTAGCCGTAGGTGAACCTGCAGAAGGAGATCGGAAG ++ +DDDDDIHIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHIIIIIIIIIIIIIIIIIIIIIIIIHIIIIIIIIIIIIIIIIIIIIIIIIIIHIGHIIIIIIIIIIIIIIIIIIIIIIIIIIHIIIIIIIIIIIIIHGIIIIIC +@ERR9875491.71.1 H8:1:H32YCBCX2:2:1101:17861:2474 length=151 +CCTTCCGCAGGTTCACCTACGGAAACCTTGTTACAACTTCTCCTTCCTCTAAGCAATAAGGTTTACTTGACTATCCAAAACGATCTAAGTTGCCCTAGAAAGCTTCGGTCCGAAATATTCACCGGATTGCTCAATCGGTAGGAGCGACGGG ++ +@BDD@HFIICCC