Stubs and DAG file (#39)

* add nextflow schema * remove hidden fields * tweak the schema * format files using intellij * [FIX] Syncing stubs and scripts (#38) * Add a basic nextflow_schema file (#37) * add nextflow schema * remove hidden fields * tweak the schema * fixing all stubs as requested * adding stubs to rename_files module Co-authored-by: Abhinav Sharma <[email protected]> * update readme and trim config * update the readme * tweak readme * Further updates for the readme * update readme * updated the benefits for NF wrapper * [DEV] adding dag resources (#41) * adding dag as png * adding a topic to readme * fixing wrong format * fixing `dag-batch.png` Co-authored-by: Davi Marcon <[email protected]>
mycobactopia-org · Sep 9, 2021 · 59f7650 · 59f7650
1 parent 9d00768
commit 59f7650
Show file tree

Hide file tree

Showing 24 changed files with 265 additions and 208 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,55 @@
-# Mtbseq-nf
+# mtbseq-nf
 
-**NOTE**: This still a Work in progress!
+**NOTE: This still a work in progress, the code is functional however the [usage](#Usage) doc needs some love!**
 
-Project with aim to create a nextflow wrapper for mtbseq workflow.
+[MTBseq](https://github.com/ngs-fzb/MTBseq_source) made simple and easy using [Nextflow](https://www.nextflow.io/) workflow manager. 
 
-Heavily based on [ngs-fzb/MTBseq_source](https://github.com/ngs-fzb/MTBseq_source)
+# Benefits of the Nextflow wrapper
+
+- Fine-grained control over resource allocation (CPU/Memory/Storage)
+- Reliance of bioconda and biocontainers for installing packages for reproducibility
+- Ease of use on a range of infrastructure (cloud/on-prem clusters/local machine)
+- Resumability for failed processes
+- Centralized locations for specifying 
+    - Hardware requirements (`conf/standard.config`)
+    - Software requirements (`conf/docker.config` or `conf/conda.config`)
+    - MTBseq parameters (`conf/global_parameters.config`)
+- Dedicated user interface for all parameters for wider audience (`nextflow_schema.json`)
+- Explicit rather than the implicit options (`conf/global_parameters.config`)
+- Ability to analyze genomes in **parallel** as well as in **batch** 
+
+# Workflow example
+This pipeline has two execution types: batch and parallel and here is a dag example for them!
+
+The execution type is determined by the analysis\_mode parameter
+
+## Batch
+![batch-workflow](./resources/dag-batch.png)
+
+## Parallel
+![parallel-workflow](./resources/dag-parallel.png)
+
+
+
+# Usage 
+
+## Nextflow command line
+
+
+## Nextflow Tower 
+
+
+# Contributions
 
 Contributions are warmly accepted!
 
-We would like to Thank the developers of MTBseq!
+
+# License
+
+
+The insipiration for this project itself [MTBseq](https://github.com/ngs-fzb/MTBseq_source) has a GPL-3 license as of [v1.0.3](https://github.com/ngs-fzb/MTBseq_source/blob/v1.0.3/LICENSE.md).
+
+The components related to `mtbseq-nf` project itself (the Nextflow wrapper code) are licensed under the liberal MPL-2.0 license.
+
+We would like to Thank the developers of MTBseq for putting in the intial effort!
+
diff --git a/conf/conda.config b/conf/conda.config
@@ -1,6 +1,6 @@
 process {
     withName:
-        ".*" {
-        conda = "mtbseq"
-            }
+    ".*" {
+        conda = 'bioconda::mtbseq:1.0.3'
     }
+}
diff --git a/conf/docker.config b/conf/docker.config
@@ -3,6 +3,6 @@ docker.enabled = true
 process {
     withName:
     ".*" {
-    container = 'quay.io/biocontainers/mtbseq:1.0.3--pl526_1'
-            }
+        container = 'quay.io/biocontainers/mtbseq:1.0.3--pl526_1'
     }
+}
diff --git a/conf/global_params.config b/conf/global_params.config
@@ -53,7 +53,7 @@ TBVARIANTS {
     mincovr = 4
     minphred = 4
     minfreq = 75
- }
+}
 
 TBSTATS {
     project_name = params.project_name
@@ -71,7 +71,7 @@ TBSTRAINS {
     mincovr = 4
     minphred = 4
     minfreq = 75
- }
+}
 
 TBJOIN {
     results_dir = params.outdir
@@ -82,7 +82,7 @@ TBJOIN {
     mincovr = 4
     minphred = 4
     minfreq = 75
- }
+}
 
 TBAMEND {
     results_dir = params.outdir
@@ -96,7 +96,7 @@ TBAMEND {
     unambig = 95
     window = 12
     distance = 12
- }
+}
 
 TBGROUPS {
     results_dir = params.outdir
@@ -136,3 +136,76 @@ RENAME_FILES {
     save_mode = 'copy'
     should_publish = true
 }
+//
+//// If a module was chosen with the --step OPTION, the --continue OPTION ensures that the pipeline will continue the analysis with downstream modules. This is automatically set if the --step OPTION is set to the VALUE TBfull.
+//// continue
+//
+//// This OPTION requires a user supplied file specifying a set of datasets (e.g. samples.txt) as VALUE. The file must be a two-column, tab-separated file. Column 1 has to be your [SampleID]. Column 2 has to be your [LibID]. TBjoin requires this OPTION to be set.
+//samples = "samples.tsv"
+//
+//// This OPTION allows the user to set a project name for the steps TBjoin, TBamend and TBgroups. If you do not support a project name, [NONE] is used as a default value.
+//project
+//
+//// This OPTION sets the reference genome for the read mapping. By default, the genome of Mycobacterium tuberculosis H37Rv (NC_000962.3) is set as reference. User supplied FASTA files for other reference genomes should be placed in the directory /MTBseq_source/var/ref/, and the respective name given without .fasta extension. Please be aware that for other reference genomes, you need to provide the respective annotation files as well or annotations will be skipped.
+//ref
+//
+//// This OPTION sets a list of known variant positions associated to drug resistance for resistance prediction. Give the full path to the file. The required structure of the file can be seen here: /MTBseq_source/var/res/MTB_Resistance_Mediating.txt
+//resilist
+//
+//// This OPTION sets a list of interesting regions to be used for annotation of detected variants. Give the full path to the file. The required structure of the file can be seen here: /MTBseq_source/var/res/MTB_Extended_Resistance_Mediating.txt
+//intregions
+//
+//// This OPTION specifies a gene categories file to annotate essential and non-essential genes as well as repetitive regions. SNPs in repetitive regions will be excluded for phylogenetic analysis. Give the full path to the file. The required structure of the file can be seen here: /MTBseq_source/var/cat/MTB_Gene_Categories.txt
+//categories
+//
+//// This OPTION specifies a file for base quality recalibration. The list must be in VCF format and should contain known SNPs. Give the full path to the file. The required structure of the file can be seen here: /MTBseq_source/var/res/MTB_Base_Calibration_List.vcf
+//basecalib
+//
+//// This OPTION is used in the modules TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION will skip all filtering steps and report the calculated information for all positions in the input file.
+//all_vars = false
+//
+//// This OPTION is used in TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION will add an additional filter that excludes all variants except SNPs.
+//snp_vars = false
+//
+//// This OPTION is used in TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION has major implications on how the mapping data for each position is processed. By default, the majority allele is called and taken for further calculations. If the --lowfreq_vars OPTION is set, MTBseq will consider the majority allele distinct from wild type, if such an allele is present. This means that only in this detection mode, MTBseq will report variants present only in subpopulations, i.e. low frequency mutations. Of course, OPTIONS --mincovf, --mincovr, --minphred20, and --minfreq need to be set accordingly. Please be aware that output generated in this detection mode should not be used for phylogenetic analysis.
+//lowfreq_vars = false
+//
+//// This OPTION is used in TBlist. By default, the OPTION is set to 13. The OPTION sets a threshold for the sequence data quality to be used for the mpileup creation.
+//minbqual = false
+//
+//// This OPTION is used in TBvariants, TBjoin, TBamend, and TBstrains. By default, the OPTION is set to 4. The OPTION sets a minimum forward read coverage threshold. Alleles must have a forward coverage of this VALUE or higher to be considered.
+//mincovf = 4
+//
+//// This OPTION is used in TBvariants, TBjoin, TBamend, and TBstrains. By default, the OPTION is set to 4. The OPTION sets a minimum reverse read coverage threshold. Alleles must have a reverse coverage of this VALUE or higher to be considered.
+//mincovr = 4
+//
+//// This OPTION is used in TBvariants, TBjoin, TBamend, and TBstrains. By default, the OPTION is set to 4. The OPTION sets a minimum number of reads indicating an allele with a phred score of at least 20.
+//minphred = 4
+//
+//// This OPTION is used in TBvariants, TBjoin, TBamend, and TBstrains. By default, the OPTION is set to 75. The OPTION sets a minimum frequency for an allele.
+//minfreq
+//
+//// This OPTION is used in TBamend. By default, the OPTION is set to 95. The option sets a minimum percentage of samples with unambiguous information for position.
+//unambig
+//
+//// This OPTION is used in TBamend. By default, the OPTION is set to 12. The OPTION sets a window size in which the algorithm scans for the occurrence of multiple variants within the same sample. If more than one variant occurs within this window in the same sample, the positions will be excluded.
+//window
+//
+//// This OPTION is used in TBgroups. By default, the OPTION is set to 12. The OPTION sets a SNP distance that is used to classify samples into groups of samples, using agglomerative clustering. If SNP distances between samples are less or equal this VALUE, they are grouped together.
+//distance
+//
+//// This OPTION turns off the display logging function and will report the logging only in a file, called "MTBseq_[DATE]_[USER].log".
+//quiet
+//
+//// This OPTION is used in TBbwa, TBmerge, TBrefine, TBpile and TBlist. By default, the OPTION is set to 1. The OPTION sets the maximum number of CPUs to use within the pipeline. You can use more than one core in order to execute the pipeline faster. 8 is the current maximum.
+//threads
+//
+//// This OPTION will show you all available OPTIONs and corresponding VALUEs used by MTBseq.
+//help
+//
+//// This OPTION will show you the current version of MTBseq.
+//version
+//
+//// This OPTION will check the dependencies of MTBseq.
+//check
+
diff --git a/conf/params.nf b/conf/params.nf
diff --git a/conf/standard.config b/conf/standard.config
@@ -2,76 +2,70 @@ params {
     outdir = "${baseDir}/results"
     gatk38_jar = "${baseDir}/resources/GenomeAnalysisTK.jar"
     reads = "${baseDir}/data/full_data/*_{R1,R2}*fastq.gz"
-    }
+}
 
 process {
     errorStrategy = 'retry'
     maxRetries = 3
 
     withName:
-        "TBREFINE" {
-            cpus = 8
-            memory = "15 GB"
-    }
-
-    withName:
-        "TBBWA" {
-            cpus = 8
-            memory = "15 GB"
+    "TBREFINE" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBVARIANTS" {
-            cpus = 8
-            memory = "15 GB"
+    "TBBWA" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBPILE" {
-            cpus = 8
-            memory = "15 GB"
+    "TBVARIANTS" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "MTBSEQ" {
-            cpus = 8
-            memory = "15 GB"
+    "TBPILE" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBSTRAINS" {
-            cpus = 8
-            memory = "15 GB"
+    "TBSTRAINS" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBLIST" {
-            cpus = 8
-            memory = "15 GB"
+    "TBLIST" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBSTATS" {
-            cpus = 8
-            memory = "15 GB"
+    "TBSTATS" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBAMEND" {
-            cpus = 8
-            memory = "15 GB"
+    "TBAMEND" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBGROUPS" {
-            cpus = 8
-            memory = "15 GB"
+    "TBGROUPS" {
+        cpus = 8
+        memory = "15 GB"
     }
 
     withName:
-        "TBJOIN" {
-            cpus = 8
-            memory = "15 GB"
+    "TBJOIN" {
+        cpus = 8
+        memory = "15 GB"
     }
 
 }