From a7e54aa5bdc6678bbb31830a03f36ee884382cc6 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 16 Aug 2018 11:59:02 +0200 Subject: [PATCH 01/75] removing spurious VEP directory --- annotate.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index d3571a15cd..47bb8e4214 100644 --- a/annotate.nf +++ b/annotate.nf @@ -228,7 +228,7 @@ process RunVEP { finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome """ - vep --dir /opt/vep/.vep/ \ + vep \ -i ${vcf} \ -o ${vcf.simpleName}_VEP.ann.vcf \ --assembly ${genome} \ From ae126aece4a96509b0d83b226337fefebee1e42e Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 4 Sep 2018 10:50:24 +0200 Subject: [PATCH 02/75] Strelka targeted is working fine --- germlineVC.nf | 43 +++++++++++++++++++++++---------------- lib/SarekUtils.groovy | 2 ++ somaticVC.nf | 47 ++++++++++++++++++++++++++----------------- 3 files changed, 57 insertions(+), 35 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 4dd71fb528..50d37d5569 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -454,23 +454,32 @@ process RunSingleStrelka { when: 'strelka' in tools && !params.onlyQC script: - """ - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${genomeFile} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/genome.*.vcf.gz \ - Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi \ - Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz \ - Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi \ - Strelka_${idSample}_variants.vcf.gz.tbi - """ + """ + if ![ -s "${params.targetBED}" ]; then + # do WGS + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${genomeFile} \ + --runDir Strelka + else + # WES or targeted + bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz + tabix call_targets.bed.gz + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${genomeFile} \ + --exome \ + --callRegions call_targets.bed.gz \ + --runDir Strelka + fi + + # always run this part + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz + mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi + mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz + mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi + """ } if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy index 2fa31868dd..3c921be757 100644 --- a/lib/SarekUtils.groovy +++ b/lib/SarekUtils.groovy @@ -88,6 +88,8 @@ class SarekUtils { 'strelka-BP', 'strelkaBP', 'tag', + 'target-BED', + 'targetBED', 'test', 'tools', 'total-memory', diff --git a/somaticVC.nf b/somaticVC.nf index 0e2347bb0e..98b05aae25 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -449,24 +449,35 @@ process RunStrelka { when: 'strelka' in tools && !params.onlyQC script: - """ - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${genomeFile} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/somatic.indels.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ + """ + if ![ -s "${params.targetBED}" ]; then + # do WGS + configureStrelkaSomaticWorkflow.py \ + --tumor ${bamTumor} \ + --normal ${bamNormal} \ + --referenceFasta ${genomeFile} \ + --runDir Strelka + else + # WES or targeted + bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz + tabix call_targets.bed.gz + configureStrelkaSomaticWorkflow.py \ + --tumor ${bamTumor} \ + --normal ${bamNormal} \ + --referenceFasta ${genomeFile} \ + --exome \ + --callRegions call_targets.bed.gz \ + --runDir Strelka + fi + + python Strelka/runWorkflow.py -m local -j ${task.cpus} + # always run this part + + mv Strelka/results/variants/somatic.indels.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz + mv Strelka/results/variants/somatic.indels.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi + mv Strelka/results/variants/somatic.snvs.vcf.gz Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz + mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi + """ } if (params.verbose) strelkaOutput = strelkaOutput.view { From 5082a7a8b6cace701ac06e6c962da78d8af4bba3 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 10:00:26 +0200 Subject: [PATCH 03/75] changes to ConcatVCF to accomodate bcftools isec in germline targets --- germlineVC.nf | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index 50d37d5569..afaf562b8e 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -376,7 +376,8 @@ process ConcatVCF { file(genomeIndex) from Channel.value(referenceMap.genomeIndex) output: - set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated + // we have this funny *_* pattern to avoid copying the raw calls to publishdir + set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC @@ -424,8 +425,17 @@ process ConcatVCF { tail -n +\$((L+1)) \${vcf} done done - ) | bgzip > ${outputFile}.gz - tabix ${outputFile}.gz + ) | bgzip -@${task.cpus} > rawcalls.vcf.gz + tabix rawcalls.vcf.gz + + # now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided + if [ -s "${params.targetBED}" ]; then + bcftools isec --targets-file ${params.targetBED} rawcalls.vcf.gz | bgzip -@${task.cpus} > ${outputFile}.gz + tabix ${outputFile}.gz + else + # simply rename the raw calls as WGS results + for f in rawcalls*; do mv -v \$f ${outputFile}\${f#rawcalls}; done + fi """ } From f980e11668e72eb957be8f4d10ea054e83eaa62e Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 10:21:30 +0200 Subject: [PATCH 04/75] concatenateVCF.sh is now a separate script to avoid code duplication --- germlineVC.nf | 49 +------------------------------- scripts/concatenateVCFs.sh | 58 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 48 deletions(-) create mode 100755 scripts/concatenateVCFs.sh diff --git a/germlineVC.nf b/germlineVC.nf index afaf562b8e..f557884010 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -388,54 +388,7 @@ process ConcatVCF { else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" """ - set -euo pipefail - # first make a header from one of the VCF intervals - # get rid of interval information only from the GATK command-line, but leave the rest - FIRSTVCF=\$(ls *.vcf | head -n 1) - sed -n '/^[^#]/q;p' \$FIRSTVCF | \ - awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \ - > header - - # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig - # header in the VCF as it is optional (FreeBayes does not save it, for example) - CONTIGS=(\$(cut -f1 ${genomeIndex})) - - # concatenate VCFs in the correct order - ( - cat header - - for chr in "\${CONTIGS[@]}"; do - # Skip if globbing would not match any file to avoid errors such as - # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 - # was not processed. - pattern="\${chr}_*.vcf" - if ! compgen -G "\${pattern}" > /dev/null; then continue; fi - - # ls -v sorts by numeric value ("version"), which means that chr1_100_ - # is sorted *after* chr1_99_. - for vcf in \$(ls -v \${pattern}); do - # Determine length of header. - # The 'q' command makes sed exit when it sees the first non-header - # line, which avoids reading in the entire file. - L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l) - - # Then print all non-header lines. Since tail is very fast (nearly as - # fast as cat), this is way more efficient than using a single sed, - # awk or grep command. - tail -n +\$((L+1)) \${vcf} - done - done - ) | bgzip -@${task.cpus} > rawcalls.vcf.gz - tabix rawcalls.vcf.gz - - # now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided - if [ -s "${params.targetBED}" ]; then - bcftools isec --targets-file ${params.targetBED} rawcalls.vcf.gz | bgzip -@${task.cpus} > ${outputFile}.gz - tabix ${outputFile}.gz - else - # simply rename the raw calls as WGS results - for f in rawcalls*; do mv -v \$f ${outputFile}\${f#rawcalls}; done - fi + ${workflow.projectDir}/scripts/concatenateVCFs.sh ${genomeIndex} ${task.cpus} ${outputFile} ${params.targetBED} """ } diff --git a/scripts/concatenateVCFs.sh b/scripts/concatenateVCFs.sh new file mode 100755 index 0000000000..1b98be961c --- /dev/null +++ b/scripts/concatenateVCFs.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# this script concatenates all VCFs that are in the local directory: the +# purpose is to make a single VCF from all the VCFs that were created from different intervals + +set -euo pipefail + +genomeIndex=$1 +cpus=$2 +outputFile=$3 +targetBED=$4 +# first make a header from one of the VCF intervals +# get rid of interval information only from the GATK command-line, but leave the rest +FIRSTVCF=$(ls *.vcf | head -n 1) +sed -n '/^[^#]/q;p' $FIRSTVCF | \ +awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\\n")}' \ +> header + +# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig +# header in the VCF as it is optional (FreeBayes does not save it, for example) +CONTIGS=($(cut -f1 ${genomeIndex})) + +# concatenate VCFs in the correct order +( + cat header + + for chr in "${CONTIGS[@]}"; do + # Skip if globbing would not match any file to avoid errors such as + # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 + # was not processed. + pattern="${chr}_*.vcf" + if ! compgen -G "${pattern}" > /dev/null; then continue; fi + + # ls -v sorts by numeric value ("version"), which means that chr1_100_ + # is sorted *after* chr1_99_. + for vcf in $(ls -v ${pattern}); do + # Determine length of header. + # The 'q' command makes sed exit when it sees the first non-header + # line, which avoids reading in the entire file. + L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) + + # Then print all non-header lines. Since tail is very fast (nearly as + # fast as cat), this is way more efficient than using a single sed, + # awk or grep command. + tail -n +$((L+1)) ${vcf} + done + done +) | bgzip -@${cpus} > rawcalls.vcf.gz +tabix rawcalls.vcf.gz + +# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided +if [ -s "${targetBED}" ]; then + bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz + tabix ${outputFile}.gz +else + # simply rename the raw calls as WGS results + for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls}; done +fi + From df4d8d1e207c6106283b7b9a08e970633e2c6d99 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 13:02:47 +0200 Subject: [PATCH 05/75] added concatOptions and Strelka fix --- germlineVC.nf | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index f557884010..7a83f3c162 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -387,9 +387,14 @@ process ConcatVCF { else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf" else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - """ - ${workflow.projectDir}/scripts/concatenateVCFs.sh ${genomeIndex} ${task.cpus} ${outputFile} ${params.targetBED} - """ + if(params.targetBED) // targeted + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" + else // WGS + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " + + """ + ${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions} + """ } if (params.verbose) vcfConcatenated = vcfConcatenated.view { @@ -418,7 +423,7 @@ process RunSingleStrelka { script: """ - if ![ -s "${params.targetBED}" ]; then + if [ ! -s "${params.targetBED}" ]; then # do WGS configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ From 60d15cc6432cf82f940c5578dd0ff9f7cfad8ee8 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 13:04:48 +0200 Subject: [PATCH 06/75] added getopts, fixed existence check and set +u --- scripts/concatenateVCFs.sh | 39 ++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/scripts/concatenateVCFs.sh b/scripts/concatenateVCFs.sh index 1b98be961c..6e3ff73d79 100755 --- a/scripts/concatenateVCFs.sh +++ b/scripts/concatenateVCFs.sh @@ -2,12 +2,35 @@ # this script concatenates all VCFs that are in the local directory: the # purpose is to make a single VCF from all the VCFs that were created from different intervals +usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; } + +while getopts "i:c:o:t:" p; do + case "${p}" in + i) + genomeIndex=${OPTARG} + ;; + c) + cpus=${OPTARG} + ;; + o) + outputFile=${OPTARG} + ;; + t) + targetBED=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi +if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi +if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi + set -euo pipefail -genomeIndex=$1 -cpus=$2 -outputFile=$3 -targetBED=$4 # first make a header from one of the VCF intervals # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=$(ls *.vcf | head -n 1) @@ -47,12 +70,16 @@ CONTIGS=($(cut -f1 ${genomeIndex})) ) | bgzip -@${cpus} > rawcalls.vcf.gz tabix rawcalls.vcf.gz +set +u + # now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided -if [ -s "${targetBED}" ]; then +echo "target is $targetBED" +if [ ! -z ${targetBED+x} ]; then + echo "Selecting subset..." bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz tabix ${outputFile}.gz else # simply rename the raw calls as WGS results - for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls}; done + for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done fi From 9ca0c9341615ef160ae69211aab93e4ccfe2ac78 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 15:09:04 +0200 Subject: [PATCH 07/75] Somatic ConcatVCF also simplified --- somaticVC.nf | 51 ++++++++++----------------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/somaticVC.nf b/somaticVC.nf index 98b05aae25..b5b2fbc9db 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -374,53 +374,22 @@ process ConcatVCF { file(genomeIndex) from Channel.value(referenceMap.genomeIndex) output: - set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated + // we have this funny *_* pattern to avoid copying the raw calls to publishdir + set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated + // TODO DRY with ConcatVCF when: ( 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC script: outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf" - """ - set -euo pipefail - # first make a header from one of the VCF intervals - # get rid of interval information only from the GATK command-line, but leave the rest - FIRSTVCF=\$(ls *.vcf | head -n 1) - sed -n '/^[^#]/q;p' \$FIRSTVCF | \ - awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \ - > header - - # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig - # header in the VCF as it is optional (FreeBayes does not save it, for example) - CONTIGS=(\$(cut -f1 ${genomeIndex})) - - # concatenate VCFs in the correct order - ( - cat header - - for chr in "\${CONTIGS[@]}"; do - # Skip if globbing would not match any file to avoid errors such as - # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 - # was not processed. - pattern="\${chr}_*.vcf" - if ! compgen -G "\${pattern}" > /dev/null; then continue; fi - - # ls -v sorts by numeric value ("version"), which means that chr1_100_ - # is sorted *after* chr1_99_. - for vcf in \$(ls -v \${pattern}); do - # Determine length of header. - # The 'q' command makes sed exit when it sees the first non-header - # line, which avoids reading in the entire file. - L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l) - - # Then print all non-header lines. Since tail is very fast (nearly as - # fast as cat), this is way more efficient than using a single sed, - # awk or grep command. - tail -n +\$((L+1)) \${vcf} - done - done - ) | bgzip > ${outputFile}.gz - tabix ${outputFile}.gz + if(params.targetBED) // targeted + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}" + else // WGS + concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " + + """ + ${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions} """ } From 25ecf7ae40db29e9b34330fd6fd31a48c9972b96 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:03:00 +0200 Subject: [PATCH 08/75] update Singularity with new link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f5301e456..6a1b912d12 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Previously known as the Cancer Analysis Workflow (CAW), Sarek is a workflow designed to run analyses on WGS data from regular samples or tumour / normal pairs, including relapse samples if required. It's built using [Nextflow][nextflow-link], a domain specific language for workflow building. -Software dependencies are handled using [Docker](https://www.docker.com) or [Singularity](http://singularity.lbl.gov) - container technologies that provide excellent reproducibility and ease of use. +Software dependencies are handled using [Docker](https://www.docker.com) or [Singularity](https://www.sylabs.io/singularity/) - container technologies that provide excellent reproducibility and ease of use. Singularity has been designed specifically for high-performance computing environments. This means that although Sarek has been primarily designed for use with the Swedish [UPPMAX HPC systems](https://www.uppmax.uu.se), it should be able to run on any system that supports these two tools. From 6566d364b4667dcd83aa1ec79338439e5f9b1075 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:04:48 +0200 Subject: [PATCH 09/75] add bio.tools to checklist [skip ci] --- .github/RELEASE_CHECKLIST.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 58ffe43635..56ecb2d715 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -18,6 +18,7 @@ This checklist is for our own reference 6. Commit and push version updates 7. Make a [release](https://github.com/SciLifeLab/Sarek/releases) on GitHub 8. Choose an appropriate codename for the release -9. Tweet that new version is released -10. Commit and push. Continue making more awesome :metal: -11. Have fika :cake: +9. Update [bio.tools](https://bio.tools/Sarek) with the new release +10. Tweet that new version is released +11. Commit and push. Continue making more awesome :metal: +12. Have fika :cake: From fa8aec89cf58ab3b005f955f8df33c4d7e04cf1c Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 6 Sep 2018 16:07:21 +0200 Subject: [PATCH 10/75] updated documentation --- CHANGELOG.md | 1 + README.md | 2 +- docs/USE_CASES.md | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52b7f981da..d3bdd76645 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule +- [#613](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED ### `Changed` - [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version diff --git a/README.md b/README.md index 9f5301e456..ae7f5c3d3f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://opensource.scilifelab.se/projects/sarek/) -#### An open-source analysis pipeline to detect germline or somatic variants from whole genome sequencing +#### An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing [![Nextflow version][nextflow-badge]][nextflow-link] [![Travis build status][travis-badge]][travis-link] diff --git a/docs/USE_CASES.md b/docs/USE_CASES.md index b894d79498..c690be3a70 100644 --- a/docs/USE_CASES.md +++ b/docs/USE_CASES.md @@ -189,3 +189,13 @@ SUBJECT_ID XX 1 SAMPLEIDR /samples/SAMPLEIDR.bam /samples/SAMPLEIDR ``` If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space). In this case, you need to start with `--step=recalibrate` (see previous section). + +## Processing targeted (whole exome or panel) sequencing data + +The recommended flow for thrgeted sequencing data is to use the whole genome workflow as it is, but also provide a BED file containing targets for variant calling. +The Strelka part of the workflow will pick up these intervals, and activate the `--exome` flag to process deeper coverage. It is adviced to pad the variant calling +regions (exons or the target) to some extent before submitting to the workflow. To add the target BED file configure the flow like: + +```bash +nextflow run SciLifeLab/Sarek/germlineVC.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --sample my_panel.tsv +``` From 6944fb14b31c031664056250fa18200ced21c9d4 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:11:44 +0200 Subject: [PATCH 11/75] update Singularity link + fix link to config file [fix ci] --- docs/INSTALL.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/INSTALL.md b/docs/INSTALL.md index c07c442658..250ee0a16e 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -8,7 +8,7 @@ You can use a small reference genome as testing. - See the [Install Nextflow documentation](https://www.nextflow.io/docs/latest/getstarted.html#installation) - See the [Reference files documentation](REFERENCES.md) - See the [Install Docker documentation](https://docs.docker.com/engine/installation/linux/ubuntu/#install-docker) -- See the [Install Singularity documentation](http://singularity.lbl.gov/install-linux) +- See the [Install Singularity documentation](https://www.sylabs.io/guides/2.6/user-guide/quick_start.html#installation) ## Installation @@ -18,7 +18,7 @@ Nextflow will automatically fetch Sarek from GitHub when launched if `SciLifeLab Sarek use Singularity containers to package all the different tools. -If you plan to use the automatic pull of Singularity images, you can use the [`singularity.config`](../configuration/singularity.config) configuration file. You can also set up the Nextflow environnement variable `NXF_SINGULARITY_CACHEDIR` to choose where to store them. +If you plan to use the automatic pull of Singularity images, you can use the [`singularity.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/singularity.config) configuration file. You can also set up the Nextflow environnement variable `NXF_SINGULARITY_CACHEDIR` to choose where to store them. For example ```bash From e06b7e14354323e1e9e18a1a29dd9f0ac3fc6b29 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:16:34 +0200 Subject: [PATCH 12/75] remove old documentation + remove old comments --- buildReferences.nf | 1 - docs/REFERENCES.md | 12 ++---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/buildReferences.nf b/buildReferences.nf index 8403d3de06..ed26c2ae25 100644 --- a/buildReferences.nf +++ b/buildReferences.nf @@ -26,7 +26,6 @@ kate: syntax groovy; space-indent on; indent-width 2; https://github.com/SciLifeLab/Sarek/README.md -------------------------------------------------------------------------------- Processes overview - - ProcessReference - Download all references if needed - DecompressFile - Extract files if needed - BuildBWAindexes - Build indexes for BWA - BuildReferenceIndex - Build index for FASTA refs diff --git a/docs/REFERENCES.md b/docs/REFERENCES.md index 084eb14696..8e0ad12ce9 100644 --- a/docs/REFERENCES.md +++ b/docs/REFERENCES.md @@ -72,20 +72,12 @@ Use `--genome smallGRCh37` to map against a small reference genome based on GRCh The `buildReferences.nf` script can download and build the files needed for smallGRCh37, or build the references for GRCh37/smallGRCh37. -### `--download` - -Only with `--genome smallGRCh37`. If this option is specify, the [`smallRef`](https://github.com/szilvajuhos/smallRef) repository will be automatically downloaded from GitHub. Not to be used on UPPMAX cluster Bianca or on similarly secured clusters where such things are not working/allowed. - -``` -nextflow run buildReferences.nf --download --genome smallGRCh37 -``` - ### `--refDir` -Use `--refDir ` to specify where are the files to process. +Use `--refDir ` to specify where are the files to process. ``` -nextflow run buildReferences.nf --refDir --genome +nextflow run buildReferences.nf --refDir --genome ``` ### `--genome` From 4c4fe5e851ab4acf0182efa444be81a9a04d4a84 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:17:58 +0200 Subject: [PATCH 13/75] fix path to files [skip ci] --- docs/CONFIG.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 01b6a6e4fe..85cae631a8 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -12,41 +12,41 @@ The standard ones are designed to work on a Swedish UPPMAX clusters, and can be Every configuration file can be modified for your own use. If you want you can specify the use of a config file using `-c ` -### [`containers.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/containers.config) +### [`containers.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/containers.config) Define Containers for all process. Images will be pulled automatically. Use in your own profile if needed. -### [`docker.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/docker.config) +### [`docker.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/docker.config) Define Docker Containers for all process. Images will be pulled automatically. Use in your own profile if needed. -### [`genomes.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/genomes.config) +### [`genomes.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/genomes.config) Contain path to all references. Modify it if you want to change genome version, or the path to your references files. -### [`singularity-path.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/singularity-path.config) +### [`singularity-path.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/singularity-path.config) Define path to Singularity Containers for all process. To be used when downloading Singularity Containers, like on a secure UPPMAX cluster. Images will not be pulled automatically. You need to set them up before. -### [`singularity.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/singularity.config) +### [`singularity.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/singularity.config) Define Singularity Containers for all process. Images will be pulled automatically. Use in your own profile if needed. -### [`travis.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/travis.config) +### [`travis.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/travis.config) To be used for Travis (2 cpus) or on small computer for testing purpose -### [`uppmax-slurm.config`](https://github.com/SciLifeLab/Sarek/blob/master/configuration/uppmax-slurm.config) +### [`uppmax-slurm.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/uppmax-slurm.config) Slurm configuration for a UPPMAX cluster Will run the workflow on `/scratch` using the Nextflow [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch) directive From 1f52da1487f24c23c95bee8bfd55aa653f71281e Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:28:49 +0200 Subject: [PATCH 14/75] update old links [skip ci] --- docs/USAGE.md | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/docs/USAGE.md b/docs/USAGE.md index b1474d0f31..d562ce2aff 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -93,7 +93,8 @@ Choose an output directory ### --project `ProjectID` -Specify a project number ID on a UPPMAX cluster. (optionnal if not on such a cluster) +Specify a project number ID on a UPPMAX cluster. +(optionnal if not on such a cluster) ### --sample `file.tsv` @@ -101,20 +102,24 @@ Use the given TSV file as sample (cf [TSV documentation](TSV.md)). ### --step `step` -Choose from wich step the workflow will start. Choose only one step. Possible values are: +Choose from wich step the workflow will start. +Choose only one step. +Possible values are: - mapping (default, will start workflow with FASTQ files) -- realign (will start workflow with BAM files (with T/N BAMs that were not realigned together)) -- recalibrate (will start workflow with BAM files and Recalibration Tables (Only with T/N BAMs that were realigned together)) +- recalibrate (will start workflow with BAM files and Recalibration Tables `--step` option is case insensitive to avoid easy introduction of errors when choosing a step. + ### --test Test run Sarek on a smaller dataset, that way you don't have to specify `--sample data/tsv/tiny.tsv` ### --tools `tool1[,tool2,tool3...]` -Choose which tools will be used in the workflow. Different tools to be separated by commas. Possible values are: +Choose which tools will be used in the workflow. +Different tools to be separated by commas. +Possible values are: - haplotypecaller (use `HaplotypeCaller` for VC) (germlineVC) - manta (use `Manta` for SV) (germlineVC,somaticVC) @@ -124,11 +129,14 @@ Choose which tools will be used in the workflow. Different tools to be separated - snpeff (use `snpEff` for Annotation) (annotate) - vep (use `VEP` for Annotation) (annotate) -`--tools` option is case insensitive to avoid easy introduction of errors when choosing tools. So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worrying about case sensitivity. +`--tools` option is case insensitive to avoid easy introduction of errors when choosing tools. +So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worrying about case sensitivity. ### --annotateTools `tool1[,tool2,tool3...]` -Choose which tools to annotate. Different tools to be separated by commas. Possible values are: +Choose which tools to annotate. +Different tools to be separated by commas. +Possible values are: - haplotypecaller (Annotate `HaplotypeCaller` output) - manta (Annotate `Manta` output) - mutect2 (Annotate `MuTect2` output) @@ -136,7 +144,8 @@ Choose which tools to annotate. Different tools to be separated by commas. Possi ### --annotateVCF `file1[,file2,file3...]` -Choose vcf to annotate. Different vcfs to be separated by commas. +Choose vcf to annotate. +Different vcfs to be separated by commas. ### --verbose @@ -190,22 +199,21 @@ Simpler to specify in the configuration files, but it's still possible to specif ### --totalMemory `memory` -# Nextflow options - -See the [options documentation](https://github.com/SciLifeLab/NGI-NextflowDocs/blob/master/docs/OPTIONS.md) - -## Profiles +## Configuration and profiles -More informations on the [SciLifeLab Nextflow documentation](https://github.com/SciLifeLab/NGI-NextflowDocs/blob/master/docs/INSTALL.md). The default profile is `standard`. You can use your own profile: +More informations on the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html). +The default profile is `standard`. +You can use your own profile: ```bash nextflow run SciLifeLab/Sarek --sample mysample.tsv -profile myprofile ``` -A standard profile is defined in [`nextflow.config`](../nextflow.config). You can use the files in the [`configuration/`](../configuration) directory as a base to make a new `.config` file that you can specify directly (or add as a profile): +A standard profile is defined in [`nextflow.config`](https://github.com/SciLifeLab/Sarek/blob/master/nextflow.config). +You can use the files in the [`conf/`](https://github.com/SciLifeLab/Sarek/tree/master/conf) directory as a base to make a new `.config` file that you can specify directly (or add as a profile): ```bash -nextflow run SciLifeLab/Sarek --sample mysample.tsv -c config/milou.config +nextflow run SciLifeLab/Sarek --sample mysample.tsv -c conf/personnal.config ``` ## Update to latest version @@ -218,7 +226,8 @@ nextflow pull SciLifeLab/Sarek ## Run the latest version -If there is a feature or bugfix you want to use in a resumed or re-analyzed run, you have to update the workflow to the latest version. By default it is not updated automatically, so use something like: +If there is a feature or bugfix you want to use in a resumed or re-analyzed run, you have to update the workflow to the latest version. +By default it is not updated automatically, so use something like: ```bash nextflow run -latest SciLifeLab/Sarek/main.nf ... -resume From 17cf64334f856872b6689107da2c4a88d8ebc625 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:31:51 +0200 Subject: [PATCH 15/75] update link to Sarek website [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6a1b912d12..d479dbd4ea 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://opensource.scilifelab.se/projects/sarek/) +# [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://sarek.scilifelab.se/) #### An open-source analysis pipeline to detect germline or somatic variants from whole genome sequencing From 95c25d900f10c3af599113e31bced8c194727e4a Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:40:39 +0200 Subject: [PATCH 16/75] fix link to config file [skip ci] --- docs/USE_CASES.md | 51 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/docs/USE_CASES.md b/docs/USE_CASES.md index b894d79498..51a09a9dc9 100644 --- a/docs/USE_CASES.md +++ b/docs/USE_CASES.md @@ -1,6 +1,9 @@ # Use cases -The workflow has three pre-processing options: `mapping`, `realign` and `recalibrate`. Using the `mapping` directive one will have a pair of mapped, deduplicated and recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. Furthermore, during this process a deduplicated BAM file is created in the `Preprocessing/NonRealigned/` directory. This is the usual option you have to give when you are starting from raw FASTQ data: +The workflow has three pre-processing options: `mapping`, `realign` and `recalibrate`. +Using the `mapping` directive one will have a pair of mapped, deduplicated and recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. +Furthermore, during this process a deduplicated BAM file is created in the `Preprocessing/NonRealigned/` directory. +This is the usual option you have to give when you are starting from raw FASTQ data: ```bash nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv @@ -12,9 +15,14 @@ nextflow run SciLifeLab/Sarek/runMultiQC.nf `mapping` will start by default, you do not have to give any additional parameters, only the TSV file describing the sample (see below). -In the [genomes.config](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/configuration/genomes.config) configuration file we are defining the intervals file as well, this is used to define regions for variant call and realignment (in a scatter and gather fashion when possible). The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. We are ignoring the hs37d5 contig that contains concatenated decoy sequences. +In the [genomes.config](https://github.com/SciLifeLab/Sarek/blob/master/conf/genomes.config) configuration file we are defining the intervals file as well, this is used to define regions for variant call and realignment (in a scatter and gather fashion when possible). +The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. +We are ignoring the hs37d5 contig that contains concatenated decoy sequences. -During the execution of the workflow a `Sarek-trace.txt`, a `Sarek-timeline.html` and a `Sarek-report.html` files are generated automatically. These files contain statistics about resources used and processes finished. If you start a new workflow or restart/resume a sample, the previous version will be renamed as `Sarek-trace.txt.1`, `Sarek-timeline.html.1` and `Sarek-report.html.1` respectively. Also, older version are renamed with incremented numbers. +During the execution of the workflow a `Sarek-trace.txt`, a `Sarek-timeline.html` and a `Sarek-report.html` files are generated automatically. +These files contain statistics about resources used and processes finished. +If you start a new workflow or restart/resume a sample, the previous version will be renamed as `Sarek-trace.txt.1`, `Sarek-timeline.html.1` and `Sarek-report.html.1` respectively. +Also, older version are renamed with incremented numbers. ## Starting from raw FASTQ - pair of FASTQ files @@ -49,18 +57,26 @@ nextflow run SciLifeLab/Sarek/main.nf --sampleDir path/to/FASTQ/files nextflow run SciLifeLab/Sarek/germlineVC.nf --tools nextflow run SciLifeLab/Sarek/runMultiQC.nf ``` -The given directory is searched recursively for FASTQ files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. All of the found FASTQ files are considered to belong to the sample. Each FASTQ file pair gets its own read group (`@RG`) in the resulting BAM file. +The given directory is searched recursively for FASTQ files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. +All of the found FASTQ files are considered to belong to the sample. +Each FASTQ file pair gets its own read group (`@RG`) in the resulting BAM file. ### Metadata when using `--sampleDir` When using `--sampleDir`, the metadata about the sample that are written to the BAM header in the `@RG` tag are determined in the following way. -- The sample name (`SM`) is derived from the the last component of the path given to `--sampleDir`. That is, you should make sure that that directory has a meaningful name! For example, with `--sampleDir=/my/fastqs/sample123`, the sample name will be `sample123`. -- The read group id is set to *flowcell.samplename.lane*. The flowcell id and lane number are auto-detected from the name of the first read in the FASTQ file. +- The sample name (`SM`) is derived from the the last component of the path given to `--sampleDir`. +That is, you should make sure that that directory has a meaningful name! For example, with `--sampleDir=/my/fastqs/sample123`, the sample name will be `sample123`. +- The read group id is set to *flowcell.samplename.lane*. +The flowcell id and lane number are auto-detected from the name of the first read in the FASTQ file. ## Starting from raw FASTQ - having pair of FASTQ files for tumor/normal samples (one lane for each sample) -The workflow command line is just the same as before, but the TSV contains extra lines. You can see the second column is used to distinguish normal and tumor samples. You can add as many relapse samples as many you have, providing their name in the third column is different. Each will be compared to the normal one-by-one. Obviously, if you do not have relapse samples, you can leave out this last line. +The workflow command line is just the same as before, but the TSV contains extra lines. +You can see the second column is used to distinguish normal and tumor samples. +You can add as many relapse samples as many you have, providing their name in the third column is different. +Each will be compared to the normal one-by-one. +Obviously, if you do not have relapse samples, you can leave out this last line. ``` SUBJECT_ID XX 0 SAMPLEIDN 1 /samples/normal_1.fastq.gz /samples/normal_2.fastq.gz @@ -70,7 +86,8 @@ SUBJECT_ID XX 1 SAMPLEIDR 1 /samples/relapse_1.fastq.gz /samples ## Starting from raw FASTQ - having multiple lanes (reads groups) -Usually there are more read groups - sequencing lanes - for a single sequencing run, and in a flowcell different lanes have to be recalibrated separately. This is captured in the TSV file only in the following manner, adding read group numbers or IDs in the fourth column. +Usually there are more read groups - sequencing lanes - for a single sequencing run, and in a flowcell different lanes have to be recalibrated separately. +This is captured in the TSV file only in the following manner, adding read group numbers or IDs in the fourth column. ``` SUBJECT_ID XX 0 SAMPLEID 1 /samples/normal1_1.fastq.gz /samples/normal1_2.fastq.gz @@ -79,7 +96,9 @@ SUBJECT_ID XX 0 SAMPLEID 2 /samples/normal2_1.fastq.gz /samples/ ## Starting from raw FASTQ - having multiple lanes (reads groups) for tumor/normal samples -Usually there are more read groups - sequencing lanes - for a single sequencing run, and in a flowcell different lanes have to be recalibrated separately. This is captured in the TSV file only in the following manner, adding read group numbers or IDs in the fourth column. Obviously, if you do not have relapse samples, you can leave out those last two lines. +Usually there are more read groups - sequencing lanes - for a single sequencing run, and in a flowcell different lanes have to be recalibrated separately. +This is captured in the TSV file only in the following manner, adding read group numbers or IDs in the fourth column. +Obviously, if you do not have relapse samples, you can leave out those last two lines. ``` SUBJECT_ID XX 0 SAMPLEIDN 1 /samples/normal1_1.fastq.gz /samples/normal1_2.fastq.gz @@ -93,7 +112,8 @@ SUBJECT_ID XX 1 SAMPLEIDR 9 /samples/relapse9_1.fastq.gz /sample ## Starting from realignement -NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. To have BAMs suitable for variant calling, realignement of pairs is necessary: +NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. +To have BAMs suitable for variant calling, realignement of pairs is necessary: ```bash nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv --step realign @@ -111,7 +131,8 @@ At the end of this step you should have recalibrated BAM files in the `Preproces ## Starting from realignement for tumor/normal samples -NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. To have BAMs suitable for variant calling, realignement of pairs is necessary: +NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. +To have BAMs suitable for variant calling, realignement of pairs is necessary: ```bash nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv --step realign @@ -167,7 +188,9 @@ And the corresponding TSV file should be like: SUBJECT_ID XX 0 SAMPLEIDN /samples/SAMPLEIDN.bam /samples/SAMPLEIDN.bai ``` -If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space). In this case, you need to start with `--step=recalibrate` (see previous section). +If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. +This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space). +In this case, you need to start with `--step=recalibrate` (see previous section). ## Starting from a recalibrated BAM file for tumor/normal samples @@ -188,4 +211,6 @@ SUBJECT_ID XX 1 SAMPLEIDT /samples/SAMPLEIDT.bam /samples/SAMPLEIDT SUBJECT_ID XX 1 SAMPLEIDR /samples/SAMPLEIDR.bam /samples/SAMPLEIDR.bai ``` -If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space). In this case, you need to start with `--step=recalibrate` (see previous section). +If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. +This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space). +In this case, you need to start with `--step=recalibrate` (see previous section). From 4d936e162880bb7edc46c73976fa643f9be666de Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 16:54:18 +0200 Subject: [PATCH 17/75] update processes comments and docs --- annotate.nf | 8 +++++--- docs/PROCESS.md | 21 +++++++++++++-------- germlineVC.nf | 3 ++- main.nf | 1 - runMultiQC.nf | 2 +- somaticVC.nf | 4 ++++ 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/annotate.nf b/annotate.nf index 8fab6e3b0a..64882adae7 100644 --- a/annotate.nf +++ b/annotate.nf @@ -26,9 +26,13 @@ kate: syntax groovy; space-indent on; indent-width 2; https://github.com/SciLifeLab/Sarek/README.md -------------------------------------------------------------------------------- Processes overview - - RunBcftoolsStats - Run BCFTools stats on vcf before annotation + - RunBcftoolsStats - Run BCFTools stats on vcf files + - RunVcftools - Run VCFTools on vcf files - RunSnpeff - Run snpEff for annotation of vcf files - RunVEP - Run VEP for annotation of vcf files + - CompressVCF - Compress and index vcf files using tabix + - GetVersionSnpEFF - Get version of tools + - GetVersionVEP - Get version of tools ================================================================================ = C O N F I G U R A T I O N = ================================================================================ @@ -89,8 +93,6 @@ vcfNotToAnnotate.close() // as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any - - (vcfForBCFtools, vcfForVCFtools, vcfForSnpeff, vcfForVep) = vcfToAnnotate.into(4) vcfForVep = vcfForVep.map { diff --git a/docs/PROCESS.md b/docs/PROCESS.md index dbd045a4e3..6b334d6123 100644 --- a/docs/PROCESS.md +++ b/docs/PROCESS.md @@ -7,29 +7,27 @@ We divide them for the moment into 5 main steps: - MapReads - Map reads with BWA - MergeBams - Merge BAMs if multilane samples -- MarkDuplicates - Mark Duplicates with Picard -- RealignerTargetCreator - Create realignment target intervals -- IndelRealigner - Realign BAMs as T/N pair +- MarkDuplicates - Mark Duplicates with GATK4 - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator - RecalibrateBam - Recalibrate Bam with PrintReads ## Germline Variant Calling: - CreateIntervalBeds - Create and sort intervals into bed files -- RunHaplotypecaller - Run HaplotypeCaller for GermLine Variant Calling (Parallelized processes) -- RunGenotypeGVCFs - Run HaplotypeCaller for GermLine Variant Calling (Parallelized processes) -- ConcatVCF - Merge results from HaplotypeCaller +- RunHaplotypecaller - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) +- RunGenotypeGVCFs - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) +- ConcatVCF - Merge results from paralellized callers - RunSingleStrelka - Run Strelka for Germline Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling ## Somatic Variant Calling: - CreateIntervalBeds - Create and sort intervals into bed files -- RunMutect1 - Run MuTect1 for Variant Calling (Parallelized processes) - RunMutect2 - Run MuTect2 for Variant Calling (Parallelized processes) - RunFreeBayes - Run FreeBayes for Variant Calling (Parallelized processes) -- ConcatVCF - Merge results from Freebayes, MuTect1 and MuTect2 +- ConcatVCF - Merge results from paralellized variant callers - RunStrelka - Run Strelka for Variant Calling +- RunStrelkaBP - Run Strelka Best Practices for Variant Calling - RunManta - Run Manta for Structural Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling - RunAlleleCount - Run AlleleCount to prepare for ASCAT @@ -43,8 +41,15 @@ We divide them for the moment into 5 main steps: - RunBamQC - Run qualimap BamQC on recalibrated BAM files - RunBcftoolsStats - Run BCFTools stats on vcf files - RunVcftools - Run VCFTools on vcf files +- GetVersionAlleleCount - Get version of tools +- GetVersionASCAT - Get version of tools +- GetVersionSnpEFF - Get version of tools +- GetVersionVEP - Get version of tools +- GetVersionAll - Get version of tools +- RunMultiQC - Run MultiQC on reports ## Annotation: - RunSnpeff - Run snpEff for annotation of vcf files - RunVEP - Run VEP for annotation of vcf files +- CompressVCF - Compress and index vcf files using tabix diff --git a/germlineVC.nf b/germlineVC.nf index 1fb96776cb..fde2af67f0 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -31,10 +31,11 @@ kate: syntax groovy; space-indent on; indent-width 2; - CreateIntervalBeds - Create and sort intervals into bed files - RunHaplotypecaller - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - RunGenotypeGVCFs - Run HaplotypeCaller for Germline Variant Calling (Parallelized processes) - - ConcatVCF - Merge results from HaplotypeCaller, MuTect2 and other paralellized callers + - ConcatVCF - Merge results from paralellized callers - RunSingleStrelka - Run Strelka for Germline Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling - RunBcftoolsStats - Run BCFTools stats on vcf files + - RunVcftools - Run VCFTools on vcf files ================================================================================ = C O N F I G U R A T I O N = ================================================================================ diff --git a/main.nf b/main.nf index 67039d860a..4a6d6a3436 100644 --- a/main.nf +++ b/main.nf @@ -30,7 +30,6 @@ kate: syntax groovy; space-indent on; indent-width 2; - MapReads - Map reads with BWA - MergeBams - Merge BAMs if multilane samples - MarkDuplicates - Mark Duplicates with GATK4 - - IndelRealigner - Realign BAMs as T/N pair - CreateRecalibrationTable - Create Recalibration Table with BaseRecalibrator - RecalibrateBam - Recalibrate Bam with PrintReads - RunSamtoolsStats - Run Samtools stats on recalibrated BAM files diff --git a/runMultiQC.nf b/runMultiQC.nf index 1781cee5fb..eb4f99d602 100644 --- a/runMultiQC.nf +++ b/runMultiQC.nf @@ -26,7 +26,7 @@ kate: syntax groovy; space-indent on; indent-width 2; https://github.com/SciLifeLab/Sarek/README.md -------------------------------------------------------------------------------- Processes overview - - GenerateMultiQCconfig - Generate MultiQC configuration file + - GetVersionAll - Get version of tools - RunMultiQC - Run MultiQC on reports ================================================================================ = C O N F I G U R A T I O N = diff --git a/somaticVC.nf b/somaticVC.nf index 5801a7a3c3..3bc3e15082 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -33,12 +33,16 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunFreeBayes - Run FreeBayes for Variant Calling (Parallelized processes) - ConcatVCF - Merge results from paralellized variant callers - RunStrelka - Run Strelka for Variant Calling + - RunStrelkaBP - Run Strelka Best Practices for Variant Calling - RunManta - Run Manta for Structural Variant Calling - RunSingleManta - Run Manta for Single Structural Variant Calling - RunAlleleCount - Run AlleleCount to prepare for ASCAT - RunConvertAlleleCounts - Run convertAlleleCounts to prepare for ASCAT - RunAscat - Run ASCAT for CNV - RunBcftoolsStats - Run BCFTools stats on vcf files + - RunVcftools - Run VCFTools on vcf files + - GetVersionAlleleCount - Get version of tools + - GetVersionASCAT - Get version of tools ================================================================================ = C O N F I G U R A T I O N = ================================================================================ From 776ced38c80b591911a1828920e6030e0dee6128 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 6 Sep 2018 17:06:09 +0200 Subject: [PATCH 18/75] fix some links --- docs/OUTPUT.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/OUTPUT.md b/docs/OUTPUT.md index 422ed59a1e..eb0040ca1f 100644 --- a/docs/OUTPUT.md +++ b/docs/OUTPUT.md @@ -99,11 +99,11 @@ Manta provides a candidate list for small indels also that can be fed to Strelka [fastqc]: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ [freebayes]: https://github.com/ekg/freebayes [GATK-BP]: https://software.broadinstitute.org/gatk/best-practices/bp_3step.php?case=GermShortWGS -[haplotypecaller]: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php +[haplotypecaller]: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php [genomicvcf]: https://gatkforums.broadinstitute.org/gatk/discussion/4017/what-is-a-gvcf-and-how-is-it-different-from-a-regular-vcf [manta]: https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md#structural-variant-predictions [multiqc-link]: http://multiqc.info/ -[mutect2]: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_cancer_m2_MuTect2.php +[mutect2]: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_Mutect2.php [ngi-link]: https://ngisweden.scilifelab.se/ [picard-md]: http://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates [polyphen-link]: http://genetics.bwh.harvard.edu/pph2/ @@ -116,5 +116,5 @@ Manta provides a candidate list for small indels also that can be fed to Strelka [speedseq]: https://github.com/SciLifeLab/Sarek/blob/master/scripts/speedseq.filter.awk [strelka2]: https://github.com/Illumina/strelka [vep-link]: http://www.ensembl.org/Tools/VEP -[VEP-predictions]: https://www.ensembl.org/info/genome/variation/predicted_data.html +[VEP-predictions]: https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html [logo]: https://img.shields.io/github/release/SciLifeLab/Sarek.svg From 521221742e957bf35409e05d9179abe0604d576a Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 7 Sep 2018 12:52:24 +0200 Subject: [PATCH 19/75] fix #633 --- annotate.nf | 4 ++-- conf/containers.config | 27 --------------------------- conf/singularity-path.config | 33 --------------------------------- conf/uppmax-localhost.config | 7 ------- conf/uppmax-slurm.config | 9 --------- docs/PROCESS.md | 2 +- 6 files changed, 3 insertions(+), 79 deletions(-) diff --git a/annotate.nf b/annotate.nf index 64882adae7..c39def8503 100644 --- a/annotate.nf +++ b/annotate.nf @@ -31,7 +31,7 @@ kate: syntax groovy; space-indent on; indent-width 2; - RunSnpeff - Run snpEff for annotation of vcf files - RunVEP - Run VEP for annotation of vcf files - CompressVCF - Compress and index vcf files using tabix - - GetVersionSnpEFF - Get version of tools + - GetVersionSnpeff - Get version of tools - GetVersionVEP - Get version of tools ================================================================================ = C O N F I G U R A T I O N = @@ -267,7 +267,7 @@ if (params.verbose) vcfCompressedoutput = vcfCompressedoutput.view { "Index : ${it[3].fileName}" } -process GetVersionSnpEFF { +process GetVersionSnpeff { publishDir directoryMap.version, mode: 'link' output: file("v_*.txt") when: 'snpeff' in tools || 'merge' in tools diff --git a/conf/containers.config b/conf/containers.config index f30784f48c..6e0ab0a1ad 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -38,36 +38,9 @@ process { withName:GetVersionASCAT { container = "${params.repository}/r-base:${params.tag}" } - withName:GetVersionBamQC { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionBCFtools { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionBWAsamtools { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionFastQC { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionFreeBayes { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionGATK { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionManta { - container = "${params.repository}/sarek:${params.tag}" - } withName:GetVersionSnpeff { container = {params.genome == 'GRCh38' ? "${params.repository}/snpeffgrch38:${params.tag}" : "${params.repository}/snpeffgrch37:${params.tag}"} } - withName:GetVersionStrelka { - container = "${params.repository}/sarek:${params.tag}" - } - withName:GetVersionVCFtools { - container = "${params.repository}/sarek:${params.tag}" - } withName:GetVersionVEP { container = {params.genome == 'GRCh38' ? "${params.repository}/vepgrch38:${params.tag}" : "${params.repository}/vepgrch37:${params.tag}"} } diff --git a/conf/singularity-path.config b/conf/singularity-path.config index a8e1473191..938829dc77 100644 --- a/conf/singularity-path.config +++ b/conf/singularity-path.config @@ -43,42 +43,12 @@ process { withName:GetVersionASCAT { container = "${params.containerPath}/r-base-${params.tag}.img" } - withName:GetVersionBamQC { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionBCFtools { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionBWAsamtools { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionFastQC { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionFreeBayes { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionGATK { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionManta { - container = "${params.containerPath}/sarek-${params.tag}.img" - } withName:GetVersionSnpeff { container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} } - withName:GetVersionStrelka { - container = "${params.containerPath}/sarek-${params.tag}.img" - } - withName:GetVersionVCFtools { - container = "${params.containerPath}/sarek-${params.tag}.img" - } withName:GetVersionVEP { container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} } - withName:IndelRealigner { - container = "${params.containerPath}/sarek-${params.tag}.img" - } withName:MapReads { container = "${params.containerPath}/sarek-${params.tag}.img" } @@ -88,9 +58,6 @@ process { withName:MergeBams { container = "${params.containerPath}/sarek-${params.tag}.img" } - withName:RealignerTargetCreator { - container = "${params.containerPath}/sarek-${params.tag}.img" - } withName:RecalibrateBam { container = "${params.containerPath}/sarek-${params.tag}.img" } diff --git a/conf/uppmax-localhost.config b/conf/uppmax-localhost.config index 3536627ff7..08052cc188 100644 --- a/conf/uppmax-localhost.config +++ b/conf/uppmax-localhost.config @@ -62,9 +62,6 @@ process { cpus = 16 memory = {params.totalMemory} } - withName:IndelRealigner { - memory = {params.singleCPUMem * 2 * task.attempt} - } withName:MapReads { cpus = 16 memory = {params.totalMemory} @@ -78,10 +75,6 @@ process { cpus = 16 memory = {params.totalMemory} } - withName:RealignerTargetCreator { - cpus = 4 - memory = {params.singleCPUMem * 4 * task.attempt} - } withName:RecalibrateBam { memory = {params.singleCPUMem * task.attempt} } diff --git a/conf/uppmax-slurm.config b/conf/uppmax-slurm.config index 48005ebb4b..2f8f78ad79 100644 --- a/conf/uppmax-slurm.config +++ b/conf/uppmax-slurm.config @@ -42,12 +42,6 @@ process { queue = 'core' time = {params.runTime * task.attempt} } - withName:IndelRealigner { - cpus = 1 - memory = {params.singleCPUMem * task.attempt} - queue = 'core' - time = {params.runTime * task.attempt} - } withName:MapReads { time = {params.runTime * task.attempt} } @@ -61,9 +55,6 @@ process { queue = 'core' time = {params.runTime * task.attempt} } - withName:RealignerTargetCreator { - time = {params.runTime * task.attempt} - } withName:RecalibrateBam { cpus = 8 memory = {params.singleCPUMem * 8 * task.attempt} diff --git a/docs/PROCESS.md b/docs/PROCESS.md index 6b334d6123..396d9b7b62 100644 --- a/docs/PROCESS.md +++ b/docs/PROCESS.md @@ -43,7 +43,7 @@ We divide them for the moment into 5 main steps: - RunVcftools - Run VCFTools on vcf files - GetVersionAlleleCount - Get version of tools - GetVersionASCAT - Get version of tools -- GetVersionSnpEFF - Get version of tools +- GetVersionSnpeff - Get version of tools - GetVersionVEP - Get version of tools - GetVersionAll - Get version of tools - RunMultiQC - Run MultiQC on reports From 4bd45cfcbe294ab1b81a15ddb006b5ff1464ecf3 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 10 Sep 2018 09:50:07 +0200 Subject: [PATCH 20/75] \n or \n that is the question --- scripts/concatenateVCFs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/concatenateVCFs.sh b/scripts/concatenateVCFs.sh index 6e3ff73d79..89bf9d125e 100755 --- a/scripts/concatenateVCFs.sh +++ b/scripts/concatenateVCFs.sh @@ -35,7 +35,7 @@ set -euo pipefail # get rid of interval information only from the GATK command-line, but leave the rest FIRSTVCF=$(ls *.vcf | head -n 1) sed -n '/^[^#]/q;p' $FIRSTVCF | \ -awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\\n")}' \ +awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \ > header # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig From 738c137ce8cdf8cd762dfcd185b396a590baed89 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 10 Sep 2018 10:04:39 +0200 Subject: [PATCH 21/75] exclamation misplaced --- somaticVC.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/somaticVC.nf b/somaticVC.nf index b25c29cb6d..919fe4c697 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -405,7 +405,7 @@ process RunStrelka { script: """ - if ![ -s "${params.targetBED}" ]; then + if [ ! -s "${params.targetBED}" ]; then # do WGS configureStrelkaSomaticWorkflow.py \ --tumor ${bamTumor} \ From 406f5f426cd8e451259c3fecae855ae497665046 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 10 Sep 2018 10:23:46 +0200 Subject: [PATCH 22/75] update CHANGELOG [skip ci] --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52b7f981da..b4115f195c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,17 +19,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` - [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Improve install script - [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Simplify tests -- [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor docs +- [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs - [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config - [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering ### `Removed` - [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template - [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Remove old Dockerfiles +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Remove old comments ### `Fixed` - [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix VEP tests - +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Fix links in MD files ## [2.1.0] - Ruotes - 2018-08-14 ### `Added` From aaf3a2b00db8118c75e884f0909d2d206710d384 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 10 Sep 2018 10:58:28 +0200 Subject: [PATCH 23/75] typo [skip ci] --- docs/USAGE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/USAGE.md b/docs/USAGE.md index d562ce2aff..d2c9788f65 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -69,11 +69,11 @@ All parameters, options and variables can be specified with configuration files ### --callName `Name` -Specify a name for MultiQC report (optionnal) +Specify a name for MultiQC report (optional) ### --contactMail `email` -Specify an email for MultiQC report (optionnal) +Specify an email for MultiQC report (optional) ### --help @@ -94,7 +94,7 @@ Choose an output directory ### --project `ProjectID` Specify a project number ID on a UPPMAX cluster. -(optionnal if not on such a cluster) +(optional if not on such a cluster) ### --sample `file.tsv` From 18c50713acf08bd85ded3748c3bdb66e6512ecd0 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 10 Sep 2018 10:58:55 +0200 Subject: [PATCH 24/75] remove more references to realign [skip ci] --- docs/USE_CASES.md | 50 ++--------------------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/docs/USE_CASES.md b/docs/USE_CASES.md index 51a09a9dc9..f0d88a9c3c 100644 --- a/docs/USE_CASES.md +++ b/docs/USE_CASES.md @@ -1,8 +1,7 @@ # Use cases -The workflow has three pre-processing options: `mapping`, `realign` and `recalibrate`. +The workflow has two pre-processing options: `mapping` and `recalibrate`. Using the `mapping` directive one will have a pair of mapped, deduplicated and recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. -Furthermore, during this process a deduplicated BAM file is created in the `Preprocessing/NonRealigned/` directory. This is the usual option you have to give when you are starting from raw FASTQ data: ```bash @@ -15,7 +14,7 @@ nextflow run SciLifeLab/Sarek/runMultiQC.nf `mapping` will start by default, you do not have to give any additional parameters, only the TSV file describing the sample (see below). -In the [genomes.config](https://github.com/SciLifeLab/Sarek/blob/master/conf/genomes.config) configuration file we are defining the intervals file as well, this is used to define regions for variant call and realignment (in a scatter and gather fashion when possible). +In the [genomes.config](https://github.com/SciLifeLab/Sarek/blob/master/conf/genomes.config) configuration file we are defining the intervals file as well, this is used to define regions for variant calling (in a scatter and gather fashion when possible). The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. We are ignoring the hs37d5 contig that contains concatenated decoy sequences. @@ -110,53 +109,8 @@ SUBJECT_ID XX 1 SAMPLEIDR 7 /samples/relapse7_1.fastq.gz /sample SUBJECT_ID XX 1 SAMPLEIDR 9 /samples/relapse9_1.fastq.gz /samples/relapse9_2.fastq.gz ``` -## Starting from realignement - -NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. -To have BAMs suitable for variant calling, realignement of pairs is necessary: - -```bash -nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv --step realign -nextflow run SciLifeLab/Sarek/germlineVC.nf --tools -nextflow run SciLifeLab/Sarek/runMultiQC.nf -``` - -And the corresponding TSV file should be like: - -``` -SUBJECT_ID XX 0 SAMPLEID /samples/SAMPLEIDN.bam /samples/SAMPLEIDN.bai -``` - -At the end of this step you should have recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. - -## Starting from realignement for tumor/normal samples - -NGI Production in the previous years delivered many preprocessed samples; these BAM files are not recalibrated. -To have BAMs suitable for variant calling, realignement of pairs is necessary: - -```bash -nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv --step realign -nextflow run SciLifeLab/Sarek/germlineVC.nf --tools -nextflow run SciLifeLab/Sarek/somaticVC.nf --tools -nextflow run SciLifeLab/Sarek/annotate.nf --tool --annotateVCF myfile.vcf -nextflow run SciLifeLab/Sarek/runMultiQC.nf - -``` - -And the corresponding TSV file should be like (obviously, if you do not have relapse samples, you can leave out this last line): - -``` -SUBJECT_ID XX 0 SAMPLEIDN /samples/SAMPLEIDN.bam /samples/SAMPLEIDN.bai -SUBJECT_ID XX 1 SAMPLEIDT /samples/SAMPLEIDT.bam /samples/SAMPLEIDT.bai -SUBJECT_ID XX 1 SAMPLEIDR /samples/SAMPLEIDT.bam /samples/SAMPLEIDR.bai -``` - -At the end of this step you should have recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. - ## Starting from recalibration for tumor/normal samples -If the BAM files were realigned together, you can start from recalibration: - ```bash nextflow run SciLifeLab/Sarek/main.nf --sample mysample.tsv --step recalibrate nextflow run SciLifeLab/Sarek/germlineVC.nf --tools From eaaaa3882b2ebec38019c488bbf6b8b57864489a Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 10 Sep 2018 10:59:03 +0200 Subject: [PATCH 25/75] spacing --- buildReferences.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildReferences.nf b/buildReferences.nf index ed26c2ae25..0a19ac3bcb 100644 --- a/buildReferences.nf +++ b/buildReferences.nf @@ -143,7 +143,7 @@ process BuildReferenceIndex { } if (params.verbose) ch_referenceIndex.view { - "Reference index : ${it.fileName}" + "Reference index : ${it.fileName}" } process BuildSAMToolsIndex { From f68f5aa00a57a53269738f6f580d2b8dc7893a7f Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 10 Sep 2018 13:30:31 +0200 Subject: [PATCH 26/75] Renamed tsv to input --- docs/{TSV.md => INPUT.md} | 41 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) rename docs/{TSV.md => INPUT.md} (73%) diff --git a/docs/TSV.md b/docs/INPUT.md similarity index 73% rename from docs/TSV.md rename to docs/INPUT.md index 084adb1c93..4468f02320 100644 --- a/docs/TSV.md +++ b/docs/INPUT.md @@ -57,3 +57,44 @@ All the files will be in he Preprocessing/Recalibrated/ directory, and by defaul ```bash nextflow run SciLifeLab/Sarek/somaticVC.nf --sample Preprocessing/Recalibrated/mysample.tsv --tools Mutect2,Strelka ``` + +## Input FASTQ file name best practices + +The input folder, containing the FASTQ files for one individual (ID) should be organized into one subfolder for every sample. +All fastq files for that sample should be collected here. + +``` +ID ++--sample1 ++------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz ++------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz ++------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz ++------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz ++--sample2 ++------sample2_lib_flowcell-index_lane_R1_1000.fastq.gz ++------sample2_lib_flowcell-index_lane_R2_1000.fastq.gz ++--sample3 ++------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz ++------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz ++------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz ++------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz +``` + +Fastq filename structure: + +- `sample_lib_flowcell-index_lane_R1_1000.fastq.gz` and +- `sample_lib_flowcell-index_lane_R2_1000.fastq.gz` + +Where: + +- `sample` = sample id +- `lib` = indentifier of libaray preparation +- `flowcell` = identifyer of flow cell for the sequencing run +- `lane` = identifier of the lane of the sequencing run + +Read group information will be parsed from fastq file names according to this: + +- `RGID` = "sample_lib_flowcell_index_lane" +- `RGPL` = "Illumina" +- `PU` = sample +- `RGLB` = lib From 9e224081a0817c10d6083a23b8fa578a814cae50 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 10 Sep 2018 13:34:03 +0200 Subject: [PATCH 27/75] Started with the beginners docs --- docs/USAGE.md | 41 ++++------------------------------------- 1 file changed, 4 insertions(+), 37 deletions(-) diff --git a/docs/USAGE.md b/docs/USAGE.md index b1474d0f31..e7ab0b66f7 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -9,47 +9,14 @@ The workflow is started for a sample, or a set of samples from the same Individu Each different physical samples is identified by its own ID. For example in a Tumour/Normal settings, this ID could correspond to "Normal", "Tumour_1", "Tumour_2" etc. corresponding to all physical samples from the same patient. -## Input FASTQ file name best practices - -The input folder, containing the FASTQ files for one individual (ID) should be organized into one subfolder for every sample. -All fastq files for that sample should be collected here. +## Preparing to run Sarek +Sarek will start the analysis by parsing a supplied input file in tsv format. +This file contains all the necessary information about the data and should have at least one tab-separated line: ``` -ID -+--sample1 -+------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz -+--sample2 -+------sample2_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample2_lib_flowcell-index_lane_R2_1000.fastq.gz -+--sample3 -+------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz +SUBJECT_ID XX 0 SAMPLEID 1 /samples/normal_1.fastq.gz /samples/normal_2.fastq.gz ``` -Fastq filename structure: - -- `sample_lib_flowcell-index_lane_R1_1000.fastq.gz` and -- `sample_lib_flowcell-index_lane_R2_1000.fastq.gz` - -Where: - -- `sample` = sample id -- `lib` = indentifier of libaray preparation -- `flowcell` = identifyer of flow cell for the sequencing run -- `lane` = identifier of the lane of the sequencing run - -Read group information will be parsed from fastq file names according to this: - -- `RGID` = "sample_lib_flowcell_index_lane" -- `RGPL` = "Illumina" -- `PU` = sample -- `RGLB` = lib - ## Scripts Sarek uses several scripts, a wrapper is currently being made to simplify the command lines. From 2f69352b9433b0c81a4cf933ea456a62807ea17c Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Mon, 10 Sep 2018 16:15:19 +0200 Subject: [PATCH 28/75] killing me softly with a VEP line --- annotate.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index 8e3c862b6f..fcb4b31ddb 100644 --- a/annotate.nf +++ b/annotate.nf @@ -214,7 +214,7 @@ process RunVEP { finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome """ - vep \ + vep --dir /opt/vep/.vep/ \ -i ${vcf} \ -o ${vcf.simpleName}_VEP.ann.vcf \ --assembly ${genome} \ From 70b40d20c4f9a617270a659ce580d2c805c5e980 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 11 Sep 2018 10:27:50 +0200 Subject: [PATCH 29/75] Sarek-data updated --- Sarek-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sarek-data b/Sarek-data index ba299d76c8..c2da0d2a8a 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit ba299d76c851dc916c051ef5f77d7a4ab39dcc9f +Subproject commit c2da0d2a8a1c1a8e9b9b0930b84e34073ea43d03 From fba72dfa58b16fd4f9656f8ac84e09f312e81913 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 11 Sep 2018 10:28:26 +0200 Subject: [PATCH 30/75] concatVCF.sh moved to bin --- germlineVC.nf | 2 +- somaticVC.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/germlineVC.nf b/germlineVC.nf index e273516af7..fe5a41370e 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -380,7 +380,7 @@ process ConcatVCF { concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " """ - ${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions} + concatenateVCFs.sh ${concatOptions} """ } diff --git a/somaticVC.nf b/somaticVC.nf index 26ebd46676..455cfa0b78 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -379,7 +379,7 @@ process ConcatVCF { concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} " """ - ${workflow.projectDir}/scripts/concatenateVCFs.sh ${concatOptions} + concatenateVCFs.sh ${concatOptions} """ } From ce331eab0735a5a4f78762c8be90075eb5d8c7c7 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Tue, 11 Sep 2018 10:30:52 +0200 Subject: [PATCH 31/75] Added --cpus directive --- scripts/test.sh | 14 +++++++++++--- scripts/wrapper.sh | 9 +++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index bcd017d1f1..fba3fdda21 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -8,6 +8,7 @@ PROFILE=singularity SAMPLE=Sarek-data/testdata/tsv/tiny.tsv TEST=ALL TRAVIS=${TRAVIS:-false} +CPUS=2 TMPDIR=`pwd`/tmp mkdir -p $TMPDIR @@ -53,6 +54,10 @@ do BUILD=true shift # past value ;; + -c|--cpus) + CPUS=$2 + shift # past value + ;; *) # unknown option shift # past argument ;; @@ -60,7 +65,7 @@ do done function run_wrapper() { - ./scripts/wrapper.sh $@ --profile $PROFILE --genome $GENOME --genomeBase $PWD/References/$GENOME --verbose + ./scripts/wrapper.sh $@ --profile $PROFILE --genome $GENOME --genomeBase $PWD/References/$GENOME --verbose --cpus ${CPUS} } function clean_repo() { @@ -110,12 +115,15 @@ fi if [[ ALL,GERMLINE =~ $TEST ]] then run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller --targetBED Sarek-data/testdata/target.bed + clean_repo fi if [[ ALL,TOOLS =~ $TEST ]] then run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2 + run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --targetBED Sarek-data/testdata/target.bed fi if [[ ALL,MANTA =~ $TEST ]] @@ -126,7 +134,7 @@ then fi -if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]] +if [[ ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]] then if [[ $TEST = ANNOTATESNPEFF ]] then @@ -152,7 +160,7 @@ then clean_repo fi -if [[ ALL,BUILDCONTAINERS =~ $TEST ]] && [[ $PROFILE == docker ]] +if [[ BUILDCONTAINERS =~ $TEST ]] && [[ $PROFILE == docker ]] then ./scripts/do_all.sh --genome $GENOME fi diff --git a/scripts/wrapper.sh b/scripts/wrapper.sh index b96b63024b..71c0950290 100755 --- a/scripts/wrapper.sh +++ b/scripts/wrapper.sh @@ -15,6 +15,7 @@ STEP='mapping' TAG='latest' TOOLS='haplotypecaller,strelka,manta' VARIANTCALLING=false +CPUS=2 while [[ $# -gt 0 ]] do @@ -85,6 +86,10 @@ do VARIANTCALLING=true shift # past argument ;; + -c|--cpus) + CPUS=$2 + shift # past value + ;; *) # unknown option shift # past argument ;; @@ -92,8 +97,8 @@ do done function run_sarek() { - echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0)" - nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose + echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" + nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} } if [[ $GERMLINE == true ]] && [[ $SOMATIC == true ]] From c07d8aa0cde0a1677c7e9ed70a0a36cc08d139c2 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 12:47:13 +0200 Subject: [PATCH 32/75] better testing --- .travis.yml | 10 ++-------- scripts/test.sh | 24 +++--------------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3e18eb1d1d..2ab8e7780f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,19 +11,13 @@ env: global: - NXF_VER=0.31.0 SGT_VER=2.5.1 matrix: - - CE=singularity TEST=TOOLS - - CE=singularity TEST=MANTA - - CE=docker TEST=MANTA - - CE=docker TEST=TOOLS + - CE=singularity TEST=SOMATIC + - CE=docker TEST=SOMATIC - CE=docker TEST=ANNOTATEVEP - CE=singularity TEST=ANNOTATESNPEFF - - CE=singularity TEST=STEP - CE=singularity TEST=GERMLINE - - CE=singularity TEST=DIR - CE=docker TEST=ANNOTATESNPEFF - - CE=docker TEST=STEP - CE=docker TEST=GERMLINE - - CE=docker TEST=DIR install: # Install Nextflow (and Singularity if needed) diff --git a/scripts/test.sh b/scripts/test.sh index bcd017d1f1..13492a34f6 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -94,38 +94,20 @@ then fi fi -if [[ ALL,DIR =~ $TEST ]] -then - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal - clean_repo -fi - -if [[ ALL,STEP =~ $TEST ]] -then - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal - run_wrapper --germline --step recalibrate --noReports - clean_repo -fi - if [[ ALL,GERMLINE =~ $TEST ]] then run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller + run_wrapper --germline --step recalibrate --noReports clean_repo fi -if [[ ALL,TOOLS =~ $TEST ]] +if [[ ALL,SOMATIC =~ $TEST ]] then - run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2 -fi - -if [[ ALL,MANTA =~ $TEST ]] -then - run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta --noReports + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP clean_repo fi - if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]] then if [[ $TEST = ANNOTATESNPEFF ]] From f11106eb18804c784e8a38c06e321600f143a8b6 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 13:56:01 +0200 Subject: [PATCH 33/75] sort tests by amount of time needed --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2ab8e7780f..3d1dd30eb8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,8 +15,8 @@ env: - CE=docker TEST=SOMATIC - CE=docker TEST=ANNOTATEVEP - CE=singularity TEST=ANNOTATESNPEFF - - CE=singularity TEST=GERMLINE - CE=docker TEST=ANNOTATESNPEFF + - CE=singularity TEST=GERMLINE - CE=docker TEST=GERMLINE install: From 3fe164fce49322416d0af03a511b31f314bb5dde Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 13:56:51 +0200 Subject: [PATCH 34/75] sort exports --- scripts/test.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index 13492a34f6..37bf133dde 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -13,10 +13,8 @@ TMPDIR=`pwd`/tmp mkdir -p $TMPDIR export NXF_SINGULARITY_CACHEDIR=$TMPDIR export NXF_TEMP=$TMPDIR - -export SINGULARITY_TMPDIR=$TMPDIR export SINGULARITY_CACHEDIR=$TMPDIR - +export SINGULARITY_TMPDIR=$TMPDIR # remove Reference directory rm -rf References From 46e187731f80aeb50e8ceea856be31d8695bf376 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 13:57:26 +0200 Subject: [PATCH 35/75] update depreciated syntax for singularity --- buildContainers.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildContainers.nf b/buildContainers.nf index 8aa546c657..845c54f9ff 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -98,7 +98,7 @@ process PullSingularityContainers { script: """ - singularity pull --name ${container}-${params.tag}.img docker://${params.repository}/${container}:${params.tag} + singularity build --name ${container}-${params.tag}.img docker://${params.repository}/${container}:${params.tag} """ } From b432dd7eb8b52fa6b15edc581c49b761d29b3c10 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 13:57:54 +0200 Subject: [PATCH 36/75] add pulling other big containers --- scripts/containers.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/containers.sh b/scripts/containers.sh index fc5dc12a31..f7d41565b8 100755 --- a/scripts/containers.sh +++ b/scripts/containers.sh @@ -25,7 +25,19 @@ do esac done -if [[ $TEST = ANNOTATEVEP ]] && [[ $PROFILE = docker ]] && [[ $TRAVIS == true ]] +if [[ $PROFILE = docker ]] && [[ $TRAVIS == true ]] then - docker pull maxulysse/vepgrch37:latest + if [[ $TEST = ANNOTATEVEP ]] + then + docker pull maxulysse/vepgrch37:latest + else + docker pull maxulysse/snpeffgrch37:latest + fi +fi + +if [[ $TEST = ANNOTATESNPEFF ]] && [[ $PROFILE = singularity ]] && [[ $TRAVIS == true ]] +then + cd tmp + singularity build --name maxulysse-snpeffgrch37-latest.img docker://maxulysse/snpeffgrch37:latest + cd .. fi From ce93c115fbb184e8acb779463750700f40b13500 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 13:58:04 +0200 Subject: [PATCH 37/75] update docs --- docs/TESTS.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/docs/TESTS.md b/docs/TESTS.md index 458ffbad82..3e059cc5eb 100644 --- a/docs/TESTS.md +++ b/docs/TESTS.md @@ -24,11 +24,6 @@ nextflow run main.nf --sampleDir Sarek-data/testdata/manta/normal \ --step mapping --genome smallGRCh37 --genome_base References/smallGRCh37 \ --tag latest -profile singularity -# Testing to restart from `realign` -nextflow run main.nf --step realign \ - --genome smallGRCh37 --genome_base References/smallGRCh37 \ - --tag latest -profile singularity - # Testing to restart from `recalibrate` nextflow run main.nf --step recalibrate \ --genome smallGRCh37 --genome_base References/smallGRCh37 \ @@ -93,11 +88,8 @@ Four optional arguments are supported: - `-s` || `--sample`: Use to change the test sample (default=`Sarek-data/testdata/tsv/tiny.tsv`) - `-t` || `--test`: - - `DIR`: test `mapping` with an input directory - - `STEP`: test `mapping`, `realign` and `recalibrate` - - `GERMLINE`: test `mapping` and Variant Calling with `HaplotypeCaller` - - `TOOLS`: test `mapping` and Variant Calling with `FreeBayes`, `HaplotypeCaller`, `MuTect1`, `MuTect2`, `Strelka` - - `MANTA`: test `mapping` and Variant Calling with `Manta` + - `GERMLINE`: test `mapping`, `recalibrate` and Variant Calling with `HaplotypeCaller` + - `SOMATIC`: test `mapping` and Variant Calling with `FreeBayes`, `HaplotypeCaller`, `MuTect1`, `MuTect2`, `Strelka` and `Manta` - `ANNOTATESNPEFF`: test annotation using `snpEFF` - `ANNOTATEVEP`: test annotation using `VEP` - `BUILDCONTAINERS`: test building all containers except `snpeffgrch37`, `snpeffgrch38`, `vepgrch37` and `vepgrch38` From d910c134b35434039d3cf7e5c180a1f54f9d3a92 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 14:02:52 +0200 Subject: [PATCH 38/75] fix path to TMPDIR --- scripts/containers.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/containers.sh b/scripts/containers.sh index f7d41565b8..3f9f5c5343 100755 --- a/scripts/containers.sh +++ b/scripts/containers.sh @@ -5,6 +5,9 @@ PROFILE=singularity TEST=ALL TRAVIS=${TRAVIS:-false} +TMPDIR=`pwd`/tmp +mkdir -p $TMPDIR + while [[ $# -gt 0 ]] do key=$1 @@ -37,7 +40,7 @@ fi if [[ $TEST = ANNOTATESNPEFF ]] && [[ $PROFILE = singularity ]] && [[ $TRAVIS == true ]] then - cd tmp + cd $TMPDIR singularity build --name maxulysse-snpeffgrch37-latest.img docker://maxulysse/snpeffgrch37:latest cd .. fi From 365876d2e40ec6c513afee74ca1843d1184de4bb Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 14:09:01 +0200 Subject: [PATCH 39/75] update with .simg extension for singularity image + remove --name option --- buildContainers.nf | 2 +- conf/singularity-path.config | 72 ++++++++++++++++++------------------ scripts/containers.sh | 4 +- 3 files changed, 38 insertions(+), 40 deletions(-) diff --git a/buildContainers.nf b/buildContainers.nf index 845c54f9ff..69e869d646 100644 --- a/buildContainers.nf +++ b/buildContainers.nf @@ -98,7 +98,7 @@ process PullSingularityContainers { script: """ - singularity build --name ${container}-${params.tag}.img docker://${params.repository}/${container}:${params.tag} + singularity build ${container}-${params.tag}.simg docker://${params.repository}/${container}:${params.tag} """ } diff --git a/conf/singularity-path.config b/conf/singularity-path.config index 938829dc77..448e7d1432 100644 --- a/conf/singularity-path.config +++ b/conf/singularity-path.config @@ -14,111 +14,111 @@ singularity { process { withName:BuildBWAindexes { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:BuildReferenceIndex { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:BuildSAMToolsIndex { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:BuildVCFIndex { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:CompressVCF { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:ConcatVCF { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:CreateRecalibrationTable { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:GetVersionAll { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:GetVersionAlleleCount { - container = "${params.containerPath}/runallelecount-${params.tag}.img" + container = "${params.containerPath}/runallelecount-${params.tag}.simg" } withName:GetVersionASCAT { - container = "${params.containerPath}/r-base-${params.tag}.img" + container = "${params.containerPath}/r-base-${params.tag}.simg" } withName:GetVersionSnpeff { - container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} + container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.simg" : "${params.containerPath}/snpeffgrch37-${params.tag}.simg"} } withName:GetVersionVEP { - container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} + container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.simg" : "${params.containerPath}/vepgrch37-${params.tag}.simg"} } withName:MapReads { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:MarkDuplicates { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:MergeBams { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RecalibrateBam { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunAlleleCount { - container = "${params.containerPath}/runallelecount-${params.tag}.img" + container = "${params.containerPath}/runallelecount-${params.tag}.simg" } withName:RunAscat { - container = "${params.containerPath}/r-base-${params.tag}.img" + container = "${params.containerPath}/r-base-${params.tag}.simg" } withName:RunBamQC { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunBcftoolsStats { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunConvertAlleleCounts { - container = "${params.containerPath}/r-base-${params.tag}.img" + container = "${params.containerPath}/r-base-${params.tag}.simg" } withName:RunFastQC { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunFreeBayes { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunGenotypeGVCFs { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunHaplotypecaller { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunManta { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunMultiQC { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunMutect2 { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunSamtoolsStats { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunSingleManta { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunSingleStrelka { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunSnpeff { - container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.img" : "${params.containerPath}/snpeffgrch37-${params.tag}.img"} + container = {params.genome == 'GRCh38' ? "${params.containerPath}/snpeffgrch38-${params.tag}.simg" : "${params.containerPath}/snpeffgrch37-${params.tag}.simg"} } withName:RunStrelka { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunStrelkaBP { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunVcftools { - container = "${params.containerPath}/sarek-${params.tag}.img" + container = "${params.containerPath}/sarek-${params.tag}.simg" } withName:RunVEP { - container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.img" : "${params.containerPath}/vepgrch37-${params.tag}.img"} + container = {params.genome == 'GRCh38' ? "${params.containerPath}/vepgrch38-${params.tag}.simg" : "${params.containerPath}/vepgrch37-${params.tag}.simg"} } } diff --git a/scripts/containers.sh b/scripts/containers.sh index 3f9f5c5343..da6c2fd1ba 100755 --- a/scripts/containers.sh +++ b/scripts/containers.sh @@ -40,7 +40,5 @@ fi if [[ $TEST = ANNOTATESNPEFF ]] && [[ $PROFILE = singularity ]] && [[ $TRAVIS == true ]] then - cd $TMPDIR - singularity build --name maxulysse-snpeffgrch37-latest.img docker://maxulysse/snpeffgrch37:latest - cd .. + singularity build $TMPDIR/maxulysse-snpeffgrch37-latest.simg docker://maxulysse/snpeffgrch37:latest fi From f88e4c3499b772d5bb6b3aac01c58a3f284496c9 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 14:33:33 +0200 Subject: [PATCH 40/75] clean up test.sh script --- scripts/test.sh | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index 37bf133dde..a170eb1803 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -82,14 +82,6 @@ then echo "$(tput setaf 1)Building references$(tput sgr0)" nextflow run buildReferences.nf --refDir Sarek-data/reference --outDir References/$GENOME -profile $PROFILE --genome $GENOME --verbose fi - # Remove images only on TRAVIS - if [[ $PROFILE == docker ]] && [[ $TRAVIS == true ]] - then - docker rmi -f maxulysse/igvtools:latest - elif [[ $PROFILE == singularity ]] && [[ $TRAVIS == true ]] - then - rm -rf work/singularity/igvtools-latest.img - fi fi if [[ ALL,GERMLINE =~ $TEST ]] @@ -118,15 +110,6 @@ then then ANNOTATOR=merge,snpEFF,VEP fi - if [[ $PROFILE == docker ]] && [[ $TRAVIS == true ]] - then - docker rmi -f maxulysse/sarek:latest - docker rmi -f maxulysse/picard:latest - elif [[ $PROFILE == singularity ]] && [[ $TRAVIS == true ]] - then - rm -rf work/singularity/sarek-latest.img - rm -rf work/singularity/picard-latest.img - fi run_wrapper --annotate --tools ${ANNOTATOR} --annotateVCF Sarek-data/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports run_wrapper --annotate --tools ${ANNOTATOR} --annotateVCF Sarek-data/testdata/vcf/Strelka_1234N_variants.vcf.gz,Sarek-data/testdata/vcf/Strelka_9876T_variants.vcf.gz clean_repo From 1558b0707a1d31f41e95e8a65b8306821b099061 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 14:55:33 +0200 Subject: [PATCH 41/75] update CHANGELOG --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4115f195c..9b7750e652 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,12 +17,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version - [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` -- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Improve install script -- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Simplify tests +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Improve install script +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Simplify tests - [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs - [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config - [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering +- [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images ### `Removed` - [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template From ab70d0e8b79134a37ff1b9393ab392c797daee27 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 11 Sep 2018 15:36:43 +0200 Subject: [PATCH 42/75] forgot one MuTecT1 [skip ci] --- docs/TESTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/TESTS.md b/docs/TESTS.md index 3e059cc5eb..1e160bd9c9 100644 --- a/docs/TESTS.md +++ b/docs/TESTS.md @@ -89,7 +89,7 @@ Four optional arguments are supported: Use to change the test sample (default=`Sarek-data/testdata/tsv/tiny.tsv`) - `-t` || `--test`: - `GERMLINE`: test `mapping`, `recalibrate` and Variant Calling with `HaplotypeCaller` - - `SOMATIC`: test `mapping` and Variant Calling with `FreeBayes`, `HaplotypeCaller`, `MuTect1`, `MuTect2`, `Strelka` and `Manta` + - `SOMATIC`: test `mapping` and Variant Calling with `FreeBayes`, `HaplotypeCaller`, `MuTect2`, `Strelka` and `Manta` - `ANNOTATESNPEFF`: test annotation using `snpEFF` - `ANNOTATEVEP`: test annotation using `VEP` - `BUILDCONTAINERS`: test building all containers except `snpeffgrch37`, `snpeffgrch38`, `vepgrch37` and `vepgrch38` From 43ea430958c9f7dd5a3a1c18983f824b27f6c5c2 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 12 Sep 2018 08:39:27 +0200 Subject: [PATCH 43/75] putting concatenateVCF.sh to bin --- bin/concatenateVCFs.sh | 85 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100755 bin/concatenateVCFs.sh diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh new file mode 100755 index 0000000000..89bf9d125e --- /dev/null +++ b/bin/concatenateVCFs.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# this script concatenates all VCFs that are in the local directory: the +# purpose is to make a single VCF from all the VCFs that were created from different intervals + +usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; } + +while getopts "i:c:o:t:" p; do + case "${p}" in + i) + genomeIndex=${OPTARG} + ;; + c) + cpus=${OPTARG} + ;; + o) + outputFile=${OPTARG} + ;; + t) + targetBED=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi +if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi +if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi + +set -euo pipefail + +# first make a header from one of the VCF intervals +# get rid of interval information only from the GATK command-line, but leave the rest +FIRSTVCF=$(ls *.vcf | head -n 1) +sed -n '/^[^#]/q;p' $FIRSTVCF | \ +awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \ +> header + +# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig +# header in the VCF as it is optional (FreeBayes does not save it, for example) +CONTIGS=($(cut -f1 ${genomeIndex})) + +# concatenate VCFs in the correct order +( + cat header + + for chr in "${CONTIGS[@]}"; do + # Skip if globbing would not match any file to avoid errors such as + # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 + # was not processed. + pattern="${chr}_*.vcf" + if ! compgen -G "${pattern}" > /dev/null; then continue; fi + + # ls -v sorts by numeric value ("version"), which means that chr1_100_ + # is sorted *after* chr1_99_. + for vcf in $(ls -v ${pattern}); do + # Determine length of header. + # The 'q' command makes sed exit when it sees the first non-header + # line, which avoids reading in the entire file. + L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) + + # Then print all non-header lines. Since tail is very fast (nearly as + # fast as cat), this is way more efficient than using a single sed, + # awk or grep command. + tail -n +$((L+1)) ${vcf} + done + done +) | bgzip -@${cpus} > rawcalls.vcf.gz +tabix rawcalls.vcf.gz + +set +u + +# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided +echo "target is $targetBED" +if [ ! -z ${targetBED+x} ]; then + echo "Selecting subset..." + bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz + tabix ${outputFile}.gz +else + # simply rename the raw calls as WGS results + for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done +fi + From d237b8f993900dbc09e785d17b3316dba88a9c6c Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 12 Sep 2018 08:40:51 +0200 Subject: [PATCH 44/75] adding targetBED to tests and wrapper --- scripts/test.sh | 4 ++-- scripts/wrapper.sh | 22 ++++++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index fba3fdda21..8481b93315 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -115,7 +115,7 @@ fi if [[ ALL,GERMLINE =~ $TEST ]] then run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller --targetBED Sarek-data/testdata/target.bed + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller --bed `pwd`/Sarek-data/testdata/target.bed clean_repo fi @@ -123,7 +123,7 @@ fi if [[ ALL,TOOLS =~ $TEST ]] then run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2 - run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --targetBED Sarek-data/testdata/target.bed + run_wrapper --somatic --sample $SAMPLE --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed fi if [[ ALL,MANTA =~ $TEST ]] diff --git a/scripts/wrapper.sh b/scripts/wrapper.sh index 71c0950290..31f587c7e8 100755 --- a/scripts/wrapper.sh +++ b/scripts/wrapper.sh @@ -34,11 +34,19 @@ do SOMATIC=true shift # past argument ;; + -c|--cpus) + CPUS=$2 + shift # past value + ;; -d|--sampleDir) SAMPLEDIR=$2 shift # past argument shift # past value ;; + -e|--bed) + TARGETBED=$2 + shift # past value + ;; -f|--annotateVCF) ANNOTATEVCF=$2 shift # past argument @@ -86,10 +94,6 @@ do VARIANTCALLING=true shift # past argument ;; - -c|--cpus) - CPUS=$2 - shift # past value - ;; *) # unknown option shift # past argument ;; @@ -97,8 +101,14 @@ do done function run_sarek() { - echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" - nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} + # https://stackoverflow.com/questions/3601515/how-to-check-if-a-variable-is-set-in-bash + if [ -z ${TARGETBED+x} ]; then # variable unset + echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" + nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} + else + echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" --targetBED ${TARGETBED} + nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} --targetBED ${TARGETBED} + fi } if [[ $GERMLINE == true ]] && [[ $SOMATIC == true ]] From 5d5407aede6eb80c60b534132b289ad0972a5311 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 12 Sep 2018 08:41:47 +0200 Subject: [PATCH 45/75] temporary fix for vepgrch37 container path problem --- annotate.nf | 2 +- lib/QC.groovy | 2 +- scripts/concatenateVCFs.sh | 85 -------------------------------------- 3 files changed, 2 insertions(+), 87 deletions(-) delete mode 100755 scripts/concatenateVCFs.sh diff --git a/annotate.nf b/annotate.nf index 60b975bd32..493d63fa5a 100644 --- a/annotate.nf +++ b/annotate.nf @@ -216,7 +216,7 @@ process RunVEP { finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome """ - vep --dir /opt/vep/.vep/ \ + /opt/vep/src/ensembl-vep/vep --dir /opt/vep/.vep/ \ -i ${vcf} \ -o ${vcf.simpleName}_VEP.ann.vcf \ --assembly ${genome} \ diff --git a/lib/QC.groovy b/lib/QC.groovy index 6e6ef83eb9..2e1c20f820 100644 --- a/lib/QC.groovy +++ b/lib/QC.groovy @@ -60,7 +60,7 @@ class QC { // Get VEP version static def getVersionVEP() { """ - vep --help > v_vep.txt + /opt/vep/src/ensembl-vep/vep --help > v_vep.txt """ } } diff --git a/scripts/concatenateVCFs.sh b/scripts/concatenateVCFs.sh deleted file mode 100755 index 89bf9d125e..0000000000 --- a/scripts/concatenateVCFs.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash -# this script concatenates all VCFs that are in the local directory: the -# purpose is to make a single VCF from all the VCFs that were created from different intervals - -usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; } - -while getopts "i:c:o:t:" p; do - case "${p}" in - i) - genomeIndex=${OPTARG} - ;; - c) - cpus=${OPTARG} - ;; - o) - outputFile=${OPTARG} - ;; - t) - targetBED=${OPTARG} - ;; - *) - usage - ;; - esac -done -shift $((OPTIND-1)) - -if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi -if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi -if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi - -set -euo pipefail - -# first make a header from one of the VCF intervals -# get rid of interval information only from the GATK command-line, but leave the rest -FIRSTVCF=$(ls *.vcf | head -n 1) -sed -n '/^[^#]/q;p' $FIRSTVCF | \ -awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \ -> header - -# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig -# header in the VCF as it is optional (FreeBayes does not save it, for example) -CONTIGS=($(cut -f1 ${genomeIndex})) - -# concatenate VCFs in the correct order -( - cat header - - for chr in "${CONTIGS[@]}"; do - # Skip if globbing would not match any file to avoid errors such as - # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3 - # was not processed. - pattern="${chr}_*.vcf" - if ! compgen -G "${pattern}" > /dev/null; then continue; fi - - # ls -v sorts by numeric value ("version"), which means that chr1_100_ - # is sorted *after* chr1_99_. - for vcf in $(ls -v ${pattern}); do - # Determine length of header. - # The 'q' command makes sed exit when it sees the first non-header - # line, which avoids reading in the entire file. - L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l) - - # Then print all non-header lines. Since tail is very fast (nearly as - # fast as cat), this is way more efficient than using a single sed, - # awk or grep command. - tail -n +$((L+1)) ${vcf} - done - done -) | bgzip -@${cpus} > rawcalls.vcf.gz -tabix rawcalls.vcf.gz - -set +u - -# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided -echo "target is $targetBED" -if [ ! -z ${targetBED+x} ]; then - echo "Selecting subset..." - bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz - tabix ${outputFile}.gz -else - # simply rename the raw calls as WGS results - for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done -fi - From d046b858bb89caf1c34f57873230b1e0d1c566bd Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 11:58:46 +0200 Subject: [PATCH 46/75] update Dockerfile --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 340e699e71..08425200ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,5 +6,5 @@ LABEL \ maintainer="Maxime Garcia , Szilveszter Juhos " COPY environment.yml / -RUN conda env update -n root -f /environment.yml && conda clean -a -ENV PATH /opt/conda/bin:$PATH +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/sarek-2.1.0/bin:$PATH From 433c0078dd47cf54273d9a8579d8feae3c1eb650 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 12:00:18 +0200 Subject: [PATCH 47/75] add Sarek version into bug_report --- .github/ISSUE_TEMPLATE/bug_report.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index e32e0c2f68..b4a3147f40 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -31,5 +31,8 @@ A clear and concise description of what you expected to happen. **Container (please complete the following information):** - tag: [e.g. 1.0.0] +**Sarek (please complete the following information):** + - version: [e.g. 2.1.0] + **Additional context** Add any other context about the problem here. From e7c1d3a4fe724b7c9e773ac17839f119e8a4ebb1 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 12:24:59 +0200 Subject: [PATCH 48/75] Beautify CHANGELOG --- CHANGELOG.md | 315 ++++++++++++++++++++++++++++----------------------- 1 file changed, 173 insertions(+), 142 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b7750e652..5a866d012e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,223 +1,254 @@ # Changelog + All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [Unreleased] + ### `Added` -- [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request) -- [#614](https://github.com/SciLifeLab/Sarek/pull/614) - Add PR Template -- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Add presentation -- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation -- [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` -- [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule + +- [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request) +- [#614](https://github.com/SciLifeLab/Sarek/pull/614) - Add PR Template +- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Add presentation +- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation +- [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` +- [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule ### `Changed` -- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version -- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG -- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` -- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Improve install script -- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Simplify tests -- [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs -- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config -- [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes -- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering -- [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images + +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG +- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Improve install script +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Simplify tests +- [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs +- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config +- [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering +- [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images ### `Removed` -- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template -- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Remove old Dockerfiles -- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Remove old comments + +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template +- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Remove old Dockerfiles +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Remove old comments ### `Fixed` -- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix VEP tests -- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Fix links in MD files + +- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix VEP tests +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Fix links in MD files + ## [2.1.0] - Ruotes - 2018-08-14 ### `Added` -- [#555](https://github.com/SciLifeLab/Sarek/pull/555) - `snpEff` output into `VEP` -- [#556](https://github.com/SciLifeLab/Sarek/pull/556) - `Strelka` Best Practices -- [#563](https://github.com/SciLifeLab/Sarek/pull/563) - Use `SnpEFF` reports in `MultiQC` -- [#568](https://github.com/SciLifeLab/Sarek/pull/568) - `VCFTools` process `RunVcftools` for QC -- [#574](https://github.com/SciLifeLab/Sarek/pull/574), [#580](https://github.com/SciLifeLab/Sarek/pull/580) - Abstracts for NPMI, JOBIM and EACR25 -- [#577](https://github.com/SciLifeLab/Sarek/pull/577) - New repository for testing: [Sarek-data](https://github.com/SciLifeLab/Sarek-data) -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New library `QC` for functions `bamQC`, `bcftools`, `samtoolsStats`, `vcftools`, `getVersionBCFtools`, `getVersionGATK`, `getVersionManta`, `getVersionSnpEFF`, `getVersionStrelka`, `getVersionVCFtools`, `getVersionVEP` -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New Processes `GetVersionBCFtools`, `GetVersionGATK`, `GetVersionManta`, `GetVersionSnpEFF`, `GetVersionStrelka`, `GetVersionVCFtools`, `GetVersionVEP` -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - new Python script `bin/scrape_tool_versions.py` inspired by @ewels and @apeltzer -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New QC Process `RunVcftools` -- [#596](https://github.com/SciLifeLab/Sarek/pull/596) - New profile for BinAC cluster -- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - New function `sarek_ascii()` in `SarekUtils` -- [#599](https://github.com/SciLifeLab/Sarek/pull/599), [#602](https://github.com/SciLifeLab/Sarek/pull/602) - New Process `CompressVCF` -- [#601](https://github.com/SciLifeLab/Sarek/pull/601), [#603](https://github.com/SciLifeLab/Sarek/pull/603) - Container for GATK4 -- [#606](https://github.com/SciLifeLab/Sarek/pull/606) - Add test data as a submodule from [`Sarek-data`](https://github.com/SciLifeLab/Sarek-data) -- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Add documentation on how to install Nextflow on `bianca` + +- [#555](https://github.com/SciLifeLab/Sarek/pull/555) - `snpEff` output into `VEP` +- [#556](https://github.com/SciLifeLab/Sarek/pull/556) - `Strelka` Best Practices +- [#563](https://github.com/SciLifeLab/Sarek/pull/563) - Use `SnpEFF` reports in `MultiQC` +- [#568](https://github.com/SciLifeLab/Sarek/pull/568) - `VCFTools` process `RunVcftools` for QC +- [#574](https://github.com/SciLifeLab/Sarek/pull/574), [#580](https://github.com/SciLifeLab/Sarek/pull/580) - Abstracts for NPMI, JOBIM and EACR25 +- [#577](https://github.com/SciLifeLab/Sarek/pull/577) - New repository for testing: [Sarek-data](https://github.com/SciLifeLab/Sarek-data) +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New library `QC` for functions `bamQC`, `bcftools`, `samtoolsStats`, `vcftools`, `getVersionBCFtools`, `getVersionGATK`, `getVersionManta`, `getVersionSnpEFF`, `getVersionStrelka`, `getVersionVCFtools`, `getVersionVEP` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New Processes `GetVersionBCFtools`, `GetVersionGATK`, `GetVersionManta`, `GetVersionSnpEFF`, `GetVersionStrelka`, `GetVersionVCFtools`, `GetVersionVEP` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - new Python script `bin/scrape_tool_versions.py` inspired by @ewels and @apeltzer +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New QC Process `RunVcftools` +- [#596](https://github.com/SciLifeLab/Sarek/pull/596) - New profile for BinAC cluster +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - New function `sarek_ascii()` in `SarekUtils` +- [#599](https://github.com/SciLifeLab/Sarek/pull/599), [#602](https://github.com/SciLifeLab/Sarek/pull/602) - New Process `CompressVCF` +- [#601](https://github.com/SciLifeLab/Sarek/pull/601), [#603](https://github.com/SciLifeLab/Sarek/pull/603) - Container for GATK4 +- [#606](https://github.com/SciLifeLab/Sarek/pull/606) - Add test data as a submodule from [`Sarek-data`](https://github.com/SciLifeLab/Sarek-data) +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Add documentation on how to install Nextflow on `bianca` ### `Changed` -- [#557](https://github.com/SciLifeLab/Sarek/pull/557), [#583](https://github.com/SciLifeLab/Sarek/pull/583), [#585](https://github.com/SciLifeLab/Sarek/pull/585), [#588](https://github.com/SciLifeLab/Sarek/pull/588) - Update help -- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - GitHub langage for the repository is now `Nextflow` -- [#561](https://github.com/SciLifeLab/Sarek/pull/561) - `do_all.sh` build only containers for one genome reference (default `GRCh38`) only -- [#571](https://github.com/SciLifeLab/Sarek/pull/571) - Only one container for all QC tools -- [#582](https://github.com/SciLifeLab/Sarek/pull/582), [#587](https://github.com/SciLifeLab/Sarek/pull/587) - Update figures -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Function `defineDirectoryMap()` is now part of `SarekUtils` -- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Process `GenerateMultiQCconfig` replace by function `createMultiQCconfig()` -- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Move `checkFileExtension()`, `checkParameterExistence()`, `checkParameterList()`, `checkReferenceMap()`, `checkRefExistence()`, `extractBams()`, `extractGenders()`, `returnFile()`, `returnStatus()` and `returnTSV()` functions to `SarekUtils` -- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - `extractBams()` now takes an extra parameter. -- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Replace depreciated operator `phase` by `join`. -- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Reduce data footprint for Process `CreateRecalibrationTable` -- [#599](https://github.com/SciLifeLab/Sarek/pull/599) - Merge is tested with `ANNOTATEALL` -- [#604](https://github.com/SciLifeLab/Sarek/pull/604) - Synching `GRCh38` `wgs_calling_regions` bedfiles -- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Update to GATK4 -- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - One container approach -- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version -- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG -- [#617](https://github.com/SciLifeLab/Sarek/pull/617) - Replace depreciated $name syntax with withName + +- [#557](https://github.com/SciLifeLab/Sarek/pull/557), [#583](https://github.com/SciLifeLab/Sarek/pull/583), [#585](https://github.com/SciLifeLab/Sarek/pull/585), [#588](https://github.com/SciLifeLab/Sarek/pull/588) - Update help +- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - GitHub langage for the repository is now `Nextflow` +- [#561](https://github.com/SciLifeLab/Sarek/pull/561) - `do_all.sh` build only containers for one genome reference (default `GRCh38`) only +- [#571](https://github.com/SciLifeLab/Sarek/pull/571) - Only one container for all QC tools +- [#582](https://github.com/SciLifeLab/Sarek/pull/582), [#587](https://github.com/SciLifeLab/Sarek/pull/587) - Update figures +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Function `defineDirectoryMap()` is now part of `SarekUtils` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Process `GenerateMultiQCconfig` replace by function `createMultiQCconfig()` +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Move `checkFileExtension()`, `checkParameterExistence()`, `checkParameterList()`, `checkReferenceMap()`, `checkRefExistence()`, `extractBams()`, `extractGenders()`, `returnFile()`, `returnStatus()` and `returnTSV()` functions to `SarekUtils` +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - `extractBams()` now takes an extra parameter. +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Replace depreciated operator `phase` by `join`. +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Reduce data footprint for Process `CreateRecalibrationTable` +- [#599](https://github.com/SciLifeLab/Sarek/pull/599) - Merge is tested with `ANNOTATEALL` +- [#604](https://github.com/SciLifeLab/Sarek/pull/604) - Synching `GRCh38` `wgs_calling_regions` bedfiles +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Update to GATK4 +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - One container approach +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update CHANGELOG +- [#617](https://github.com/SciLifeLab/Sarek/pull/617) - Replace depreciated $name syntax with withName ### `Fixed` -- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - Display message for `repository` and `containerPath` -- [#566](https://github.com/SciLifeLab/Sarek/pull/566) - `slurmDownload` profile -- [#579](https://github.com/SciLifeLab/Sarek/pull/579), [#584](https://github.com/SciLifeLab/Sarek/pull/584) - `Manta` output reorganized after modification for `Strelka Best Practices` process -- [#585](https://github.com/SciLifeLab/Sarek/pull/583) - Trace file is plain txt -- [#590](https://github.com/SciLifeLab/Sarek/pull/590), [#593](https://github.com/SciLifeLab/Sarek/pull/593) - Fix Singularity installation in Travis CI testing -- [#598](https://github.com/SciLifeLab/Sarek/pull/598), [#601](https://github.com/SciLifeLab/Sarek/pull/601) - Fixes for Python script `selectROI.py` to work with CLC viewer + +- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - Display message for `repository` and `containerPath` +- [#566](https://github.com/SciLifeLab/Sarek/pull/566) - `slurmDownload` profile +- [#579](https://github.com/SciLifeLab/Sarek/pull/579), [#584](https://github.com/SciLifeLab/Sarek/pull/584) - `Manta` output reorganized after modification for `Strelka Best Practices` process +- [#585](https://github.com/SciLifeLab/Sarek/pull/583) - Trace file is plain txt +- [#590](https://github.com/SciLifeLab/Sarek/pull/590), [#593](https://github.com/SciLifeLab/Sarek/pull/593) - Fix Singularity installation in Travis CI testing +- [#598](https://github.com/SciLifeLab/Sarek/pull/598), [#601](https://github.com/SciLifeLab/Sarek/pull/601) - Fixes for Python script `selectROI.py` to work with CLC viewer ### `Removed` -- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Remove Mutect1 + +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Remove Mutect1 ## [2.0.0] - 2018-03-23 + ### `Added` -- basic wrapper script -- Abstract, posters and figures -- ROI selector and FreeBayes sanitizer scripts -- New logo and icon for the project -- check for existing tumor/normal channel -- `SarekUtils` with `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` functions -- some `runOptions` for `docker` (prevent some user right problem) -- This `CHANGELOG` + +- basic wrapper script +- Abstract, posters and figures +- ROI selector and FreeBayes sanitizer scripts +- New logo and icon for the project +- check for existing tumor/normal channel +- `SarekUtils` with `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` functions +- some `runOptions` for `docker` (prevent some user right problem) +- This `CHANGELOG` ### `Changed` -- `CAW` is now `Sarek` -- Dissect Workflow in 5 new scripts: `annotate.nf`, `main.nf`, `germlineVC.nf`, `runMultiQC.nf` and `somaticVC.nf` -- `report.html`, `timeline.html` and `trace.html` are generated in `Reports/` -- `--version` is now used to define the workflow version -- most params are now defined in the base.config file instead of in the scripts -- update RELEASE_CHECKLIST.md -- `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` in script functions are now called within `SarekUtils` -- `nf_required_version` is now `params.nfRequiredVersion` -- in `buildReferences.nf` script, channels now begin by `ch_`, and files by `f_` -- use `PublishDir mode: 'link'` instead of `copy` -- `directoryMap` now contains `params.outDir` -- [#539](https://github.com/SciLifeLab/Sarek/issues/539) - use Nextflow support of scratch -- reordered Travis CI tests -- update documentation -- `MultiQC` version in container from v`1.4` to v`1.5` -- `vepgrch37` container base image from `release_90.6` to `release_92` -- `vepgrch38` container base image from `release_90.6` to `release_92` -- `VEP` version in containers from v`90` to v`91` -- `nucleotidesPerSecond` is now `params.nucleotidesPerSecond` -- default `params.tag` is now `latest` instead of current version, so --tag needs to be specified with the right version to be sure of using the `containers` corresponding + +- `CAW` is now `Sarek` +- Dissect Workflow in 5 new scripts: `annotate.nf`, `main.nf`, `germlineVC.nf`, `runMultiQC.nf` and `somaticVC.nf` +- `report.html`, `timeline.html` and `trace.html` are generated in `Reports/` +- `--version` is now used to define the workflow version +- most params are now defined in the base.config file instead of in the scripts +- update RELEASE_CHECKLIST.md +- `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` in script functions are now called within `SarekUtils` +- `nf_required_version` is now `params.nfRequiredVersion` +- in `buildReferences.nf` script, channels now begin by `ch_`, and files by `f_` +- use `PublishDir mode: 'link'` instead of `copy` +- `directoryMap` now contains `params.outDir` +- [#539](https://github.com/SciLifeLab/Sarek/issues/539) - use Nextflow support of scratch +- reordered Travis CI tests +- update documentation +- `MultiQC` version in container from v`1.4` to v`1.5` +- `vepgrch37` container base image from `release_90.6` to `release_92` +- `vepgrch38` container base image from `release_90.6` to `release_92` +- `VEP` version in containers from v`90` to v`91` +- `nucleotidesPerSecond` is now `params.nucleotidesPerSecond` +- default `params.tag` is now `latest` instead of current version, so --tag needs to be specified with the right version to be sure of using the `containers` corresponding ### `Deprecated` -- `standard` profile -- `uppmax-localhost.config` file + +- `standard` profile +- `uppmax-localhost.config` file ### `Removed` -- `scripts/skeleton_batch.sh` -- old data and tsv files -- UPPMAX directories from containers -- `--step` in `annotate.nf`, `germlineVC.nf` and `somatic.nf` -- some `runOptions` for Singularity (binding not needed anymore on UPPMAX) -- `download` profile + +- `scripts/skeleton_batch.sh` +- old data and tsv files +- UPPMAX directories from containers +- `--step` in `annotate.nf`, `germlineVC.nf` and `somatic.nf` +- some `runOptions` for Singularity (binding not needed anymore on UPPMAX) +- `download` profile ### `Fixed` -- [#533](https://github.com/SciLifeLab/Sarek/issues/533) - Replace `VEP` `--pick` option by `--per_gene` -- [#530](https://github.com/SciLifeLab/Sarek/issues/530) - use `$PWD` for default `outDir` + +- [#533](https://github.com/SciLifeLab/Sarek/issues/533) - Replace `VEP` `--pick` option by `--per_gene` +- [#530](https://github.com/SciLifeLab/Sarek/issues/530) - use `$PWD` for default `outDir` ## [1.2.5] - 2018-01-18 ### `Added` -- Zenodo for DOI -- Delivery README -- Document use of the `--sampleDir` option -- Contributing Guidelines -- Issue Templates -- Release Checklist -- `--outDir` -- `awsbatch` profile -- `aws-batch.config` config file -- `--noBAMQC` params (failing sometimes on Bianca) + +- Zenodo for DOI +- Delivery README +- Document use of the `--sampleDir` option +- Contributing Guidelines +- Issue Templates +- Release Checklist +- `--outDir` +- `awsbatch` profile +- `aws-batch.config` config file +- `--noBAMQC` params (failing sometimes on Bianca) ### `Changed` -- Update `Nextflow` to `0.26.0` (new fancy report + AWS Batch) -- Extra time on Travis CI testing -- Replace `bundleDir` by `params.genome_base` -- Update `MultiQC` to `1.3` (MEGAQC FTW) -- Move and rename some test files + +- Update `Nextflow` to `0.26.0` (new fancy report + AWS Batch) +- Extra time on Travis CI testing +- Replace `bundleDir` by `params.genome_base` +- Update `MultiQC` to `1.3` (MEGAQC FTW) +- Move and rename some test files ### `Fixed` -- Version of COSMIC GRCh37 v83 -- Write an error message when `--sampleDir` does not find any FASTQ files -- `base.config` for ConcatVCF process -- File specification for recalibrationReport in RecalibrateBam process (got error on AWS Batch) + +- Version of COSMIC GRCh37 v83 +- Write an error message when `--sampleDir` does not find any FASTQ files +- `base.config` for ConcatVCF process +- File specification for recalibrationReport in RecalibrateBam process (got error on AWS Batch) ## [1.2.4] - 2017-10-27 ### `Fixed` -- [#488](https://github.com/SciLifeLab/Sarek/issues/488) - Better CPU requirements for `ConcatVCF` -- [#489](https://github.com/SciLifeLab/Sarek/issues/489) - Exception handling for `ASCAT` -- [#490](https://github.com/SciLifeLab/Sarek/issues/490) - CPU requirements for `runSingleStrelka` and `runSingleManta` + +- [#488](https://github.com/SciLifeLab/Sarek/issues/488) - Better CPU requirements for `ConcatVCF` +- [#489](https://github.com/SciLifeLab/Sarek/issues/489) - Exception handling for `ASCAT` +- [#490](https://github.com/SciLifeLab/Sarek/issues/490) - CPU requirements for `runSingleStrelka` and `runSingleManta` ## [1.2.3] - 2017-10-18 ### `Fixed` -- [#475](https://github.com/SciLifeLab/Sarek/issues/475) - 16 cpus for local executor -- [#357](https://github.com/SciLifeLab/Sarek/issues/357) - `ASCAT` works for GRCh38 -- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on `/scratch` -- [#480](https://github.com/SciLifeLab/Sarek/issues/480) - No `tsv` file needed for step `annotate` + +- [#475](https://github.com/SciLifeLab/Sarek/issues/475) - 16 cpus for local executor +- [#357](https://github.com/SciLifeLab/Sarek/issues/357) - `ASCAT` works for GRCh38 +- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on `/scratch` +- [#480](https://github.com/SciLifeLab/Sarek/issues/480) - No `tsv` file needed for step `annotate` ## [1.2.2] - 2017-10-06 ### `Fixed` -- [#479](https://github.com/SciLifeLab/Sarek/issues/479) - Typo in `uppmax-localhost.config` + +- [#479](https://github.com/SciLifeLab/Sarek/issues/479) - Typo in `uppmax-localhost.config` ## [1.2.1] - 2017-10-06 ### `Changed` -- `runascat` and `runconvertallelecounts` containers are now replaced by `r-base` -- `willmclaren/ensembl-vep:release_90.5` is now base for `vepgrch37` and `vepgrch38` + +- `runascat` and `runconvertallelecounts` containers are now replaced by `r-base` +- `willmclaren/ensembl-vep:release_90.5` is now base for `vepgrch37` and `vepgrch38` ### `Removed` -- `vep` container -- `strelka_config.ini` file + +- `vep` container +- `strelka_config.ini` file ### `Fixed` -- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on /scratch -- [#472](https://github.com/SciLifeLab/Sarek/issues/472) - Update function to check Nextflow version -- [#473](https://github.com/SciLifeLab/Sarek/issues/473) - Remove `returnMin()` function + +- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on /scratch +- [#472](https://github.com/SciLifeLab/Sarek/issues/472) - Update function to check Nextflow version +- [#473](https://github.com/SciLifeLab/Sarek/issues/473) - Remove `returnMin()` function ## [1.2.0] - 2017-10-02 ### `Changed` -- Fix version for Manuscript + +- Fix version for Manuscript ## [1.1] - 2017-09-15 ### `Added` -- Singularity possibilities + +- Singularity possibilities ### `Changed` -- Reports made by default -- Intervals file can be a bed file -- Normal sample preprocessing + HaplotypeCaller is possible -- Better Travis CI tests + +- Reports made by default +- Intervals file can be a bed file +- Normal sample preprocessing + HaplotypeCaller is possible +- Better Travis CI tests ### `Fixed` -- Memory requirements + +- Memory requirements ## [1.0] - 2017-02-16 ### `Added` -- Docker possibilities + +- Docker possibilities ## [0.9] - 2016-11-16 From def35d82405bd01838b5593c7767b506e28ec944 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 12 Sep 2018 14:01:07 +0200 Subject: [PATCH 49/75] added target report at the end, targetBED=false in base.config --- conf/base.config | 1 + germlineVC.nf | 1 + somaticVC.nf | 1 + 3 files changed, 3 insertions(+) diff --git a/conf/base.config b/conf/base.config index db4b0891f6..bb2513f26e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -33,6 +33,7 @@ params { step = 'mapping' // Default step is mapping strelkaBP = false // Don't use Manta's candidate indels as input to Strelka tag = 'latest' // Default tag is latest, to be overwritten by --tag + targetBED = false // no targets by default test = false // Not testing by default verbose = false // Enable for more verbose information version = '2.1.0' // Workflow version diff --git a/germlineVC.nf b/germlineVC.nf index fe5a41370e..ef9481832a 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -652,6 +652,7 @@ def minimalInformationMessage() { log.info "TSV file : " + tsvFile log.info "Genome : " + params.genome log.info "Genome_base : " + params.genome_base + log.info "Target BED : " + params.targetBED log.info "Tools : " + tools.join(', ') log.info "Containers" if (params.repository != "") log.info " Repository : " + params.repository diff --git a/somaticVC.nf b/somaticVC.nf index 455cfa0b78..8ef338217d 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -915,6 +915,7 @@ def minimalInformationMessage() { log.info "TSV file : " + tsvFile log.info "Genome : " + params.genome log.info "Genome_base : " + params.genome_base + log.info "Target BED : " + params.targetBED log.info "Tools : " + tools.join(', ') log.info "Containers" if (params.repository != "") log.info " Repository : " + params.repository From cfffbf90679d7936f48f9aafa1610358b9e40117 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 12 Sep 2018 16:31:02 +0200 Subject: [PATCH 50/75] Updated the config docs --- docs/CONFIG.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 85cae631a8..3f973ee90e 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -5,7 +5,8 @@ For more informations on how to use configuration files, have a look at the [Nex For more informations about profiles, have a look at the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html#config-profiles) We provides several configuration files and profiles for Sarek. -The standard ones are designed to work on a Swedish UPPMAX clusters, and can be modified and tailored to your own need. +The standard ones are designed to work on a Swedish UPPMAX cluster, but can be modified and tailored to your own need. + ## Configuration files @@ -51,10 +52,14 @@ To be used for Travis (2 cpus) or on small computer for testing purpose Slurm configuration for a UPPMAX cluster Will run the workflow on `/scratch` using the Nextflow [`scratch`](https://www.nextflow.io/docs/latest/process.html#scratch) directive -## profiles +## Profiles +A profile is a convenient way of specifying which set of configuration files to use. +The default profile is `standard`, but Sarek has multiple predefined profiles which are listed below that can be specified by specifying `-profile `: + +```bash +nextflow run SciLifeLab/Sarek --sample mysample.tsv -profile myprofile +``` -Every profile can be modified for your own use. -To use a profile, you'll need to specify `-profile ` ### `docker` @@ -82,3 +87,14 @@ Singularity images will be pulled automatically. This is the profile for Singularity testing on a small machine, or on Travis CI. Singularity images will be pulled automatically. + +## Customisation +The recommended way to use custom settings is to supply Sarek with an additional configuration file. You can use the files in the [`conf/`](https://github.com/SciLifeLab/Sarek/tree/master/conf) directory as an inspiration to make this new `.config` file and specify it using the `-c` flag: + +```bash +nextflow run SciLifeLab/Sarek --sample mysample.tsv -c conf/personal.config +``` + +Any configuration field specified in this file has precedence over the predefined configurations but any field left out from the file will be set by the normal configuration files included in the specified (or `standard`) profile. + +Furthermore, to find out which configuration files take action for the different profiles, the profiles are defined in the file [`nextflow.config`](https://github.com/SciLifeLab/Sarek/blob/master/nextflow.config). From 1258ad029e39eeadcd02adad85af0d72d64116b2 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 12 Sep 2018 16:32:06 +0200 Subject: [PATCH 51/75] Whitespace change on INPUT docs --- docs/INPUT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/INPUT.md b/docs/INPUT.md index 4468f02320..edab568bff 100644 --- a/docs/INPUT.md +++ b/docs/INPUT.md @@ -3,7 +3,7 @@ Input files for Sarek can be specified using a tsv file given to the `--sample` parameter. The tsv file is a Tab Separated Value file with columns: `subject gender status sample lane fastq1 fastq2` or `subject gender status sample bam bai`. The content of these columns should be quite straight-forward: -- `subject` designate the subject, it should be the ID of the Patient, or if you don't have one, il could be the Normal ID Sample. +- `subject` designate the subject, it should be the ID of the Patient, or if you don't have one, it could be the Normal ID Sample. - `gender` is the gender of the Patient, (XX or XY) - `status` is the status of the Patient, (0 for Normal or 1 for Tumor) - `sample` designate the Sample, it should be the ID of the Sample (it is possible to have more than one tumor sample for each patient) From d888fb9994f0fdc02e854385ee37969822f9aa1f Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 12 Sep 2018 16:32:57 +0200 Subject: [PATCH 52/75] Major rewrite of the usage docs, from a beginners perspective --- docs/PARAMETERS.md | 139 ++++++++++++++++++++ docs/USAGE.md | 315 +++++++++++++++++++++------------------------ 2 files changed, 288 insertions(+), 166 deletions(-) create mode 100644 docs/PARAMETERS.md diff --git a/docs/PARAMETERS.md b/docs/PARAMETERS.md new file mode 100644 index 0000000000..399fade15c --- /dev/null +++ b/docs/PARAMETERS.md @@ -0,0 +1,139 @@ +# Parameters + +A list of all possible parameter that can be used for the different scripts included in Sarek. + +## Common for all scripts + +### --help + +Display help + +### --noReports + +Disable all QC tools and MultiQC. + +### --outDir + +Choose an output directory + +### --project `ProjectID` + +Specify a project number ID on a UPPMAX cluster. +(optional if not on such a cluster) + +### --sample `file.tsv` + +Use the given TSV file as sample (cf [TSV documentation](TSV.md)). +Is not used for `annotate.nf` and `runMultiQC.nf`. + +### --tools `tool1[,tool2,tool3...]` + +Choose which tools will be used in the workflow. +Different tools to be separated by commas. +Possible values are: + +- haplotypecaller (use `HaplotypeCaller` for VC) (germlineVC.nf) +- manta (use `Manta` for SV) (germlineVC.nf,somaticVC.nf) +- strelka (use `Strelka` for VC) (germlineVC.nf,somaticVC.nf) +- ascat (use `ASCAT` for CNV) (somaticVC.nf) +- mutect2 (use `MuTect2` for VC) (somaticVC.nf) +- snpeff (use `snpEff` for Annotation) (annotate.nf) +- vep (use `VEP` for Annotation) (annotate.nf) + +`--tools` option is case insensitive to avoid easy introduction of errors when choosing tools. +So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worrying about case sensitivity. + +### --verbose + +Display more information about files being processed. + +## Preprocessing script (`main.nf`) +### --step `step` + +Choose from wich step the workflow will start. +Choose only one step. +Possible values are: + +- mapping (default, will start workflow with FASTQ files) +- recalibrate (will start workflow with BAM files and Recalibration Tables + +`--step` option is case insensitive to avoid easy introduction of errors when choosing a step. + +### --test + +Test run Sarek on a smaller dataset, that way you don't have to specify `--sample Sarek-data/testdata/tsv/tiny.tsv` + +### --onlyQC + +Run only QC tools and MultiQC to generate a HTML report. + + +## Annotate script (`annotate.nf`) + +### --annotateTools `tool1[,tool2,tool3...]` + +Choose which tools to annotate. +Different tools to be separated by commas. +Possible values are: +- haplotypecaller (Annotate `HaplotypeCaller` output) +- manta (Annotate `Manta` output) +- mutect2 (Annotate `MuTect2` output) +- strelka (Annotate `Strelka` output) + +### --annotateVCF `file1[,file2,file3...]` + +Choose vcf to annotate. +Different vcfs to be separated by commas. + + +## MultiQC script (`runMultiQC.nf`) +### --callName `Name` + +Specify a name for MultiQC report (optional) + +### --contactMail `email` + +Specify an email for MultiQC report (optional) + + +## References + +For most use cases, the reference information is already in the configuration file [`conf/genomes.config`](https://github.com/SciLifeLab/Sarek/blob/master/conf/genomes.config). +However, if needed, you can specify any reference file at the command line. + +### --acLoci `acLoci file` + +### --bwaIndex `bwaIndex file` + +### --cosmic `cosmic file` + +### --cosmicIndex `cosmicIndex file` + +### --dbsnp `dbsnp file` + +### --dbsnpIndex `dbsnpIndex file` + +### --genomeDict `genomeDict file` + +### --genomeFile `genomeFile file` + +### --genomeIndex `genomeIndex file` + +### --intervals `intervals file` + +### --knownIndels `knownIndels file` + +### --knownIndelsIndex `knownIndelsIndex file` + +### --snpeffDb `snpeffDb file` + +## Hardware Parameters + +For most use cases, the reference information is already in the appropriate [configuration files](https://github.com/SciLifeLab/Sarek/blob/master/conf/). +However, it is still possible to specify these parameters at the command line as well. + +### --runTime `time` + +### --singleCPUMem `memory` + +### --totalMemory `memory` diff --git a/docs/USAGE.md b/docs/USAGE.md index c4e32bd9c4..4a4ff7c5a3 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -1,201 +1,184 @@ -# Usage +# How to run Sarek -I would recommend to run Nextflow within a [screen](https://www.gnu.org/software/screen/) or [tmux](https://tmux.github.io/) session. +This guide will take you through your first run of Sarek. +It is divided into two steps corresponding to the two main types of analysis offered by Sarek: + - Run a Germline Analysis + - Run a Somatic Analysis -## Project folder structure +This guide assumes you have internet access on the server where the analysis will take place. If you do not have that, please look into the [installation instructions](INSTALL_BIANCA.md) for the restricted access server Bianca at Uppmax, which should give an idea on how to adjust the following examples accordingly. -The workflow is started for a sample, or a set of samples from the same Individual. -Each different physical samples is identified by its own ID. -For example in a Tumour/Normal settings, this ID could correspond to "Normal", "Tumour_1", "Tumour_2" etc. corresponding to all physical samples from the same patient. +It is recommended to run Sarek within a [screen](https://www.gnu.org/software/screen/) or [tmux](https://tmux.github.io/) session. +This helps Sarek run uninterrupted until the analysis has finished. +Furthermore, Sarek is designed to be run on a single sample for a germline analysis or a set of samples from the same individual for a somatic analysis. +If more than one individual will be analysed, it is recommended that this is done in separate directories which is analysed separately. -## Preparing to run Sarek -Sarek will start the analysis by parsing a supplied input file in tsv format. -This file contains all the necessary information about the data and should have at least one tab-separated line: -``` -SUBJECT_ID XX 0 SAMPLEID 1 /samples/normal_1.fastq.gz /samples/normal_2.fastq.gz -``` - -## Scripts +## Update to latest version -Sarek uses several scripts, a wrapper is currently being made to simplify the command lines. -Currently the typical reduced command lines are: +To make sure that you have the latest version of Sarek, use: ```bash -nextflow run SciLifeLab/Sarek/main.nf --sample --step -nextflow run SciLifeLab/Sarek/germlineVC.nf --sample --tools -nextflow run SciLifeLab/Sarek/somaticVC.nf --sample --tools -nextflow run SciLifeLab/Sarek/annotate.nf --tools (--annotateTools ||--annotateVCF ) -nextflow run SciLifeLab/Sarek/runMultiQC.nf +nextflow pull SciLifeLab/Sarek ``` -All parameters, options and variables can be specified with configuration files and profile (cf [configuration documentation](#profiles)). - -## Options - -### --callName `Name` - -Specify a name for MultiQC report (optional) - -### --contactMail `email` - -Specify an email for MultiQC report (optional) - -### --help - -Display help - -### --noReports - -Disable all QC tools and MultiQC to generate a HTML report. - -### --onlyQC - -Run only QC tools and MultiQC to generate a HTML report. - -### --outDir - -Choose an output directory - -### --project `ProjectID` - -Specify a project number ID on a UPPMAX cluster. -(optional if not on such a cluster) - -### --sample `file.tsv` - -Use the given TSV file as sample (cf [TSV documentation](TSV.md)). - -### --step `step` - -Choose from wich step the workflow will start. -Choose only one step. -Possible values are: - -- mapping (default, will start workflow with FASTQ files) -- recalibrate (will start workflow with BAM files and Recalibration Tables - -`--step` option is case insensitive to avoid easy introduction of errors when choosing a step. - -### --test - -Test run Sarek on a smaller dataset, that way you don't have to specify `--sample data/tsv/tiny.tsv` - -### --tools `tool1[,tool2,tool3...]` - -Choose which tools will be used in the workflow. -Different tools to be separated by commas. -Possible values are: - -- haplotypecaller (use `HaplotypeCaller` for VC) (germlineVC) -- manta (use `Manta` for SV) (germlineVC,somaticVC) -- strelka (use `Strelka` for VC) (germlineVC,somaticVC) -- ascat (use `ASCAT` for CNV) (somaticVC) -- mutect2 (use `MuTect2` for VC) (somaticVC) -- snpeff (use `snpEff` for Annotation) (annotate) -- vep (use `VEP` for Annotation) (annotate) - -`--tools` option is case insensitive to avoid easy introduction of errors when choosing tools. -So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worrying about case sensitivity. - -### --annotateTools `tool1[,tool2,tool3...]` - -Choose which tools to annotate. -Different tools to be separated by commas. -Possible values are: -- haplotypecaller (Annotate `HaplotypeCaller` output) -- manta (Annotate `Manta` output) -- mutect2 (Annotate `MuTect2` output) -- strelka (Annotate `Strelka` output) - -### --annotateVCF `file1[,file2,file3...]` - -Choose vcf to annotate. -Different vcfs to be separated by commas. - -### --verbose - -Display more information about files being processed. - -## Containers - -### --containerPath `Path to the singularity containers (default=containers/)` - -### --repository `Docker-hub repository (default=maxulysse)` - -### --tag `tag of the containers to use (default=current version)` - -## References - -If needed, you can specify each reference file by command line. - -### --acLoci `acLoci file` - -### --bwaIndex `bwaIndex file` - -### --cosmic `cosmic file` - -### --cosmicIndex `cosmicIndex file` - -### --dbsnp `dbsnp file` +## Run the latest version -### --dbsnpIndex `dbsnpIndex file` +If there is a feature or bugfix you want to use in a resumed or re-analyzed run, you have to update the workflow to the latest version. +By default it is not updated automatically, so use something like: -### --genomeDict `genomeDict file` +```bash +nextflow run -latest SciLifeLab/Sarek/main.nf ... -resume +``` -### --genomeFile `genomeFile file` +## Not on Uppmax +The commands used in this guide is suitable on how to run on a cluster at Uppmax. +To run these examples on a different infrastructure, there are a few things that needs to be changed. -### --genomeIndex `genomeIndex file` + - Most likely, the `slurm` profile is not suitable to use. + Find a more suitable one (or design your own) using the [configuration documentation](CONFIG.md) + - The path for where reference genomes are located (specified in the `--genome_base` parameter) need to be modified. + Use the instructions in the [reference documentation](REFERENCES.md) to make sure all the reference files are available. -### --intervals `intervals file` -### --knownIndels `knownIndels file` +## Run a Germline Analysis +This section presents a complete instruction to run a germline analysis using Sarek on a single sample. +Sarek will start the analysis by parsing a supplied input file in TSV format. +This file contains all the necessary information about the data and for the germline analysis it should have at least one line. +For more detailed information about how to construct TSV files for custom data, see [input documentation](INPUT.md). -### --knownIndelsIndex `knownIndelsIndex file` +For example, the file can be called `samples_germline.tsv` with the content (corresponding to columns: `subject gender status sample lane fastq1 fastq2`): -### --snpeffDb `snpeffDb file` +``` +SUBJECT_ID XX 0 SAMPLEID 1 /samples/normal_1.fastq.gz /samples/normal_2.fastq.gz +``` -## Parameters +The first workflow that will be run is contained in the `main.nf` file and performs the preprocessing step consisting of mapping, marking of duplicates and base recalibration. Running this command will launch a nextflow process in the terminal which in turn submits jobs (processes) to the SLURM queue. +``` +nextflow run SciLifeLab/Sarek/main.nf \ +--sample samples_germline.tsv \ +-profile slurm \ +--project \ +--genome_base /sw/data/uppnex/ToolBox/hg38bundle \ +--genome GRCh38 +``` -Simpler to specify in the configuration files, but it's still possible to specify every thing in the command line. +When the workflow has finished successfully it should print something similar to this: +``` +Completed at: Fri Aug 31 05:10:07 CEST 2018 +Duration : 1d 13h 24m 51s +Success : true +Exit status : 0 +``` +Make sure to check that the output states `Success : true` and not `Success : false`. +The results of the first step is located in the `Preprocessing` directory. +These files will be used in the next step, where the actual variant calling takes place. +Among other things, the preprocessing step should have created a new TSV file which is intended to be used as input for the variant calling step: +``` +nextflow run SciLifeLab/Sarek/germlineVC.nf \ +--sample Preprocessing/Recalibrated/recalibrated.tsv \ +-profile slurm \ +--project \ +--genome_base /sw/data/uppnex/ToolBox/hg38bundle \ +--genome GRCh38 \ +--tools HaplotypeCaller +``` +When successful (`Success : true`), this step should produce vcf file(s) within a `VariantCalling` directory. +The next workflow will annotate the found variants. +It is possible to specify the tools used for annotation (here VEP) and the variant-calling tools to use as input for annotation (here HaplotypeCaller). +``` +nextflow run SciLifeLab/Sarek/annotate.nf \ +--annotateTools HaplotypeCaller \ +-profile slurm \ +--project \ +--genome_base ~/Sarek/References/smallGRCh37 \ +--tools VEP +``` -### --runTime `time` +Finally, run MultiQC to get an easily accessible report of all your analysis. +``` +nextflow run SciLifeLab/Sarek/runMultiQC.nf \ +-profile slurm +--project \ +``` +## Run a Somatic Analysis -### --singleCPUMem `memory` +This section presents a complete instruction on how to run a somatic analysis using Sarek on two samples from the same individual. In this case one normal sample and one tumour sample will be used. However, Sarek can also accept more than one tumour sample (i.e. relapses) for the same individual. -### --totalMemory `memory` +Note: Four out of five of the steps included in this example are identical or very similar to the steps included in the germline analysis example. Therefore, much of the information in this example is redundant compared to the first example. -## Configuration and profiles +Sarek will start the analysis by parsing a supplied input file in TSV format. +This file contains all the necessary information about the data and for the somatic analysis it should have at least two lines. +These lines have columns corresonding to `subject gender status sample lane fastq1 fastq2`. +For more detailed information about how to construct TSV files for custom data, see [input documentation](INPUT.md). -More informations on the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html). -The default profile is `standard`. -You can use your own profile: +For example, the file can be called `samples_somatic.tsv` with the content: -```bash -nextflow run SciLifeLab/Sarek --sample mysample.tsv -profile myprofile ``` - -A standard profile is defined in [`nextflow.config`](https://github.com/SciLifeLab/Sarek/blob/master/nextflow.config). -You can use the files in the [`conf/`](https://github.com/SciLifeLab/Sarek/tree/master/conf) directory as a base to make a new `.config` file that you can specify directly (or add as a profile): - -```bash -nextflow run SciLifeLab/Sarek --sample mysample.tsv -c conf/personnal.config +SUBJECT_ID XX 0 SAMPLEID1 1 /samples/normal_1.fastq.gz /samples/normal_2.fastq.gz +SUBJECT_ID XX 1 SAMPLEID2 1 /samples/tumour_1.fastq.gz /samples/tumour_2.fastq.gz +``` +The first workflow that will be run is contained in the `main.nf` file and performs the preprocessing step consisting of mapping, marking of duplicates and base recalibration. Running this command will launch a nextflow process in the terminal which in turn submits jobs (processes) to the SLURM queue. +``` +nextflow run SciLifeLab/Sarek/main.nf \ +--sample samples_somatic.tsv \ +-profile slurm \ +--project \ +--genome_base /sw/data/uppnex/ToolBox/hg38bundle \ +--genome GRCh38 ``` -## Update to latest version - -To update workflow to the latest version use: - -```bash -nextflow pull SciLifeLab/Sarek +When the workflow has finished successfully it should print something similar to this: +``` +Completed at: Fri Aug 31 05:10:07 CEST 2018 +Duration : 1d 13h 24m 51s +Success : true +Exit status : 0 ``` -## Run the latest version +Make sure to check that the output states `Success : true` and not `Success : false`. +The results of the first step is located in the `Preprocessing` directory. +These files will be used in the next two steps, where the actual variant calling takes place. +Among other things, the preprocessing step should have created a new TSV file which is intended to be used as input for the variant calling steps: -If there is a feature or bugfix you want to use in a resumed or re-analyzed run, you have to update the workflow to the latest version. -By default it is not updated automatically, so use something like: +``` +nextflow run SciLifeLab/Sarek/germlineVC.nf \ +--sample Preprocessing/Recalibrated/recalibrated.tsv \ +-profile slurm \ +--project \ +--genome_base /sw/data/uppnex/ToolBox/hg38bundle \ +--genome GRCh38 \ +--tools HaplotypeCaller +``` +When successful (`Success : true`), this step should produce vcf file(s) within a `VariantCalling` directory. +The first variant calling step is actually the one from the germline analysis. +This is included here since information regarding germline variants is still useful for analysis of somatic variants. +The next variant calling step is the somatic specific analysis: +``` +nextflow run SciLifeLab/Sarek/somaticVC.nf \ +--sample Preprocessing/Recalibrated/recalibrated.tsv \ +-profile slurm \ +--project \ +--genome_base /sw/data/uppnex/ToolBox/hg38bundle \ +--genome GRCh38 \ +--tools Strelka +``` +When successful (`Success : true`), this step should produce vcf file(s) within the `VariantCalling` directory separate from the germline vcf file. +The next workflow will annotate the found variants. +It is possible to specify the tools used for annotation (here VEP) and the variant-calling tools to use as input for annotation (here HaplotypeCaller and Strelka). +``` +nextflow run SciLifeLab/Sarek/annotate.nf \ +--annotateTools HaplotypeCaller,Strelka \ +-profile slurm \ +--project \ +--genome_base ~/Sarek/References/smallGRCh37 \ +--containerPath \ +--tools VEP +``` -```bash -nextflow run -latest SciLifeLab/Sarek/main.nf ... -resume +Finally, run MultiQC to get an easily accessible report of all your analysis. +``` +nextflow run SciLifeLab/Sarek/runMultiQC.nf \ +-profile slurm +--project \ ``` From c54ec51efe90dedbaf9357a8b42fd10aa7293d40 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 16:43:15 +0200 Subject: [PATCH 53/75] update RELEASE_CHECKLIST + add helper script to change version number in files --- .github/RELEASE_CHECKLIST.md | 52 +++++++++++++++++++++--------------- scripts/do_release.sh | 43 +++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 21 deletions(-) create mode 100755 scripts/do_release.sh diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 56ecb2d715..1606ca4c48 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -1,24 +1,34 @@ # Release checklist + This checklist is for our own reference -1. Check that everything is up to date and ready to go - - Travis tests are passing - - Manual tests on Bianca are passing -2. Increase version numbers -3. Update version numbers in code: `configuration/base.config` -4. Build, and get the containers. - - `./scripts/do_all.sh --push --tag ` - - `./scripts/do_all.sh --pull --tag ` -5. Test against sample data. - - Check for any command line errors - - Check version numbers are printed correctly - - `./scripts/test.sh -p docker --tag ` - - `./scripts/test.sh -p singularity --tag ` - - `./scripts/test.sh -p singularityPath --tag ` -6. Commit and push version updates -7. Make a [release](https://github.com/SciLifeLab/Sarek/releases) on GitHub -8. Choose an appropriate codename for the release -9. Update [bio.tools](https://bio.tools/Sarek) with the new release -10. Tweet that new version is released -11. Commit and push. Continue making more awesome :metal: -12. Have fika :cake: +1. Check that everything is ready to go + + - [PR](https://github.com/SciLifeLab/Sarek/pull) are merged + - [Travis tests](https://travis-ci.org/SciLifeLab/Sarek/branches) are passing on `dev` + +2. Increase version number following [semantic versioning](http://semver.org/spec/v2.0.0.html) +3. Choose an appropriate codename for the release + - i.e. Peaks in [Sarek National Park](https://en.wikipedia.org/wiki/Sarek_National_Park#Topography) +4. Build docker containers. + + - `./scripts/do_all.sh --tag ` + +5. Test against sample data. + + - `./scripts/test.sh -p docker --tag ` + - Check for any command line errors + +6. Use script to update version in files: + + - `./scripts/do_release.sh -r "" -c ""` + +7. Push latest updates +8. Make a PR against `dev` +9. Merge said PR +10. Make a [release](https://github.com/SciLifeLab/Sarek/releases) on GitHub +11. Update [bio.tools](https://bio.tools/Sarek) with the new release details +12. Tweet that a new version is released +13. Add a new `Unreleased` section in `CHANGELOG.md` for the `dev` version +14. Commit and push. Continue making more awesome :metal: +15. Have fika :cake: diff --git a/scripts/do_release.sh b/scripts/do_release.sh new file mode 100755 index 0000000000..88caf682a5 --- /dev/null +++ b/scripts/do_release.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -xeuo pipefail + +CODENAME='' +RELEASE='' + +while [[ $# -gt 0 ]] +do + key=$1 + case $key in + -c|--codename) + CODENAME=$2 + shift # past argument + shift # past value + ;; + -r|--release) + RELEASE=$2 + shift # past argument + shift # past value + ;; + esac +done + +if [[ $CODENAME == "" ]] +then + echo "No codename specified" + exit +fi + +if [[ $RELEASE == "" ]] +then + echo "No release specified" + exit +fi + +echo "Preparing release $RELEASE - $CODENAME" + +sed -i "s/\[Unreleased\]/[$RELEASE] - $CODENAME - $(date +'%Y-%m-%d')/g" CHANGELOG.md +sed -i "s/sarek-[0-9\.]\+/sarek-$RELEASE/g" Dockerfile +sed -i "s/sarek-[0-9\.]\+/sarek-$RELEASE/g" Singularity +sed -i "s/version = '[0-9\.]\+'/version = '$RELEASE'/g" conf/base.config + +git commit CHANGELOG.md Dockerfile Singularity conf/base.config -m "preparing release $RELEASE [skip ci]" From 20f86f02a28b64b123c989789836e2e3f8eb1bc1 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 16:45:22 +0200 Subject: [PATCH 54/75] update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a866d012e..577e970bc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add helper script for changing version number ### `Changed` @@ -28,6 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Update RELEASE_CHECKLIST ### `Removed` From cc2bd7c6b21b2abe9797aaf26bd79c6c0684afab Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 12 Sep 2018 16:49:04 +0200 Subject: [PATCH 55/75] fix PR number [skip ci] --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 577e970bc2..106d5224c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule -- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add helper script for changing version number +- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Add helper script for changing version number ### `Changed` @@ -29,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images -- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Update RELEASE_CHECKLIST +- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST ### `Removed` From afabfcc767d1460c3f4e4bcf1cc63c8ae8737f9c Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 12 Sep 2018 16:52:30 +0200 Subject: [PATCH 56/75] Updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b7750e652..93585f8f9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add a complete example analysis to docs ### `Changed` - [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update Nextflow required version @@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus FastQC processes - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs ### `Removed` - [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template From 6d6c4d3a1798a91a214a45e2204355bd949cf5c4 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 12 Sep 2018 16:52:43 +0200 Subject: [PATCH 57/75] Updated README --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d479dbd4ea..a6f1fccf1a 100644 --- a/README.md +++ b/README.md @@ -82,12 +82,13 @@ The Sarek pipeline comes with documentation in the `docs/` directory: 06. [Configuration and profiles documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONFIG.md) 07. [Intervals documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/INTERVALS.md) 08. [Running the pipeline](https://github.com/SciLifeLab/Sarek/blob/master/docs/USAGE.md) -09. [Examples](https://github.com/SciLifeLab/Sarek/blob/master/docs/USE_CASES.md) -10. [TSV file documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/TSV.md) -11. [Processes documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/PROCESS.md) -12. [Documentation about containers](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONTAINERS.md) -13. [More information about ASCAT](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md) -14. [Output documentation structure](https://github.com/SciLifeLab/Sarek/blob/master/docs/OUTPUT.md) +09. [Command line parameters](https://github.com/SciLifeLab/Sarek/blob/master/docs/PARAMETERS.md) +10. [Examples](https://github.com/SciLifeLab/Sarek/blob/master/docs/USE_CASES.md) +11. [Input files documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/INPUT.md) +12. [Processes documentation](https://github.com/SciLifeLab/Sarek/blob/master/docs/PROCESS.md) +13. [Documentation about containers](https://github.com/SciLifeLab/Sarek/blob/master/docs/CONTAINERS.md) +14. [More information about ASCAT](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md) +15. [Output documentation structure](https://github.com/SciLifeLab/Sarek/blob/master/docs/OUTPUT.md) ## Contributions & Support From 39fdfd4fed9aebcf59bd0b2eb0c652c9aea60c52 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Wed, 12 Sep 2018 20:23:22 +0200 Subject: [PATCH 58/75] falling back to tiny.tsv --- scripts/test.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/test.sh b/scripts/test.sh index 466b2bbed1..e78302eda5 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -92,9 +92,12 @@ fi if [[ ALL,GERMLINE =~ $TEST ]] then # Added Strelka to germline test (no Strelka best practices test for this small data) and not asking for reports + echo "########################### TESTING GERMLINE WGS ########################################" run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --noReports # testing targeted calls + echo "########################### TESTING GERMLINE TARGETED ########################################" run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports + echo "########################### TESTING GERMLINE RECALIBRATION ########################################" run_wrapper --germline --step recalibrate --noReports clean_repo fi @@ -102,10 +105,13 @@ fi if [[ ALL,SOMATIC =~ $TEST ]] then # Do we need HaplotypeCaller in the somatic test? + echo "########################### TESTING SOMATIC FROM SCRATCH ########################################" run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports + echo "########################### TESTING SOMATIC STRELKA BEST PRACTICE FROM SCRATCH ########################################" run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP # run targeted tests with tiny set - run_wrapper --somatic --sample Preprocessing/Recalibrated/recalibrated.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed + echo "########################### TESTING SOMATIC TARGETED ########################################" + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed clean_repo fi From 5f7318671293fb8ef9d4ddfbbb1ac0aac77c3767 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 13 Sep 2018 10:58:07 +0200 Subject: [PATCH 59/75] +s [skip ci] --- .github/RELEASE_CHECKLIST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 1606ca4c48..153832c351 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -4,7 +4,7 @@ This checklist is for our own reference 1. Check that everything is ready to go - - [PR](https://github.com/SciLifeLab/Sarek/pull) are merged + - [PRs](https://github.com/SciLifeLab/Sarek/pull) are merged - [Travis tests](https://travis-ci.org/SciLifeLab/Sarek/branches) are passing on `dev` 2. Increase version number following [semantic versioning](http://semver.org/spec/v2.0.0.html) From f00f8c96378d509aa647892c733206fd0884924f Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 13 Sep 2018 12:03:16 +0200 Subject: [PATCH 60/75] add documentation about making release --- .github/RELEASE_CHECKLIST.md | 2 +- docs/RELEASE.md | 38 ++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 docs/RELEASE.md diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 153832c351..7d1324dc6a 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -1,6 +1,6 @@ # Release checklist -This checklist is for our own reference +> This checklist is for our own reference, to help us prepare a new release 1. Check that everything is ready to go diff --git a/docs/RELEASE.md b/docs/RELEASE.md new file mode 100644 index 0000000000..5b74da67ff --- /dev/null +++ b/docs/RELEASE.md @@ -0,0 +1,38 @@ +# RELEASE + +> This document is for helping Sarek core developers and anyone joining the team to prepare a new release + +## [CHECKLIST](https://github.com/SciLifeLab/Sarek/blob/master/.github/RELEASE_CHECKLIST.md) + +This checklist is for our own reference, to help us prepare a new release. +Just follow it and be sure to check every item on the list. + +## [Helper script](https://github.com/SciLifeLab/Sarek/blob/master/scripts/do_release.sh) + +This script will update the version number in the following files: + +- [CHANGELOG.md](https://github.com/SciLifeLab/Sarek/blob/master/CHANGELOG.md) + - Will change Unreleased to correct version number and add codename and date +- [Dockerfile](https://github.com/SciLifeLab/Sarek/blob/master/Dockerfile) + - Will update to correct version number +- [Singularity](https://github.com/SciLifeLab/Sarek/blob/master/Singularity) + - Will update to correct version number +- [conf/base.config](https://github.com/SciLifeLab/Sarek/blob/master/conf/base.config) + - Will update to correct version number + +### Usage + +### Usage + +```bash +./scripts/do_release.sh -r "" -c "" +``` + +- `-r|--release` specify the new version number +- `-c|--codename` specify the codename + +### Example + +```bash +./scripts/do_release.sh -r "2.2.0" -c "Skårki" +``` From 65770565a842bd3ddd8b12342d763a85094e5a62 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 13 Sep 2018 13:20:23 +0200 Subject: [PATCH 61/75] +s [skip ci] --- .github/RELEASE_CHECKLIST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md index 7d1324dc6a..6721a7a797 100644 --- a/.github/RELEASE_CHECKLIST.md +++ b/.github/RELEASE_CHECKLIST.md @@ -4,7 +4,7 @@ 1. Check that everything is ready to go - - [PRs](https://github.com/SciLifeLab/Sarek/pull) are merged + - [PRs](https://github.com/SciLifeLab/Sarek/pulls) are merged - [Travis tests](https://travis-ci.org/SciLifeLab/Sarek/branches) are passing on `dev` 2. Increase version number following [semantic versioning](http://semver.org/spec/v2.0.0.html) From 72c0c2cba14998afc72039e01fe0072dbe12d7b4 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 15:28:26 +0200 Subject: [PATCH 62/75] simplified tests without singularity --- .travis.yml | 4 +--- scripts/test.sh | 25 ++++++++----------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3d1dd30eb8..56ffee9308 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,14 +11,12 @@ env: global: - NXF_VER=0.31.0 SGT_VER=2.5.1 matrix: - - CE=singularity TEST=SOMATIC - CE=docker TEST=SOMATIC - CE=docker TEST=ANNOTATEVEP - - CE=singularity TEST=ANNOTATESNPEFF - CE=docker TEST=ANNOTATESNPEFF - - CE=singularity TEST=GERMLINE - CE=docker TEST=GERMLINE + install: # Install Nextflow (and Singularity if needed) - "./scripts/install.sh --engine $CE" diff --git a/scripts/test.sh b/scripts/test.sh index e78302eda5..772e610989 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -89,33 +89,24 @@ then fi fi + if [[ ALL,GERMLINE =~ $TEST ]] then # Added Strelka to germline test (no Strelka best practices test for this small data) and not asking for reports - echo "########################### TESTING GERMLINE WGS ########################################" - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --noReports - # testing targeted calls - echo "########################### TESTING GERMLINE TARGETED ########################################" - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports - echo "########################### TESTING GERMLINE RECALIBRATION ########################################" - run_wrapper --germline --step recalibrate --noReports - clean_repo + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports + run_wrapper --germline --step recalibrate --noReports + clean_repo fi if [[ ALL,SOMATIC =~ $TEST ]] then - # Do we need HaplotypeCaller in the somatic test? - echo "########################### TESTING SOMATIC FROM SCRATCH ########################################" run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports - echo "########################### TESTING SOMATIC STRELKA BEST PRACTICE FROM SCRATCH ########################################" - run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP - # run targeted tests with tiny set - echo "########################### TESTING SOMATIC TARGETED ########################################" - run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed - clean_repo + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP + run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed + clean_repo fi - if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]] then if [[ $TEST = ANNOTATESNPEFF ]] From be3cc7f327b1b1b0062ea08433adcf69f73bea57 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 15:41:11 +0200 Subject: [PATCH 63/75] typo fix --- scripts/wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/wrapper.sh b/scripts/wrapper.sh index 31f587c7e8..f3f65171bf 100755 --- a/scripts/wrapper.sh +++ b/scripts/wrapper.sh @@ -106,7 +106,7 @@ function run_sarek() { echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} else - echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS}" --targetBED ${TARGETBED} + echo "$(tput setaf 1)nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose$(tput sgr0) --max_cpus ${CPUS} --targetBED ${TARGETBED}" nextflow run $@ -profile $PROFILE --genome $GENOME --genome_base $GENOMEBASE --tag $TAG --verbose --max_cpus ${CPUS} --targetBED ${TARGETBED} fi } From 813ba520f337d81370df0856012119556d64a9ed Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 15:44:38 +0200 Subject: [PATCH 64/75] CHANGELOG changes changed --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e5984c086..93033dd65a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule - [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Add a complete example analysis to docs +- [#635](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED - [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Add helper script for changing version number ### `Changed` @@ -31,7 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs -- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST +- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE\_CHECKLIST ### `Removed` From 8204c7a1d84777ac9938e51027bfdab70a1c92a4 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 16:59:21 +0200 Subject: [PATCH 65/75] even less somatic test --- scripts/test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/test.sh b/scripts/test.sh index 772e610989..ae591fd2c9 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -93,7 +93,7 @@ fi if [[ ALL,GERMLINE =~ $TEST ]] then # Added Strelka to germline test (no Strelka best practices test for this small data) and not asking for reports - run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka + run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --noReports run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports run_wrapper --germline --step recalibrate --noReports clean_repo @@ -103,7 +103,8 @@ if [[ ALL,SOMATIC =~ $TEST ]] then run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP - run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed + # Disabling targeted somatic as it is practically the same as the germline, and takes aaaages + #run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed clean_repo fi From aa056c9c5159a4df234c58f54fd162452fd02770 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 16:59:59 +0200 Subject: [PATCH 66/75] fixed spacing --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index bb2513f26e..532e3483ee 100644 --- a/conf/base.config +++ b/conf/base.config @@ -33,7 +33,7 @@ params { step = 'mapping' // Default step is mapping strelkaBP = false // Don't use Manta's candidate indels as input to Strelka tag = 'latest' // Default tag is latest, to be overwritten by --tag - targetBED = false // no targets by default + targetBED = false // no targets by default test = false // Not testing by default verbose = false // Enable for more verbose information version = '2.1.0' // Workflow version From d8c35d5742e474154a2708112f3fa1241cec4fe3 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Thu, 13 Sep 2018 17:01:21 +0200 Subject: [PATCH 67/75] Fixing fixed fixes in CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93033dd65a..ba6861a7ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update documentation - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule -- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Add a complete example analysis to docs +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add a complete example analysis to docs - [#635](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED - [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Add helper script for changing version number From fef3c1fab0b309984ecd5a81db6e4308f8694d84 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Fri, 14 Sep 2018 18:28:28 +0200 Subject: [PATCH 68/75] Zenodo REST API to upload data --- scripts/zenodo_metadata.json | 12 +++++++ scripts/zenodo_upload.py | 69 ++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 scripts/zenodo_metadata.json create mode 100755 scripts/zenodo_upload.py diff --git a/scripts/zenodo_metadata.json b/scripts/zenodo_metadata.json new file mode 100644 index 0000000000..050b1e13ca --- /dev/null +++ b/scripts/zenodo_metadata.json @@ -0,0 +1,12 @@ +{ + "metadata": { + "upload_type": "dataset", + "publication_type": "other", + "publication_date": "2018-09-14", + "title": "Eljen II Rakoczi Feco", + "creators": [ { "name": "Szilveszter, Juhos", "affiliation": "SciLifeLab" }], + "description": "This is my first upload", + "access_right": "open", + "license": { "id":"cc-by" } + } +} diff --git a/scripts/zenodo_upload.py b/scripts/zenodo_upload.py new file mode 100755 index 0000000000..a06372b6b3 --- /dev/null +++ b/scripts/zenodo_upload.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# some part are shamelessly stolen from https://github.com/darvasd/upload-to-zenodo + +import os +import requests +import click +import json +import codecs + +@click.command(context_settings = dict( help_option_names = ['-h', '--help'] )) +@click.option('--filename', '-f', type=str, help='Files to upload to Zenodo.org', required=True) +@click.option('--token', '-t', type=str, help='Access token', required=True) +@click.option('--metadata', '-m', type=str, help='Metadata file', required=True) +@click.option('--deposition', '-d', type=str, help='Deposition ID', required=False) +@click.option('--sandbox', '-s', is_flag=True, help='Use sandbox', required=False, default=True) + +def uploadToZenodo(filename,token,metadata,sandbox,deposition): + print "Processing file ",filename + zenodoHost='zenodo.org' + if sandbox: + zenodoHost='sandbox.' + zenodoHost + print "Using host: " + zenodoHost + + # reading metadata: + with codecs.open(metadata, 'r', 'utf-8') as f: + metadataStr = f.read() + if not _is_valid_json(metadataStr): + return + else: + metadataJSON = json.loads(metadataStr) + print metadataJSON + + zenodoURL='https://' + zenodoHost + '/api/deposit/depositions' + headers = {"Content-Type": "application/json"} +# r = requests.post( zenodoURL, +# params={'access_token': token}, +# json={}, +# headers=headers) + +# deposition_id = r.json()['id'] +# data = {'filename': os.path.basename(filename)} +# files = {'file': open(filename, 'rb')} +# r = requests.post(zenodoURL + '/%s/files' % deposition_id, +# params={'access_token': token}, +# data=data, +# files=files) + +# print r.json() + + deposition_id = deposition + + r = requests.put(zenodoURL + '/%s' % deposition_id, + params={'access_token': token}, + json=metadataJSON, + headers=headers) + + print r.json() + + +def _is_valid_json(text): + try: + json.loads(text) + return True + except ValueError as e: + print('Invalid json: %s' % e) + return False + +if __name__ == "__main__": + uploadToZenodo() From 7457fcedd4717dfecdc647a07510fd6c2f6c78d8 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 19 Sep 2018 15:54:18 +0200 Subject: [PATCH 69/75] fix #641 --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index af3ea825f4..ce23859ff0 100644 --- a/environment.yml +++ b/environment.yml @@ -2,8 +2,8 @@ # conda env create -f environment.yml name: sarek channels: - - bioconda - conda-forge + - bioconda - defaults dependencies: From 916e5c5d33e6ea3065af7a9c736c5db554ff3bea Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 19 Sep 2018 15:55:35 +0200 Subject: [PATCH 70/75] update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50c2eeb05a..74aa7f32e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs - [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update conda channel order priorities ### `Removed` From ad6088c67ea33dc34b55f201b73ebf2efdf82e8f Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 20 Sep 2018 10:06:21 +0200 Subject: [PATCH 71/75] include environement.yml in the version number update --- environment.yml | 2 +- scripts/do_release.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index ce23859ff0..ff44323e39 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: sarek +name: sarek-2.1.0 channels: - conda-forge - bioconda diff --git a/scripts/do_release.sh b/scripts/do_release.sh index 88caf682a5..23f59e21c4 100755 --- a/scripts/do_release.sh +++ b/scripts/do_release.sh @@ -37,7 +37,8 @@ echo "Preparing release $RELEASE - $CODENAME" sed -i "s/\[Unreleased\]/[$RELEASE] - $CODENAME - $(date +'%Y-%m-%d')/g" CHANGELOG.md sed -i "s/sarek-[0-9\.]\+/sarek-$RELEASE/g" Dockerfile +sed -i "s/sarek-[0-9\.]\+/sarek-$RELEASE/g" environment.yml sed -i "s/sarek-[0-9\.]\+/sarek-$RELEASE/g" Singularity sed -i "s/version = '[0-9\.]\+'/version = '$RELEASE'/g" conf/base.config -git commit CHANGELOG.md Dockerfile Singularity conf/base.config -m "preparing release $RELEASE [skip ci]" +git commit CHANGELOG.md Dockerfile environment.yml Singularity conf/base.config -m "preparing release $RELEASE [skip ci]" From 5718c0f24d60579d381cc8e6cbbe174ad4dd663b Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 20 Sep 2018 13:28:16 +0200 Subject: [PATCH 72/75] update environnement.yml --- environment.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index ff44323e39..9a9ce78db8 100644 --- a/environment.yml +++ b/environment.yml @@ -9,16 +9,15 @@ channels: dependencies: - bcftools=1.8 - bwa=0.7.17 - - conda-forge::openjdk=8.0.144 # Needed for FastQC docker - see bioconda/bioconda-recipes#5026 - fastqc=0.11.7 - freebayes=1.2.0 - gatk4=4.0.6.0 - htslib=1.9 - igvtools=2.3.93 - manta=1.4.0 - - multiqc=1.5 - - qualimap=2.2.2a + - multiqc=1.6 + - qualimap=2.2.2b - samtools=1.8 - strelka=2.9.3 - - vcfanno=0.2.8 - - vcftools=0.1.15 + - vcfanno=0.3.0 + - vcftools=0.1.16 From e58bc8f9344efc3054c819e87f3dd56475ec4a49 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 20 Sep 2018 13:32:24 +0200 Subject: [PATCH 73/75] update docs and CHANGELOG --- CHANGELOG.md | 6 +++++- docs/CONTAINERS.md | 10 +++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74aa7f32e8..3e04d3ff75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` - [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add a complete example analysis to docs -- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Add helper script for changing version number +- [#640](https://github.com/SciLifeLab/Sarek/pull/640), [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Add helper script for changing version number ### `Changed` @@ -33,6 +33,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs - [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST - [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update conda channel order priorities +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - MultiQC 1.5 -> 1.6 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Qualimap 2.2.2a -> 2.2.2b +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - VCFanno 0.2.8 -> 0.3.0 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - VCFtools 0.1.15 -> 0.1.16 ### `Removed` diff --git a/docs/CONTAINERS.md b/docs/CONTAINERS.md index 9766a4419c..f223972127 100644 --- a/docs/CONTAINERS.md +++ b/docs/CONTAINERS.md @@ -79,7 +79,7 @@ We provide script to build/push or pull all containers ### sarek [![sarek-docker status][sarek-docker-badge]][sarek-docker-link] -- Based on `debian:8.9` +- Based on `nfcore/base:latest` - Contain **[BCFTools][bcftools-link]** 1.8 - Contain **[BWA][bwa-link]** 0.7.17 - Contain **[FastQC][fastqc-link]** 0.11.7 @@ -88,12 +88,12 @@ We provide script to build/push or pull all containers - Contain **[HTSlib][htslib-link]** 1.9 - Contain **[IGVtools][igvtools-link]** 2.3.93 - Contain **[Manta][manta-link]** 1.4.0 -- Contain **[MultiQC][multiqc-link]** 1.5 -- Contain **[Qualimap][qualimap-link]** 2.2.2a +- Contain **[MultiQC][multiqc-link]** 1.6 +- Contain **[Qualimap][qualimap-link]** 2.2.2b - Contain **[samtools][samtools-link]** 1.8 - Contain **[Strelka2][strelka-link]** 2.9.3 -- Contain **[VCFanno][vcfanno-link]** 0.2.8 -- Contain **[VCFtools][vcftools-link]** 0.1.15 +- Contain **[VCFanno][vcfanno-link]** 0.3.0 +- Contain **[VCFtools][vcftools-link]** 0.1.16 ### snpeff [![snpeff-docker status][snpeff-docker-badge]][snpeff-docker-link] From 60da87c8f9f5e0eb4c7eb3420c70d23aaef73154 Mon Sep 17 00:00:00 2001 From: Szilveszter Juhos Date: Fri, 21 Sep 2018 09:59:56 +0200 Subject: [PATCH 74/75] Zenodo tests --- scripts/zenodo_metadata.json | 6 +-- scripts/zenodo_upload.py | 77 ++++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/scripts/zenodo_metadata.json b/scripts/zenodo_metadata.json index 050b1e13ca..6c0e40395f 100644 --- a/scripts/zenodo_metadata.json +++ b/scripts/zenodo_metadata.json @@ -3,9 +3,9 @@ "upload_type": "dataset", "publication_type": "other", "publication_date": "2018-09-14", - "title": "Eljen II Rakoczi Feco", - "creators": [ { "name": "Szilveszter, Juhos", "affiliation": "SciLifeLab" }], - "description": "This is my first upload", + "title": "References and test data for Sarek", + "creators": [ { "name": "Maxime Garcia", "affiliation": "SciLifeLab" },{ "name": "Szilveszter, Juhos", "affiliation": "SciLifeLab"}], + "description": "Reference files necessary to run somatic/germline variation discovery for https://github.com/SciLifeLab/Sarek", "access_right": "open", "license": { "id":"cc-by" } } diff --git a/scripts/zenodo_upload.py b/scripts/zenodo_upload.py index a06372b6b3..896c271c77 100755 --- a/scripts/zenodo_upload.py +++ b/scripts/zenodo_upload.py @@ -7,13 +7,16 @@ import json import codecs +# global static header +headers = {"Content-Type": "application/json"} + @click.command(context_settings = dict( help_option_names = ['-h', '--help'] )) -@click.option('--filename', '-f', type=str, help='Files to upload to Zenodo.org', required=True) +@click.option('--filename', '-f', type=str, help='Files to upload to Zenodo.org', required=False) @click.option('--token', '-t', type=str, help='Access token', required=True) -@click.option('--metadata', '-m', type=str, help='Metadata file', required=True) +@click.option('--metadata', '-m', type=str, help='Metadata file', required=False) @click.option('--deposition', '-d', type=str, help='Deposition ID', required=False) @click.option('--sandbox', '-s', is_flag=True, help='Use sandbox', required=False, default=True) - +# This is the surrogate for main(): everything happens here def uploadToZenodo(filename,token,metadata,sandbox,deposition): print "Processing file ",filename zenodoHost='zenodo.org' @@ -21,42 +24,58 @@ def uploadToZenodo(filename,token,metadata,sandbox,deposition): zenodoHost='sandbox.' + zenodoHost print "Using host: " + zenodoHost - # reading metadata: - with codecs.open(metadata, 'r', 'utf-8') as f: + # The URL for the actual upload and a global + zenodoURL='https://' + zenodoHost + '/api/deposit/depositions' + + # if there is no deposition ID, create one + newDeposition = False + if not deposition: + deposition = new_deposition(zenodoURL, token) + newDeposition = True + # add file - in some cases we are only updating metadata, but + # to create a new deposition you must have a file + if not newDeposition and filename is not None: + upload_file(filename, deposition, zenodoURL, token) + + if metadata: + print "Uploading metadata %s" % metadata + r = upload_metadata(zenodoURL, metadata, deposition, token) + +def new_deposition(zURL,token): + r = requests.post( zURL, + params={'access_token': token}, + json={}, + headers=headers) + depo_id = r.json()['id'] + print "New deposition created with ID %s at %s" % (depo_id, zURL +"/"+str(depo_id)) + print "Refer to this at further file uploads" + return depo_id + +def upload_file(filename, deposition_id, zURL, token): + data = {'filename': os.path.basename(filename)} + files = {'file': open(filename, 'rb')} + r = requests.post(zURL + '/%s/files' % deposition_id, + params={'access_token': token}, + data=data, + files=files) + # TODO: actualy it is not true: if the filename is already there, it is not uploaded, the JSON + # returns with a warning and does nothing + print "New file %s uploaded" % filename + +def upload_metadata(zURL, mdFile, deposition_id, token): + # reading metadata: + with codecs.open(mdFile, 'r', 'utf-8') as f: metadataStr = f.read() if not _is_valid_json(metadataStr): return else: metadataJSON = json.loads(metadataStr) - print metadataJSON - - zenodoURL='https://' + zenodoHost + '/api/deposit/depositions' - headers = {"Content-Type": "application/json"} -# r = requests.post( zenodoURL, -# params={'access_token': token}, -# json={}, -# headers=headers) - -# deposition_id = r.json()['id'] -# data = {'filename': os.path.basename(filename)} -# files = {'file': open(filename, 'rb')} -# r = requests.post(zenodoURL + '/%s/files' % deposition_id, -# params={'access_token': token}, -# data=data, -# files=files) -# print r.json() - - deposition_id = deposition - - r = requests.put(zenodoURL + '/%s' % deposition_id, + return requests.put(zURL + '/%s' % deposition_id, params={'access_token': token}, json=metadataJSON, headers=headers) - print r.json() - - def _is_valid_json(text): try: json.loads(text) From 9f80f0203711e00044d04fe05b622dfb92088b87 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 21 Sep 2018 11:47:02 +0200 Subject: [PATCH 75/75] preparing release 2.2.0 --- CHANGELOG.md | 4 +++- Dockerfile | 2 +- Singularity | 2 +- conf/base.config | 2 +- environment.yml | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 481a5afff1..841cff7a49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +## [2.2.0] - Skårki - 2018-09-21 + ### `Added` - [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request) @@ -32,7 +34,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering - [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images - [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs -- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE\_CHECKLIST +- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST - [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update conda channel order priorities - [#642](https://github.com/SciLifeLab/Sarek/pull/642) - MultiQC 1.5 -> 1.6 - [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Qualimap 2.2.2a -> 2.2.2b diff --git a/Dockerfile b/Dockerfile index 08425200ac..bf22d333b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,4 +7,4 @@ LABEL \ COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/sarek-2.1.0/bin:$PATH +ENV PATH /opt/conda/envs/sarek-2.2.0/bin:$PATH diff --git a/Singularity b/Singularity index a22fcaa520..6d7cb5552e 100644 --- a/Singularity +++ b/Singularity @@ -7,7 +7,7 @@ Bootstrap:docker VERSION 2.1.0 %environment - PATH=/opt/conda/envs/sarek-2.1.0/bin:$PATH + PATH=/opt/conda/envs/sarek-2.2.0/bin:$PATH export PATH %files diff --git a/conf/base.config b/conf/base.config index 532e3483ee..1d20a8ad7f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -36,7 +36,7 @@ params { targetBED = false // no targets by default test = false // Not testing by default verbose = false // Enable for more verbose information - version = '2.1.0' // Workflow version + version = '2.2.0' // Workflow version } process { diff --git a/environment.yml b/environment.yml index 9a9ce78db8..05852b7f61 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: sarek-2.1.0 +name: sarek-2.2.0 channels: - conda-forge - bioconda