From 126c0b6f5c6f2bfc062382c2623030c3c2ce1224 Mon Sep 17 00:00:00 2001 From: Patrick Sorn Date: Wed, 14 Jun 2023 10:11:05 +0200 Subject: [PATCH 1/3] Added ONT modules and scripts - not yet functional --- bin/optimus_no_prime.py | 65 ++++++++++++ bin/optimus_no_prime.sh | 23 ++++ modules/09_ont_qc.nf | 43 ++++++++ modules/10_ont_preprocess.nf | 132 +++++++++++++++++++++++ modules/11_ont_primer_removal.nf | 61 +++++++++++ modules/12_ont_variant_calling.nf | 66 ++++++++++++ modules/13_ont_annotate.nf | 169 ++++++++++++++++++++++++++++++ 7 files changed, 559 insertions(+) create mode 100755 bin/optimus_no_prime.py create mode 100755 bin/optimus_no_prime.sh create mode 100755 modules/09_ont_qc.nf create mode 100755 modules/10_ont_preprocess.nf create mode 100755 modules/11_ont_primer_removal.nf create mode 100755 modules/12_ont_variant_calling.nf create mode 100755 modules/13_ont_annotate.nf diff --git a/bin/optimus_no_prime.py b/bin/optimus_no_prime.py new file mode 100755 index 0000000..71259ae --- /dev/null +++ b/bin/optimus_no_prime.py @@ -0,0 +1,65 @@ +import os +import sys +import re + +def list_primersets(path): + ignore = ("5p_cov",".txt",".bedGraph") + filenames = os.listdir(path) + filtered_filenames = [f for f in filenames if not any(j in f for j in ignore) and ".bed" in f] + return filtered_filenames + +def get_libsize(path): + with open(path, 'r') as file: + sum = 0 + for line in file: + columns = line.split() + if len(columns) >= 5: + sum += float(columns[4]) + return sum + +def get_score(path, T, G): + A = 0 + S = 0 + with open(path, 'r') as file: + for line in file: + columns = line.split() + if len(columns) >= 7: + S += float(columns[2]) - float(columns[1]) + A += float(columns[6]) + S = S / G + P = 1 - (A/T) + score = A / (T * S * P) + return score + + +# Run +G = 29903 # SARS CoV2 Genomelength + +# Path where to find primerset coverages generated by bedtools (optimus_no_prime.sh) +path = sys.argv[1] + +sampleId = sys.argv[2] + +total_cov_file = [f for f in os.listdir(path) if "5p_cov" in f and "Graph" not in f] +primersets = list_primersets(path) + +max = 0.0 +likely_primerset = primersets[0] +for set in primersets: + libsize = get_libsize(path + total_cov_file[0]) + score = get_score(path + set, libsize, G) + if score >= max: + likely_primerset = set + max = score + +# Remove sampleId from primer-set name +likely_primerset = re.sub(sampleId + "_","",likely_primerset,1) + +# If score is too low, it could either mean the used primer-set is not in the library or primers have already been trimmed. +# In this case return NA and primer clipping will be skipped. +if max < 5: + likely_primerset = "NA" + +sys.stdout.write(likely_primerset) +sys.stdout.flush() +sys.exit(0) diff --git a/bin/optimus_no_prime.sh b/bin/optimus_no_prime.sh new file mode 100755 index 0000000..5380724 --- /dev/null +++ b/bin/optimus_no_prime.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +primerDir=${1} +sampleId=${2} +bam=${3} + +# Get coverage of read-ends across genome + bedtools genomecov -5 -strand + -ibam ${bam} -dz \ + | awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"+"}' > temp.bed + bedtools genomecov -5 -strand - -ibam ${bam} -dz \ + | awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"-"}' >> temp.bed + sort -k1,1 -k2,2n temp.bed > ${sampleId}_5p_cov.bed + cut -f1,2,3,5 temp.bed > ${sampleId}_5p_cov.bedGraph + rm temp.bed + +# Overlap read-end coverage with primer BED files + primerList=($(find ${primerDir} -name "*.bed" -printf "%f\n")) + for pset in ${primerList[@]} + do + bedtools map -a ${primerDir}/${pset} -b ${sampleId}_5p_cov.bed -s -c 5 -o sum -null 0 \ + | awk -F'\t' '{OFS="\t"}{print $1,$2,$3,$4,$5,$6,$(NF)}' \ + > ${sampleId}_${pset} + done diff --git a/modules/09_ont_qc.nf b/modules/09_ont_qc.nf new file mode 100755 index 0000000..192623b --- /dev/null +++ b/modules/09_ont_qc.nf @@ -0,0 +1,43 @@ +process FASTQC { + cpus 2 + memory "3 GB" + publishDir "${params.outDir}/${sampleId}/qc", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) + + input: + tuple val(sampleId), path(fq) + + output: + path("*fastqc*") + + """ + fastqc \ + --threads 2 \ + ${fq} + """ +} + +process NANOPLOT { + cpus 2 + memory "3 GB" + publishDir "${params.outDir}/${sampleId}/qc", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::nanoplot=1.40.2" : null) + + input: + tuple val(sampleId), path(fq) + + output: + path("*${sampleId}_*") + + """ + NanoPlot \ + --fastq ${fq} \ + -p ${sampleId}_ \ + -t 1 \ + --tsv_stats + """ +} diff --git a/modules/10_ont_preprocess.nf b/modules/10_ont_preprocess.nf new file mode 100755 index 0000000..b4bb818 --- /dev/null +++ b/modules/10_ont_preprocess.nf @@ -0,0 +1,132 @@ +process NANOFILT { + cpus 3 + memory params.memory + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::nanofilt=2.8.0" : null) + + input: + tuple val(sampleId), path(inputFq) + + output: + tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq + + """ + gunzip -c ${inputFq} | NanoFilt \ + --quality 10 \ + --length 300 \ + --logfile ${sampleId}_qctrim.log \ + | gzip > ${sampleId}_qctrim.fq.gz + """ +} + +process PORECHOP { + cpus params.cpus + memory params.memory + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null) + + input: + tuple val(sampleId), path(inputFq) + + output: + tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq + path("${sampleId}_porechop.log") + + """ + porechop \ + --input ${inputFq} \ + --output ${sampleId}_noAdapter.fq.gz \ + --threads ${params.cpus} \ + --format "fastq" \ + --verbosity 1 > ${sampleId}_porechop.log + """ +} + +process CHOPPER { + cpus params.cpus+1 + memory params.memory + publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::chopper=0.5.0" : null) + + input: + tuple val(sampleId), path(inputFq) + + output: + tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq + path("${sampleId}_chopper.log") + + """ + cat ${inputFq} \ + | chopper \ + --quality 10 \ + --minlength 300 \ + --threads ${params.cpus} \ + 2> ${sampleId}_chopper.log \ + | gzip > ${sampleId}_qctrim.fq.gz + """ +} + +process PORECHOP_ABI { + cpus params.cpus + memory params.memory + publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::porechop_abi=0.5.0" : null) + + input: + tuple val(sampleId), path(inputFq) + + output: + tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq + path("${sampleId}_porechop_abi.log") + + """ + porechop_abi \ + --ab_initio \ + --input ${inputFq} \ + --output ${sampleId}_noAdapter.fq.gz \ + --threads ${params.cpus} \ + --format "fastq.gz" \ + --verbosity 1 > "${sampleId}_porechop_abi.log" + """ +} + +process MINIMAP2 { + cpus params.cpus + memory params.memory + publishDir "${params.outDir}/${sampleId}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::minimap2=2.24 bioconda::samtools=1.16.1" : null) + + input: + tuple val(sampleId), path(fq) + + output: + tuple val(sampleId), path("${sampleId}.bam"), path("${sampleId}.bam.bai"), emit: bam + + """ + cores=\$((${params.cpus} - 2)) + + minimap2 \ + -x map-ont \ + -a ${params.sarscov2_reference} \ + -t \${cores} \ + ${fq} \ + | samtools view \ + -b \ + - \ + | samtools sort \ + -o ${sampleId}.bam \ + --threads 1 \ + -T ${sampleId} \ + - + + samtools index ${sampleId}.bam + """ +} \ No newline at end of file diff --git a/modules/11_ont_primer_removal.nf b/modules/11_ont_primer_removal.nf new file mode 100755 index 0000000..448525e --- /dev/null +++ b/modules/11_ont_primer_removal.nf @@ -0,0 +1,61 @@ +process OPTIMUS_NO_PRIME { + cpus 2 + memory params.memory + publishDir "${params.outDir}/${sampleId}/optimus_no_prime", mode: "copy", pattern: "*bed*" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::bedtools=2.30.0 bioconda::samtools=1.16.1" : null) + + input: + tuple val(sampleId), path(bam), path(bai) + + output: + tuple val(sampleId), path(bam), path(bai), env(primerset), emit: prediction + path("${sampleId}_*") + + """ + ${moduleDir}/optimus_no_prime.sh ${params.primerDir} ${sampleId} ${bam} + primerset=\$(python ${moduleDir}/optimus_no_prime.py ./ ${sampleId}) + """ +} + +process PRIMER_SOFTCLIP { + cpus 2 + memory params.memory + publishDir "${params.outDir}/${sampleId}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::samtools=1.16.1" : null) + + input: + tuple val(sampleId), path(bam), path(bai), val(bed) + + output: + tuple val(sampleId), path("*_noprime.bam"), path("*_noprime.bam.bai"), emit: bam + path("${sampleId}_*"), optional: true + + script: + if( bed == 'NA' ){ + """ + mv ${bam} ${sampleId}_noprime.bam + mv ${bai} ${sampleId}_noprime.bam.bai + """ + } else { + """ + samtools ampliconclip \ + -b ${params.primerDir}/${bed} \ + -f ${sampleId}_ampliconclip_stats.txt \ + --both-ends \ + --strand \ + --tolerance 5 \ + ${bam} \ + | samtools sort \ + -o ${sampleId}_noprime.bam \ + -T ${sampleId} \ + - + + samtools index ${sampleId}_noprime.bam + """ + } +} + diff --git a/modules/12_ont_variant_calling.nf b/modules/12_ont_variant_calling.nf new file mode 100755 index 0000000..aabbffa --- /dev/null +++ b/modules/12_ont_variant_calling.nf @@ -0,0 +1,66 @@ +process NANOCALLER { + cpus params.cpus + memory params.memory + publishDir "${params.outDir}/${sampleId}/variant_calls/nanocaller", mode: "copy", pattern: "*vcf*" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::nanocaller=3.0.1" : null) + + input: + tuple val(sampleId), path(bam), path(bai) + + output: + tuple val(sampleId), path(bam), path(bai), path("${sampleId}.vcf.gz"), val("nanocaller"), emit: vcf + path("${sampleId}*") + + """ + NanoCaller \ + --bam ${bam} \ + --ref ${params.reference} \ + --preset ont \ + --maxcov 1000 \ + --mincov 10 \ + --cpu ${params.cpus} \ + --min_allele_freq 0.05 \ + --ins_threshold 0.05 \ + --del_threshold 0.05 \ + --sample ${sampleId} \ + --prefix ${sampleId} + """ +} + +process CLAIR3 { + cpus params.cpus + memory params.memory + publishDir "${params.outDir}/${sampleId}/variant_calls", mode: "copy", pattern: "clair3" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::clair3=1.0.0" : null) + + input: + tuple val(sampleId), path(bam), path(bai) + + output: + tuple val(sampleId), path(bam), path(bai), path("clair3/merge_output.vcf.gz"), val("clair3"), emit: vcf + path("clair3") + + """ + run_clair3.sh \ + --bam_fn=${bam} \ + --ref_fn=${params.reference} \ + --threads=${params.cpus} \ + --platform="ont" \ + --model_path=${params.clair3model} \ + --include_all_ctgs \ + --min_coverage=10 \ + --snp_min_af=0.05 \ + --indel_min_af=0.05 \ + --output=clair3 \ + --chunk_size=5000 + + # Quick fix for RefCalls from Clair3 which mess up Vafator run_clair3 + # Remove RefCalls from VCF + mv clair3/merge_output.vcf.gz clair3/temp.vcf.gz + zcat clair3/temp.vcf.gz | grep -v RefCall | gzip > clair3/merge_output.vcf.gz + """ +} diff --git a/modules/13_ont_annotate.nf b/modules/13_ont_annotate.nf new file mode 100755 index 0000000..63b32b1 --- /dev/null +++ b/modules/13_ont_annotate.nf @@ -0,0 +1,169 @@ +process VAFATOR { + cpus 1 + memory "3 GB" + tag "${sampleId}" + publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" + + conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null) + + input: + tuple val(sampleId), path(bam), path(bai), path(vcf), val(caller) + + output: + tuple val(sampleId), path("${sampleId}_${caller}_vaf.vcf"), val(caller), emit: annotated_vcf + + script: + mq_param = params.vafator_min_mapping_quality != false ? "--mapping-quality " + params.vafator_min_mapping_quality : "" + bq_param = params.vafator_min_base_quality != false ? "--base-call-quality " + params.vafator_min_base_quality : "" + """ + vafator \ + --input-vcf ${vcf} \ + --output-vcf ${sampleId}_${caller}_vaf.vcf \ + --bam vafator ${bam} ${mq_param} ${bq_param} + """ +} + +process SNPEFF { + cpus 1 + memory params.memory + publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::snpeff=5.0 bioconda::samtools=1.16.1" : null) + + input: + tuple val(sampleId), path(vcf), val(caller) + val(snpeff_data) + val(snpeff_config) + val(snpeff_organism) + + output: + tuple val(sampleId), path("${sampleId}_${caller}_snpeff.vcf.gz"), emit: annotated_vcf + path("${sampleId}_${caller}_snpeff.vcf.gz.tbi") + + """ + # for some reason the snpEff.config file needs to be in the folder where snpeff runs... + cp ${snpeff_config} . + + snpEff eff -Xmx${params.memory} -dataDir ${snpeff_data} \ + -noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs \ + ${snpeff_organism} ${vcf} | bgzip -c > ${sampleId}_${caller}_snpeff.vcf.gz + + tabix -p vcf ${sampleId}_${caller}_snpeff.vcf.gz + """ +} + + +process VARIANT_VAF_ANNOTATION { + cpus 1 + memory "3 GB" + publishDir "${params.outDir}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) + + input: + tuple val(sampleId), path(vcf) + + output: + tuple val(sampleId), path("${sampleId}.lowfreq_subclonal.vcf"), emit: vaf_annotated + + """ + tabix -p vcf ${vcf} + + # annotates low frequency and subclonal variants + bcftools view -Ob ${vcf} | \ + bcftools filter \ + --exclude 'INFO/vafator_af < ${params.low_frequency_variant_threshold}' \ + --soft-filter LOW_FREQUENCY - | \ + bcftools filter \ + --exclude 'INFO/vafator_af >= ${params.low_frequency_variant_threshold} && INFO/vafator_af < ${params.subclonal_variant_threshold}' \ + --soft-filter SUBCLONAL \ + --output-type v - | \ + bcftools filter \ + --exclude 'INFO/vafator_af >= ${params.subclonal_variant_threshold} && INFO/vafator_af < ${params.lq_clonal_variant_threshold}' \ + --soft-filter LOW_QUALITY_CLONAL \ + --output-type v - > ${sampleId}.lowfreq_subclonal.vcf + """ +} + +/** +Add SARS-CoV-2 specific annotations: conservation, Pfam domains and problematic sites. +Also, according to problematic sites described in DeMaio et al. (2020) we filter out any variants at the beginning and +end of the genome. +*/ +process VARIANT_SARSCOV2_ANNOTATION { + cpus 1 + memory "3 GB" + publishDir "${params.output}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) + + input: + tuple val(sampleId), path(vcf) + + output: + tuple val(sampleId), path("${sampleId}.annotated_sarscov2.vcf"), emit: annotated_vcfs + + """ + bcftools annotate \ + --annotations ${params.conservation_sarscov2} \ + --header-lines ${params.conservation_sarscov2_header} \ + -c CHROM,FROM,TO,CONS_HMM_SARS_COV_2 \ + --output-type z ${vcf} | \ + bcftools annotate \ + --annotations ${params.conservation_sarbecovirus} \ + --header-lines ${params.conservation_sarbecovirus_header} \ + -c CHROM,FROM,TO,CONS_HMM_SARBECOVIRUS \ + --output-type z - | \ + bcftools annotate \ + --annotations ${params.conservation_vertebrate} \ + --header-lines ${params.conservation_vertebrate_header} \ + -c CHROM,FROM,TO,CONS_HMM_VERTEBRATE_COV \ + --output-type z - | \ + bcftools annotate \ + --annotations ${params.pfam_names} \ + --header-lines ${params.pfam_names_header} \ + -c CHROM,FROM,TO,PFAM_NAME \ + --output-type z - | \ + bcftools annotate \ + --annotations ${params.pfam_descriptions} \ + --header-lines ${params.pfam_descriptions_header} \ + -c CHROM,FROM,TO,PFAM_DESCRIPTION - | \ + bcftools filter \ + --exclude 'POS <= 55 | POS >= 29804' \ + --output-type z - > annotated_sarscov2.vcf.gz + + tabix -p vcf annotated_sarscov2.vcf.gz + + bcftools annotate \ + --annotations ${params.problematic_sites} \ + --columns INFO/problematic:=FILTER annotated_sarscov2.vcf.gz > ${sampleId}.annotated_sarscov2.vcf + """ +} + +process PANGOLIN { + cpus 1 + memory "3 GB" + publishDir "${params.outDir}", mode: "copy" + tag "${sampleId}" + + conda (params.enable_conda ? "bioconda::pangolin=4.1.2" : null) + + input: + tuple val(sampleId), path(fasta) + + output: + tuple val(name), path("*pangolin.csv") + + """ + mkdir tmp + #--decompress-model + pangolin \ + ${fasta} \ + --outfile ${sampleId}.pangolin.csv \ + --tempdir ./tmp \ + --threads 1 + """ +} \ No newline at end of file From a90879238e61d02d6a4871d91ca53c6bb1873343 Mon Sep 17 00:00:00 2001 From: Patrick Sorn Date: Wed, 14 Jun 2023 10:46:02 +0200 Subject: [PATCH 2/3] Updated main.nf with ONT pipeline calls --- main.nf | 54 ++++++++---- modules/13_ont_annotate.nf | 169 ------------------------------------- 2 files changed, 39 insertions(+), 184 deletions(-) delete mode 100755 modules/13_ont_annotate.nf diff --git a/main.nf b/main.nf index 93beb95..34fd95f 100755 --- a/main.nf +++ b/main.nf @@ -13,6 +13,11 @@ include { VARIANT_ANNOTATION; VARIANT_SARSCOV2_ANNOTATION; VARIANT_VAF_ANNOTATION; VAFATOR } from './modules/06_variant_annotation' include { PANGOLIN_LINEAGE; VCF2FASTA } from './modules/07_lineage_annotation' include { BGZIP } from './modules/08_compress_vcf' +include { FASTQC; NANOPLOT } from './modules/09_ont_qc' +include { NANOFILT; PORECHOP; CHOPPER; PORECHOP_ABI; MINIMAP2 } from './modules/10_ont_preprocess' +include { OPTIMUS_NO_PRIME; PRIMER_SOFTCLIP } from './modules/11_ont_primer_removal' +include { NANOCALLER; CLAIR3 } from './modules/12_ont_variant_calling' + params.help= false @@ -60,6 +65,8 @@ params.input_fastqs_list = false params.input_fastas_list = false params.input_vcfs_list = false params.input_bams_list = false +params.input_ont = false + if (params.help) { log.info params.help_message @@ -239,42 +246,55 @@ workflow { bam_files = ALIGNMENT_PAIRED_END.out } else { - READ_TRIMMING_SINGLE_END(input_fastqs) - ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) - bam_files = ALIGNMENT_SINGLE_END.out - } - BAM_PREPROCESSING(bam_files, reference) - preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams - - if (primers) { - PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) - preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + if (params.input_ont) { + FASTQC(input_fastqs) + PORECHOP(input_fastqs) + CHOPPER(PORECHOP.out.fq) + NANOPLOT(CHOPPER.out.fq) + input_bams = MINIMAP2(CHOPPER.out.fq).bam + } + else { + READ_TRIMMING_SINGLE_END(input_fastqs) + ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) + bam_files = ALIGNMENT_SINGLE_END.out } - COVERAGE_ANALYSIS(preprocessed_bams) + if (params.input_ont) { + preprocessed_bams = PRIMER_SOFTCLIP(OPTIMUS_NO_PRIME(input_bams).prediction) + } + else { + BAM_PREPROCESSING(bam_files, reference) + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + + if (primers) { + PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) + preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + } + COVERAGE_ANALYSIS(preprocessed_bams) // variant calling vcfs_to_normalize = null - if (!params.skip_bcftools) { + if (!params.skip_bcftools && !params.input_ont) { VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out) } - if (!params.skip_lofreq) { + if (!params.skip_lofreq && !params.input_ont) { VARIANT_CALLING_LOFREQ(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out) } - if (!params.skip_gatk) { + if (!params.skip_gatk && !params.input_ont) { VARIANT_CALLING_GATK(preprocessed_bams, reference) vcfs_to_normalize = vcfs_to_normalize == null? VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out) } - if (!params.skip_ivar && gff) { + if (!params.skip_ivar && gff && !params.input_ont) { VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff) IVAR2VCF(VARIANT_CALLING_IVAR.out, reference) vcfs_to_normalize = vcfs_to_normalize == null? IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out) } + } else if (input_fastas) { if (!params.skip_pangolin) { @@ -313,6 +333,10 @@ workflow { } if (preprocessed_bams) { + if (params.input_ont) { + CLAIR3(preprocessed_bams.bam) + NANOCALLER(preprocessed_bams.bam) + } // we can only add technical annotations when we have the reads VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0)) VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf) diff --git a/modules/13_ont_annotate.nf b/modules/13_ont_annotate.nf deleted file mode 100755 index 63b32b1..0000000 --- a/modules/13_ont_annotate.nf +++ /dev/null @@ -1,169 +0,0 @@ -process VAFATOR { - cpus 1 - memory "3 GB" - tag "${sampleId}" - publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" - - conda (params.enable_conda ? "bioconda::vafator=1.2.5" : null) - - input: - tuple val(sampleId), path(bam), path(bai), path(vcf), val(caller) - - output: - tuple val(sampleId), path("${sampleId}_${caller}_vaf.vcf"), val(caller), emit: annotated_vcf - - script: - mq_param = params.vafator_min_mapping_quality != false ? "--mapping-quality " + params.vafator_min_mapping_quality : "" - bq_param = params.vafator_min_base_quality != false ? "--base-call-quality " + params.vafator_min_base_quality : "" - """ - vafator \ - --input-vcf ${vcf} \ - --output-vcf ${sampleId}_${caller}_vaf.vcf \ - --bam vafator ${bam} ${mq_param} ${bq_param} - """ -} - -process SNPEFF { - cpus 1 - memory params.memory - publishDir "${params.outDir}/${sampleId}/variant_calls/${caller}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "bioconda::snpeff=5.0 bioconda::samtools=1.16.1" : null) - - input: - tuple val(sampleId), path(vcf), val(caller) - val(snpeff_data) - val(snpeff_config) - val(snpeff_organism) - - output: - tuple val(sampleId), path("${sampleId}_${caller}_snpeff.vcf.gz"), emit: annotated_vcf - path("${sampleId}_${caller}_snpeff.vcf.gz.tbi") - - """ - # for some reason the snpEff.config file needs to be in the folder where snpeff runs... - cp ${snpeff_config} . - - snpEff eff -Xmx${params.memory} -dataDir ${snpeff_data} \ - -noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs \ - ${snpeff_organism} ${vcf} | bgzip -c > ${sampleId}_${caller}_snpeff.vcf.gz - - tabix -p vcf ${sampleId}_${caller}_snpeff.vcf.gz - """ -} - - -process VARIANT_VAF_ANNOTATION { - cpus 1 - memory "3 GB" - publishDir "${params.outDir}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) - - input: - tuple val(sampleId), path(vcf) - - output: - tuple val(sampleId), path("${sampleId}.lowfreq_subclonal.vcf"), emit: vaf_annotated - - """ - tabix -p vcf ${vcf} - - # annotates low frequency and subclonal variants - bcftools view -Ob ${vcf} | \ - bcftools filter \ - --exclude 'INFO/vafator_af < ${params.low_frequency_variant_threshold}' \ - --soft-filter LOW_FREQUENCY - | \ - bcftools filter \ - --exclude 'INFO/vafator_af >= ${params.low_frequency_variant_threshold} && INFO/vafator_af < ${params.subclonal_variant_threshold}' \ - --soft-filter SUBCLONAL \ - --output-type v - | \ - bcftools filter \ - --exclude 'INFO/vafator_af >= ${params.subclonal_variant_threshold} && INFO/vafator_af < ${params.lq_clonal_variant_threshold}' \ - --soft-filter LOW_QUALITY_CLONAL \ - --output-type v - > ${sampleId}.lowfreq_subclonal.vcf - """ -} - -/** -Add SARS-CoV-2 specific annotations: conservation, Pfam domains and problematic sites. -Also, according to problematic sites described in DeMaio et al. (2020) we filter out any variants at the beginning and -end of the genome. -*/ -process VARIANT_SARSCOV2_ANNOTATION { - cpus 1 - memory "3 GB" - publishDir "${params.output}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "conda-forge::gsl=2.7 bioconda::bcftools=1.14" : null) - - input: - tuple val(sampleId), path(vcf) - - output: - tuple val(sampleId), path("${sampleId}.annotated_sarscov2.vcf"), emit: annotated_vcfs - - """ - bcftools annotate \ - --annotations ${params.conservation_sarscov2} \ - --header-lines ${params.conservation_sarscov2_header} \ - -c CHROM,FROM,TO,CONS_HMM_SARS_COV_2 \ - --output-type z ${vcf} | \ - bcftools annotate \ - --annotations ${params.conservation_sarbecovirus} \ - --header-lines ${params.conservation_sarbecovirus_header} \ - -c CHROM,FROM,TO,CONS_HMM_SARBECOVIRUS \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.conservation_vertebrate} \ - --header-lines ${params.conservation_vertebrate_header} \ - -c CHROM,FROM,TO,CONS_HMM_VERTEBRATE_COV \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.pfam_names} \ - --header-lines ${params.pfam_names_header} \ - -c CHROM,FROM,TO,PFAM_NAME \ - --output-type z - | \ - bcftools annotate \ - --annotations ${params.pfam_descriptions} \ - --header-lines ${params.pfam_descriptions_header} \ - -c CHROM,FROM,TO,PFAM_DESCRIPTION - | \ - bcftools filter \ - --exclude 'POS <= 55 | POS >= 29804' \ - --output-type z - > annotated_sarscov2.vcf.gz - - tabix -p vcf annotated_sarscov2.vcf.gz - - bcftools annotate \ - --annotations ${params.problematic_sites} \ - --columns INFO/problematic:=FILTER annotated_sarscov2.vcf.gz > ${sampleId}.annotated_sarscov2.vcf - """ -} - -process PANGOLIN { - cpus 1 - memory "3 GB" - publishDir "${params.outDir}", mode: "copy" - tag "${sampleId}" - - conda (params.enable_conda ? "bioconda::pangolin=4.1.2" : null) - - input: - tuple val(sampleId), path(fasta) - - output: - tuple val(name), path("*pangolin.csv") - - """ - mkdir tmp - #--decompress-model - pangolin \ - ${fasta} \ - --outfile ${sampleId}.pangolin.csv \ - --tempdir ./tmp \ - --threads 1 - """ -} \ No newline at end of file From 177bae94c1aaed97ab4d685f78334a14be378ebd Mon Sep 17 00:00:00 2001 From: Pablo Riesgo-Ferreiro Date: Thu, 25 Jan 2024 12:40:25 +0000 Subject: [PATCH 3/3] fix indentation --- main.nf | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/main.nf b/main.nf index 34fd95f..1c1b3a3 100755 --- a/main.nf +++ b/main.nf @@ -248,28 +248,28 @@ workflow { else { if (params.input_ont) { FASTQC(input_fastqs) - PORECHOP(input_fastqs) - CHOPPER(PORECHOP.out.fq) - NANOPLOT(CHOPPER.out.fq) - input_bams = MINIMAP2(CHOPPER.out.fq).bam + PORECHOP(input_fastqs) + CHOPPER(PORECHOP.out.fq) + NANOPLOT(CHOPPER.out.fq) + input_bams = MINIMAP2(CHOPPER.out.fq).bam } else { - READ_TRIMMING_SINGLE_END(input_fastqs) - ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) - bam_files = ALIGNMENT_SINGLE_END.out + READ_TRIMMING_SINGLE_END(input_fastqs) + ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) + bam_files = ALIGNMENT_SINGLE_END.out } if (params.input_ont) { preprocessed_bams = PRIMER_SOFTCLIP(OPTIMUS_NO_PRIME(input_bams).prediction) } else { - BAM_PREPROCESSING(bam_files, reference) - preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams - - if (primers) { - PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) - preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam - } - COVERAGE_ANALYSIS(preprocessed_bams) + BAM_PREPROCESSING(bam_files, reference) + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + + if (primers) { + PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) + preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + } + COVERAGE_ANALYSIS(preprocessed_bams) // variant calling vcfs_to_normalize = null