TRON-Bioinformatics · priesgo · Jun 14, 2023 · Jun 14, 2023 · Jan 25, 2024
diff --git a/bin/optimus_no_prime.py b/bin/optimus_no_prime.py
@@ -0,0 +1,65 @@
+import os
+import sys
+import re
+
+def list_primersets(path):
+    ignore = ("5p_cov",".txt",".bedGraph")
+    filenames = os.listdir(path)
+    filtered_filenames = [f for f in filenames if not any(j in f for j in ignore) and ".bed" in f]
+    return filtered_filenames
+
+def get_libsize(path):
+    with open(path, 'r') as file:
+        sum = 0
+        for line in file:
+            columns = line.split()
+            if len(columns) >= 5:
+                sum += float(columns[4])
+    return sum
+
+def get_score(path, T, G):
+    A = 0
+    S = 0
+    with open(path, 'r') as file:
+        for line in file:
+            columns = line.split()
+            if len(columns) >= 7:
+                S += float(columns[2]) - float(columns[1])
+                A += float(columns[6])
+    S = S / G
+    P = 1 - (A/T)
+    score = A / (T * S * P)
+    return score
+
+
+# Run
+G = 29903 # SARS CoV2 Genomelength
+
+# Path where to find primerset coverages generated by bedtools (optimus_no_prime.sh)
+path = sys.argv[1]
+
+sampleId = sys.argv[2]
+
+total_cov_file = [f for f in os.listdir(path) if "5p_cov" in f and "Graph" not in f]
+primersets = list_primersets(path)
+
+max = 0.0
+likely_primerset = primersets[0]
+for set in primersets:
+    libsize = get_libsize(path + total_cov_file[0])
+    score = get_score(path + set, libsize, G)
+    if score >= max:
+        likely_primerset = set
+        max = score
+
+# Remove sampleId from primer-set name
+likely_primerset = re.sub(sampleId + "_","",likely_primerset,1)
+
+# If score is too low, it could either mean the used primer-set is not in the library or primers have already been trimmed.
+# In this case return NA and primer clipping will be skipped.
+if max < 5:
+    likely_primerset = "NA"
+
+sys.stdout.write(likely_primerset)
+sys.stdout.flush()
+sys.exit(0)
diff --git a/bin/optimus_no_prime.sh b/bin/optimus_no_prime.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+primerDir=${1}
+sampleId=${2}
+bam=${3}
+
+# Get coverage of read-ends across genome
+	bedtools genomecov -5 -strand + -ibam ${bam} -dz \
+		| awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"+"}' > temp.bed
+	bedtools genomecov -5 -strand - -ibam ${bam} -dz \
+		| awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"-"}' >> temp.bed
+	sort -k1,1 -k2,2n temp.bed > ${sampleId}_5p_cov.bed
+	cut -f1,2,3,5 temp.bed > ${sampleId}_5p_cov.bedGraph
+	rm temp.bed
+
+# Overlap read-end coverage with primer BED files
+	primerList=($(find ${primerDir} -name "*.bed" -printf "%f\n"))
+	for pset in ${primerList[@]}
+	do
+		bedtools map -a ${primerDir}/${pset} -b ${sampleId}_5p_cov.bed -s -c 5 -o sum -null 0 \
+			| awk -F'\t' '{OFS="\t"}{print $1,$2,$3,$4,$5,$6,$(NF)}' \
+			> ${sampleId}_${pset}
+	done
diff --git a/main.nf b/main.nf
@@ -13,6 +13,11 @@ include { VARIANT_ANNOTATION; VARIANT_SARSCOV2_ANNOTATION;
             VARIANT_VAF_ANNOTATION; VAFATOR } from './modules/06_variant_annotation'
 include { PANGOLIN_LINEAGE; VCF2FASTA } from './modules/07_lineage_annotation'
 include { BGZIP } from './modules/08_compress_vcf'
+include { FASTQC; NANOPLOT } from './modules/09_ont_qc'
+include { NANOFILT; PORECHOP; CHOPPER; PORECHOP_ABI; MINIMAP2 } from './modules/10_ont_preprocess'
+include { OPTIMUS_NO_PRIME; PRIMER_SOFTCLIP } from './modules/11_ont_primer_removal'
+include { NANOCALLER; CLAIR3 } from './modules/12_ont_variant_calling'
+
 
 
 params.help= false
@@ -60,6 +65,8 @@ params.input_fastqs_list = false
 params.input_fastas_list = false
 params.input_vcfs_list = false
 params.input_bams_list = false
+params.input_ont = false
+
 
 if (params.help) {
     log.info params.help_message
@@ -239,10 +246,22 @@ workflow {
             bam_files = ALIGNMENT_PAIRED_END.out
         }
         else {
+	    if (params.input_ont) {
+	        FASTQC(input_fastqs)
+		    PORECHOP(input_fastqs)
+		    CHOPPER(PORECHOP.out.fq)
+		    NANOPLOT(CHOPPER.out.fq)
+		    input_bams = MINIMAP2(CHOPPER.out.fq).bam
+	    }
+	    else {
             READ_TRIMMING_SINGLE_END(input_fastqs)
             ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
             bam_files = ALIGNMENT_SINGLE_END.out
         }
+	if (params.input_ont) {
+	    preprocessed_bams = PRIMER_SOFTCLIP(OPTIMUS_NO_PRIME(input_bams).prediction)
+	}
+	else {
         BAM_PREPROCESSING(bam_files, reference)
         preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams
 
@@ -254,27 +273,28 @@ workflow {
 
         // variant calling
         vcfs_to_normalize = null
-        if (!params.skip_bcftools) {
+        if (!params.skip_bcftools && !params.input_ont) {
             VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
             vcfs_to_normalize = vcfs_to_normalize == null?
                 VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
         }
-        if (!params.skip_lofreq) {
+        if (!params.skip_lofreq && !params.input_ont) {
             VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
             vcfs_to_normalize = vcfs_to_normalize == null?
                 VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
         }
-        if (!params.skip_gatk) {
+        if (!params.skip_gatk && !params.input_ont) {
             VARIANT_CALLING_GATK(preprocessed_bams, reference)
             vcfs_to_normalize = vcfs_to_normalize == null?
                 VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
         }
-        if (!params.skip_ivar && gff) {
+        if (!params.skip_ivar && gff && !params.input_ont) {
             VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
             IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
             vcfs_to_normalize = vcfs_to_normalize == null?
                 IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)
         }
+
     }
     else if (input_fastas) {
         if (!params.skip_pangolin) {
@@ -313,6 +333,10 @@ workflow {
     }
 
     if (preprocessed_bams) {
+        if (params.input_ont) {
+	    CLAIR3(preprocessed_bams.bam)
+	    NANOCALLER(preprocessed_bams.bam)
+	}
         // we can only add technical annotations when we have the reads
         VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
         VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)

diff --git a/modules/09_ont_qc.nf b/modules/09_ont_qc.nf
@@ -0,0 +1,43 @@
+process FASTQC {
+	cpus 2
+	memory "3 GB"
+	publishDir "${params.outDir}/${sampleId}/qc", mode: "copy"
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null)
+
+	input:
+		tuple val(sampleId), path(fq)
+
+	output:
+		path("*fastqc*")
+
+	"""
+	fastqc \
+		--threads 2 \
+		${fq}
+	"""
+}
+
+process NANOPLOT {
+	cpus 2
+	memory "3 GB"
+	publishDir "${params.outDir}/${sampleId}/qc", mode: "copy"
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::nanoplot=1.40.2" : null)
+
+	input:
+		tuple val(sampleId), path(fq)
+
+	output:
+		path("*${sampleId}_*")
+
+	"""
+	NanoPlot \
+		--fastq ${fq} \
+		-p ${sampleId}_ \
+		-t 1 \
+		--tsv_stats
+	"""
+}
diff --git a/modules/10_ont_preprocess.nf b/modules/10_ont_preprocess.nf
@@ -0,0 +1,132 @@
+process NANOFILT {
+	cpus 3
+	memory params.memory
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::nanofilt=2.8.0" : null)
+
+	input:
+		tuple val(sampleId), path(inputFq)
+
+	output:
+		tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq
+
+	"""
+	gunzip -c ${inputFq} | NanoFilt \
+	--quality 10 \
+	--length 300 \
+	--logfile ${sampleId}_qctrim.log \
+	| gzip > ${sampleId}_qctrim.fq.gz
+	"""
+}
+
+process PORECHOP {
+	cpus params.cpus
+	memory params.memory
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null)
+
+	input:
+		tuple val(sampleId), path(inputFq)
+
+	output:
+		tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq
+		path("${sampleId}_porechop.log")
+
+	"""
+	porechop \
+		--input ${inputFq} \
+		--output ${sampleId}_noAdapter.fq.gz \
+		--threads ${params.cpus} \
+		--format "fastq" \
+		--verbosity 1 > ${sampleId}_porechop.log
+	"""
+}
+
+process CHOPPER {
+	cpus params.cpus+1
+	memory params.memory
+	publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log"
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::chopper=0.5.0" : null)
+
+	input:
+		tuple val(sampleId), path(inputFq)
+
+	output:
+		tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq
+		path("${sampleId}_chopper.log")
+
+	"""
+	cat ${inputFq} \
+	| chopper \
+		--quality 10 \
+		--minlength 300 \
+		--threads ${params.cpus} \
+		2> ${sampleId}_chopper.log \
+	| gzip > ${sampleId}_qctrim.fq.gz
+	"""
+}
+
+process PORECHOP_ABI {
+	cpus params.cpus
+	memory params.memory
+	publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log"
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::porechop_abi=0.5.0" : null)
+
+	input:
+		tuple val(sampleId), path(inputFq)
+
+	output:
+		tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq
+		path("${sampleId}_porechop_abi.log")
+
+	"""
+	porechop_abi \
+		--ab_initio \
+		--input ${inputFq} \
+		--output ${sampleId}_noAdapter.fq.gz \
+		--threads ${params.cpus} \
+		--format "fastq.gz" \
+		--verbosity 1 > "${sampleId}_porechop_abi.log"
+	"""
+}
+
+process MINIMAP2 {
+	cpus params.cpus
+	memory params.memory
+	publishDir "${params.outDir}/${sampleId}", mode: "copy"
+	tag "${sampleId}"
+
+	conda (params.enable_conda ? "bioconda::minimap2=2.24 bioconda::samtools=1.16.1" : null)
+
+	input:
+		tuple val(sampleId), path(fq)
+
+	output:
+		tuple val(sampleId), path("${sampleId}.bam"), path("${sampleId}.bam.bai"), emit: bam
+
+	"""
+	cores=\$((${params.cpus} - 2))
+
+	minimap2 \
+		-x map-ont \
+		-a ${params.sarscov2_reference} \
+		-t \${cores} \
+		${fq} \
+	| samtools view \
+		-b \
+		- \
+	| samtools sort \
+		-o ${sampleId}.bam \
+		--threads 1 \
+		-T ${sampleId} \
+		-
+
+	samtools index ${sampleId}.bam
+	"""
+}