Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate ont pipeline #64

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions bin/optimus_no_prime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import sys
import re

def list_primersets(path):
ignore = ("5p_cov",".txt",".bedGraph")
filenames = os.listdir(path)
filtered_filenames = [f for f in filenames if not any(j in f for j in ignore) and ".bed" in f]
return filtered_filenames

def get_libsize(path):
with open(path, 'r') as file:
sum = 0
for line in file:
columns = line.split()
if len(columns) >= 5:
sum += float(columns[4])
return sum

def get_score(path, T, G):
A = 0
S = 0
with open(path, 'r') as file:
for line in file:
columns = line.split()
if len(columns) >= 7:
S += float(columns[2]) - float(columns[1])
A += float(columns[6])
S = S / G
P = 1 - (A/T)
score = A / (T * S * P)
return score


# Run
G = 29903 # SARS CoV2 Genomelength

# Path where to find primerset coverages generated by bedtools (optimus_no_prime.sh)
path = sys.argv[1]

sampleId = sys.argv[2]

total_cov_file = [f for f in os.listdir(path) if "5p_cov" in f and "Graph" not in f]
primersets = list_primersets(path)

max = 0.0
likely_primerset = primersets[0]
for set in primersets:
libsize = get_libsize(path + total_cov_file[0])
score = get_score(path + set, libsize, G)
if score >= max:
likely_primerset = set
max = score

# Remove sampleId from primer-set name
likely_primerset = re.sub(sampleId + "_","",likely_primerset,1)

# If score is too low, it could either mean the used primer-set is not in the library or primers have already been trimmed.
# In this case return NA and primer clipping will be skipped.
if max < 5:
likely_primerset = "NA"

sys.stdout.write(likely_primerset)
sys.stdout.flush()
sys.exit(0)
23 changes: 23 additions & 0 deletions bin/optimus_no_prime.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

primerDir=${1}
sampleId=${2}
bam=${3}

# Get coverage of read-ends across genome
bedtools genomecov -5 -strand + -ibam ${bam} -dz \
| awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"+"}' > temp.bed
bedtools genomecov -5 -strand - -ibam ${bam} -dz \
| awk '{OFS="\t"}{x=$2+1}{print $1,$2,x,$1":"$2"-"x":-",$3,"-"}' >> temp.bed
sort -k1,1 -k2,2n temp.bed > ${sampleId}_5p_cov.bed
cut -f1,2,3,5 temp.bed > ${sampleId}_5p_cov.bedGraph
rm temp.bed

# Overlap read-end coverage with primer BED files
primerList=($(find ${primerDir} -name "*.bed" -printf "%f\n"))
for pset in ${primerList[@]}
do
bedtools map -a ${primerDir}/${pset} -b ${sampleId}_5p_cov.bed -s -c 5 -o sum -null 0 \
| awk -F'\t' '{OFS="\t"}{print $1,$2,$3,$4,$5,$6,$(NF)}' \
> ${sampleId}_${pset}
done
32 changes: 28 additions & 4 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ include { VARIANT_ANNOTATION; VARIANT_SARSCOV2_ANNOTATION;
VARIANT_VAF_ANNOTATION; VAFATOR } from './modules/06_variant_annotation'
include { PANGOLIN_LINEAGE; VCF2FASTA } from './modules/07_lineage_annotation'
include { BGZIP } from './modules/08_compress_vcf'
include { FASTQC; NANOPLOT } from './modules/09_ont_qc'
include { NANOFILT; PORECHOP; CHOPPER; PORECHOP_ABI; MINIMAP2 } from './modules/10_ont_preprocess'
include { OPTIMUS_NO_PRIME; PRIMER_SOFTCLIP } from './modules/11_ont_primer_removal'
include { NANOCALLER; CLAIR3 } from './modules/12_ont_variant_calling'



params.help= false
Expand Down Expand Up @@ -60,6 +65,8 @@ params.input_fastqs_list = false
params.input_fastas_list = false
params.input_vcfs_list = false
params.input_bams_list = false
params.input_ont = false


if (params.help) {
log.info params.help_message
Expand Down Expand Up @@ -239,10 +246,22 @@ workflow {
bam_files = ALIGNMENT_PAIRED_END.out
}
else {
if (params.input_ont) {
FASTQC(input_fastqs)
PORECHOP(input_fastqs)
CHOPPER(PORECHOP.out.fq)
NANOPLOT(CHOPPER.out.fq)
input_bams = MINIMAP2(CHOPPER.out.fq).bam
}
else {
READ_TRIMMING_SINGLE_END(input_fastqs)
ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
bam_files = ALIGNMENT_SINGLE_END.out
}
if (params.input_ont) {
preprocessed_bams = PRIMER_SOFTCLIP(OPTIMUS_NO_PRIME(input_bams).prediction)
}
else {
BAM_PREPROCESSING(bam_files, reference)
preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams

Expand All @@ -254,27 +273,28 @@ workflow {

// variant calling
vcfs_to_normalize = null
if (!params.skip_bcftools) {
if (!params.skip_bcftools && !params.input_ont) {
VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
}
if (!params.skip_lofreq) {
if (!params.skip_lofreq && !params.input_ont) {
VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
}
if (!params.skip_gatk) {
if (!params.skip_gatk && !params.input_ont) {
VARIANT_CALLING_GATK(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
}
if (!params.skip_ivar && gff) {
if (!params.skip_ivar && gff && !params.input_ont) {
VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)
}

}
else if (input_fastas) {
if (!params.skip_pangolin) {
Expand Down Expand Up @@ -313,6 +333,10 @@ workflow {
}

if (preprocessed_bams) {
if (params.input_ont) {
CLAIR3(preprocessed_bams.bam)
NANOCALLER(preprocessed_bams.bam)
}
// we can only add technical annotations when we have the reads
VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)
Expand Down
43 changes: 43 additions & 0 deletions modules/09_ont_qc.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
process FASTQC {
cpus 2
memory "3 GB"
publishDir "${params.outDir}/${sampleId}/qc", mode: "copy"
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null)

input:
tuple val(sampleId), path(fq)

output:
path("*fastqc*")

"""
fastqc \
--threads 2 \
${fq}
"""
}

process NANOPLOT {
cpus 2
memory "3 GB"
publishDir "${params.outDir}/${sampleId}/qc", mode: "copy"
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::nanoplot=1.40.2" : null)

input:
tuple val(sampleId), path(fq)

output:
path("*${sampleId}_*")

"""
NanoPlot \
--fastq ${fq} \
-p ${sampleId}_ \
-t 1 \
--tsv_stats
"""
}
132 changes: 132 additions & 0 deletions modules/10_ont_preprocess.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
process NANOFILT {
cpus 3
memory params.memory
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::nanofilt=2.8.0" : null)

input:
tuple val(sampleId), path(inputFq)

output:
tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq

"""
gunzip -c ${inputFq} | NanoFilt \
--quality 10 \
--length 300 \
--logfile ${sampleId}_qctrim.log \
| gzip > ${sampleId}_qctrim.fq.gz
"""
}

process PORECHOP {
cpus params.cpus
memory params.memory
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::porechop=0.2.4" : null)

input:
tuple val(sampleId), path(inputFq)

output:
tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq
path("${sampleId}_porechop.log")

"""
porechop \
--input ${inputFq} \
--output ${sampleId}_noAdapter.fq.gz \
--threads ${params.cpus} \
--format "fastq" \
--verbosity 1 > ${sampleId}_porechop.log
"""
}

process CHOPPER {
cpus params.cpus+1
memory params.memory
publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log"
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::chopper=0.5.0" : null)

input:
tuple val(sampleId), path(inputFq)

output:
tuple val(sampleId), path("${sampleId}_qctrim.fq.gz"), emit: fq
path("${sampleId}_chopper.log")

"""
cat ${inputFq} \
| chopper \
--quality 10 \
--minlength 300 \
--threads ${params.cpus} \
2> ${sampleId}_chopper.log \
| gzip > ${sampleId}_qctrim.fq.gz
"""
}

process PORECHOP_ABI {
cpus params.cpus
memory params.memory
publishDir "${params.outDir}/${sampleId}", mode: "copy", pattern: "*.log"
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::porechop_abi=0.5.0" : null)

input:
tuple val(sampleId), path(inputFq)

output:
tuple val(sampleId), path("${sampleId}_noAdapter.fq.gz"), emit: fq
path("${sampleId}_porechop_abi.log")

"""
porechop_abi \
--ab_initio \
--input ${inputFq} \
--output ${sampleId}_noAdapter.fq.gz \
--threads ${params.cpus} \
--format "fastq.gz" \
--verbosity 1 > "${sampleId}_porechop_abi.log"
"""
}

process MINIMAP2 {
cpus params.cpus
memory params.memory
publishDir "${params.outDir}/${sampleId}", mode: "copy"
tag "${sampleId}"

conda (params.enable_conda ? "bioconda::minimap2=2.24 bioconda::samtools=1.16.1" : null)

input:
tuple val(sampleId), path(fq)

output:
tuple val(sampleId), path("${sampleId}.bam"), path("${sampleId}.bam.bai"), emit: bam

"""
cores=\$((${params.cpus} - 2))

minimap2 \
-x map-ont \
-a ${params.sarscov2_reference} \
-t \${cores} \
${fq} \
| samtools view \
-b \
- \
| samtools sort \
-o ${sampleId}.bam \
--threads 1 \
-T ${sampleId} \
-

samtools index ${sampleId}.bam
"""
}
Loading
Loading