Skip to content

Commit

Permalink
Merge pull request #54 from TRON-Bioinformatics/51_pangolin_only_mode
Browse files Browse the repository at this point in the history
Implemented lineage only mode
  • Loading branch information
priesgo authored Oct 10, 2023
2 parents 6d95e04 + 16011f7 commit 6a52c63
Show file tree
Hide file tree
Showing 8 changed files with 218 additions and 98 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ test:
bash tests/scripts/test_11.sh
bash tests/scripts/test_12.sh
bash tests/scripts/test_13.sh
bash tests/scripts/test_14.sh
bash tests/scripts/test_15.sh
bash tests/scripts/test_16.sh
bash tests/scripts/test_python_unit_tests.sh
235 changes: 138 additions & 97 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ params.skip_bcftools = false
params.skip_gatk = false
params.skip_pangolin = false
params.skip_normalization = false
params.lineage_mode = false

// references
params.reference = false
Expand All @@ -39,7 +40,7 @@ params.gff = false
//params.snpeff_config = false
params.snpeff_organism = false
params.primers = false
params.different_virus = false
params.reference_generate = false

params.output = "."
params.min_mapping_quality = 20
Expand Down Expand Up @@ -206,6 +207,11 @@ else if (params.input_vcfs_list != false || params.vcf != false) {
.set { input_vcfs }
}
}
else if(params.reference_generate) {
genome = params.reference
annotation = gff
annotation_name = snpeff_organism
}
else {
log.error "missing some input data"
exit 1
Expand Down Expand Up @@ -240,119 +246,154 @@ if (params.skip_bcftools && params.skip_gatk && params.skip_ivar && params.skip_
exit 1
}

// Lineage mode should only be used when running with assembly or VCF input
// For FASTQ input we can run pangolin as regular rule
if (params.lineage_mode && input_fastqs) {
log.error "lineage mode is only supported with fasta or VCF input"
exit 1
}

workflow {
if (params.reference) {
if (! skip_snpeff && gff) {
SNPEFF_DATABASE(reference, gff, snpeff_organism)
snpeff_data = SNPEFF_DATABASE.out.snpeff_data
snpeff_config = SNPEFF_DATABASE.out.snpeff_config
}

}
if (input_fastqs) {
if (params.reference) {
BWA_INDEX(reference)
reference = BWA_INDEX.out.reference
if (params.lineage_mode) {
log.info "Running lineage mode. In this mode only Pangolin is executed on either fasta or VCF"
if (input_fastas) {
PANGOLIN_LINEAGE(input_fastas)
}
if (library == "paired") {
READ_TRIMMING_PAIRED_END(input_fastqs)
ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference)
bam_files = ALIGNMENT_PAIRED_END.out
else if (input_vcfs) {
if (! params.skip_normalization) {
VARIANT_NORMALIZATION(input_vcfs, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
else {
normalized_vcfs = input_vcfs
}
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
}
else {
READ_TRIMMING_SINGLE_END(input_fastqs)
ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
bam_files = ALIGNMENT_SINGLE_END.out
}
else if (params.reference_generate) {
log.info "Running genome/annotation generate mode. This step generates only bwa and snpEff databases..."
// Custom reference mode. This mode is added to prepare the reference just once when processing many samples
if (genome) {
BWA_INDEX(genome)
if (!skip_snpeff) {
SNPEFF_DATABASE(genome, gff, snpeff_organism)
}
}
BAM_PREPROCESSING(bam_files, reference)
preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams
}
else {
// Run genome and annotation steps for single sample
if (params.reference) {
if (! skip_snpeff && gff) {
SNPEFF_DATABASE(reference, gff, snpeff_organism)
snpeff_data = SNPEFF_DATABASE.out.snpeff_data
snpeff_config = SNPEFF_DATABASE.out.snpeff_config
}

if (primers) {
PRIMER_TRIMMING_IVAR(preprocessed_bams, primers)
preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam
}
COVERAGE_ANALYSIS(preprocessed_bams)

// variant calling
vcfs_to_normalize = null
if (!params.skip_bcftools) {
VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
if (input_fastqs) {
if (params.reference) {
BWA_INDEX(reference)
reference = BWA_INDEX.out.reference
}
if (library == "paired") {
READ_TRIMMING_PAIRED_END(input_fastqs)
ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference)
bam_files = ALIGNMENT_PAIRED_END.out
}
else {
READ_TRIMMING_SINGLE_END(input_fastqs)
ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
bam_files = ALIGNMENT_SINGLE_END.out
}
BAM_PREPROCESSING(bam_files, reference)
preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams

if (primers) {
PRIMER_TRIMMING_IVAR(preprocessed_bams, primers)
preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam
}
COVERAGE_ANALYSIS(preprocessed_bams)

// variant calling
vcfs_to_normalize = null
if (!params.skip_bcftools) {
VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
}
if (!params.skip_lofreq) {
VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
}
if (!params.skip_gatk) {
VARIANT_CALLING_GATK(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
}
if (!params.skip_ivar && gff) {
VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)
}
}
if (!params.skip_lofreq) {
VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
else if (input_fastas) {
if (!params.skip_pangolin) {
// pangolin from fasta
PANGOLIN_LINEAGE(input_fastas)
}
// assembly variant calling
VARIANT_CALLING_ASSEMBLY(input_fastas, reference)
vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out
}
if (!params.skip_gatk) {
VARIANT_CALLING_GATK(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
else if (input_vcfs) {
vcfs_to_normalize = input_vcfs
}
if (!params.skip_ivar && gff) {
VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)

if (! params.skip_normalization) {
VARIANT_NORMALIZATION(vcfs_to_normalize, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
}
else if (input_fastas) {
if (!params.skip_pangolin) {
// pangolin from fasta
PANGOLIN_LINEAGE(input_fastas)
else {
normalized_vcfs = vcfs_to_normalize
}

// assembly variant calling
VARIANT_CALLING_ASSEMBLY(input_fastas, reference)
vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out
}
else if (input_vcfs) {
vcfs_to_normalize = input_vcfs
}

if (! params.skip_normalization) {
VARIANT_NORMALIZATION(vcfs_to_normalize, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
else {
normalized_vcfs = vcfs_to_normalize
}

if (input_fastqs || input_vcfs) {
// pangolin from VCF on the normalized VCFs
if (!params.skip_pangolin) {
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
if (input_fastqs || input_vcfs) {
// pangolin from VCF on the normalized VCFs
if (!params.skip_pangolin) {
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
}
}
}

if (! skip_sarscov2_annotations) {
// only optionally add SARS-CoV-2 specific annotations
VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs)
normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs
}
if (! skip_sarscov2_annotations) {
// only optionally add SARS-CoV-2 specific annotations
VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs)
normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs
}

if (preprocessed_bams) {
// we can only add technical annotations when we have the reads
VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)
normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated
}
if (preprocessed_bams) {
// we can only add technical annotations when we have the reads
VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)
normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated
}

// NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly
if (gff) {
PHASING(normalized_vcfs, reference, gff)
normalized_vcfs = PHASING.out
}
// NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly
if (gff) {
PHASING(normalized_vcfs, reference, gff)
normalized_vcfs = PHASING.out
}

if (! skip_snpeff) {
// only when configured we run SnpEff
VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism)
normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs
}
else {
BGZIP(normalized_vcfs)
if (! skip_snpeff) {
// only when configured we run SnpEff
VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism)
normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs
}
else {
BGZIP(normalized_vcfs)
}
}
}
5 changes: 5 additions & 0 deletions modules/00_prepare_annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ process BWA_INDEX {
path("reference/sequences.fa"), emit: reference
path("reference/sequences.fa.fai"), emit: fai
path("reference/sequences.dict"), emit: gatk_dict
path("reference/sequences.fa.0123")
path("reference/sequences.fa.amb")
path("reference/sequences.fa.ann")
path("reference/sequences.fa.bwt.2bit.64")
path("reference/sequences.fa.pac")

script:
memory = "${params.memory}".replaceAll(" ", "").toLowerCase()
Expand Down
3 changes: 2 additions & 1 deletion modules/07_lineage_annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ process PANGOLIN_LINEAGE {

when:
// only runs pangolin on LoFreq and the assembly results
caller == "lofreq" || caller == "assembly"
// JoHa: added input to run lineage mode for vcf input
caller == "lofreq" || caller == "assembly" || caller == "input"

shell:
"""
Expand Down
32 changes: 32 additions & 0 deletions tests/scripts/test_14.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

##################################################################################
# Genome generate
##################################################################################
echo "Running CoVigator pipeline test 14"
source bin/assert.sh
output=tests/output/test14
nextflow main.nf -profile conda --name test_data \
--output $output \
--reference_generate \
--reference $(pwd)/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa \
--gff $(pwd)/reference/Sars_cov_2.ASM985889v3.101.gff3 \
--snpeff_organism Sars_cov_2

# Test reference genome related output
test -s $output/reference/sequences.fa.fai || { echo "Missing fasta index file!"; exit 1; }
test -s $output/reference/sequences.dict || { echo "Missing GATK dict file!"; exit 1; }

# Test bwa index files are present
test -s $output/reference/sequences.fa.0123 || { echo "Missing bwa 0123 index file!"; exit 1; }
test -s $output/reference/sequences.fa.amb || { echo "Missing bwa amb index file!"; exit 1; }
test -s $output/reference/sequences.fa.ann || { echo "Missing bwa ann index file!"; exit 1; }
test -s $output/reference/sequences.fa.bwt.2bit.64 || { echo "Missing bwa bwt.2bit.64 index file!"; exit 1; }
test -s $output/reference/sequences.fa.pac || { echo "Missing bwa pac index file!"; exit 1; }


# Test snpEff output
test -s $output/snpeff/snpEff.config || { echo "Missing snpEff config file!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/snpEffectPredictor.bin || { echo "Missing snpEff predictor bin file!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/sequences.fa || { echo "Missing snpEff reference genome!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/genes.gff || { echo "Missing snpEff reference annotation!"; exit 1; }
22 changes: 22 additions & 0 deletions tests/scripts/test_15.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

##################################################################################
# Lineage only mode
##################################################################################

echo "Running CoVigator pipeline test 15"
source bin/assert.sh
output=tests/output/test15
nextflow main.nf -profile conda --name test_data \
--output $output \
--vcf $(pwd)/tests/test_data/test_data.assembly.lineage_mode.vcf.gz \
--lineage_mode \
--skip_normalization

test -s $output/test_data.input.fasta || { echo "Missing VCF2FASTA fasta file (lineage mode with vcf input)!"; exit 1; }
test -s $output/test_data.input.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; }
assert_eq $(wc -l $output/test_data.input.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results"




16 changes: 16 additions & 0 deletions tests/scripts/test_16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

##################################################################################
# Lineage only mode
##################################################################################

echo "Running CoVigator pipeline test 16"
source bin/assert.sh
output=tests/output/test15
nextflow main.nf -profile conda --name test_data \
--output $output \
--fasta $(pwd)/tests/test_data/test_data.fasta \
--lineage_mode

test -s $output/test_data.assembly.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; }
assert_eq $(wc -l $output/test_data.assembly.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results"
Binary file not shown.

0 comments on commit 6a52c63

Please sign in to comment.