Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented lineage only mode #54

Merged
merged 10 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,7 @@ test:
bash tests/scripts/test_11.sh
bash tests/scripts/test_12.sh
bash tests/scripts/test_13.sh
bash tests/scripts/test_14.sh
bash tests/scripts/test_15.sh
bash tests/scripts/test_16.sh
bash tests/scripts/test_python_unit_tests.sh
235 changes: 138 additions & 97 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ params.skip_bcftools = false
params.skip_gatk = false
params.skip_pangolin = false
params.skip_normalization = false
params.lineage_mode = false

// references
params.reference = false
Expand All @@ -39,7 +40,7 @@ params.gff = false
//params.snpeff_config = false
params.snpeff_organism = false
params.primers = false
params.different_virus = false
params.reference_generate = false

params.output = "."
params.min_mapping_quality = 20
Expand Down Expand Up @@ -206,6 +207,11 @@ else if (params.input_vcfs_list != false || params.vcf != false) {
.set { input_vcfs }
}
}
else if(params.reference_generate) {
genome = params.reference
annotation = gff
annotation_name = snpeff_organism
}
else {
log.error "missing some input data"
exit 1
Expand Down Expand Up @@ -240,119 +246,154 @@ if (params.skip_bcftools && params.skip_gatk && params.skip_ivar && params.skip_
exit 1
}

// Lineage mode should only be used when running with assembly or VCF input
// For FASTQ input we can run pangolin as regular rule
if (params.lineage_mode && input_fastqs) {
johausmann marked this conversation as resolved.
Show resolved Hide resolved
log.error "lineage mode is only supported with fasta or VCF input"
exit 1
}

workflow {
if (params.reference) {
if (! skip_snpeff && gff) {
SNPEFF_DATABASE(reference, gff, snpeff_organism)
snpeff_data = SNPEFF_DATABASE.out.snpeff_data
snpeff_config = SNPEFF_DATABASE.out.snpeff_config
}

}
if (input_fastqs) {
if (params.reference) {
BWA_INDEX(reference)
reference = BWA_INDEX.out.reference
if (params.lineage_mode) {
log.info "Running lineage mode. In this mode only Pangolin is executed on either fasta or VCF"
if (input_fastas) {
PANGOLIN_LINEAGE(input_fastas)
}
if (library == "paired") {
READ_TRIMMING_PAIRED_END(input_fastqs)
ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference)
bam_files = ALIGNMENT_PAIRED_END.out
else if (input_vcfs) {
if (! params.skip_normalization) {
VARIANT_NORMALIZATION(input_vcfs, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
else {
normalized_vcfs = input_vcfs
}
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
}
else {
READ_TRIMMING_SINGLE_END(input_fastqs)
ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
bam_files = ALIGNMENT_SINGLE_END.out
}
else if (params.reference_generate) {
log.info "Running genome/annotation generate mode. This step generates only bwa and snpEff databases..."
// Custom reference mode. This mode is added to prepare the reference just once when processing many samples
if (genome) {
BWA_INDEX(genome)
if (!skip_snpeff) {
SNPEFF_DATABASE(genome, gff, snpeff_organism)
}
}
BAM_PREPROCESSING(bam_files, reference)
preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams
}
else {
// Run genome and annotation steps for single sample
if (params.reference) {
if (! skip_snpeff && gff) {
SNPEFF_DATABASE(reference, gff, snpeff_organism)
snpeff_data = SNPEFF_DATABASE.out.snpeff_data
snpeff_config = SNPEFF_DATABASE.out.snpeff_config
}

if (primers) {
PRIMER_TRIMMING_IVAR(preprocessed_bams, primers)
preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam
}
COVERAGE_ANALYSIS(preprocessed_bams)

// variant calling
vcfs_to_normalize = null
if (!params.skip_bcftools) {
VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
if (input_fastqs) {
if (params.reference) {
BWA_INDEX(reference)
reference = BWA_INDEX.out.reference
}
if (library == "paired") {
READ_TRIMMING_PAIRED_END(input_fastqs)
ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference)
bam_files = ALIGNMENT_PAIRED_END.out
}
else {
READ_TRIMMING_SINGLE_END(input_fastqs)
ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference)
bam_files = ALIGNMENT_SINGLE_END.out
}
BAM_PREPROCESSING(bam_files, reference)
preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams

if (primers) {
PRIMER_TRIMMING_IVAR(preprocessed_bams, primers)
preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam
}
COVERAGE_ANALYSIS(preprocessed_bams)

// variant calling
vcfs_to_normalize = null
if (!params.skip_bcftools) {
VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out)
}
if (!params.skip_lofreq) {
VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
}
if (!params.skip_gatk) {
VARIANT_CALLING_GATK(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
}
if (!params.skip_ivar && gff) {
VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)
}
}
if (!params.skip_lofreq) {
VARIANT_CALLING_LOFREQ(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out)
else if (input_fastas) {
if (!params.skip_pangolin) {
// pangolin from fasta
PANGOLIN_LINEAGE(input_fastas)
}
// assembly variant calling
VARIANT_CALLING_ASSEMBLY(input_fastas, reference)
vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out
}
if (!params.skip_gatk) {
VARIANT_CALLING_GATK(preprocessed_bams, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out)
else if (input_vcfs) {
vcfs_to_normalize = input_vcfs
}
if (!params.skip_ivar && gff) {
VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff)
IVAR2VCF(VARIANT_CALLING_IVAR.out, reference)
vcfs_to_normalize = vcfs_to_normalize == null?
IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out)

if (! params.skip_normalization) {
VARIANT_NORMALIZATION(vcfs_to_normalize, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
}
else if (input_fastas) {
if (!params.skip_pangolin) {
// pangolin from fasta
PANGOLIN_LINEAGE(input_fastas)
else {
normalized_vcfs = vcfs_to_normalize
}

// assembly variant calling
VARIANT_CALLING_ASSEMBLY(input_fastas, reference)
vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out
}
else if (input_vcfs) {
vcfs_to_normalize = input_vcfs
}

if (! params.skip_normalization) {
VARIANT_NORMALIZATION(vcfs_to_normalize, reference)
normalized_vcfs = VARIANT_NORMALIZATION.out
}
else {
normalized_vcfs = vcfs_to_normalize
}

if (input_fastqs || input_vcfs) {
// pangolin from VCF on the normalized VCFs
if (!params.skip_pangolin) {
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
if (input_fastqs || input_vcfs) {
// pangolin from VCF on the normalized VCFs
if (!params.skip_pangolin) {
VCF2FASTA(normalized_vcfs, reference)
PANGOLIN_LINEAGE(VCF2FASTA.out)
}
}
}

if (! skip_sarscov2_annotations) {
// only optionally add SARS-CoV-2 specific annotations
VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs)
normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs
}
if (! skip_sarscov2_annotations) {
// only optionally add SARS-CoV-2 specific annotations
VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs)
normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs
}

if (preprocessed_bams) {
// we can only add technical annotations when we have the reads
VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)
normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated
}
if (preprocessed_bams) {
// we can only add technical annotations when we have the reads
VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0))
VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf)
normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated
}

// NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly
if (gff) {
PHASING(normalized_vcfs, reference, gff)
normalized_vcfs = PHASING.out
}
// NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly
if (gff) {
PHASING(normalized_vcfs, reference, gff)
normalized_vcfs = PHASING.out
}

if (! skip_snpeff) {
// only when configured we run SnpEff
VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism)
normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs
}
else {
BGZIP(normalized_vcfs)
if (! skip_snpeff) {
// only when configured we run SnpEff
VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism)
normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs
}
else {
BGZIP(normalized_vcfs)
}
}
}
5 changes: 5 additions & 0 deletions modules/00_prepare_annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ process BWA_INDEX {
path("reference/sequences.fa"), emit: reference
path("reference/sequences.fa.fai"), emit: fai
path("reference/sequences.dict"), emit: gatk_dict
path("reference/sequences.fa.0123")
path("reference/sequences.fa.amb")
path("reference/sequences.fa.ann")
path("reference/sequences.fa.bwt.2bit.64")
path("reference/sequences.fa.pac")

script:
memory = "${params.memory}".replaceAll(" ", "").toLowerCase()
Expand Down
3 changes: 2 additions & 1 deletion modules/07_lineage_annotation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ process PANGOLIN_LINEAGE {

when:
// only runs pangolin on LoFreq and the assembly results
caller == "lofreq" || caller == "assembly"
// JoHa: added input to run lineage mode for vcf input
caller == "lofreq" || caller == "assembly" || caller == "input"

shell:
"""
Expand Down
5 changes: 4 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ params.pfam_names_header = "$baseDir/reference/pfam_names.header.txt"
params.pfam_descriptions_header = "$baseDir/reference/pfam_descriptions.header.txt"

profiles {
conda { params.enable_conda = true }
conda {
params.enable_conda = true
conda.enabled = true
}
debug { process.beforeScript = 'echo $HOSTNAME' }
test {
params.cpus = 1
Expand Down
32 changes: 32 additions & 0 deletions tests/scripts/test_14.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

##################################################################################
# Genome generate
##################################################################################
echo "Running CoVigator pipeline test 14"
source bin/assert.sh
output=tests/output/test14
nextflow main.nf -profile conda --name test_data \
--output $output \
--reference_generate \
--reference $(pwd)/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa \
--gff $(pwd)/reference/Sars_cov_2.ASM985889v3.101.gff3 \
--snpeff_organism Sars_cov_2

# Test reference genome related output
test -s $output/reference/sequences.fa.fai || { echo "Missing fasta index file!"; exit 1; }
test -s $output/reference/sequences.dict || { echo "Missing GATK dict file!"; exit 1; }

# Test bwa index files are present
test -s $output/reference/sequences.fa.0123 || { echo "Missing bwa 0123 index file!"; exit 1; }
test -s $output/reference/sequences.fa.amb || { echo "Missing bwa amb index file!"; exit 1; }
test -s $output/reference/sequences.fa.ann || { echo "Missing bwa ann index file!"; exit 1; }
test -s $output/reference/sequences.fa.bwt.2bit.64 || { echo "Missing bwa bwt.2bit.64 index file!"; exit 1; }
test -s $output/reference/sequences.fa.pac || { echo "Missing bwa pac index file!"; exit 1; }


# Test snpEff output
test -s $output/snpeff/snpEff.config || { echo "Missing snpEff config file!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/snpEffectPredictor.bin || { echo "Missing snpEff predictor bin file!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/sequences.fa || { echo "Missing snpEff reference genome!"; exit 1; }
test -s $output/snpeff/Sars_cov_2/genes.gff || { echo "Missing snpEff reference annotation!"; exit 1; }
22 changes: 22 additions & 0 deletions tests/scripts/test_15.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

##################################################################################
# Lineage only mode
##################################################################################

echo "Running CoVigator pipeline test 15"
source bin/assert.sh
output=tests/output/test15
nextflow main.nf -profile conda --name test_data \
--output $output \
--vcf $(pwd)/tests/test_data/test_data.assembly.lineage_mode.vcf.gz \
--lineage_mode \
--skip_normalization

test -s $output/test_data.input.fasta || { echo "Missing VCF2FASTA fasta file (lineage mode with vcf input)!"; exit 1; }
test -s $output/test_data.input.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; }
assert_eq $(wc -l $output/test_data.input.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results"




16 changes: 16 additions & 0 deletions tests/scripts/test_16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

##################################################################################
# Lineage only mode
##################################################################################

echo "Running CoVigator pipeline test 16"
source bin/assert.sh
output=tests/output/test15
nextflow main.nf -profile conda --name test_data \
--output $output \
--fasta $(pwd)/tests/test_data/test_data.fasta \
--lineage_mode

test -s $output/test_data.assembly.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; }
assert_eq $(wc -l $output/test_data.assembly.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results"
Binary file not shown.
Loading