diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2e74c7a3..002b66dc7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ jobs: - "-profile test,docker --mapping_tool bowtie2 --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'" - "-profile test,docker --skip_preprocessing" - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'" - - "-profile test_humanbam,docker --run_sexdeterrmine" + - "-profile test_humanbam,docker --run_sexdeterrmine --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref steps: - name: Check out pipeline code diff --git a/conf/modules.config b/conf/modules.config index efdc216a4..479d699e3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1152,4 +1152,24 @@ process { pattern: '*.txt' ] } + + withName: ANGSD_GL { + tag = { "${meta.reference}" } + ext.args = { + gl_model = params.genotyping_angsd_glmodel == 'samtools' ? 1 : params.genotyping_angsd_glmodel == 'gatk' ? 2 : params.genotyping_angsd_glmodel == 'soapsnp' ? 3 : 4 + gl_format = params.genotyping_angsd_glformat == 'binary' ? 1 : params.genotyping_angsd_glformat == 'beagle_binary' ? 2 : params.genotyping_angsd_glformat == 'binary_three' ? 3 : 4 + [ + ( gl_format == 2 || gl_format == 3 ) ? '-doMajorMinor 1': '', + "-GL ${gl_model}", + "-doGlf ${gl_format}", + ].join(' ').trim() + } + ext.prefix = { "angsd_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{glf,beagle}.gz' + ] + } } diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 41aa2defa..9ecfc2568 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -877,3 +877,29 @@ nextflow run main.nf -profile test_humanbam,docker --outdir ./results -w work/ - ## Specifically, no geno/snp/ind for the reference that has no bed/snp file (Mammoth). Only data for "human" reference. nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` + +## ANGSD + +```bash +## ANGSD on raw reads. Default parameters. (--genotyping_angsd_glmodel 'samtools' --genotyping_angsd_glformat 'binary' ) +## Expect: One `glf.gz` file in binary format per reference. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## ANGSD on raw reads. gatk model, beagle binary format. +## Expect: One `beagle.gz` file in beagle format per reference. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'angsd' --genotyping_angsd_glmodel 'gatk' --genotyping_angsd_glformat 'beagle_binary' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## ANGSD on raw reads. soapSNP model, binary_three format. +## Expect: One `glf.gz` file in binary_three format per reference. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'angsd' --genotyping_angsd_glmodel 'soapsnp' --genotyping_angsd_glformat 'binary_three' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## ANGSD on raw reads. syk model, text format. +## Expect: One `glf.gz` file in binary_three format per reference. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'angsd' --genotyping_angsd_glmodel 'syk' --genotyping_angsd_glformat 'text' --genotyping_source 'raw' -ansi-log false -dump-channels +``` diff --git a/docs/output.md b/docs/output.md index 691e4db12..9237f0c55 100644 --- a/docs/output.md +++ b/docs/output.md @@ -621,3 +621,16 @@ When using pileupCaller for genotyping, single-stranded and double-stranded libr [FreeBayes](https://github.com/freebayes/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs (single-nucleotide polymorphisms), indels (insertions and deletions), MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a short-read sequencing alignment. It calls variants based on the literal sequences of reads aligned to a particular target, not their precise alignment. This model is a straightforward generalization of previous ones (e.g. PolyBayes, samtools, GATK) which detect or report variants based on alignments. This method avoids one of the core problems with alignment-based variant detection - that identical sequences may have multiple possible alignments. The output provided is a bgzipped VCF file for containing the genotype calls of each sample, it's index file, as well as the statistics of the VCF file generated by `bcftools stats`. + +#### ANGSD + +
+Output files + +- `genotyping/` + + - `*.{glf,beagle}.gz`: Genotype likelihood file, containing likelihoods across all samples per reference. + +
+ +[ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. It can estimate genotype likelihoods and allele frequencies from next-generation sequencing data. The output provided is a bgzipped genotype likelihood file, containing likelihoods across all samples per reference. Users can specify the model used for genotype likelihood estimation, as well as the output format. For more information on the available options, see the [ANGSD](https://www.popgen.dk/angsd/index.php/Genotype_Likelihoods). diff --git a/modules.json b/modules.json index 1b9488483..e6100c773 100644 --- a/modules.json +++ b/modules.json @@ -20,6 +20,11 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["bam_docounts_contamination_angsd"] }, + "angsd/gl": { + "branch": "master", + "git_sha": "c22aa6082716bd372cbb8f7ccf7c83220f180864", + "installed_by": ["modules"] + }, "bamutil/trimbam": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/nf-core/angsd/gl/environment.yml b/modules/nf-core/angsd/gl/environment.yml new file mode 100644 index 000000000..aac5db41f --- /dev/null +++ b/modules/nf-core/angsd/gl/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "angsd_gl" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::angsd=0.940" + - "bioconda::htslib=1.17" diff --git a/modules/nf-core/angsd/gl/main.nf b/modules/nf-core/angsd/gl/main.nf new file mode 100644 index 000000000..9a098b7e4 --- /dev/null +++ b/modules/nf-core/angsd/gl/main.nf @@ -0,0 +1,112 @@ +process ANGSD_GL { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/angsd:0.940--hce60e53_2': + 'biocontainers/angsd:0.940--hce60e53_2' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) //Optionally + tuple val(meta3), path(error_file) //Optionally. Used for SYK model only. + + output: + tuple val(meta), path("*.{glf,beagle}.gz"), emit: genotype_likelihood + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def GL_model = args.contains("-GL 1") ? 1 : args.contains("-GL 2") ? 2 : args.contains("-GL 3") ? 3 : args.contains("-GL 4") ? 4 : 0 + def ref = fasta ? "-ref ${fasta}" : '' // Use reference fasta if provided + def errors = error_file ? "-errors ${error_file}" : '' // Only applies to SYK model + def output_mode = args.contains("-doGlf") ? "" : '-doGlf 1' // Default to outputting binary glf (10 log likelihoods) if not set in args + // NOTE: GL is specified within args, so is not provided as a separate argument + + if (GL_model != 3 && GL_model != 4) { + """ + ls -1 *.bam > bamlist.txt + + angsd \\ + -nThreads ${task.cpus} \\ + -bam bamlist.txt \\ + $args \\ + $ref \\ + $output_mode \\ + -out ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + angsd: \$(echo \$(angsd 2>&1) | grep 'angsd version' | head -n 1 | sed 's/.*version: //g;s/ .*//g') + END_VERSIONS + """ + } else if (GL_model == 3) { + // No args for this part. + // GL is hardcoded to 3 here to avoid passing all other arguments to the calibration step + """ + ls -1 *.bam > bamlist.txt + + ## SOAPsnp model + ## First get the calibration matrix. minQ MUST be 0 for this step. Will create the directory angsd_tmpdir/ with the required files for the next step. + angsd \\ + -nThreads ${task.cpus} \\ + -bam bamlist.txt \\ + -minQ 0 \\ + -GL 3 \\ + $ref \\ + -out ${prefix} + + ## Then run the model + angsd \\ + -nThreads ${task.cpus} \\ + -bam bamlist.txt \\ + $args \\ + $ref \\ + $output_mode \\ + -out ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + angsd: \$(echo \$(angsd 2>&1) | grep 'angsd version' | head -n 1 | sed 's/.*version: //g;s/ .*//g') + END_VERSIONS + """ + } else if (GL_model == 4) { + """ + ls -1 *.bam > bamlist.txt + + ## SYK model + angsd \\ + -nThreads ${task.cpus} \\ + -bam bamlist.txt \\ + $args \\ + $ref \\ + $output_mode \\ + $errors \\ + -doCounts 1 \\ + -out ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + angsd: \$(echo \$(angsd 2>&1) | grep 'angsd version' | head -n 1 | sed 's/.*version: //g;s/ .*//g') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.glf + gzip ${prefix}.glf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + angsd: \$(echo \$(angsd 2>&1) | grep 'angsd version' | head -n 1 | sed 's/.*version: //g;s/ .*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/angsd/gl/meta.yml b/modules/nf-core/angsd/gl/meta.yml new file mode 100644 index 000000000..168f615d5 --- /dev/null +++ b/modules/nf-core/angsd/gl/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "angsd_gl" +description: Calculated genotype likelihoods from BAM files. +keywords: + - angsd + - genotype likelihood + - genomics +tools: + - "angsd": + description: "ANGSD: Analysis of next generation Sequencing Data" + homepage: "http://www.popgen.dk/angsd/" + documentation: "http://www.popgen.dk/angsd/" + tool_dev_url: "https://github.com/ANGSD/angsd" + doi: "10.1186/s12859-014-0356-4" + licence: ["GPL v3", "MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: A list of BAM or CRAM files + pattern: "*.{bam,cram}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: A reference genome in FASTA format + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - error_file: + type: file + description: A file containing information about type specific errors. + pattern: "*.error.chunkunordered" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - genotype_likelihood: + type: file + description: File containing genotype likelihoods per sample + pattern: "*.{glf,beagle}.gz" + +authors: + - "@apeltzer" + - "@TCLamnidis" +maintainers: + - "@apeltzer" + - "@TCLamnidis" diff --git a/nextflow.config b/nextflow.config index 5865a2943..a3e6454fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -227,6 +227,8 @@ params { genotyping_gatk_hc_emitrefconf = 'GVCF' genotyping_freebayes_min_alternate_count = 1 genotyping_freebayes_skip_coverage = 0 + genotyping_angsd_glmodel = 'samtools' + genotyping_angsd_glformat = 'binary' } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 11348bfd2..edda1ae6e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1025,6 +1025,22 @@ "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified.", "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value. Setting to 0 (the default) deactivates this behaviour.\n\n> Modifies freebayes parameter: `-g`", "fa_icon": "fab fa-think-peaks" + }, + "genotyping_angsd_glmodel": { + "type": "string", + "default": "samtools", + "fa_icon": "fas fa-project-diagram", + "description": "Specify which ANGSD genotyping likelihood model to use.", + "help_text": "Specify which genotype likelihood model to use.\n\n> Modifies angsd parameter: `-GL`", + "enum": ["samtools", "gatk", "soapsnp", "syk"] + }, + "genotyping_angsd_glformat": { + "type": "string", + "default": "binary", + "fa_icon": "fas fa-text-height", + "description": "Specify the formatting of the output VCF for ANGSD genotype likelihood results.", + "help_text": "Specifies what type of genotyping likelihood file format will be output.\n\nThe options refer to the following descriptions respectively:\n\n- `binary`: binary output of all 10 log genotype likelihood\n- `beagle_binary`: beagle likelihood file\n- `binary_three`: binary 3 times likelihood\n- `text`: text output of all 10 log genotype likelihoods.\n\nSee the [ANGSD documentation](http://www.popgen.dk/angsd/) for more information on which to select for your downstream applications.\n\n> Modifies angsd parameter: `-doGlf`", + "enum": ["binary", "beagle_binary", "binary_three", "text"] } }, "fa_icon": "fas fa-sliders-h", diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 58bebf64c..a674bf85f 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -11,6 +11,7 @@ include { GATK_INDELREALIGNER } from '../../module include { GATK_UNIFIEDGENOTYPER } from '../../modules/nf-core/gatk/unifiedgenotyper/main' include { GATK4_HAPLOTYPECALLER } from '../../modules/nf-core/gatk4/haplotypecaller/main' include { FREEBAYES } from '../../modules/nf-core/freebayes/main' +include { ANGSD_GL } from '../../modules/nf-core/angsd/gl/main' include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_UG } from '../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FREEBAYES } from '../../modules/nf-core/bcftools/index/main' @@ -28,6 +29,7 @@ workflow GENOTYPE { ch_multiqc_files = Channel.empty() ch_pileupcaller_genotypes = Channel.empty() ch_eigenstrat_coverage_stats = Channel.empty() + ch_angsd_genotype_likelihoods = Channel.empty() ch_genotypes_vcf = Channel.empty() ch_bcftools_stats = Channel.empty() @@ -357,7 +359,62 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'angsd' ) { - log.warn("[nf-core/eager] Genotyping with ANGSD is not yet implemented .") + ch_bams_for_multimap = ch_bam_bai + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple() + .map { + combo_meta, metas, bams, bais -> + def new_map = [:] + def ids = metas.collect { meta -> meta.sample_id } + def strandedness = metas.collect { meta -> meta.strandedness } + def single_ends = metas.collect { meta -> meta.single_end } + def reference = combo_meta.reference + new_meta = [ sample_id: ids, strandedness: strandedness, single_end: single_ends, reference: reference ] + + [ combo_meta, new_meta, bams, bais ] // Drop bais + } // Collect all IDs into a list in meta.sample_id. + + ch_fasta_for_multimap = ch_fasta_plus + // Because dbsnp is optional, the channel can be [[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. + .join( ch_dbsnp, remainder:true ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Also filter out the empty list dbsnp (meta == []) + .filter { it[0] != [] } + // convert added null dbsnp into an empty list + .map { + ref_meta, fasta, fai, dict, dbsnp -> + def final_dbsnp = dbsnp != null ? dbsnp : [] + [ ref_meta, fasta, fai, dict, final_dbsnp ] + } + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + addNewMetaFromAttributes( it, "id" , "reference" , false ) + } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] + + ch_input_for_angsd = ch_bams_for_multimap + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> + bam: [ meta, bam ] + fasta: [ ref_meta, fasta ] + } + + ANGSD_GL( + ch_input_for_angsd.bam, + ch_input_for_angsd.fasta, + [[], []], // No errors file + ) + ch_angsd_genotype_likelihoods = ANGSD_GL.out.genotype_likelihood + ch_versions = ch_versions.mix( ANGSD_GL.out.versions.first() ) + + // Add genotyper info to the meta + ch_angsd_genotype_likelihoods = ch_angsd_genotype_likelihoods + .map { + meta, glf -> + [ meta + [ genotyper: "angsd" ], glf ] + } } // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes @@ -387,10 +444,11 @@ workflow GENOTYPE { } emit: - eigenstrat = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] - vcf = ch_genotypes_vcf // [ [ meta ], vcf ] ] - vcf_stats = ch_bcftools_stats // [ [ meta ], stats ] - eigenstrat_coverage = ch_eigenstrat_coverage_stats // [ [ meta ], stats ] + eigenstrat = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] + vcf = ch_genotypes_vcf // [ [ meta ], vcf ] ] + vcf_stats = ch_bcftools_stats // [ [ meta ], stats ] + glf = ch_angsd_genotype_likelihoods // [ [ meta ], glf ] + eigenstrat_coverage = ch_eigenstrat_coverage_stats // [ [ meta ], stats ] versions = ch_versions mqc = ch_multiqc_files }