diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4df3b5a51..5437003e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam" - "-profile test,docker --mapping_tool bowtie2" - "-profile test,docker --skip_preprocessing" - - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd" + - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref steps: - name: Check out pipeline code diff --git a/CITATIONS.md b/CITATIONS.md index 94ffee430..63f3adda3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -94,6 +94,10 @@ > Jun, G., Wing, M. K., Abecasis, G. R., & Kang, H. M. (2015). An efficient and scalable analysis framework for variant extraction and refinement from population-scale DNA sequence data. Genome Research, 25(6), 918–925. doi: [10.1101/gr.176552.114](https://doi.org/10.1101/gr.176552.114) +- [QualiMap](https://doi.org/10.1093/bioinformatics/btv566) + + > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. Download: http://qualimap.bioinfo.cipf.es/ + - [DamageProfiler](https://doi.org/10.1093/bioinformatics/btab190) > DamageProfiler Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). doi: [10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: https://github.com/Integrative-Transcriptomics/DamageProfiler diff --git a/conf/modules.config b/conf/modules.config index 39722408f..904e63981 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -819,6 +819,15 @@ process { ] } + withName: "QUALIMAP_BAMQC" { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + publishDir = [ + path: { "${params.outdir}/mapstats/qualimap/${meta.reference}/${meta.sample_id}/}" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + // // DAMAGE CALCULATION // diff --git a/docs/output.md b/docs/output.md index 0be0f510d..d84c9ec02 100644 --- a/docs/output.md +++ b/docs/output.md @@ -379,6 +379,33 @@ These curves will be displayed in the pipeline run's MultiQC report, however you ### Mapping Statistics +#### QualiMap + +
+Output files + +- `qualimap/` + + - `/` + - `*.html`: in-depth report including percent coverage, depth coverage, GC content, etc. of mapped reads + - `genome_results.txt` + - `css/`: HTML CSS styling used for the report + - `images_qualimapReport/`: PNG version of images from the HTML report. + - `raw_data_qualimapReport/`: The raw data used to render the HTML report as TXT files. + +
+ +[QualiMap](http://qualimap.bioinfo.cipf.es/) +is a tool which provides statistics on the quality of the mapping of your reads to your reference genome. It allows you to assess how well covered your reference genome is by your data, both in 'fold' depth (average number of times a given base on the reference is covered by a read) and 'percentage' (the percentage of all bases on the reference genome that is covered at a given fold depth). These outputs allow you to make decision if you have enough quality data for downstream applications like genotyping, and how to adjust the parameters for those tools accordingly. + +> NB: Neither fold coverage nor percent coverage on its own is sufficient to assess whether you have a high quality mapping. Abnormally high fold coverages of a smaller region such as highly conserved genes or un-removed-adapter-containing reference genomes can artificially inflate the mean coverage, yet a high percent coverage is not useful if all bases of the genome are covered at just 1x coverage. + +**Note that many of the statistics from this module are displayed in the General Stats table, as they represent single values that are not plottable.** + +You will receive output for each sample. This means you will statistics of deduplicated values of all types of libraries combined in a single value (i.e. non-UDG treated, full-UDG, paired-end, single-end all together). + +> ⚠️ Warning: If your library has no reads mapping to the reference, this will result in an empty BAM file. Qualimap will therefore not produce any output even if a BAM exists! + #### Bedtools
diff --git a/modules.json b/modules.json index 13ff8adbc..25e1ecc9a 100644 --- a/modules.json +++ b/modules.json @@ -160,6 +160,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "qualimap/bamqc": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "samtools/faidx": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/nf-core/qualimap/bamqc/main.nf b/modules/nf-core/qualimap/bamqc/main.nf new file mode 100644 index 000000000..fef7307a0 --- /dev/null +++ b/modules/nf-core/qualimap/bamqc/main.nf @@ -0,0 +1,123 @@ +process QUALIMAP_BAMQC { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::qualimap=2.2.2d" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/qualimap:2.2.2d--1' : + 'biocontainers/qualimap:2.2.2d--1' }" + + input: + tuple val(meta), path(bam) + path gff + + output: + tuple val(meta), path("${prefix}"), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + def collect_pairs = meta.single_end ? '' : '--collect-overlap-pairs' + def memory = (task.memory.mega*0.8).intValue() + 'M' + def regions = gff ? "--gff $gff" : '' + + def strandedness = 'non-strand-specific' + if (meta.strandedness == 'forward') { + strandedness = 'strand-specific-forward' + } else if (meta.strandedness == 'reverse') { + strandedness = 'strand-specific-reverse' + } + """ + unset DISPLAY + mkdir -p tmp + export _JAVA_OPTIONS=-Djava.io.tmpdir=./tmp + qualimap \\ + --java-mem-size=$memory \\ + bamqc \\ + $args \\ + -bam $bam \\ + $regions \\ + -p $strandedness \\ + $collect_pairs \\ + -outdir $prefix \\ + -nt $task.cpus + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qualimap: \$(echo \$(qualimap 2>&1) | sed 's/^.*QualiMap v.//; s/Built.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + """ + mkdir -p $prefix/css + mkdir $prefix/images_qualimapReport + mkdir $prefix/raw_data_qualimapReport + cd $prefix/css + touch agogo.css + touch basic.css + touch bgtop.png + touch comment-close.png + touch doctools.js + touch down-pressed.png + touch jquery.js + touch plus.png + touch qualimap_logo_small.png + touch searchtools.js + touch up.png + touch websupport.js + touch ajax-loader.gif + touch bgfooter.png + touch comment-bright.png + touch comment.png + touch down.png + touch file.png + touch minus.png + touch pygments.css + touch report.css + touch underscore.js + touch up-pressed.png + cd ../images_qualimapReport/ + touch genome_coverage_0to50_histogram.png + touch genome_coverage_quotes.png + touch genome_insert_size_across_reference.png + touch genome_mapping_quality_histogram.png + touch genome_uniq_read_starts_histogram.png + touch genome_coverage_across_reference.png + touch genome_gc_content_per_window.png + touch genome_insert_size_histogram.png + touch genome_reads_clipping_profile.png + touch genome_coverage_histogram.png + touch genome_homopolymer_indels.png + touch genome_mapping_quality_across_reference.png + touch genome_reads_content_per_read_position.png + cd ../raw_data_qualimapReport + touch coverage_across_reference.txt + touch genome_fraction_coverage.txt + touch insert_size_histogram.txt + touch mapped_reads_nucleotide_content.txt + touch coverage_histogram.txt + touch homopolymer_indels.txt + touch mapped_reads_clipping_profile.txt + touch mapping_quality_across_reference.txt + touch duplication_rate_histogram.txt + touch insert_size_across_reference.txt + touch mapped_reads_gc-content_distribution.txt + touch mapping_quality_histogram.txt + cd ../ + touch genome_results.txt + touch qualimapReport.html + cd ../ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qualimap: \$(echo \$(qualimap 2>&1) | sed 's/^.*QualiMap v.//; s/Built.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/qualimap/bamqc/meta.yml b/modules/nf-core/qualimap/bamqc/meta.yml new file mode 100644 index 000000000..303532eb7 --- /dev/null +++ b/modules/nf-core/qualimap/bamqc/meta.yml @@ -0,0 +1,47 @@ +name: qualimap_bamqc +description: Evaluate alignment data +keywords: + - quality control + - qc + - bam +tools: + - qualimap: + description: | + Qualimap 2 is a platform-independent application written in + Java and R that provides both a Graphical User Interface and + a command-line interface to facilitate the quality control of + alignment sequencing data and its derivatives like feature counts. + homepage: http://qualimap.bioinfo.cipf.es/ + documentation: http://qualimap.conesalab.org/doc_html/index.html + doi: 10.1093/bioinformatics/bts503 + licence: ["GPL-2.0-only"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - gff: + type: file + description: Feature file with regions of interest + pattern: "*.{gff,gtf,bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: dir + description: Qualimap results dir + pattern: "*/*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@phue" diff --git a/nextflow.config b/nextflow.config index 31845eba4..a20ae5a5e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -141,6 +141,10 @@ params { skip_deduplication = false deduplication_tool = 'markduplicates' + // Qualimap + skip_qualimap = false + snpcapture_bed = null + // Contamination estimation run_contamination_estimation_angsd = false contamination_estimation_angsd_chrom_name = 'X' diff --git a/nextflow_schema.json b/nextflow_schema.json index ae00f094a..904c7ef61 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -958,6 +958,14 @@ "description": "Turns on defects mode to extrapolate without testing for defects (lc_extrap mode only).", "help_text": "Activates defects mode of `lc_extrap`, which does the extrapolation without testing for defects.\n\n> Modifies preseq lc_extrap parameter: `-D`", "fa_icon": "fab fa-creative-commons-sampling-plus" + }, + "skip_qualimap": { + "type": "boolean", + "default": "false" + }, + "snpcapture_bed": { + "type": "string", + "description": "Path to snp capture in BED format. Provided file can also be gzipped." } }, "fa_icon": "fas fa-search" diff --git a/workflows/eager.nf b/workflows/eager.nf index 4da7fd1d7..8024f354b 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -98,8 +98,10 @@ include { MTNUCRATIO } from '../modules/n include { HOST_REMOVAL } from '../modules/local/host_removal' include { ENDORSPY } from '../modules/nf-core/endorspy/main' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTATS_BAM_INPUT } from '../modules/nf-core/samtools/flagstat/main' -include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH ; BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' -include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH ; BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' +include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' +include { QUALIMAP_BAMQC } from '../modules/nf-core/qualimap/bamqc/main' +include { GUNZIP as GUNZIP_SNPBED } from '../modules/nf-core/gunzip/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,6 +138,26 @@ workflow EAGER { if ( params.preprocessing_tool == 'fastp' && !adapterlist.extension.matches(".*(fa|fasta|fna|fas)") ) error "[nf-core/eager] ERROR: fastp adapter list requires a `.fasta` format and extension (or fa, fas, fna). Check input: --preprocessing_adapterlist ${params.preprocessing_adapterlist}" } + // QualiMap + if ( params.snpcapture_bed ) { + ch_snpcapture_bed_gunzip = Channel.fromPath( params.snpcapture_bed, checkIfExists: true ) + .collect() + .map { + file -> + meta = file.simpleName + [meta,file] + } + .branch { + meta, bed -> + forgunzip: bed[0].extension == "gz" + skip: true + } + ch_snpcapture_bed = GUNZIP_SNPBED(ch_snpcapture_bed_gunzip.forgunzip).gunzip.mix(ch_snpcapture_bed_gunzip.skip).map{it[1]} + + } else { + ch_snpcapture_bed = [] + } + // Contamination estimation hapmap_file = file(params.contamination_estimation_angsd_hapmap, checkIfExists:true) @@ -147,11 +169,10 @@ workflow EAGER { file(params.input) ) ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ // ! There is currently no tooling to help you write a sample sheet schema - + // // SUBWORKFLOW: Indexing of reference files // @@ -270,6 +291,20 @@ workflow EAGER { ch_dedupped_flagstat = Channel.empty() } + // + // MODULE QUALIMAP + // + + if ( !params.skip_qualimap ) { + ch_qualimap_input = ch_dedupped_bams + .map { + meta, bam, bai -> + [ meta, bam ] + } + QUALIMAP_BAMQC(ch_qualimap_input, ch_snpcapture_bed) + ch_versions = ch_versions.mix( QUALIMAP_BAMQC.out.versions ) + } + // // MODULE: remove reads mapping to the host from the raw fastq // @@ -395,9 +430,9 @@ workflow EAGER { // - // MODULE: Bedtools coverage + // MODULE: Bedtools coverage // - + if ( params.run_bedtools_coverage ) { ch_anno_for_bedtools = Channel.fromPath(params.mapstats_bedtools_featurefile, checkIfExists: true).collect() @@ -412,7 +447,7 @@ workflow EAGER { SAMTOOLS_VIEW_GENOME(ch_dedupped_bams) ch_genome_for_bedtools = SAMTOOLS_VIEW_GENOME.out.genome - + BEDTOOLS_COVERAGE_BREADTH(ch_dedupped_for_bedtools, ch_genome_for_bedtools) BEDTOOLS_COVERAGE_DEPTH(ch_dedupped_for_bedtools, ch_genome_for_bedtools) @@ -420,7 +455,6 @@ workflow EAGER { ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_BREADTH.out.versions ) ch_versions = ch_versions.mix( BEDTOOLS_COVERAGE_DEPTH.out.versions ) } - // // SUBWORKFLOW: Calculate Damage @@ -491,6 +525,10 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) //ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) // Replaced with custom mixing + if ( !params.skip_qualimap ) { + ch_multiqc_files = ch_multiqc_files.mix( QUALIMAP_BAMQC.out.results.collect{it[1]}.ifEmpty([]) ) + } + MULTIQC ( ch_multiqc_files.collect(), ch_multiqc_config.toList(),