From b976ac7f8e8368282242548a8da4aff0216985dd Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 20 Mar 2024 13:32:18 +0100 Subject: [PATCH 01/12] port over stuff from dsl2-bam-to-fq --- docs/usage.md | 15 ++++++++++++++- nextflow.config | 3 +++ nextflow_schema.json | 6 ++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index be8646cc3..58da8c26f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -27,6 +27,18 @@ CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ``` +### Supplying BAM input + +It is possible to also supply BAM files as input to nf-core/eager. This can allow you to skip earlier steps of the pipeline (preprocessing and mapping) when desired - e.g. when re-processing public data. You can also convert input BAM files back to FASTQ files to re-undergo preprocessing and mapping. This may be desired when you want to standardise the mapping parameters between your own and previously published data. + +You will still need to fill the `pairment` column in the input TSV sheet for the BAM files. If you do not convert the BAM files back to FASTQ, you must specify the column as `single`. If you do do the conversion, you must specify the type of reads the BAM file contains, i.e.: + +- If the mapped reads in the BAM file are single end then specify `single` +- If the mapped reads in the BAM file are paired-end _but merged pairs_ (i.e. overlapping pairs collapsed to a single read), then you must also supply `single` +- If the mapped reads in the BAM file are paired-end and are _not_ merged (i.e., paired-end mapping was originally performed), then you must specify `paired` + +Note that if you do not specify to merge BAM converted paired-end FASTQs (i.e., request paired-end mapping), only forward and reverse pairs will be used - singletons in the BAMs will be discarded! + ### Full samplesheet The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. @@ -261,6 +273,7 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```bash +``` +bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/nextflow.config b/nextflow.config index 5865a2943..039659fd1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,9 @@ params { // Input options input = null + // Input BAM conversion + convert_inputbam = false + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' diff --git a/nextflow_schema.json b/nextflow_schema.json index 11348bfd2..228055bf7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,6 +23,12 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/eager/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "convert_inputbam": { + "type": "boolean", + "default": false, + "description": "Specify to convert input BAM files back to FASTQ for remapping", + "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication)." + }, "outdir": { "type": "string", "format": "directory-path", From 8cb6674a35994c726a5f3cb0eaedd01eda40ef06 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:10:28 +0200 Subject: [PATCH 02/12] install samtools/collatefastq --- modules.json | 5 ++ .../samtools/collatefastq/environment.yml | 8 ++ modules/nf-core/samtools/collatefastq/main.nf | 55 ++++++++++++++ .../nf-core/samtools/collatefastq/meta.yml | 76 +++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 modules/nf-core/samtools/collatefastq/environment.yml create mode 100644 modules/nf-core/samtools/collatefastq/main.nf create mode 100644 modules/nf-core/samtools/collatefastq/meta.yml diff --git a/modules.json b/modules.json index 1b9488483..0d1073ca2 100644 --- a/modules.json +++ b/modules.json @@ -205,6 +205,11 @@ "git_sha": "6b0e4fe14ca1b12e131f64608f0bbaf36fd11451", "installed_by": ["modules"] }, + "samtools/collatefastq": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["modules"] + }, "samtools/depth": { "branch": "master", "git_sha": "a1ffbc1fd87bd5a829e956cc26ec9cc53af3e817", diff --git a/modules/nf-core/samtools/collatefastq/environment.yml b/modules/nf-core/samtools/collatefastq/environment.yml new file mode 100644 index 000000000..d1ce1a801 --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/environment.yml @@ -0,0 +1,8 @@ +name: samtools_collatefastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/collatefastq/main.nf b/modules/nf-core/samtools/collatefastq/main.nf new file mode 100644 index 000000000..1e431c6e5 --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/main.nf @@ -0,0 +1,55 @@ +process SAMTOOLS_COLLATEFASTQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fq.gz") , optional:true, emit: fastq_interleaved + tuple val(meta), path("*_other.fq.gz") , emit: fastq_other + tuple val(meta), path("*_singleton.fq.gz") , optional:true, emit: fastq_singleton + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def output = (interleave && ! meta.single_end) ? "> ${prefix}_interleaved.fq.gz" : + meta.single_end ? "-1 ${prefix}_1.fq.gz -s ${prefix}_singleton.fq.gz" : + "-1 ${prefix}_1.fq.gz -2 ${prefix}_2.fq.gz -s ${prefix}_singleton.fq.gz" + + """ + samtools collate \\ + $args \\ + --threads $task.cpus \\ + ${reference} \\ + -O \\ + $input \\ + . | + + samtools fastq \\ + $args2 \\ + --threads $task.cpus \\ + ${reference} \\ + -0 ${prefix}_other.fq.gz \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collatefastq/meta.yml b/modules/nf-core/samtools/collatefastq/meta.yml new file mode 100644 index 000000000..898cdbdad --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/meta.yml @@ -0,0 +1,76 @@ +name: samtools_collatefastq +description: | + The module uses collate and then fastq methods from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + documentation: http://www.htslib.org/doc/1.1/samtools.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - interleave: + type: boolean + description: | + If true, the output is a single interleaved paired-end FASTQ + If false, the output split paired-end FASTQ + default: false +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + R1 and R2 FASTQ files + pattern: "*_{1,2}.fq.gz" + - fastq_interleaved: + type: file + description: | + Interleaved paired end FASTQ files + pattern: "*_interleaved.fq.gz" + - fastq_other: + type: file + description: | + FASTQ files with reads where the READ1 and READ2 FLAG bits set are either both set or both unset. + pattern: "*_other.fq.gz" + - fastq_singleton: + type: file + description: | + FASTQ files with singleton reads. + pattern: "*_singleton.fq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lescai" + - "@maxulysse" + - "@matthdsm" +maintainers: + - "@lescai" + - "@maxulysse" + - "@matthdsm" From c0fcaeae26fcd2c705179af922381148960bab10 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:11:04 +0200 Subject: [PATCH 03/12] add input BAM conversion to FASTQ --- conf/modules.config | 19 ++++++++++ workflows/eager.nf | 89 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 88 insertions(+), 20 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index efdc216a4..5c5f2f3e0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,25 @@ process { ] } + // + // CONVERT INPUT BAM + // + withName: SAMTOOLS_CONVERT_INPUT_BAM { + tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + publishDir = [ + enabled: false + ] + } + + withName: CAT_FASTQ_CONVERTED_BAM { + tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + publishDir = [ + enabled: false + ] + } + // // READ PREPROCESSING // diff --git a/workflows/eager.nf b/workflows/eager.nf index d2e25ce20..65a4d7a28 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -44,21 +44,23 @@ include { GENOTYPE } from '../subworkflows/local/genotype' // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM_INPUT } from '../modules/nf-core/samtools/index/main' -include { PRESEQ_CCURVE } from '../modules/nf-core/preseq/ccurve/main' -include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap/main' -include { FALCO } from '../modules/nf-core/falco/main' -include { MTNUCRATIO } from '../modules/nf-core/mtnucratio/main' -include { HOST_REMOVAL } from '../modules/local/host_removal' -include { ENDORSPY } from '../modules/nf-core/endorspy/main' -include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTATS_BAM_INPUT } from '../modules/nf-core/samtools/flagstat/main' -include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH } from '../modules/nf-core/bedtools/coverage/main' -include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' -include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' -include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_NOBED } from '../modules/nf-core/qualimap/bamqc/main' -include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_WITHBED } from '../modules/nf-core/qualimap/bamqc/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { SAMTOOLS_COLLATEFASTQ as SAMTOOLS_CONVERT_INPUT_BAM } from '../modules/nf-core/samtools/collatefastq/main' +include { CAT_FASTQ as CAT_FASTQ_CONVERTED_BAM } from '../modules/nf-core/cat/fastq/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM_INPUT } from '../modules/nf-core/samtools/index/main' +include { PRESEQ_CCURVE } from '../modules/nf-core/preseq/ccurve/main' +include { PRESEQ_LCEXTRAP } from '../modules/nf-core/preseq/lcextrap/main' +include { FALCO } from '../modules/nf-core/falco/main' +include { MTNUCRATIO } from '../modules/nf-core/mtnucratio/main' +include { HOST_REMOVAL } from '../modules/local/host_removal' +include { ENDORSPY } from '../modules/nf-core/endorspy/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTATS_BAM_INPUT } from '../modules/nf-core/samtools/flagstat/main' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH } from '../modules/nf-core/bedtools/coverage/main' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' +include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' +include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_NOBED } from '../modules/nf-core/qualimap/bamqc/main' +include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_WITHBED } from '../modules/nf-core/qualimap/bamqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -94,6 +96,53 @@ workflow EAGER { if ( params.preprocessing_tool == 'fastp' && !adapterlist.extension.matches(".*(fa|fasta|fna|fas)") ) error "[nf-core/eager] ERROR: fastp adapter list requires a `.fasta` format and extension (or fa, fas, fna). Check input: --preprocessing_adapterlist ${params.preprocessing_adapterlist}" } + // + // MODULE: Convert input BAMs back to FastQ + // + + if (params.convert_inputbam) { + // Convert input BAMs back to FastQ with non-interleaved output. + SAMTOOLS_CONVERT_INPUT_BAM ( ch_samplesheet_bams, [ [], [] ], false ) + + // if BAM is single-end, pull R1 output as well as 'other' output and merge (in case collapsed reads have their R1 and R2 flags both set to 0 or 1) + ch_single_end_reads = SAMTOOLS_CONVERT_INPUT_BAM.out.fastq + .filter { + meta, reads -> + meta.single_end + } + .join(SAMTOOLS_CONVERT_INPUT_BAM.out.fastq_other) + .map { + meta, read1, fastq_other -> + [meta, [read1, fastq_other] ] + } + .dump(tag: "converted_se_and_other", pretty: true) + + //if BAM is paired-end, pull R1 and R2 outputs, discarding 'other' output and singletons + ch_paired_end_reads = SAMTOOLS_CONVERT_INPUT_BAM.out.fastq + .filter { + meta, reads -> + ! meta.single_end + } + .dump(tag: "pe-bam", pretty: true) + + // Merge the R1 and other reads for single-end BAMs + CAT_FASTQ_CONVERTED_BAM( ch_single_end_reads ) + ch_fastqs_from_converted_bams = CAT_FASTQ_CONVERTED_BAM.out.reads + .mix(ch_paired_end_reads) + // drop reference and id_index from meta + .map { + meta, reads -> + [ meta - meta.subMap('reference', 'id_index'), reads ] + } + + // Mix the converted fastqs with the original fastqs + ch_fastqs_for_preprocessing = ch_fastqs_from_converted_bams + .mix( ch_samplesheet_fastqs ) + } else { + // If BAM conversion is not activated , just use the original fastqs + ch_fastqs_for_preprocessing = ch_samplesheet_fastqs + } + // // SUBWORKFLOW: Indexing of reference files // @@ -106,11 +155,11 @@ workflow EAGER { // if ( params.sequencing_qc_tool == "falco" ) { - FALCO ( ch_samplesheet_fastqs ) + FALCO ( ch_fastqs_for_preprocessing ) ch_versions = ch_versions.mix( FALCO.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( FALCO.out.txt.collect{it[1]}.ifEmpty([]) ) } else { - FASTQC ( ch_samplesheet_fastqs ) + FASTQC ( ch_fastqs_for_preprocessing ) ch_versions = ch_versions.mix( FASTQC.out.versions.first() ) ch_multiqc_files = ch_multiqc_files.mix( FASTQC.out.zip.collect{it[1]}.ifEmpty([]) ) } @@ -120,12 +169,12 @@ workflow EAGER { // if ( !params.skip_preprocessing ) { - PREPROCESSING ( ch_samplesheet_fastqs, adapterlist ) + PREPROCESSING ( ch_fastqs_for_preprocessing, adapterlist ) ch_reads_for_mapping = PREPROCESSING.out.reads ch_versions = ch_versions.mix( PREPROCESSING.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) } else { - ch_reads_for_mapping = ch_samplesheet_fastqs + ch_reads_for_mapping = ch_fastqs_for_preprocessing } // @@ -273,7 +322,7 @@ workflow EAGER { // Preparing fastq channel for host removal to be combined with the bam channel // The meta of the fastq channel contains additional fields when compared to the meta from the bam channel: lane, colour_chemistry, // and not necessarily matching single_end. Those fields are dropped of the meta in the map and stored in new_meta - ch_fastqs_for_host_removal= ch_samplesheet_fastqs.map{ + ch_fastqs_for_host_removal= ch_fastqs_for_preprocessing.map{ meta, fastqs -> new_meta = meta.clone().findAll{ it.key !in [ 'lane', 'colour_chemistry', 'single_end' ] } [ new_meta, meta, fastqs ] From 0e29de07fd97f7274458d3eac80e04f420e0f6f3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:26:29 +0200 Subject: [PATCH 04/12] update schema --- nextflow_schema.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 304a3126e..d3766068a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -25,9 +25,9 @@ }, "convert_inputbam": { "type": "boolean", - "default": false, "description": "Specify to convert input BAM files back to FASTQ for remapping", - "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication)." + "help_text": "This parameter tells the pipeline to convert the BAM files listed in the `--input` TSV or CSV sheet back to FASTQ format to allow re-preprocessing and mapping\n\nCan be useful when you want to ensure consistent mapping parameters across all libraries when incorporating public data, however be careful of biases that may come from re-processing again (the BAM files may already be clipped, or only mapped reads with different settings are included so you may not have all reads from the original publication).", + "fa_icon": "fas fa-undo-alt" }, "outdir": { "type": "string", @@ -1307,7 +1307,6 @@ "properties": { "run_sexdeterrmine": { "type": "boolean", - "default": false, "fa_icon": "fas fa-transgender-alt", "description": "Turn on sex determination for human reference genomes. This will run on single- and double-stranded variants of a library separately.", "help_text": "Specify to run the optional process of sex determination." From 99f2f0f5e676626abdf4995ee2fdb1b647f88be1 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:32:59 +0200 Subject: [PATCH 05/12] typo --- docs/usage.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 58da8c26f..fef8a5213 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -273,7 +273,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -``` -bash +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` From 973b5805760aa292422d089656f9d9126429d05b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:56:00 +0200 Subject: [PATCH 06/12] consistent module name. skip input bam index/flagstat/filtering if converting. --- workflows/eager.nf | 47 ++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 65a4d7a28..bde6c329f 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -46,7 +46,7 @@ include { GENOTYPE } from '../subworkflows/local/genotype' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { SAMTOOLS_COLLATEFASTQ as SAMTOOLS_CONVERT_INPUT_BAM } from '../modules/nf-core/samtools/collatefastq/main' +include { SAMTOOLS_COLLATEFASTQ as SAMTOOLS_CONVERT_BAM_INPUT } from '../modules/nf-core/samtools/collatefastq/main' include { CAT_FASTQ as CAT_FASTQ_CONVERTED_BAM } from '../modules/nf-core/cat/fastq/main' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM_INPUT } from '../modules/nf-core/samtools/index/main' include { PRESEQ_CCURVE } from '../modules/nf-core/preseq/ccurve/main' @@ -102,15 +102,15 @@ workflow EAGER { if (params.convert_inputbam) { // Convert input BAMs back to FastQ with non-interleaved output. - SAMTOOLS_CONVERT_INPUT_BAM ( ch_samplesheet_bams, [ [], [] ], false ) + SAMTOOLS_CONVERT_BAM_INPUT ( ch_samplesheet_bams, [ [], [] ], false ) // if BAM is single-end, pull R1 output as well as 'other' output and merge (in case collapsed reads have their R1 and R2 flags both set to 0 or 1) - ch_single_end_reads = SAMTOOLS_CONVERT_INPUT_BAM.out.fastq + ch_single_end_reads = SAMTOOLS_CONVERT_BAM_INPUT.out.fastq .filter { meta, reads -> meta.single_end } - .join(SAMTOOLS_CONVERT_INPUT_BAM.out.fastq_other) + .join(SAMTOOLS_CONVERT_BAM_INPUT.out.fastq_other) .map { meta, read1, fastq_other -> [meta, [read1, fastq_other] ] @@ -118,7 +118,7 @@ workflow EAGER { .dump(tag: "converted_se_and_other", pretty: true) //if BAM is paired-end, pull R1 and R2 outputs, discarding 'other' output and singletons - ch_paired_end_reads = SAMTOOLS_CONVERT_INPUT_BAM.out.fastq + ch_paired_end_reads = SAMTOOLS_CONVERT_BAM_INPUT.out.fastq .filter { meta, reads -> ! meta.single_end @@ -195,24 +195,31 @@ workflow EAGER { // MODULE: indexing of user supplied input BAMs // - SAMTOOLS_INDEX_BAM_INPUT ( ch_samplesheet_bams ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX_BAM_INPUT.out.versions ) + if ( !params.convert_inputbam ){ + SAMTOOLS_INDEX_BAM_INPUT ( ch_samplesheet_bams ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX_BAM_INPUT.out.versions ) - if ( params.fasta_largeref ) - ch_bams_from_input = ch_samplesheet_bams.join( SAMTOOLS_INDEX_BAM_INPUT.out.csi ) - else { - ch_bams_from_input = ch_samplesheet_bams.join( SAMTOOLS_INDEX_BAM_INPUT.out.bai ) - } + if ( params.fasta_largeref ) { + ch_bams_from_input = ch_samplesheet_bams.join( SAMTOOLS_INDEX_BAM_INPUT.out.csi ) + } else { + ch_bams_from_input = ch_samplesheet_bams.join( SAMTOOLS_INDEX_BAM_INPUT.out.bai ) + } + // + // MODULE: flagstats of user supplied input BAMs + // + ch_bam_bai_input = ch_samplesheet_bams + .join(SAMTOOLS_INDEX_BAM_INPUT.out.bai) + + SAMTOOLS_FLAGSTATS_BAM_INPUT ( ch_bam_bai_input ) + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTATS_BAM_INPUT.out.versions ) + ch_flagstat_input_bam = SAMTOOLS_FLAGSTATS_BAM_INPUT.out.flagstat // For endorspy - // - // MODULE: flagstats of user supplied input BAMs - // - ch_bam_bai_input = ch_samplesheet_bams - .join(SAMTOOLS_INDEX_BAM_INPUT.out.bai) - SAMTOOLS_FLAGSTATS_BAM_INPUT ( ch_bam_bai_input ) - ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTATS_BAM_INPUT.out.versions ) + } else { + ch_bams_from_input = Channel.empty() + ch_flagstat_input_bam = Channel.empty() + } // // SUBWORKFLOW: bam filtering (length, mapped/unmapped, quality etc.) @@ -394,7 +401,7 @@ workflow EAGER { // ch_flagstat_for_endorspy_raw = MAP.out.flagstat - .mix( SAMTOOLS_FLAGSTATS_BAM_INPUT.out.flagstat ) + .mix( ch_flagstat_input_bam ) if ( params.run_bamfiltering & !params.skip_deduplication ) { ch_for_endorspy = ch_flagstat_for_endorspy_raw From 70822e3b65553541b4912480da258d2cda856efa Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:56:30 +0200 Subject: [PATCH 07/12] remove dumps --- workflows/eager.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index bde6c329f..578e02189 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -115,7 +115,6 @@ workflow EAGER { meta, read1, fastq_other -> [meta, [read1, fastq_other] ] } - .dump(tag: "converted_se_and_other", pretty: true) //if BAM is paired-end, pull R1 and R2 outputs, discarding 'other' output and singletons ch_paired_end_reads = SAMTOOLS_CONVERT_BAM_INPUT.out.fastq @@ -123,7 +122,6 @@ workflow EAGER { meta, reads -> ! meta.single_end } - .dump(tag: "pe-bam", pretty: true) // Merge the R1 and other reads for single-end BAMs CAT_FASTQ_CONVERTED_BAM( ch_single_end_reads ) From 861844f7b3c96ca2a3bf4e164e63853cd74a257d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 5 Apr 2024 11:57:51 +0200 Subject: [PATCH 08/12] add CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 002b66dc7..5a011c5b9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: - "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw'" - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed'" - "-profile test,docker --mapping_tool bowtie2 --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'" - - "-profile test,docker --skip_preprocessing" + - "-profile test,docker --skip_preprocessing --convert_inputbam" - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'" - "-profile test_humanbam,docker --run_sexdeterrmine --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref From 1b70a0beb32dbab2f87bfda43e64d75485424e79 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Apr 2024 11:59:09 +0300 Subject: [PATCH 09/12] typo in module name --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 71922d3ac..00dbd9bb4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,7 +37,7 @@ process { // // CONVERT INPUT BAM // - withName: SAMTOOLS_CONVERT_INPUT_BAM { + withName: SAMTOOLS_CONVERT_BAM_INPUT { tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } publishDir = [ From 8a73d3b1438c21a308d7324b08d34c9166e9493c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Apr 2024 12:18:04 +0300 Subject: [PATCH 10/12] document manual test --- docs/development/manual_tests.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 9ecfc2568..eea6cab14 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -903,3 +903,11 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Expect: One `glf.gz` file in binary_three format per reference. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'angsd' --genotyping_angsd_glmodel 'syk' --genotyping_angsd_glformat 'text' --genotyping_source 'raw' -ansi-log false -dump-channels ``` + +# CONVERT BAM INPUT + +```bash +## BAM input converted to FastQ and remapped. +## Expect: BAM input shows up in FastQC -> mapping results. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ --convert_inputbam --skip_deduplication -resume -ansi-log false -dump-channels +``` From 3721e1e41989fad0ac6d2574e1f4bf972c8433fb Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 10 May 2024 11:59:12 +0200 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- nextflow.config | 2 +- workflows/eager.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1cbfb1867..bb1052669 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,7 +14,7 @@ params { input = null // Input BAM conversion - convert_inputbam = false + convert_inputbam = false // References genome = null diff --git a/workflows/eager.nf b/workflows/eager.nf index 578e02189..91770032b 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -123,7 +123,7 @@ workflow EAGER { ! meta.single_end } - // Merge the R1 and other reads for single-end BAMs + // Put all the converted FASTQs back together again CAT_FASTQ_CONVERTED_BAM( ch_single_end_reads ) ch_fastqs_from_converted_bams = CAT_FASTQ_CONVERTED_BAM.out.reads .mix(ch_paired_end_reads) @@ -190,7 +190,7 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix( MAP.out.mqc.collect{it[1]}.ifEmpty([]) ) // - // MODULE: indexing of user supplied input BAMs + // MODULE: indexing of user supplied unconverted input BAMs // if ( !params.convert_inputbam ){ From 47edd9a8e6448c193f872b2971c44a457d7ce0b9 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 10 May 2024 13:03:40 +0300 Subject: [PATCH 12/12] move CAT_FASTQ_CONVERTED_BAM --- workflows/eager.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 91770032b..b3daed622 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -116,6 +116,9 @@ workflow EAGER { [meta, [read1, fastq_other] ] } + // Put all the converted FASTQs with single-end reads back together again + CAT_FASTQ_CONVERTED_BAM( ch_single_end_reads ) + //if BAM is paired-end, pull R1 and R2 outputs, discarding 'other' output and singletons ch_paired_end_reads = SAMTOOLS_CONVERT_BAM_INPUT.out.fastq .filter { @@ -123,8 +126,6 @@ workflow EAGER { ! meta.single_end } - // Put all the converted FASTQs back together again - CAT_FASTQ_CONVERTED_BAM( ch_single_end_reads ) ch_fastqs_from_converted_bams = CAT_FASTQ_CONVERTED_BAM.out.reads .mix(ch_paired_end_reads) // drop reference and id_index from meta