nf-core · JustBioinfo · Jun 1, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 6, 2024
diff --git a/conf/modules.config b/conf/modules.config
@@ -138,6 +138,16 @@ process {
         ]
     }
 
+    withName: PRESTO_MASKPRIMERS_ALIGN_TRIM {
+        publishDir = [
+            path: { "${params.outdir}/presto/trim_upstream_umi_linker/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args = '--skiprc --pf UMILINK'
+        ext.args2 = '-f ID PRIMER ERROR'
+    }
+
     withName: PRESTO_MASKPRIMERS_ALIGN {
         publishDir = [
             path: { "${params.outdir}/presto/02-maskprimers/${meta.id}" },
@@ -246,9 +256,19 @@ process {
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
-        ext.args = '--coord presto --rc tail --1f CONSCOUNT PRCONS --2f CONSCOUNT PRCONS'
+        ext.args = '--coord presto --rc tail --1f CONSCOUNT PRCONS --2f CONSCOUNT PRCONS --failed'
         ext.args2 = '-f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT LENGTH OVERLAP ERROR PVALUE'
     }
+
+    withName: PRESTO_ASSEMBLEPAIRS_JOIN {
+        publishDir = [
+            path: { "${params.outdir}/presto/08-assemble-pairs-join/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args = '--coord presto --rc tail --1f CONSCOUNT PRCONS --2f CONSCOUNT PRCONS'
+        ext.args2 = '-f ID LENGTH'
+    }
 
     withName: PRESTO_ASSEMBLEPAIRS_SEQUENTIAL {
         publishDir = [

diff --git a/docs/usage.md b/docs/usage.md
@@ -444,6 +444,27 @@ The UMI barcodes are typically read from an index file but sometimes can be prov
 
 - No UMIs in R1 or R2 reads: if no UMIs are present in the samples, specify `--umi_length 0` to use the sans-UMI subworkflow.
 
+### 5’-RACE where R1 not starting directly by UMI 
+
+This sequencing type requires setting `--library_generation_method specific_5p_race_umi`.
+
+A fasta file containing the UMI + race linker sequence pattern is required to locate and trim the sequence upstream of the UMI.
+
+```bash
+nextflow run nf-core/airrflow -profile docker \
+--input samplesheet.tsv \
+--library_generation_method specific_5p_race_umi \
+--cprimers Cprimers.fasta \
+--race_linker linker.fasta \
+--umi_linker umi_pattern.fasta \
+--umi_position R1 \
+--umi_length 18 \
+--cprimer_start 0 \
+--cprimer_position R2  
+--outdir ./results
+```
+
+
 ## Supported single cell library generation methods (protocols)
 
 When processing single cell sequencing data departing from raw `fastq` reads, currently only a `--library_generation_method` to support 10xGenomics data is available.

diff --git a/modules/local/presto/presto_assemblepairs.nf b/modules/local/presto/presto_assemblepairs.nf
@@ -13,6 +13,7 @@ process PRESTO_ASSEMBLEPAIRS {
 
     output:
     tuple val(meta), path("*_assemble-pass.fastq"), emit: reads
+    tuple val(meta), path("*_assemble-fail.fastq"),emit: reads_fail, optional: true
     path("*_command_log.txt"), emit: logs
     path("*.log")
     path("*_table.tab")

diff --git a/modules/local/presto/presto_assemblepairs_join.nf b/modules/local/presto/presto_assemblepairs_join.nf
@@ -0,0 +1,37 @@
+process PRESTO_ASSEMBLEPAIRS_JOIN {
+    tag "$meta.id"
+    label 'process_long_parallelized'
+    label 'immcantation'
+
+    conda "bioconda::presto=0.7.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/presto:0.7.1--pyhdfd78af_0' :
+        'biocontainers/presto:0.7.1--pyhdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(R1), path(R2), path(reads_pass)
+
+    output:
+    tuple val(meta), path("*_assemblejoin-pass.fastq"), emit: reads
+    path("*_command_log.txt"), emit: logs
+    path("*.log")
+    path("*_table.tab")
+    path "versions.yml" , emit: versions
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    """
+    AssemblePairs.py join -1 $R1 -2 $R2 --nproc ${task.cpus} \\
+        $args \\
+        --outname ${meta.id}_join --log ${meta.id}_join.log > ${meta.id}_join_command_log.txt
+    ParseLog.py -l ${meta.id}_join.log $args2
+    cp ${meta.id}_assemble-pass.fastq ${meta.id}_assemblejoin-pass.fastq
+    cat ${meta.id}_join_assemble-pass.fastq >> ${meta.id}_assemblejoin-pass.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        presto: \$( AssemblePairs.py --version | awk -F' '  '{print \$2}' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/presto/presto_maskprimers_align_trim.nf b/modules/local/presto/presto_maskprimers_align_trim.nf
@@ -0,0 +1,40 @@
+process PRESTO_MASKPRIMERS_ALIGN_TRIM {
+    tag "$meta.id"
+    label "process_high"
+    label 'immcantation'
+
+    conda "bioconda::presto=0.7.1"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/presto:0.7.1--pyhdfd78af_0' :
+        'biocontainers/presto:0.7.1--pyhdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(R1)
+    path(umi_linker)
+
+    output:
+    tuple val(meta), path("*_trim_R1_primers-pass.fastq") , emit: reads
+    path "*_command_log_R1.txt", emit: logs
+    path "*_R1.log"
+    path "*.tab", emit: log_tab
+    path "versions.yml" , emit: versions
+
+    script:
+    def args = task.ext.args?: ''
+    def args2 = task.ext.args2?: ''
+    """
+    MaskPrimers.py align --nproc ${task.cpus} \\
+    -s $R1 \\
+    -p ${umi_linker} \\
+    --mode trim \\
+    $args \\
+    --outname ${meta.id}_trim_R1 \\
+    --log ${meta.id}_R1.log > ${meta.id}_command_log_R1.txt
+    ParseLog.py -l ${meta.id}_R1.log $args2
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        presto: \$( MaskPrimers.py --version | awk -F' '  '{print \$2}' )
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -68,6 +68,7 @@ params {
 
     // Assemble pairs
     assemblepairs_sequential = false
+    assemblepairs_join = false
 
     // internal cregion
     align_cregion = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -10,7 +10,10 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["input", "outdir"],
+            "required": [
+                "input",
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -27,7 +30,10 @@
                     "type": "string",
                     "default": "fastq",
                     "description": "Specify the processing mode for the pipeline. Available options are \"fastq\" and \"assembled\".",
-                    "enum": ["fastq", "assembled"],
+                    "enum": [
+                        "fastq",
+                        "assembled"
+                    ],
                     "fa_icon": "fas fa-terminal"
                 },
                 "outdir": {
@@ -75,7 +81,12 @@
                     "type": "string",
                     "description": "Path to fasta file containing the linker sequence, if no V-region primers were used but a linker sequence is present (e.g. 5' RACE SMARTer TAKARA protocol).",
                     "fa_icon": "fas fa-dna"
-                }
+                },
+                "umi_linker": {
+                    "type": "string",
+                    "description": "Path to fasta file containing umi-linker motifs, if no V-region primer has been used but a linker sequence is present with a residual sequence upstream of the UMI.",
+                    "fa_icon": "fas fa-dna"
+                },
             },
             "fa_icon": "fas fa-flask"
         },
@@ -112,7 +123,10 @@
                     "default": "R1",
                     "fa_icon": "fas fa-dna",
                     "description": "Indicate if C region primers are in the R1 or R2 reads.",
-                    "enum": ["R1", "R2"]
+                    "enum": [
+                        "R1",
+                        "R2"
+                    ]
                 },
                 "primer_revpr": {
                     "type": "boolean",
@@ -133,7 +147,10 @@
                     "default": "R1",
                     "description": "Indicate if UMI indices are recorded in the R1 (default) or R1 fastq file.",
                     "help_text": "The pipeline requires UMI barcodes for identifying unique transcripts. These barcodes are typically read from an index file but sometimes can be provided merged with the start of the R1 or R2 reads. If provided in an additional index file, set the `--index_file` parameter, if provided merged with the R1 or R2 reads, set the `--umi_position` parameter to R1 or R2, respectively.",
-                    "enum": ["R1", "R2"],
+                    "enum": [
+                        "R1",
+                        "R2"
+                    ],
                     "fa_icon": "fas fa-barcode"
                 },
                 "umi_length": {
@@ -235,7 +252,12 @@
                     "type": "string",
                     "default": "cut",
                     "description": "Masking mode for the pRESTO MaskPrimer step. Available: cut, mask, trim, tag.",
-                    "enum": ["cut", "mask", "tag", "trim"],
+                    "enum": [
+                        "cut",
+                        "mask",
+                        "tag",
+                        "trim"
+                    ],
                     "help_text": "The primer masking modes will perform the following actions:\n\n* `cut`: remove both the primer region and the preceding sequence.\n* `mask`: replace the primer region with Ns and remove the preceding sequence.\n* `trim`: remove the region preceding the primer, but leave the primer region intact.\n* `tag`: leave the input sequence unmodified.",
                     "fa_icon": "fas fa-mask"
                 },
@@ -291,6 +313,12 @@
                     "fa_icon": "fas fa-align-center",
                     "description": "Use AssemblePairs sequential instead of AssemblePairs align when assembling read pairs."
                 },
+                "assemblepairs_join": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-align-center",
+                    "default": false,
+                    "description": "Use AssemblePairs join after AssemblePairs align to rescue non-overlapping reads by concatening them by their ends."
+                },
                 "align_cregion": {
                     "type": "boolean",
                     "fa_icon": "fas fa-align-center",
@@ -410,14 +438,19 @@
                     "oneOf": [
                         {
                             "type": "string",
-                            "enum": ["auto"]
+                            "enum": [
+                                "auto"
+                            ]
                         },
                         {
                             "type": "number",
                             "minimum": 0
                         }
                     ],
-                    "type": ["string", "number"],
+                    "type": [
+                        "string",
+                        "number"
+                    ],
                     "default": "auto",
                     "fa_icon": "fab fa-pagelines",
                     "description": "Set the clustering threshold Hamming distance value. Default: 'auto'"
@@ -443,7 +476,10 @@
                     "type": "string",
                     "default": "raxml",
                     "description": "Lineage tree software to use to build trees within Dowser. If you change the default, also set the `lineage_tree_exec` parameter.",
-                    "enum": ["raxml", "igphyml"],
+                    "enum": [
+                        "raxml",
+                        "igphyml"
+                    ],
                     "fa_icon": "fas fa-pagelines"
                 },
                 "lineage_tree_exec": {
@@ -694,7 +730,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {

diff --git a/subworkflows/local/presto_umi.nf b/subworkflows/local/presto_umi.nf
@@ -10,6 +10,7 @@ include { FASTP                                          } from '../../modules/n
 include { PRESTO_FILTERSEQ      as  PRESTO_FILTERSEQ_UMI     }    from '../../modules/local/presto/presto_filterseq'
 include { PRESTO_MASKPRIMERS    as  PRESTO_MASKPRIMERS_UMI   }    from '../../modules/local/presto/presto_maskprimers'
 include { PRESTO_MASKPRIMERS_ALIGN as PRESTO_ALIGN_PRIMERS   }    from '../../modules/local/presto/presto_maskprimers_align'
+include { PRESTO_MASKPRIMERS_ALIGN_TRIM as PRESTO_ALIGN_TRIM   }    from '../../modules/local/presto/presto_maskprimers_align_trim'
 include { PRESTO_MASKPRIMERS_EXTRACT                         }    from '../../modules/local/presto/presto_maskprimers_extract'
 include { PRESTO_MASKPRIMERS_ALIGN as PRESTO_ALIGN_CREGION   }    from '../../modules/local/presto/presto_maskprimers_align'
 include { PRESTO_PAIRSEQ        as  PRESTO_PAIRSEQ_UMI       }    from '../../modules/local/presto/presto_pairseq'
@@ -20,6 +21,7 @@ include { PRESTO_BUILDCONSENSUS as  PRESTO_BUILDCONSENSUS_UMI}    from '../../mo
 include { PRESTO_BUILDCONSENSUS as PRESTO_BUILDCONSENSUS_ALIGN }  from '../../modules/local/presto/presto_buildconsensus'
 include { PRESTO_POSTCONSENSUS_PAIRSEQ as PRESTO_POSTCONSENSUS_PAIRSEQ_UMI }    from '../../modules/local/presto/presto_postconsensus_pairseq'
 include { PRESTO_ASSEMBLEPAIRS  as  PRESTO_ASSEMBLEPAIRS_UMI }    from '../../modules/local/presto/presto_assemblepairs'
+include { PRESTO_ASSEMBLEPAIRS_JOIN  as  PRESTO_ASSEMBLEPAIRS_JOIN_UMI }    from '../../modules/local/presto/presto_assemblepairs_join'
 include { PRESTO_ASSEMBLEPAIRS_SEQUENTIAL                    }    from '../../modules/local/presto/presto_assemblepairs_sequential'
 include { PRESTO_PARSEHEADERS   as  PRESTO_PARSEHEADERS_COLLAPSE_UMI } from '../../modules/local/presto/presto_parseheaders'
 include { PRESTO_PARSEHEADERS   as  PRESTO_PARSEHEADERS_CREGION } from '../../modules/local/presto/presto_parseheaders'
@@ -36,6 +38,7 @@ workflow PRESTO_UMI {
     ch_reads       // channel: [ val(meta), [ reads ] ]
     ch_cprimers    // channel: [ cprimers.fasta ]
     ch_vprimers    // channel: [ vprimers.fasta ]
+    ch_umilinker   // channel: [ umi_linker.fasta ]
     ch_adapter_fasta // channel: [ adapters.fasta ]
     ch_internal_cregion // channel: [ internal_cregions.fasta ]
     ch_igblast
@@ -135,6 +138,40 @@ workflow PRESTO_UMI {
         ch_for_clustersets = PRESTO_PAIRSEQ_ALIGN.out.reads
         ch_pairseq_logs = PRESTO_PAIRSEQ_ALIGN.out.logs
 
+    } else if (params.library_generation_method == 'specific_5p_race_umi') {
+
+        ch_reads_R1 = PRESTO_FILTERSEQ_UMI.out.reads
+                                            .map{ reads -> [reads[0], reads[1]] }.dump(tag: 'ch_reads_R1')
+
+        // trim any sequence in R1 that is before UMI pattern-race linker sequence
+        PRESTO_ALIGN_TRIM(
+            ch_reads_R1,
+            ch_umilinker.collect()
+        )
+
+        // Merge again R1 and R2 by sample ID.
+        ch_maskprimers_trim_reads_R1 = PRESTO_ALIGN_TRIM.out.reads.map{ reads -> [reads[0].id, reads[0], reads[1]]}.dump(tag: 'ch_maskprimers_trim_reads_R1')
+        ch_filterseq_umi_reads_R2 = PRESTO_FILTERSEQ_UMI.out.reads.map{ reads -> [reads[0].id, reads[0], reads[2]]}.dump(tag: 'ch_filterseq_umi_reads_R2')
+        ch_reads_for_maskprimers_umi = ch_maskprimers_trim_reads_R1.join(ch_filterseq_umi_reads_R2)
+                                                        .map{ it -> [it[1], it[2], it[4]] }.dump(tag: 'ch_reads_for_maskprimers_umi')
+
+        PRESTO_MASKPRIMERS_UMI (
+            ch_reads_for_maskprimers_umi,
+            ch_cprimers.collect(),
+            ch_vprimers.collect()
+        )
+
+        ch_versions = ch_versions.mix(PRESTO_MASKPRIMERS_UMI.out.versions)
+        ch_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs
+
+        // Pre-consensus pair
+        PRESTO_PAIRSEQ_UMI (
+            PRESTO_MASKPRIMERS_UMI.out.reads
+        )
+        ch_versions = ch_versions.mix(PRESTO_PAIRSEQ_UMI.out.versions)
+        ch_for_clustersets = PRESTO_PAIRSEQ_UMI.out.reads
+        ch_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs
+
     } else {
 
         PRESTO_MASKPRIMERS_UMI (
@@ -208,6 +245,33 @@ workflow PRESTO_UMI {
         ch_versions = ch_versions.mix(PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.versions)
         ch_assemblepairs_reads = PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.reads
         ch_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.logs
+    } else if (params.assemblepairs_join) {
+        // Assemble read pairs align and get failed reads
+        PRESTO_ASSEMBLEPAIRS_UMI (
+            PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.reads
+        )
+
+
+        // Merge R1 failed, R2 failed and assemblepairs pass reads by sample ID.
+        ch_assemblepairs_fail_reads = PRESTO_ASSEMBLEPAIRS_UMI.out.reads_fail.map{ reads -> [reads[0].id, reads[0], reads[1]]}.dump(tag: 'ch_assemblepairs_fail_reads')
+
+        ch_assemblepairs_pass_reads = PRESTO_ASSEMBLEPAIRS_UMI.out.reads.map{ reads -> [reads[0].id, reads[0], reads[1]]}.dump(tag: 'ch_assemblepairs_pass_reads')
+
+        ch_reads_for_assemblepairs_join_umi = ch_assemblepairs_fail_reads.join(ch_assemblepairs_pass_reads)
+                                                        .map{ it -> [it[1], it[2][0], it[2][1], it[4]] }.dump(tag: 'ch_reads_for_assemblepairs_join_umi')
+
+        // rescue no overlapping reads
+        PRESTO_ASSEMBLEPAIRS_JOIN_UMI (
+            ch_reads_for_assemblepairs_join_umi
+        )
+
+        ch_versions = ch_versions.mix(PRESTO_ASSEMBLEPAIRS_JOIN_UMI.out.versions)
+        ch_assemblepairs_reads = PRESTO_ASSEMBLEPAIRS_JOIN_UMI.out.reads
+
+
+        // not include number of rescue reads 
+        ch_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs
+
     } else {
         // Assemble read pairs align
         PRESTO_ASSEMBLEPAIRS_UMI (