nf-core · yuukiiwa · Oct 17, 2023 · Oct 17, 2023 · Oct 18, 2023 · Oct 18, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,10 +49,12 @@ jobs:
     strategy:
       matrix:
         profiles:
-          - "test_nodx_vc"
-          - "test_nodx_stringtie"
-          - "test_nodx_noaln"
-          - "test_nodx_rnamod"
+          - "test_bc_nodx"
+          - "test_nobc_dx"
+          - "test_nobc_nodx_vc"
+          - "test_nobc_nodx_stringtie"
+          - "test_nobc_nodx_noaln"
+          - "test_nobc_nodx_rnamod"
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v3

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -101,7 +101,7 @@ def check_samplesheet(file_in, updated_path, file_out):
                     barcode = "barcode%s" % (barcode.zfill(2))
 
             ## Check input file extension
-            nanopolish_fast5 = ""
+            fast5 = ""
             if input_file:
                 if input_file.find(" ") != -1:
                     print_error("Input file contains spaces!", "Line", line)
@@ -115,12 +115,12 @@ def check_samplesheet(file_in, updated_path, file_out):
                     if updated_path != "not_changed":
                         input_file = "/".join([updated_path, input_file.split("/")[-1]])
                     list_dir = os.listdir(input_file)
-                    nanopolish_fast5 = input_file
-                    if not (all(fname.endswith(".fast5") for fname in list_dir)):
+                    fast5 = input_file
+                    if not (all(fname.endswith(".fast5") for fname in list_dir)) and not (all(fname.endswith(".pod5") for fname in list_dir)):
                         if "fast5" in list_dir and "fastq" in list_dir:
-                            nanopolish_fast5 = input_file + "/fast5"
+                            fast5 = input_file + "/fast5"
                             ## CHECK FAST5 DIRECTORY
-                            if not (all(fname.endswith(".fast5") for fname in os.listdir(nanopolish_fast5))):
+                            if not (all(fname.endswith(".fast5") for fname in os.listdir(fast5))):
                                 print_error("fast5 directory contains non-fast5 files.")
                             ## CHECK PROVIDED BASECALLED FASTQ
                             fastq_path = input_file + "/fastq"
@@ -139,8 +139,8 @@ def check_samplesheet(file_in, updated_path, file_out):
                                 '{input_file} path does not end with ".fastq.gz", ".fq.gz", or ".bam" and is not an existing directory with correct fast5 and/or fastq inputs.'
                             )
 
-            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, nanopolish_fast5 ]}}
-            sample_info = [barcode, input_file, nanopolish_fast5]
+            ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, fast5 ]}}
+            sample_info = [barcode, input_file, fast5]
             if group not in sample_info_dict:
                 sample_info_dict[group] = {}
             if replicate not in sample_info_dict[group]:
@@ -161,7 +161,7 @@ def check_samplesheet(file_in, updated_path, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "barcode", "reads", "nanopolish_fast5"]) + "\n")
+            fout.write(",".join(["sample", "barcode", "reads", "fast5"]) + "\n")
             for sample in sorted(sample_info_dict.keys()):
                 ## Check that replicate ids are in format 1..<NUM_REPS>
                 uniq_rep_ids = set(sample_info_dict[sample].keys())

diff --git a/conf/test.config b/conf/test.config
@@ -1,33 +1,40 @@
 /*
- * -------------------------------------------------
- *  Nextflow config file for running tests
- * -------------------------------------------------
- * Defines bundled input files and everything required
- * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
- */
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
 
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
-    // Limit resources
-    max_cpus            = 2
-    max_memory          = 6.GB
-    max_time            = 12.h
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '12.h'
 
-    // Input data to perform demultipexing
-    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
-    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
-    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
-    run_nanolyse        = true
-    protocol            = 'DNA'
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
     barcode_kit         = 'NBD103/NBD104'
-    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
-    skip_bigwig         = true
-    skip_bigbed         = true
+    trim_barcodes       = true
+    dorado_model        = '[email protected]'
+    dorado_device       = 'cpu'
+    run_nanolyse        = true
     skip_quantification = true
     skip_fusion_analysis= true
     skip_modification_analysis=true
-    aligner             = 'graphmap2'
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded_multi/'
 }
diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on Travis
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform basecalling and to skip demultipexing
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS108'
+    dorado_model        = '[email protected]'
+    dorado_device       = 'cpu'
+    skip_bigbed         = true
+    skip_bigwig         = true
+    skip_demultiplexing = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_bc_nodx_dnamod.config b/conf/test_bc_nodx_dnamod.config
@@ -0,0 +1,36 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on Travis
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform basecalling and to skip demultipexing
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nanoseq/3.2/samplesheet/samplesheet_bc_nodx_dnamod.csv'
+    input_path_file_type= 'pod5'
+    bedmethyl_out       = true
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS108'
+    dorado_model        = '[email protected]'
+    dorado_modification = '5mCG_5hmCG'
+    dorado_device       = 'cpu'
+    skip_bigbed         = true
+    skip_bigwig         = true
+    skip_demultiplexing = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_nobc_dx.config b/conf/test_nobc_dx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform demultipexing
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
+    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
+    skip_basecalling    = true
+    run_nanolyse        = true
+    protocol            = 'DNA'
+    barcode_kit         = 'NBD103/NBD104'
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
+    skip_bigwig         = true
+    skip_bigbed         = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config b/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config
@@ -20,6 +20,7 @@ params {
     input                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf'
+    skip_basecalling    = true
     protocol              = 'directRNA'
     skip_demultiplexing   = true
     skip_alignment        = true

diff --git a/conf/test_nobc_nodx_noaln_vc.config b/conf/test_nobc_nodx_noaln_vc.config
@@ -0,0 +1,30 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_nobc_nodx_vc,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check variant calling functions'
+
+    // Limit resources so that this can run on Travis
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to skip demultiplexing and variant call
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln_vc.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    skip_basecalling    = true
+    protocol            = 'DNA'
+    skip_alignment      = true
+    skip_quantification = true
+    skip_demultiplexing = true
+    call_variants       = true
+    variant_caller      = 'clair3'
+    structural_variant_caller = 'sniffles'
+}
diff --git a/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config b/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config
@@ -20,6 +20,7 @@ params {
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf'
+    skip_basecalling    = true
     protocol            = 'directRNA'
     run_nanolyse        = true
     skip_bigbed         = true

diff --git a/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config b/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config
@@ -21,6 +21,7 @@ params {
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
     protocol            = 'directRNA'
+    skip_basecalling    = true
     skip_demultiplexing = true
     skip_fusion_analysis= true
     skip_modification_analysis=true

diff --git a/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config b/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config
@@ -19,6 +19,7 @@ params {
     // Input data to skip demultiplexing and variant call
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    skip_basecalling    = true
     protocol            = 'DNA'
     skip_quantification = true
     skip_demultiplexing = true

diff --git a/conf/test_withpull.config b/conf/test_withpull.config
@@ -0,0 +1,39 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
+    barcode_kit         = 'EXP-NBD103'
+    trim_barcodes=true
+    output_demultiplex_fast5 = true
+    run_nanolyse        = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/'
+}
diff --git a/modules/local/bambu.nf b/modules/local/bambu.nf
@@ -3,8 +3,8 @@ process BAMBU {
 
     conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.0.8--r42hc247a5b_0' :
-        'quay.io/biocontainers/bioconductor-bambu:3.0.8--r42hc247a5b_0' }"
+        'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.4.0--r43hf17093f_0' :
+        'quay.io/biocontainers/bioconductor-bambu:3.4.0--r43hf17093f_0' }"
 
     input:
     tuple path(fasta), path(gtf)

diff --git a/modules/local/blue-crab.nf b/modules/local/blue-crab.nf
@@ -0,0 +1,29 @@
+process BLUE_CRAB {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::slow5tools==1.2.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/slow5tools:1.2.0--h56e2c18_1' :
+        'quay.io/biocontainers/slow5tools:1.2.0--h56e2c18_1' }"
+
+    input:
+    tuple val(meta), path(genome), path(gtf), path(fastq), path(bam), path(bai), path(pod5)
+
+    output:
+    tuple val(meta), path(genome), path(gtf), path(fastq), path(bam), path(bai), path(blow5), emit: nanopolish_outputs
+    path "versions.yml"                                                                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    blue-crab p2s $pod5 -o $blow5
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        blue-crab: \$( blue-crab -V | tail -c 6 )
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/dorado_aligner.nf b/modules/local/dorado_aligner.nf
@@ -0,0 +1,25 @@
+process DORADO_ALIGNER {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container "docker.io/ontresearch/dorado"
+
+    input:
+    tuple val(meta), path(mod_bam)
+    path fasta
+
+    output:
+    tuple val(meta), path("aligned_sorted.bam"), path("*.bai")  , emit: aligned_bam
+    path "versions.yml"                                  , emit: versions
+
+    script:
+    """
+    dorado aligner --mm2-preset map-ont $fasta $mod_bam > aligned.bam && samtools sort aligned.bam -o aligned_sorted.bam && samtools index aligned_sorted.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//')
+    END_VERSIONS
+    """
+}
+