diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5235d1ff..963b3273 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,10 +49,12 @@ jobs: strategy: matrix: profiles: - - "test_nodx_vc" - - "test_nodx_stringtie" - - "test_nodx_noaln" - - "test_nodx_rnamod" + - "test_bc_nodx" + - "test_nobc_dx" + - "test_nobc_nodx_vc" + - "test_nobc_nodx_stringtie" + - "test_nobc_nodx_noaln" + - "test_nobc_nodx_rnamod" steps: - name: Check out pipeline code uses: actions/checkout@v3 diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 93b6b2d0..ec19351e 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -101,7 +101,7 @@ def check_samplesheet(file_in, updated_path, file_out): barcode = "barcode%s" % (barcode.zfill(2)) ## Check input file extension - nanopolish_fast5 = "" + fast5 = "" if input_file: if input_file.find(" ") != -1: print_error("Input file contains spaces!", "Line", line) @@ -115,12 +115,12 @@ def check_samplesheet(file_in, updated_path, file_out): if updated_path != "not_changed": input_file = "/".join([updated_path, input_file.split("/")[-1]]) list_dir = os.listdir(input_file) - nanopolish_fast5 = input_file - if not (all(fname.endswith(".fast5") for fname in list_dir)): + fast5 = input_file + if not (all(fname.endswith(".fast5") for fname in list_dir)) and not (all(fname.endswith(".pod5") for fname in list_dir)): if "fast5" in list_dir and "fastq" in list_dir: - nanopolish_fast5 = input_file + "/fast5" + fast5 = input_file + "/fast5" ## CHECK FAST5 DIRECTORY - if not (all(fname.endswith(".fast5") for fname in os.listdir(nanopolish_fast5))): + if not (all(fname.endswith(".fast5") for fname in os.listdir(fast5))): print_error("fast5 directory contains non-fast5 files.") ## CHECK PROVIDED BASECALLED FASTQ fastq_path = input_file + "/fastq" @@ -139,8 +139,8 @@ def check_samplesheet(file_in, updated_path, file_out): '{input_file} path does not end with ".fastq.gz", ".fq.gz", or ".bam" and is not an existing directory with correct fast5 and/or fastq inputs.' ) - ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, nanopolish_fast5 ]}} - sample_info = [barcode, input_file, nanopolish_fast5] + ## Create sample mapping dictionary = {group: {replicate : [ barcode, input_file, fast5 ]}} + sample_info = [barcode, input_file, fast5] if group not in sample_info_dict: sample_info_dict[group] = {} if replicate not in sample_info_dict[group]: @@ -161,7 +161,7 @@ def check_samplesheet(file_in, updated_path, file_out): out_dir = os.path.dirname(file_out) make_dir(out_dir) with open(file_out, "w") as fout: - fout.write(",".join(["sample", "barcode", "reads", "nanopolish_fast5"]) + "\n") + fout.write(",".join(["sample", "barcode", "reads", "fast5"]) + "\n") for sample in sorted(sample_info_dict.keys()): ## Check that replicate ids are in format 1.. uniq_rep_ids = set(sample_info_dict[sample].keys()) diff --git a/conf/test.config b/conf/test.config index 8aa0238e..759f67e1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,33 +1,40 @@ /* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/nanoseq -profile test_nobc_dx, - */ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/nanoseq -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources - max_cpus = 2 - max_memory = 6.GB - max_time = 12.h + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '12.h' - // Input data to perform demultipexing - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv' - fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa' - gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf' - run_nanolyse = true - protocol = 'DNA' + // Input data to perform both basecalling and demultiplexing + input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv' + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + protocol = 'cDNA' + flowcell = 'FLO-MIN106' + kit = 'SQK-DCS109' barcode_kit = 'NBD103/NBD104' - input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz' - skip_bigwig = true - skip_bigbed = true + trim_barcodes = true + dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' + dorado_device = 'cpu' + run_nanolyse = true skip_quantification = true skip_fusion_analysis= true skip_modification_analysis=true - aligner = 'graphmap2' + + // This variable is just for reference and isnt actually required for the tests + // Files are downloaded and staged using the "GetTestData" process + input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded_multi/' } diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config new file mode 100644 index 00000000..743ceb65 --- /dev/null +++ b/conf/test_bc_nodx.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nanoseq -profile test_bc_nodx, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 12.h + + // Input data to perform basecalling and to skip demultipexing + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv' + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + protocol = 'cDNA' + flowcell = 'FLO-MIN106' + kit = 'SQK-DCS108' + dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' + dorado_device = 'cpu' + skip_bigbed = true + skip_bigwig = true + skip_demultiplexing = true + skip_quantification = true + skip_fusion_analysis= true + skip_modification_analysis=true +} diff --git a/conf/test_bc_nodx_dnamod.config b/conf/test_bc_nodx_dnamod.config new file mode 100644 index 00000000..a3cdd729 --- /dev/null +++ b/conf/test_bc_nodx_dnamod.config @@ -0,0 +1,36 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nanoseq -profile test_bc_nodx, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 12.h + + // Input data to perform basecalling and to skip demultipexing + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nanoseq/3.2/samplesheet/samplesheet_bc_nodx_dnamod.csv' + input_path_file_type= 'pod5' + bedmethyl_out = true + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + protocol = 'cDNA' + flowcell = 'FLO-MIN106' + kit = 'SQK-DCS108' + dorado_model = 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0' + dorado_modification = '5mCG_5hmCG' + dorado_device = 'cpu' + skip_bigbed = true + skip_bigwig = true + skip_demultiplexing = true + skip_quantification = true + skip_fusion_analysis= true + skip_modification_analysis=true +} diff --git a/conf/test_nobc_dx.config b/conf/test_nobc_dx.config new file mode 100644 index 00000000..a5cc717e --- /dev/null +++ b/conf/test_nobc_dx.config @@ -0,0 +1,33 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nanoseq -profile test_nobc_dx, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources + max_cpus = 2 + max_memory = 6.GB + max_time = 12.h + + // Input data to perform demultipexing + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv' + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa' + gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf' + skip_basecalling = true + run_nanolyse = true + protocol = 'DNA' + barcode_kit = 'NBD103/NBD104' + input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz' + skip_bigwig = true + skip_bigbed = true + skip_quantification = true + skip_fusion_analysis= true + skip_modification_analysis=true +} diff --git a/conf/test_nodx_noaln.config b/conf/test_nobc_nodx_noaln.config similarity index 97% rename from conf/test_nodx_noaln.config rename to conf/test_nobc_nodx_noaln.config index 4d757357..2e36ee44 100644 --- a/conf/test_nodx_noaln.config +++ b/conf/test_nobc_nodx_noaln.config @@ -20,6 +20,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv' fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa' gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf' + skip_basecalling = true protocol = 'directRNA' skip_demultiplexing = true skip_alignment = true diff --git a/conf/test_nobc_nodx_noaln_vc.config b/conf/test_nobc_nodx_noaln_vc.config new file mode 100644 index 00000000..a5f6a3a7 --- /dev/null +++ b/conf/test_nobc_nodx_noaln_vc.config @@ -0,0 +1,30 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nanoseq -profile test_nobc_nodx_vc, + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check variant calling functions' + + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 12.h + + // Input data to skip demultiplexing and variant call + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln_vc.csv' + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + skip_basecalling = true + protocol = 'DNA' + skip_alignment = true + skip_quantification = true + skip_demultiplexing = true + call_variants = true + variant_caller = 'clair3' + structural_variant_caller = 'sniffles' +} diff --git a/conf/test_nodx_rnamod.config b/conf/test_nobc_nodx_rnamod.config similarity index 97% rename from conf/test_nodx_rnamod.config rename to conf/test_nobc_nodx_rnamod.config index 8f3e51e4..f8e5b86b 100644 --- a/conf/test_nodx_rnamod.config +++ b/conf/test_nobc_nodx_rnamod.config @@ -20,6 +20,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv' fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa' gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf' + skip_basecalling = true protocol = 'directRNA' run_nanolyse = true skip_bigbed = true diff --git a/conf/test_nodx_stringtie.config b/conf/test_nobc_nodx_stringtie.config similarity index 97% rename from conf/test_nodx_stringtie.config rename to conf/test_nobc_nodx_stringtie.config index 0c6aa263..d81c9ada 100644 --- a/conf/test_nodx_stringtie.config +++ b/conf/test_nobc_nodx_stringtie.config @@ -21,6 +21,7 @@ params { fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa' gtf = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf' protocol = 'directRNA' + skip_basecalling = true skip_demultiplexing = true skip_fusion_analysis= true skip_modification_analysis=true diff --git a/conf/test_nodx_vc.config b/conf/test_nobc_nodx_vc.config similarity index 97% rename from conf/test_nodx_vc.config rename to conf/test_nobc_nodx_vc.config index f347c293..cd9f4e10 100644 --- a/conf/test_nodx_vc.config +++ b/conf/test_nobc_nodx_vc.config @@ -19,6 +19,7 @@ params { // Input data to skip demultiplexing and variant call input = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv' fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + skip_basecalling = true protocol = 'DNA' skip_quantification = true skip_demultiplexing = true diff --git a/conf/test_withpull.config b/conf/test_withpull.config new file mode 100644 index 00000000..93c97ba5 --- /dev/null +++ b/conf/test_withpull.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/nanoseq -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data to perform both basecalling and demultiplexing + input = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv' + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa' + protocol = 'cDNA' + flowcell = 'FLO-MIN106' + kit = 'SQK-DCS109' + barcode_kit = 'EXP-NBD103' + trim_barcodes=true + output_demultiplex_fast5 = true + run_nanolyse = true + skip_quantification = true + skip_fusion_analysis= true + skip_modification_analysis=true + + // This variable is just for reference and isnt actually required for the tests + // Files are downloaded and staged using the "GetTestData" process + input_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/' +} diff --git a/modules/local/bambu.nf b/modules/local/bambu.nf index 777b0e11..d757c274 100644 --- a/modules/local/bambu.nf +++ b/modules/local/bambu.nf @@ -3,8 +3,8 @@ process BAMBU { conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.0.8--r42hc247a5b_0' : - 'quay.io/biocontainers/bioconductor-bambu:3.0.8--r42hc247a5b_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.4.0--r43hf17093f_0' : + 'quay.io/biocontainers/bioconductor-bambu:3.4.0--r43hf17093f_0' }" input: tuple path(fasta), path(gtf) diff --git a/modules/local/blue-crab.nf b/modules/local/blue-crab.nf new file mode 100644 index 00000000..258b4698 --- /dev/null +++ b/modules/local/blue-crab.nf @@ -0,0 +1,29 @@ +process BLUE_CRAB { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::slow5tools==1.2.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/slow5tools:1.2.0--h56e2c18_1' : + 'quay.io/biocontainers/slow5tools:1.2.0--h56e2c18_1' }" + + input: + tuple val(meta), path(genome), path(gtf), path(fastq), path(bam), path(bai), path(pod5) + + output: + tuple val(meta), path(genome), path(gtf), path(fastq), path(bam), path(bai), path(blow5), emit: nanopolish_outputs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + blue-crab p2s $pod5 -o $blow5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blue-crab: \$( blue-crab -V | tail -c 6 ) + END_VERSIONS + """ +} diff --git a/modules/local/dorado_aligner.nf b/modules/local/dorado_aligner.nf new file mode 100644 index 00000000..3fef036d --- /dev/null +++ b/modules/local/dorado_aligner.nf @@ -0,0 +1,25 @@ +process DORADO_ALIGNER { + tag "$meta.id" + label 'process_medium' + + container "docker.io/ontresearch/dorado" + + input: + tuple val(meta), path(mod_bam) + path fasta + + output: + tuple val(meta), path("aligned_sorted.bam"), path("*.bai") , emit: aligned_bam + path "versions.yml" , emit: versions + + script: + """ + dorado aligner --mm2-preset map-ont $fasta $mod_bam > aligned.bam && samtools sort aligned.bam -o aligned_sorted.bam && samtools index aligned_sorted.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//') + END_VERSIONS + """ +} + diff --git a/modules/local/dorado_basecaller.nf b/modules/local/dorado_basecaller.nf new file mode 100644 index 00000000..200e7bb1 --- /dev/null +++ b/modules/local/dorado_basecaller.nf @@ -0,0 +1,28 @@ +process DORADO_BASECALLER { + tag "$meta.id" + label 'process_medium' + + container "docker.io/ontresearch/dorado" + + input: + tuple val(meta), path(pod5_path) + val dorado_device + val dorado_model + + output: + tuple val(meta), path("basecall*") , emit: dorado_out + path "versions.yml" , emit: versions + + script: + def emit_args = (params.dorado_modification == null) ? " --emit-fastq > basecall.fastq && gzip basecall.fastq" : " --modified-bases $params.dorado_modification > basecall.bam" + """ + dorado download --model $dorado_model + dorado basecaller $dorado_model $pod5_path --device $dorado_device $emit_args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//') + END_VERSIONS + """ +} + diff --git a/modules/local/f5c_index_eventalign.nf b/modules/local/f5c_index_eventalign.nf new file mode 100644 index 00000000..dc241cc3 --- /dev/null +++ b/modules/local/f5c_index_eventalign.nf @@ -0,0 +1,37 @@ +process F5C_INDEX_EVENTALIGN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::nanopolish==0.13.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/f5c:1.5--h56e2c18_1' : + 'quay.io/biocontainers/f5c:1.5--h56e2c18_1' }" + + input: + tuple val(meta), path(genome), path(gtf), path(fastqgz), path(bam), path(bai) + + output: + tuple val(meta), path(genome), path(gtf), path("*eventalign.txt"), path("*summary.txt"), emit: f5c_outputs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + sample_summary = "$meta.id" +"_summary.txt" + sample_eventalign = "$meta.id" +"_eventalign.txt" + fastq="$meta.id"+".fastq" + fast5 = "$meta.fast5" + fastqi="$fastq"+"*" + """ + gunzip -c $fastqgz > $fastq + f5c index -d $fast5 $fastq + echo $fastqi + f5c eventalign --reads $fastq --bam $bam --genome $genome --scale-events --signal-index --rna --min-mapq 0 --min-recalib-events 1 --summary $sample_summary --threads $task.cpus > $sample_eventalign + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanopolish: \$( f5c -V | tail -c 4) + END_VERSIONS + """ +} diff --git a/modules/local/fast5_to_pod5.nf b/modules/local/fast5_to_pod5.nf new file mode 100644 index 00000000..1b2e054d --- /dev/null +++ b/modules/local/fast5_to_pod5.nf @@ -0,0 +1,27 @@ +process FAST5_TO_POD5 { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0" + container "docker.io/yuukiiwa/pod5:0.2.4" + + input: + tuple val(meta), path(input_path) + + output: + tuple val(meta), path("pod5/"), emit: pod5 + + when: + task.ext.when == null || task.ext.when + + script: + output_name = "pod5/converted.pod5" + """ + pod5 convert fast5 $input_path --output $output_name + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pod5: \$(echo \$(pod5 --version 2>&1) | sed -r 's/..............//') + END_VERSIONS + """ +} diff --git a/modules/local/get_test_data.nf b/modules/local/get_test_data.nf index b922af75..d2aa4d3c 100644 --- a/modules/local/get_test_data.nf +++ b/modules/local/get_test_data.nf @@ -4,15 +4,16 @@ process GET_TEST_DATA { container "docker.io/yuukiiwa/git:latest" output: - path "test-datasets/fast5/$barcoded/*" , emit: ch_input_fast5s_path + path "test-datasets/fast5/$barcoded/" , emit: ch_input_fast5_dir_path path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path + path "test-datasets/pod5/" , emit: ch_pod5_dir_path path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded" : "barcoded" + barcoded = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "nonbarcoded_multi" : "barcoded_multi" """ git clone https://github.com/nf-core/test-datasets.git --branch nanoseq --single-branch diff --git a/modules/local/gtf2bed.nf b/modules/local/gtf2bed.nf index 6ae37e61..7aa63e54 100644 --- a/modules/local/gtf2bed.nf +++ b/modules/local/gtf2bed.nf @@ -7,7 +7,7 @@ process GTF2BED { 'quay.io/biocontainers/perl:5.26.2' }" input: - tuple path(gtf), val(name) + tuple val(name), path(gtf) output: tuple path('*.bed'), val(name), emit: gtf_bed diff --git a/modules/local/modkit_pileup.nf b/modules/local/modkit_pileup.nf new file mode 100644 index 00000000..02a5d2e8 --- /dev/null +++ b/modules/local/modkit_pileup.nf @@ -0,0 +1,29 @@ +process MODKIT_PILEUP { + tag "$meta.id" + label 'process_high_memory' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ont-modkit:0.4.0--h5c23e0d_0' : + 'quay.io/biocontainers/ont-modkit:0.4.0--h5c23e0d_0' }" + + input: + tuple val(meta), path(aligned_mod_bam), path(bai) + + output: + tuple val(meta), path(bedmethyl) , emit: bedmethyl + path "versions.yml" , emit: versions + + script: + bedmethyl = "$meta.id" +".bed" + """ + modkit pileup $aligned_mod_bam $bedmethyl --threads $task.cpus + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + modkit: \$(echo \$(modkit --version 2>&1) | sed -r 's/.{81}//') + END_VERSIONS + + gzip basecall.fastq + """ +} + diff --git a/modules/local/nanopolish_index_eventalign.nf b/modules/local/nanopolish_index_eventalign.nf index 0057ffb2..8379bc9c 100644 --- a/modules/local/nanopolish_index_eventalign.nf +++ b/modules/local/nanopolish_index_eventalign.nf @@ -20,7 +20,7 @@ process NANOPOLISH_INDEX_EVENTALIGN { script: sample_summary = "$meta.id" +"_summary.txt" sample_eventalign = "$meta.id" +"_eventalign.txt" - fast5 = "$meta.nanopolish_fast5" + fast5 = "$meta.fast5" """ nanopolish index -d $fast5 $fastq nanopolish eventalign --reads $fastq --bam $bam --genome $genome --scale-events --signal-index --summary $sample_summary --threads $task.cpus > $sample_eventalign diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 4eac940f..24048222 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -19,7 +19,7 @@ process SAMPLESHEET_CHECK { task.ext.when == null || task.ext.when script: // This script is bundled with the pipeline, in nf-core/nanoseq/bin/ - updated_path = workflow.profile.contains('test_nodx_rnamod') ? "$input_path" : "not_changed" + updated_path = (workflow.profile.contains('test_bc_nodx') || workflow.profile.contains('rnamod')) ? "$input_path" : "not_changed" """ check_samplesheet.py \\ $samplesheet \\ diff --git a/modules/nf-core/nanoplot/main.nf b/modules/nf-core/nanoplot/main.nf index 9706bb87..92211089 100644 --- a/modules/nf-core/nanoplot/main.nf +++ b/modules/nf-core/nanoplot/main.nf @@ -24,10 +24,12 @@ process NANOPLOT { def args = task.ext.args ?: '' def input_file = ("$ontfile".endsWith(".fastq.gz")) ? "--fastq ${ontfile}" : ("$ontfile".endsWith(".txt")) ? "--summary ${ontfile}" : '' + def prefix = task.ext.prefix ?: "${meta.id}" """ NanoPlot \\ $args \\ -t $task.cpus \\ + -p $prefix \\ $input_file cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 8b5f1c0d..1bb89fc4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,15 +19,21 @@ params { gtf = null - // Options: Demultiplexing + // Options: Basecalling and Demultiplexing input_path = null + input_path_file_type = 'fast5' + bedmethyl_out = false + flowcell = null + kit = null barcode_kit = null barcode_both_ends = false trim_barcodes = false - gpu_device = 'auto' - gpu_cluster_options = null + dorado_model = null + doraro_modification = null + dorado_device = 'cuda:all' qcat_min_score = 60 qcat_detect_middle = false + skip_basecalling = false skip_demultiplexing = false // Options: Raw read cleaning @@ -221,12 +227,16 @@ profiles { executor.cpus = 16 executor.memory = 60.GB } + test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } - test { includeConfig 'conf/test.config' } - test_nodx_stringtie { includeConfig 'conf/test_nodx_stringtie.config' } - test_nodx_noaln { includeConfig 'conf/test_nodx_noaln.config' } - test_nodx_vc { includeConfig 'conf/test_nodx_vc.config' } - test_nodx_rnamod { includeConfig 'conf/test_nodx_rnamod.config' } + test_bc_nodx { includeConfig 'conf/test_bc_nodx.config' } + test_bc_nodx_dnamod { includeConfig 'conf/test_bc_nodx_dnamod.config' } + test_nobc_dx { includeConfig 'conf/test_nobc_dx.config' } + test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' } + test_nobc_nodx_noaln { includeConfig 'conf/test_nobc_nodx_noaln.config' } + test_nobc_nodx_vc { includeConfig 'conf/test_nobc_nodx_vc.config' } + test_nobc_nodx_noaln_vc { includeConfig 'conf/test_nobc_nodx_noaln_vc.config' } + test_nobc_nodx_rnamod { includeConfig 'conf/test_nobc_nodx_rnamod.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index aa9e1bb2..86494af5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -81,17 +81,35 @@ "fa_icon": "fas fa-barcode", "description": "Trim barcodes from the output sequences in the FastQ files from basecaller." }, - "gpu_device": { + "kit": { "type": "string", - "default": "auto", - "description": "Device specified in GPU mode using '--device'.", + "default": "", + "description": "sequencing kit.", "fa_icon": "fas fa-fish" }, - "gpu_cluster_options": { + "flowcell": { "type": "string", - "description": "Cluster options required to use GPU resources (e.g. '--part=gpu --gres=gpu:1').", + "default": "", + "description": "flowcell'.", "fa_icon": "fas fa-fish" }, + "dorado_device": { + "type": "string", + "default": "cuda:all", + "description": "Device specified using '--device'.", + "fa_icon": "fas fa-fish" + }, + "dorado_model": { + "type": "string", + "default": "", + "description": "model used for basecalling.", + "fa_icon": "fas fa-fish" + }, + "skip_basecalling": { + "type": "boolean", + "description": "Skip basecalling with dorado.", + "fa_icon": "fas fa-fast-forward" + }, "qcat_min_score": { "type": "integer", "default": 60, diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 16560c5f..8d6e76de 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -28,7 +28,7 @@ def get_sample_info(LinkedHashMap row) { def meta = [:] meta.id = row.sample meta.barcode = row.barcode - meta.nanopolish_fast5 = row.nanopolish_fast5 + meta.fast5 = row.fast5 input_file = row.reads //? file(row.reads, checkIfExists: true) : null fastq_meta = [ meta, input_file ] diff --git a/subworkflows/local/rna_modifications_xpore_m6anet.nf b/subworkflows/local/rna_modifications_xpore_m6anet.nf index 5c35cc73..6ded8793 100644 --- a/subworkflows/local/rna_modifications_xpore_m6anet.nf +++ b/subworkflows/local/rna_modifications_xpore_m6anet.nf @@ -2,7 +2,9 @@ * RNA MODIFICATION DETECTION WITH XPORE AND M6ANET */ -include { NANOPOLISH_INDEX_EVENTALIGN } from '../../modules/local/nanopolish_index_eventalign' + +// include { F5C_INDEX_EVENTALIGN } from '../../modules/local/f5c_index_eventalign' // f5c works for my other pipeline, but idk how it got screwed here +include { NANOPOLISH_INDEX_EVENTALIGN } from '../../modules/local/nanopolish_index_eventalign' // nanopolish does the job include { XPORE_DATAPREP } from '../../modules/local/xpore_dataprep' include { XPORE_DIFFMOD } from '../../modules/local/xpore_diffmod' include { M6ANET_DATAPREP } from '../../modules/local/m6anet_dataprep' @@ -10,14 +12,14 @@ include { M6ANET_INFERENCE } from '../../modules/local/m6anet_inference' workflow RNA_MODIFICATION_XPORE_M6ANET { take: - ch_nanopolish_bam_fast5 + ch_bam_fast5 main: /* * Align current signals to reference with Nanopolish */ - NANOPOLISH_INDEX_EVENTALIGN { ch_nanopolish_bam_fast5 } + NANOPOLISH_INDEX_EVENTALIGN { ch_bam_fast5 } ch_nanopolish_outputs = NANOPOLISH_INDEX_EVENTALIGN.out.nanopolish_outputs nanopolish_version = NANOPOLISH_INDEX_EVENTALIGN.out.versions diff --git a/workflows/nanoseq.nf b/workflows/nanoseq.nf index 6303f6f7..0369dd27 100644 --- a/workflows/nanoseq.nf +++ b/workflows/nanoseq.nf @@ -64,7 +64,11 @@ if (!params.skip_demultiplexing) { if (params.barcode_kit && qcatBarcodeKitList.contains(params.barcode_kit)) { if (params.input_path) { - ch_input_path = Channel.fromPath(params.input_path, checkIfExists: true) + if (workflow.profile.contains('test')){ + ch_input_path = params.input_path + } else { + ch_input_path = Channel.fromPath(params.input_path, checkIfExists: true) + } } else { exit 1, "Please specify a valid input fastq file to perform demultiplexing!" } @@ -120,6 +124,11 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi include { GET_TEST_DATA } from '../modules/local/get_test_data' include { GET_NANOLYSE_FASTA } from '../modules/local/get_nanolyse_fasta' +include { FAST5_TO_POD5 } from '../modules/local/fast5_to_pod5' +include { DORADO_BASECALLER } from '../modules/local/dorado_basecaller' +include { DORADO_ALIGNER } from '../modules/local/dorado_aligner' +include { MODKIT_PILEUP } from '../modules/local/modkit_pileup' +include { GTF2BED } from '../modules/local/gtf2bed' include { BAM_RENAME } from '../modules/local/bam_rename' include { BAMBU } from '../modules/local/bambu' include { MULTIQC } from '../modules/local/multiqc' @@ -146,6 +155,7 @@ include { QCAT } from '../modules/nf-core/qcat/main' include { NANOLYSE } from '../modules/nf-core/nanolyse/main' include { CUSTOM_GETCHROMSIZES } from '../modules/nf-core/custom/getchromsizes/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' /* * SUBWORKFLOW: Consisting entirely of nf-core/modules @@ -169,11 +179,21 @@ workflow NANOSEQ{ // Pre-download test-dataset to get files for '--input_path' parameter // Nextflow is unable to recursively download directories via HTTPS if (workflow.profile.contains('test') && !workflow.profile.contains('vc')) { - if (!params.skip_modification_analysis) { + if (!params.skip_basecalling || !params.skip_modification_analysis) { if (!isOffline()) { GET_TEST_DATA () - GET_TEST_DATA.out.ch_input_dir_path - .set { ch_input_path } + if (params.skip_modification_analysis) { + if (params.bedmethyl_out){ + GET_TEST_DATA.out.ch_pod5_dir_path + .set { ch_input_path } + } else { + GET_TEST_DATA.out.ch_input_fast5_dir_path + .set { ch_input_path } + } + } else { + GET_TEST_DATA.out.ch_input_dir_path + .set { ch_input_path } + } } else { exit 1, "NXF_OFFLINE=true or -offline has been set so cannot download and run any test dataset!" } @@ -192,6 +212,7 @@ workflow NANOSEQ{ } } + // Create empty software versions channel to mix ch_software_versions = Channel.empty() @@ -201,38 +222,110 @@ workflow NANOSEQ{ INPUT_CHECK ( ch_input, ch_input_path ) .set { ch_sample } - if (!params.skip_demultiplexing) { - - // Create barcode channel - ch_barcode_kit = Channel.from(params.barcode_kit) + if (!params.skip_basecalling) { + if (params.input_path_file_type == 'fast5'){ + if (!params.skip_demultiplexing) { + ch_input_path + .map { it -> [ [id:'undemultiplexed'], it ] } + .set { ch_fast5_dir } + } else { + ch_sample + .set { ch_fast5_dir } + } - // Map ch_undemultiplexed_fastq - ch_input_path - .map { it -> [ [id:'undemultiplexed'], it ] } - .set { ch_undemultiplexed_fastq } + /* + * MODULE: Convert fast5 to pod5 + */ + FAST5_TO_POD5 ( ch_fast5_dir ) + ch_pod5 = FAST5_TO_POD5.out.pod5 + } else { + if (!params.skip_demultiplexing) { + ch_input_path + .map { it -> [ [id:'undemultiplexed'], it ] } + .set { ch_pod5 } + } else { + ch_sample + .set { ch_pod5 } + } + } /* - * MODULE: Demultipexing using qcat + * MODULE: Basecalling and demultipexing using Dorado */ - QCAT ( ch_undemultiplexed_fastq , ch_barcode_kit ) - QCAT.out.reads - .map { it -> it[1] } - .flatten() - .map { it -> [ it.baseName.substring(0,it.baseName.lastIndexOf('.')), it ] } - .join(ch_sample.map{ meta, empty -> [meta.barcode, meta] }, by: [0] ) - .map { it -> [ it[2], it[1] ] } - .set { ch_fastq } // [ meta, .fastq.qz ] - ch_software_versions = ch_software_versions.mix(QCAT.out.versions.ifEmpty(null)) - } else { - if (!params.skip_alignment || !params.skip_fusion_analysis) { - ch_sample - .map { it -> if (it[1].toString().endsWith('.gz')) [ it[0], it[1] ] } - .set { ch_fastq } + + DORADO_BASECALLER ( ch_pod5, params.dorado_device, params.dorado_model ) + ch_software_versions = ch_software_versions.mix(DORADO_BASECALLER.out.versions.ifEmpty(null)) + if (!params.bedmethyl_out) { + if (!params.skip_demultiplexing) { + + /* + * MODULE: Demultipexing using qcat + */ + ch_barcode_kit = Channel.from(params.barcode_kit) + + /* + * MODULE: Demultipexing using qcat + */ + QCAT ( DORADO_BASECALLER.out.dorado_out , ch_barcode_kit ) + QCAT.out.reads + .map { it -> it[1] } + .flatten() + .map { it -> [ it.baseName.substring(0,it.baseName.lastIndexOf('.')), it ] } + .join(ch_sample.map{ meta, empty -> [meta.barcode, meta] }, by: [0] ) + .map { it -> [ it[2], it[1] ] } + .set { ch_fastq } // [ meta, .fastq.qz ] + ch_software_versions = ch_software_versions.mix(QCAT.out.versions.ifEmpty(null)) + + } else { + DORADO_BASECALLER.out.dorado_out + .set { ch_fastq } + } } else { ch_fastq = Channel.empty() + ch_fasta = Channel.empty() + DORADO_ALIGNER( DORADO_BASECALLER.out.dorado_out, params.fasta ) + MODKIT_PILEUP ( DORADO_ALIGNER.out.aligned_bam ) + } + + } else { + + if (!params.skip_demultiplexing) { + + /* + * MODULE: Demultipexing using qcat + */ + ch_barcode_kit = Channel.from(params.barcode_kit) + + // Map ch_undemultiplexed_fastq + ch_input_path + .map { it -> [ [id:'undemultiplexed'], it ] } + .set { ch_undemultiplexed_fastq } + + /* + * MODULE: Demultipexing using qcat + */ + QCAT ( ch_undemultiplexed_fastq , ch_barcode_kit ) + QCAT.out.reads + .map { it -> it[1] } + .flatten() + .map { it -> [ it.baseName.substring(0,it.baseName.lastIndexOf('.')), it ] } + .join(ch_sample.map{ meta, empty -> [meta.barcode, meta] }, by: [0] ) + .map { it -> [ it[2], it[1] ] } + .set { ch_fastq } // [ meta, .fastq.qz ] + ch_software_versions = ch_software_versions.mix(QCAT.out.versions.ifEmpty(null)) + + } else { + if (!params.skip_alignment || !params.skip_fusion_analysis) { + ch_sample + .map { it -> if (it[1].toString().endsWith('.gz')) [ it[0], it[1] ] } + .set { ch_fastq } + } else { + ch_fastq = Channel.empty() + } } } + if (params.run_nanolyse) { if (!params.nanolyse_fasta) { if (!isOffline()) { @@ -269,19 +362,26 @@ workflow NANOSEQ{ ch_fastqc_multiqc = QCFASTQ_NANOPLOT_FASTQC.out.fastqc_multiqc.ifEmpty([]) } - ch_samtools_multiqc = Channel.empty() - if (!params.skip_alignment) { - + if (!params.bedmethyl_out){ ch_fasta = Channel.from( [id:'reference'], fasta ).collect() - /* - * SUBWORKFLOW: Make chromosome size file and covert GTF to BED12 - */ + /* + * SUBWORKFLOW: Make chromosome size file and covert GTF to BED12 + */ CUSTOM_GETCHROMSIZES( ch_fasta ) ch_chr_sizes = CUSTOM_GETCHROMSIZES.out.sizes ch_fai = CUSTOM_GETCHROMSIZES.out.fai ch_software_versions = ch_software_versions.mix(CUSTOM_GETCHROMSIZES.out.versions.first().ifEmpty(null)) + // will add the following in when nf-core/modules/minimap2/align supports junction bed input + //GTF2BED ( ch_chr_sizes ) + //ch_gtf_bed = GTF2BED.out.gtf_bed + //gtf2bed_version = GTF2BED.out.versions + } + + ch_samtools_multiqc = Channel.empty() + if (!params.skip_alignment) { + if (params.aligner == 'minimap2') { /* @@ -304,25 +404,6 @@ workflow NANOSEQ{ ch_software_versions = ch_software_versions.mix(ALIGN_GRAPHMAP2.out.samtools_version.first().ifEmpty(null)) } - if (params.call_variants && params.protocol == 'DNA') { - - /* - * SUBWORKFLOW: Short variant calling - */ - if (!params.skip_vc) { - SHORT_VARIANT_CALLING ( ch_sorted_bam, ch_sorted_bai, ch_fasta, ch_fai ) - ch_software_versions = ch_software_versions.mix(SHORT_VARIANT_CALLING.out.ch_versions.first().ifEmpty(null)) - } - - /* - * SUBWORKFLOW: Structural variant calling - */ - if (!params.skip_sv) { - STRUCTURAL_VARIANT_CALLING ( ch_sorted_bam, ch_sorted_bai, ch_fasta, ch_fai ) - ch_software_versions = ch_software_versions.mix(STRUCTURAL_VARIANT_CALLING.out.ch_versions.first().ifEmpty(null)) - } - } - ch_bedtools_version = Channel.empty() if (!params.skip_bigwig) { @@ -354,6 +435,29 @@ workflow NANOSEQ{ ch_sorted_bam = BAM_RENAME.out.bam } + if (params.call_variants && params.protocol == 'DNA') { + + SAMTOOLS_INDEX ( ch_sorted_bam ) + ch_sorted_bai = SAMTOOLS_INDEX.out.bai + samtools_version = SAMTOOLS_INDEX.out.versions + + /* + * SUBWORKFLOW: Short variant calling + */ + if (!params.skip_vc) { + SHORT_VARIANT_CALLING ( ch_sorted_bam, ch_sorted_bai, ch_fasta, ch_fai ) + ch_software_versions = ch_software_versions.mix(SHORT_VARIANT_CALLING.out.ch_versions.first().ifEmpty(null)) + } + + /* + * SUBWORKFLOW: Structural variant calling + */ + if (!params.skip_sv) { + STRUCTURAL_VARIANT_CALLING ( ch_sorted_bam, ch_sorted_bai, ch_fasta, ch_fai ) + ch_software_versions = ch_software_versions.mix(STRUCTURAL_VARIANT_CALLING.out.ch_versions.first().ifEmpty(null)) + } + } + ch_featurecounts_gene_multiqc = Channel.empty() ch_featurecounts_transcript_multiqc = Channel.empty() if (!params.skip_quantification && (params.protocol == 'cDNA' || params.protocol == 'directRNA')) {