From 88bb65d602ab1434a82b0b9406556cab674194fe Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 13:16:54 +0000 Subject: [PATCH 01/19] add subsampling --- modules.json | 5 + modules/nf-core/seqtk/sample/environment.yml | 5 + modules/nf-core/seqtk/sample/main.nf | 58 +++++++++++ modules/nf-core/seqtk/sample/meta.yml | 52 ++++++++++ .../nf-core/seqtk/sample/tests/main.nf.test | 80 ++++++++++++++++ .../seqtk/sample/tests/main.nf.test.snap | 95 +++++++++++++++++++ .../seqtk/sample/tests/standard.config | 6 ++ modules/nf-core/seqtk/sample/tests/tags.yml | 2 + nextflow.config | 2 +- nextflow_schema.json | 7 ++ workflows/seqinspector.nf | 17 +++- 11 files changed, 327 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/seqtk/sample/environment.yml create mode 100644 modules/nf-core/seqtk/sample/main.nf create mode 100644 modules/nf-core/seqtk/sample/meta.yml create mode 100644 modules/nf-core/seqtk/sample/tests/main.nf.test create mode 100644 modules/nf-core/seqtk/sample/tests/main.nf.test.snap create mode 100644 modules/nf-core/seqtk/sample/tests/standard.config create mode 100644 modules/nf-core/seqtk/sample/tests/tags.yml diff --git a/modules.json b/modules.json index 8e632d5..f01f2a6 100644 --- a/modules.json +++ b/modules.json @@ -14,6 +14,11 @@ "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "seqtk/sample": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/seqtk/sample/environment.yml b/modules/nf-core/seqtk/sample/environment.yml new file mode 100644 index 0000000..693aa5c --- /dev/null +++ b/modules/nf-core/seqtk/sample/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/sample/main.nf b/modules/nf-core/seqtk/sample/main.nf new file mode 100644 index 0000000..ea9b839 --- /dev/null +++ b/modules/nf-core/seqtk/sample/main.nf @@ -0,0 +1,58 @@ +process SEQTK_SAMPLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(reads), val(sample_size) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (!(args ==~ /.*-s[0-9]+.*/)) { + args += " -s100" + } + if ( !sample_size ) { + error "SEQTK/SAMPLE must have a sample_size value included" + } + """ + printf "%s\\n" $reads | while read f; + do + seqtk \\ + sample \\ + $args \\ + \$f \\ + $sample_size \\ + | gzip --no-name > ${prefix}_\$(basename \$f) + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo "" | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/seqtk/sample/meta.yml b/modules/nf-core/seqtk/sample/meta.yml new file mode 100644 index 0000000..42f67d8 --- /dev/null +++ b/modules/nf-core/seqtk/sample/meta.yml @@ -0,0 +1,52 @@ +name: seqtk_sample +description: Subsample reads from FASTQ files +keywords: + - sample + - fastx + - reads +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in + the FASTA or FASTQ format. Seqtk sample command subsamples sequences. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + identifier: biotools:seqtk +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files + pattern: "*.{fastq.gz}" + - sample_size: + type: integer + description: Number of reads to sample. +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Subsampled FastQ files + pattern: "*.{fastq.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" +maintainers: + - "@kaurravneet4123" + - "@sidorov-si" + - "@adamrtalbot" diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test b/modules/nf-core/seqtk/sample/tests/main.nf.test new file mode 100644 index 0000000..c121c9d --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process SEQTK_SAMPLE" + script "modules/nf-core/seqtk/sample/main.nf" + process "SEQTK_SAMPLE" + config "./standard.config" + + tag "modules" + tag "modules_nfcore" + tag "seqtk" + tag "seqtk/sample" + + test("sarscov2_sample_singleend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_pairedend_fqgz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2_sample_singlend_fqgz_stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + 50 + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/seqtk/sample/tests/main.nf.test.snap b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap new file mode 100644 index 0000000..a9fec3c --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "sarscov2_sample_singlend_fqgz_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:45.902956" + }, + "sarscov2_sample_pairedend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:37.679954" + }, + "sarscov2_sample_singleend_fqgz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "1": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.sampled_test_1.fastq.gz:md5,e5f44fafd7351c5abb9925a075132941" + ] + ], + "versions": [ + "versions.yml:md5,0529f2d163df9e2cd2ae8254dfb63806" + ] + } + ], + "timestamp": "2024-02-22T15:58:29.474491" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/standard.config b/modules/nf-core/seqtk/sample/tests/standard.config new file mode 100644 index 0000000..b2dd4b9 --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/standard.config @@ -0,0 +1,6 @@ +process { + withName: SEQTK_SAMPLE { + ext.args = '-s100' + ext.prefix = { "${meta.id}.sampled" } + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/sample/tests/tags.yml b/modules/nf-core/seqtk/sample/tests/tags.yml new file mode 100644 index 0000000..e5d113b --- /dev/null +++ b/modules/nf-core/seqtk/sample/tests/tags.yml @@ -0,0 +1,2 @@ +seqtk/sample: + - "modules/nf-core/seqtk/sample/**" diff --git a/nextflow.config b/nextflow.config index 50c1ecb..10bcb89 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - + sample_size = null // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 88fd607..f87e525 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -23,8 +23,15 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/seqinspector/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "sample_size": { + "type": "integer", + "default": null, + "description": "Subset this number of reads.", + "help_text": "Samples will be subsetted to this number of reads. If null, no subsampling will be performed." + }, "outdir": { "type": "string", + "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index ea62811..70b8967 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -3,6 +3,8 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ + +include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' @@ -30,11 +32,24 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run Seqkit sample to perform subsampling + // + if (params.sample_size) { + ch_sample_sized = SEQTK_SAMPLE(ch_samplesheet.map { + meta, reads -> [meta, reads, params.sample_size] + }).reads + ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) + } else { + // No do subsample + ch_sample_sized = ch_samplesheet + } + // // MODULE: Run FastQC // FASTQC ( - ch_samplesheet + ch_sample_sized ) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) From dbd64bfa4f311a9001f48a89df56a84581193d20 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 14:03:06 +0000 Subject: [PATCH 02/19] add subsampling nf-test --- tests/NovaSeq6000.main_subsample.nf.test | 50 +++++++++++++++++++ .../NovaSeq6000.main_subsample.nf.test.config | 8 +++ 2 files changed, 58 insertions(+) create mode 100644 tests/NovaSeq6000.main_subsample.nf.test create mode 100644 tests/NovaSeq6000.main_subsample.nf.test.config diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test new file mode 100644 index 0000000..0919bc2 --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -0,0 +1,50 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("NovaSeq6000 data test") { + + when { + config "./NovaSeq6000.main_subsample.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_software_versions.txt"), + + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_software_versions.txt"), + + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_software_versions.txt"), + + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_software_versions.txt"), + + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_software_versions.txt"), + ).match() + }, + ) + } + } +} diff --git a/tests/NovaSeq6000.main_subsample.nf.test.config b/tests/NovaSeq6000.main_subsample.nf.test.config new file mode 100644 index 0000000..acda74d --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + 'seqinspector/testdata/NovaSeq6000/samplesheet.csv' + sample_size = 90 +} From 44fb55455961c1d1cc56f04341a7d2a382dfbc42 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 14:09:57 +0000 Subject: [PATCH 03/19] add snapshots --- tests/NovaSeq6000.main_subsample.nf.test.snap | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/NovaSeq6000.main_subsample.nf.test.snap diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap new file mode 100644 index 0000000..d1dbe4a --- /dev/null +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -0,0 +1,31 @@ +{ + "NovaSeq6000 data test": { + "content": [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", + "multiqc_general_stats.txt:md5,c4f40f2313aadc38619e7487226e8d93", + "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", + "multiqc_general_stats.txt:md5,d5d73d2888cd9895e5f116e5b869e73c", + "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", + "multiqc_general_stats.txt:md5,d01ec30a262b69bc5749b0ed108a950a", + "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", + "multiqc_general_stats.txt:md5,4dffc0d1169c49adde819d4467ffb775", + "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", + "multiqc_general_stats.txt:md5,05f8dfeea9fca7f4c16ba9d553af4c69", + "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686" + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-28T14:05:44.868455454" + } +} \ No newline at end of file From d781db1c98312bafc9419955aa43a26faa9dd076 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 14:20:03 +0000 Subject: [PATCH 04/19] better name --- tests/NovaSeq6000.main_subsample.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test index 0919bc2..72851f9 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -1,11 +1,11 @@ nextflow_pipeline { - name "Test Workflow main.nf on NovaSeq6000 data" + name "Test Workflow main.nf on NovaSeq6000 data sample size 90" script "../main.nf" tag "seqinspector" tag "PIPELINE" - test("NovaSeq6000 data test") { + test("NovaSeq6000 data test sample size") { when { config "./NovaSeq6000.main_subsample.nf.test.config" From ab30119cb07a2df13f5c6606747485796116434c Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 14:52:09 +0000 Subject: [PATCH 05/19] update docs --- docs/output.md | 13 +++++++++++++ docs/usage.md | 5 +++++ nextflow.config | 2 +- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index e14c3ad..a825751 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,10 +10,23 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- [seqkit](#seqkit) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +### Seqkit + +
+Output files + +- `seqtk/` + - `*_fastq`: FastQ file after being subsampled to the sample_size value. + +
+ +[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [FastQC help pages](https://bioinf.shenwei.me/seqkit/usage/#sample). + ### FastQC
diff --git a/docs/usage.md b/docs/usage.md index d75e0fb..c11b270 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,6 +93,11 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. +```bash +nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 90 -profile docker +``` + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: diff --git a/nextflow.config b/nextflow.config index 10bcb89..38eb312 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null - sample_size = null + sample_size = 0 // References genome = null fasta = null From b8befe4b35f94132418c81640af710ab4af486fb Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 14:52:21 +0000 Subject: [PATCH 06/19] change default to 0 --- nextflow_schema.json | 4 ++-- workflows/seqinspector.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f87e525..edb18ea 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -25,9 +25,9 @@ }, "sample_size": { "type": "integer", - "default": null, + "default": 0, "description": "Subset this number of reads.", - "help_text": "Samples will be subsetted to this number of reads. If null, no subsampling will be performed." + "help_text": "Samples will be subsetted to this number of reads. If 0 (default), no subsampling will be performed." }, "outdir": { "type": "string", diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 70b8967..91a699d 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -35,7 +35,7 @@ workflow SEQINSPECTOR { // // MODULE: Run Seqkit sample to perform subsampling // - if (params.sample_size) { + if (params.sample_size > 0 ) { ch_sample_sized = SEQTK_SAMPLE(ch_samplesheet.map { meta, reads -> [meta, reads, params.sample_size] }).reads From 135d6a35c15f0338a77dad2eaf5a7b3c9a144daa Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 15:15:17 +0000 Subject: [PATCH 07/19] update snapshot --- tests/MiSeq.main.nf.test.snap | 8 ++++---- tests/NovaSeq6000.main.nf.test.snap | 16 ++++++++-------- tests/NovaSeq6000.main_subsample.nf.test.snap | 14 +++++++------- tests/PromethION.main.nf.test.snap | 8 ++++---- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/MiSeq.main.nf.test.snap b/tests/MiSeq.main.nf.test.snap index 4613d52..d60f627 100644 --- a/tests/MiSeq.main.nf.test.snap +++ b/tests/MiSeq.main.nf.test.snap @@ -4,12 +4,12 @@ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,7b1b7fd457b60404768045b148d4c0a8", "multiqc_general_stats.txt:md5,5b28a83b14cb2fe88d084d08900ebdbf", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3" + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:18:10.3675973" + "timestamp": "2024-10-28T15:06:30.86052858" } } \ No newline at end of file diff --git a/tests/NovaSeq6000.main.nf.test.snap b/tests/NovaSeq6000.main.nf.test.snap index ee3c22b..5ac1e09 100644 --- a/tests/NovaSeq6000.main.nf.test.snap +++ b/tests/NovaSeq6000.main.nf.test.snap @@ -4,28 +4,28 @@ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,3730f9046b20ac5c17a86db0a33f8d5d", "multiqc_general_stats.txt:md5,25abe0f6a35eb4a3b056fc3cf5c13732", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3", + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,8284e25ccc21041cf3b5a32eb6a51e78", "multiqc_general_stats.txt:md5,90ee35137492b80aab36ef67f72d8921", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3", + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,f38ffdc112c73af3a41ed15848a3761f", "multiqc_general_stats.txt:md5,d62a2fc39e674d98783d408791803148", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3", + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,7ff71ceb8ecdf086331047f8860c3347", "multiqc_general_stats.txt:md5,2f09b8f199ac40cf67ba50843cebd29c", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3", + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,519ff344a896ac369bba4d5c5b8be7b5", "multiqc_general_stats.txt:md5,6a1c16f068d7ba3a9225a17eb570ed9a", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3" + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:13.226135825" + "timestamp": "2024-10-28T15:07:59.897711748" } } \ No newline at end of file diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap index d1dbe4a..c37e51c 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test.snap +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -1,31 +1,31 @@ { - "NovaSeq6000 data test": { + "NovaSeq6000 data test sample size": { "content": [ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", - "multiqc_general_stats.txt:md5,c4f40f2313aadc38619e7487226e8d93", + "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", - "multiqc_general_stats.txt:md5,d5d73d2888cd9895e5f116e5b869e73c", + "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", - "multiqc_general_stats.txt:md5,d01ec30a262b69bc5749b0ed108a950a", + "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", - "multiqc_general_stats.txt:md5,4dffc0d1169c49adde819d4467ffb775", + "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", - "multiqc_general_stats.txt:md5,05f8dfeea9fca7f4c16ba9d553af4c69", + "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686" ], "meta": { "nf-test": "0.9.1", "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T14:05:44.868455454" + "timestamp": "2024-10-28T15:09:41.503580136" } } \ No newline at end of file diff --git a/tests/PromethION.main.nf.test.snap b/tests/PromethION.main.nf.test.snap index 026a8cd..1232678 100644 --- a/tests/PromethION.main.nf.test.snap +++ b/tests/PromethION.main.nf.test.snap @@ -4,12 +4,12 @@ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,1a4b472e13cadc770832b0e20d1de7b0", "multiqc_general_stats.txt:md5,409cefc7f17f95d176ced6032bf8fb32", - "multiqc_software_versions.txt:md5,a3698a2d32e8695c38d50e3d17de5fe3" + "multiqc_software_versions.txt:md5,49e3596d49ee49d967d3b6c363b486d5" ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.10.0" + "nf-test": "0.9.1", + "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T13:19:57.261730412" + "timestamp": "2024-10-28T15:10:20.186073387" } } \ No newline at end of file From 36d80d33da43045c2175ed376bf32d79906ec3ae Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 15:43:30 +0000 Subject: [PATCH 08/19] prettify docs --- docs/usage.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/usage.md b/docs/usage.md index c11b270..0cbd8f5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -94,6 +94,7 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. + ```bash nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 90 -profile docker ``` From 96dc9f73b55dd448ec35cb16765d0a06034b7343 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 15:53:19 +0000 Subject: [PATCH 09/19] update changelog --- CHANGELOG.md | 1 + docs/output.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0b12de..bad5a09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#20](https://github.com/nf-core/seqinspector/pull/20) Use tags to generate group reports - [#13](https://github.com/nf-core/seqinspector/pull/13) Generate reports per run, per project and per lane. - [#49](https://github.com/nf-core/seqinspector/pull/49) Merge with template 3.0.2. +- [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. ### `Fixed` diff --git a/docs/output.md b/docs/output.md index a825751..aef433b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -25,7 +25,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [FastQC help pages](https://bioinf.shenwei.me/seqkit/usage/#sample). +[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [Sekit sample](https://bioinf.shenwei.me/seqkit/usage/#sample). ### FastQC From 0fd6a95e677dd8beac4805875a12b2a0b464221d Mon Sep 17 00:00:00 2001 From: ngarcia Date: Mon, 28 Oct 2024 15:54:54 +0000 Subject: [PATCH 10/19] typo --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index aef433b..f2048f0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -25,7 +25,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d -[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [Sekit sample](https://bioinf.shenwei.me/seqkit/usage/#sample). +[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [Seqkit sample](https://bioinf.shenwei.me/seqkit/usage/#sample). ### FastQC From a417ad39a0b972e5aec2c8b57570ddaa235f4bdb Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 08:47:48 +0000 Subject: [PATCH 11/19] suggestions --- docs/output.md | 6 +++--- docs/usage.md | 4 ++-- nextflow_schema.json | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/output.md b/docs/output.md index f2048f0..2d4efb0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,12 +10,12 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [seqkit](#seqkit) - Subsample a specific number of reads per sample +- [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### Seqkit +### Seqtk
Output files @@ -25,7 +25,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[SeqKit](https://bioinf.shenwei.me/seqkit/) samples sequences by number. For further reading and documentation see the [Seqkit sample](https://bioinf.shenwei.me/seqkit/usage/#sample). +[Seqtk](https://github.com/lh3/seqtk) samples sequences by number. ### FastQC diff --git a/docs/usage.md b/docs/usage.md index 0cbd8f5..31ab91e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -93,10 +93,10 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. +Optionally, the `sample_size` parameter allows you to subset a random number of reads to be analysed. Note that it refers to an absolute number. ```bash -nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 90 -profile docker +nextflow run nf-core/seqinspector --input ./samplesheet.csv --outdir ./results --sample_size 1000000 -profile docker ``` ### Updating the pipeline diff --git a/nextflow_schema.json b/nextflow_schema.json index edb18ea..49742b2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -26,8 +26,8 @@ "sample_size": { "type": "integer", "default": 0, - "description": "Subset this number of reads.", - "help_text": "Samples will be subsetted to this number of reads. If 0 (default), no subsampling will be performed." + "description": "Take this number of reads as a subset.", + "help_text": "Choose the size of the subset or 0, if no subsampling shall be performed. Note that it refers to an absolute number." }, "outdir": { "type": "string", From 801220f5f936a4be2a9fb6391e7e4cffa7c63b57 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 09:14:24 +0000 Subject: [PATCH 12/19] switch to original reads for fastqc --- workflows/seqinspector.nf | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 91a699d..a89bd5c 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -36,20 +36,28 @@ workflow SEQINSPECTOR { // MODULE: Run Seqkit sample to perform subsampling // if (params.sample_size > 0 ) { - ch_sample_sized = SEQTK_SAMPLE(ch_samplesheet.map { - meta, reads -> [meta, reads, params.sample_size] - }).reads + ch_sample_sized = ch_samplesheet.join( + SEQTK_SAMPLE( + ch_samplesheet.map { + meta, reads -> [meta, reads, params.sample_size] + } + ).reads, by: [0] + ) ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) } else { // No do subsample - ch_sample_sized = ch_samplesheet + ch_sample_sized = ch_samplesheet.map { + meta, reads -> [meta, reads, []] + } } // // MODULE: Run FastQC // FASTQC ( - ch_sample_sized + ch_sample_sized.map { + meta, reads, subsampled -> [meta, reads] + } ) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) From e9ae94cb797751ca97f3f86a199fa19bfe74d3f0 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 09:15:38 +0000 Subject: [PATCH 13/19] update snapshot --- tests/NovaSeq6000.main_subsample.nf.test.snap | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap index c37e51c..ee118c9 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test.snap +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -2,30 +2,30 @@ "NovaSeq6000 data test sample size": { "content": [ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", - "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", + "multiqc_fastqc.txt:md5,3730f9046b20ac5c17a86db0a33f8d5d", + "multiqc_general_stats.txt:md5,25abe0f6a35eb4a3b056fc3cf5c13732", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", - "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", + "multiqc_fastqc.txt:md5,8284e25ccc21041cf3b5a32eb6a51e78", + "multiqc_general_stats.txt:md5,90ee35137492b80aab36ef67f72d8921", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", - "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", + "multiqc_fastqc.txt:md5,f38ffdc112c73af3a41ed15848a3761f", + "multiqc_general_stats.txt:md5,d62a2fc39e674d98783d408791803148", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", - "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", + "multiqc_fastqc.txt:md5,7ff71ceb8ecdf086331047f8860c3347", + "multiqc_general_stats.txt:md5,2f09b8f199ac40cf67ba50843cebd29c", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", - "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012", + "multiqc_fastqc.txt:md5,519ff344a896ac369bba4d5c5b8be7b5", + "multiqc_general_stats.txt:md5,6a1c16f068d7ba3a9225a17eb570ed9a", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686" ], "meta": { "nf-test": "0.9.1", "nextflow": "24.04.4" }, - "timestamp": "2024-10-28T15:09:41.503580136" + "timestamp": "2024-10-29T09:15:03.224117679" } } \ No newline at end of file From afb2a61fc15b1f0cbefb3aa72b3e58f25b18b6f8 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 11:14:12 +0000 Subject: [PATCH 14/19] back to the past --- workflows/seqinspector.nf | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index a89bd5c..972b2f3 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -36,18 +36,16 @@ workflow SEQINSPECTOR { // MODULE: Run Seqkit sample to perform subsampling // if (params.sample_size > 0 ) { - ch_sample_sized = ch_samplesheet.join( - SEQTK_SAMPLE( - ch_samplesheet.map { - meta, reads -> [meta, reads, params.sample_size] - } - ).reads, by: [0] - ) + ch_sample_sized = SEQTK_SAMPLE( + ch_samplesheet.map { + meta, reads -> [meta, reads, params.sample_size] + } + ).reads ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) } else { // No do subsample ch_sample_sized = ch_samplesheet.map { - meta, reads -> [meta, reads, []] + meta, reads -> [meta, reads] } } @@ -56,7 +54,7 @@ workflow SEQINSPECTOR { // FASTQC ( ch_sample_sized.map { - meta, reads, subsampled -> [meta, reads] + meta, subsampled -> [meta, subsampled] } ) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip) From 80e49aa4ca09ea17483a7df007e6416fe868c13c Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 13:35:36 +0000 Subject: [PATCH 15/19] update README --- README.md | 5 +++-- workflows/seqinspector.nf | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f647af0..6cf36dc 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,9 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) +2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 972b2f3..2fbb234 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -33,7 +33,7 @@ workflow SEQINSPECTOR { ch_multiqc_reports = Channel.empty() // - // MODULE: Run Seqkit sample to perform subsampling + // MODULE: Run Seqtk sample to perform subsampling // if (params.sample_size > 0 ) { ch_sample_sized = SEQTK_SAMPLE( @@ -44,9 +44,7 @@ workflow SEQINSPECTOR { ch_versions = ch_versions.mix(SEQTK_SAMPLE.out.versions.first()) } else { // No do subsample - ch_sample_sized = ch_samplesheet.map { - meta, reads -> [meta, reads] - } + ch_sample_sized = ch_samplesheet } // From 3b2949609c67489610ebbfce498529ccb8842cdc Mon Sep 17 00:00:00 2001 From: ngarcia Date: Tue, 29 Oct 2024 14:09:35 +0000 Subject: [PATCH 16/19] add module config --- conf/modules.config | 4 ++++ tests/NovaSeq6000.main_subsample.nf.test.snap | 22 +++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c883822..d3c597b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,10 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: SEQTK_SAMPLE { + ext.args = '-s100' + } + withName: FASTQC { ext.args = '--quiet' } diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap index ee118c9..27277ff 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test.snap +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -2,30 +2,30 @@ "NovaSeq6000 data test sample size": { "content": [ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,3730f9046b20ac5c17a86db0a33f8d5d", - "multiqc_general_stats.txt:md5,25abe0f6a35eb4a3b056fc3cf5c13732", + "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", + "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,8284e25ccc21041cf3b5a32eb6a51e78", - "multiqc_general_stats.txt:md5,90ee35137492b80aab36ef67f72d8921", + "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", + "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,f38ffdc112c73af3a41ed15848a3761f", - "multiqc_general_stats.txt:md5,d62a2fc39e674d98783d408791803148", + "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", + "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,7ff71ceb8ecdf086331047f8860c3347", - "multiqc_general_stats.txt:md5,2f09b8f199ac40cf67ba50843cebd29c", + "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", + "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", - "multiqc_fastqc.txt:md5,519ff344a896ac369bba4d5c5b8be7b5", - "multiqc_general_stats.txt:md5,6a1c16f068d7ba3a9225a17eb570ed9a", + "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", + "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012", "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686" ], "meta": { "nf-test": "0.9.1", "nextflow": "24.04.4" }, - "timestamp": "2024-10-29T09:15:03.224117679" + "timestamp": "2024-10-29T13:44:43.105472321" } } \ No newline at end of file From 88fa0dcedfe8d5411a01a54c5e9a093aa2d52048 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Wed, 30 Oct 2024 09:26:52 +0000 Subject: [PATCH 17/19] fix test --- tests/NovaSeq6000.main_subsample.nf.test | 1 - tests/NovaSeq6000.main_subsample.nf.test.snap | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test index 72851f9..efdc574 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -21,7 +21,6 @@ nextflow_pipeline { path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/global_report/multiqc_data/multiqc_software_versions.txt"), path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_fastqc.txt"), diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap index fdb7c8c..8e9ada0 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test.snap +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -4,7 +4,6 @@ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,aba942d1e6996b579f19798e5673f514", "multiqc_general_stats.txt:md5,ad1ec9c64cbdb1131a26aeb6de51e31c", - "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", @@ -26,6 +25,6 @@ "nf-test": "0.9.1", "nextflow": "24.04.4" }, - "timestamp": "2024-10-30T09:11:23.734754433" + "timestamp": "2024-10-30T09:25:16.689719475" } } \ No newline at end of file From 5079047a7adf95729591a3f52b3771ff7c01a1e3 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Wed, 30 Oct 2024 09:38:34 +0000 Subject: [PATCH 18/19] add citation --- CITATIONS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index ecbfb16..8a4e350 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,8 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [Seqtk](https://github.com/lh3/seqtk) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From 5a0a3e8c2e5d565f4d0b8ec1200d2f5ede017e67 Mon Sep 17 00:00:00 2001 From: ngarcia Date: Wed, 30 Oct 2024 09:38:51 +0000 Subject: [PATCH 19/19] fix test --- tests/NovaSeq6000.main_subsample.nf.test | 4 ---- tests/NovaSeq6000.main_subsample.nf.test.snap | 8 ++------ 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/NovaSeq6000.main_subsample.nf.test b/tests/NovaSeq6000.main_subsample.nf.test index efdc574..fe2b068 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test +++ b/tests/NovaSeq6000.main_subsample.nf.test @@ -25,22 +25,18 @@ nextflow_pipeline { path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/group_reports/lane1/multiqc_data/multiqc_software_versions.txt"), path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/group_reports/group1/multiqc_data/multiqc_software_versions.txt"), path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/group_reports/group2/multiqc_data/multiqc_software_versions.txt"), path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_citations.txt"), path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_fastqc.txt"), path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_general_stats.txt"), - path("$outputDir/multiqc/group_reports/test/multiqc_data/multiqc_software_versions.txt"), ).match() }, ) diff --git a/tests/NovaSeq6000.main_subsample.nf.test.snap b/tests/NovaSeq6000.main_subsample.nf.test.snap index 8e9ada0..651973b 100644 --- a/tests/NovaSeq6000.main_subsample.nf.test.snap +++ b/tests/NovaSeq6000.main_subsample.nf.test.snap @@ -7,24 +7,20 @@ "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,aa1b8d6adae86005ea7a8b2e901099b8", "multiqc_general_stats.txt:md5,c73c8d10568a56f6534d280fff701e60", - "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,ff996e1d3dc4a46e0c9535e54d51ccab", "multiqc_general_stats.txt:md5,834e1868b887171cfda72029bbbe2d3f", - "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,3df36ecfe76b25b0c22dcda84bce2b3b", "multiqc_general_stats.txt:md5,274a001b007521970f14d68bd176e5be", - "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686", "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", "multiqc_fastqc.txt:md5,ce61b4ce4b1d76ec3f20de3bf0c9ec7f", - "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012", - "multiqc_software_versions.txt:md5,b7d1ca14785a9361f0a39ce1b6a02686" + "multiqc_general_stats.txt:md5,d476ad59458a035a329605d5284b6012" ], "meta": { "nf-test": "0.9.1", "nextflow": "24.04.4" }, - "timestamp": "2024-10-30T09:25:16.689719475" + "timestamp": "2024-10-30T09:37:46.182191597" } } \ No newline at end of file