Merge remote-tracking branch 'origin/main' into gitpod_edits

Eco-Flow · Apr 5, 2024 · c64a17c · c64a17c
2 parents 9426586 + 05ef2b5
commit c64a17c
Show file tree

Hide file tree

Showing 21 changed files with 639 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -27,14 +27,14 @@ This will produce a directory in the current directory called `synteny-VERSION`
 
 ### Required
 
-* `--input` - Path to a comma-separated file containing sample id and path to the fastq(s). Each row contains information on a singular sample.
+* `--input` - Path to a comma-separated file containing sample id and either path to the fastq(s) or an SRA ID. Each row contains information on a singular sample.
 * `--database` - Path to database fasta file to be used in vsearch sintax module.
 * `--outdir` - Path to the output directory where the results will be saved (you have to use absolute paths to storage on cloud infrastructure) **[default: results]**.  
 * `--FW_primer` - Sequence of the forward primer.
 * `--RV_primer` - Sequence of the reverse primer.
 
 ### Optional
-* `--single_end` - Tells pipeline whether to expect single end or paired-end data **[default: false]**.
+* `--single_end` - Tells pipeline whether to expect single end or paired-end sra data **[default: false]**.
 * `--custom_config` - A path/url to a custom configuration file.
 * `--publish_dir_mode` - Method used to save pipeline results to output directory. (accepted: symlink, rellink, link, copy, copyNoFollow, move) **[default: copy]**. 
 * `--clean` - Enable cleanup function **[default: true]**.
@@ -49,7 +49,6 @@ This will produce a directory in the current directory called `synteny-VERSION`
 * `--cutadapt_max_error_rate` - Cutadapt max error rate parameter **[default: 0.1]**.
 * `--pacbio` - Cutadapt pacbio parameter.
 * `--iontorrent` - Cutadapt iontorrent parameter.
-* `--retain_untrimmed` - Cutadapt retain untrimmed parameter.
 * `--pear_p_value` - Pear p-value parameter.
 * `--pear_min_overlap` - Pear min overlap parameter.
 * `--pear_max_len` - Pear max length parameter.
@@ -69,6 +68,8 @@ This will produce a directory in the current directory called `synteny-VERSION`
 * `--sintax_cutoff` - vsearch sintax cutoff parameter.
 * `--sintax_strand` - vsearch sintax strand parameter.
 * `--seed` - vsearch sinxtax random seed parameter **[default: 1312]**.
+* `--ncbi_settings` - Path to NCBI settings folder.
+* `--certificate` - Path to certificate file.
 
 #### AWS parameters (ensure these match the infrastructure you have access to if using AWS)
 * `--awsqueue` - aws queue to use with aws batch.
@@ -108,6 +109,8 @@ results
 │       │   ├── genus.pdf
 │       │   └── order.pdf
 │       └── summary.tsv
+├── sratools_fasterq-dump
+│   └── sample
 ├── usearch
 │   └── sintax_summary
 │       ├── sample
@@ -148,6 +151,8 @@ results
 2. `pie_charts` - pdfs of top predicted species for different taxonomic level
 3. `summary.tsv` - tsv containing summary statistics.
 
+`sratools_fasterq-dump` - fastqs obtained from SRA ID.
+
 `usearch` - text files containing the name, number of reads, percentage of reads and cumulative percentage of reads for each taxonomic level.
 
 `vsearch`
@@ -211,19 +216,18 @@ nextflow run main.nf -profile singularity,test_small -resume
 * Running the pipeline with additional parameters:
 ```
 nextflow run main.nf -profile apptainer,local -resume \
-   --input data/input_full-s3.csv \
+   --input data/input_small-s3.csv \
    --database "s3://pollen-metabarcoding-test-data/data/viridiplantae_all_2014.sintax.fa" \
    --FW_primer "ATGCGATACTTGGTGTGAAT" --RV_primer "GCATATCAATAAGCGGAGGA" \
    --clean false \
-   --retain_untrimmed true \
    --fastq_maxee 0.5 --fastq_minlen 250 --fastq_maxns 0 --fasta_width 0 \
    --derep_strand "plus" \
    --sintax_strand "both" --sintax_cutoff 0.95
 ```
 
 * Running the pipeline with a custom config file:
 ```
-nextflow run main.nf -profile docker,aws_batch -resume --input data/input_full-s3.csv --database "s3://pollen-metabarcoding-test-data/data/viridiplantae_all_2014.sintax.fa" --FW_primer "ATGCGATACTTGGTGTGAAT" --RV_primer "GCATATCAATAAGCGGAGGA" --custom_config /path/to/custom/config
+nextflow run main.nf -profile docker,aws_batch -resume --input data/input_manual.csv --database "s3://pollen-metabarcoding-test-data/data/viridiplantae_all_2014.sintax.fa" --FW_primer "ATGCGATACTTGGTGTGAAT" --RV_primer "GCATATCAATAAGCGGAGGA" --custom_config /path/to/custom/config
 ```
 
 * Running on Gitpod:

diff --git a/conf/modules.config b/conf/modules.config
@@ -33,7 +33,6 @@ process {
             params.pacbio ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" :
                 params.iontorrent ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" :
                 "-g ${params.FW_primer} -G ${RV_primer_RevComp}",
-            params.retain_untrimmed ? '' : "--discard-untrimmed",
         ].join(' ').trim()
         ext.prefix = { "${meta.id}" }
         publishDir = [ [
@@ -161,6 +160,14 @@ process {
         ]
     }
 
+    withName: 'SRATOOLS_FASTERQDUMP' {
+        publishDir = [
+            path: { "${params.outdir}/sratools_fasterq-dump/${meta.id}" },
+            mode: params.publish_dir_mode,
+            pattern: '*.fastq.gz'
+        ]
+    }
+
     withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/data/input_mixed.csv b/data/input_mixed.csv
@@ -0,0 +1,5 @@
+ERR2537811,s3://pollen-metabarcoding-test-data/data/ERR2537811_1.fastq.gz,s3://pollen-metabarcoding-test-data/data/ERR2537811_2.fastq.gz
+ERR2537812,s3://pollen-metabarcoding-test-data/data/ERR2537812_1.fastq.gz,s3://pollen-metabarcoding-test-data/data/ERR2537812_2.fastq.gz
+ERR2537813,s3://pollen-metabarcoding-test-data/data/ERR2537813_1.fastq.gz,s3://pollen-metabarcoding-test-data/data/ERR2537813_2.fastq.gz
+ERR2537816,ERR2537816
+ERR2537814,ERR2537814
diff --git a/main.nf b/main.nf
@@ -17,8 +17,10 @@ log.info """\
 
  =========================================""".stripIndent()
 
-include { PEAR } from './modules/nf-core/pear/main'
+include { SRATOOLS_PREFETCH } from './modules/nf-core/sratools/prefetch/main'
+include { SRATOOLS_FASTERQDUMP } from './modules/nf-core/sratools/fasterqdump/main'
 include { CUTADAPT } from './modules/nf-core/cutadapt/main'
+include { PEAR } from './modules/nf-core/pear/main'
 include { VSEARCH_FASTQ_FILTER } from './modules/local/vsearch_fastq_filter.nf'
 include { VSEARCH_DEREP_FULL_LENGTH } from './modules/local/vsearch_derep.nf'
 include { VSEARCH_SINTAX } from './modules/nf-core/vsearch/sintax/main'
@@ -44,14 +46,35 @@ workflow {
   //Make a channel for version outputs:
   ch_versions = Channel.empty()
 
-  //Input to cutadapt depends on whether a single-end fastq or a set of paired-end fastqs are provided
-  if (params.single_end == true) {
-     Channel.fromPath(params.input) | flatMap{ it.readLines() } | map{ csv -> [ [ "id":csv.split(",")[0], "single_end": true ], [ csv.split(",")[1] ] ] } | CUTADAPT
-  }
-  else {
-     Channel.fromPath(params.input) | flatMap{ it.readLines() } | map{ csv -> [ [ "id":csv.split(",")[0], "single_end": false ], [ csv.split(",")[1], csv.split(",")[2] ] ] } | CUTADAPT
-  }
-  ch_versions = ch_versions.mix(CUTADAPT.out.versions.first())
+  //Split processing depending on whether two or three elements are provided on each row of input file
+  Channel.fromPath(params.input) | splitCsv | branch { two: it.size() == 2; three: it.size() == 3 } | set { input_type }
+
+  //If three elements are provided then the data is paired end so organised appropriately
+  input_type.three | map{ csv -> [ [ "id":csv[0], "single_end": false ], [ csv[1], csv[2] ] ] } | set { three_tuple }
+
+  //If a row in input file is 2 elements then it could be single end fastq or an sra so need to filter accordingly
+  input_type.two | branch { fastq: it[1] =~ /\.f.*q\.gz$/ || it[1] =~ /\.f.*q$/; sra: it[1] !=~ /\.f.*q\.gz$/ || it[1] !=~ /\.f.*q$/ } | set { ch_extension }
+
+  //Combine single end fastqs channel with paired end fastqs channel
+  ch_extension.fastq | map{ csv -> [ [ "id":csv[0], "single_end": true ], [ csv[1] ] ] } | mix(three_tuple) | set { fastqs_combined }
+
+  //Input parameter "--single_end' determines whether sras are assumed to be single_end or not
+  ch_sra = params.single_end == true ? ch_extension.sra | map{ csv -> [ [ "id":csv[0], "single_end": true ], csv[1] ] } : ch_extension.sra | map{ csv -> [ [ "id":csv[0], "single_end": false ], csv[1] ] }
+
+  //Provide ncbi_settings and certificate if provided
+  ch_ncbi_settings = params.ncbi_settings != null ? Channel.fromPath(params.ncbi_settings) : []
+  ch_certificate = params.certificate != null ? Channel.fromPath(params.certificate) : []
+
+  //Need to ensure ncbi_settings and certificate are provided to each sra
+  ch_sra | multiMap { it -> sra: [it[0], it[1]]; ncbi: ch_ncbi_settings; cert: ch_certificate } | set { ch_prefetch }
+
+  SRATOOLS_PREFETCH(ch_prefetch.sra, ch_prefetch.ncbi, ch_prefetch.cert)
+  SRATOOLS_FASTERQDUMP(SRATOOLS_PREFETCH.out.sra, ch_prefetch.ncbi, ch_prefetch.cert)
+
+  //Combine provided fastqs with sra fastqs
+  fastqs_combined | mix(SRATOOLS_FASTERQDUMP.out.reads) | set { all_fqs }
+
+  CUTADAPT(all_fqs)
 
   CUTADAPT.out.reads | PEAR
   ch_versions = ch_versions.mix(PEAR.out.versions.first())
@@ -83,7 +106,6 @@ workflow {
   CUSTOM_DUMPSOFTWAREVERSIONS (
     ch_versions.collectFile(name: 'collated_versions.yml')
   )
-
 }
 
 workflow.onComplete {

diff --git a/modules.json b/modules.json
@@ -20,6 +20,16 @@
             "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
             "installed_by": ["modules"]
           },
+          "sratools/fasterqdump": {
+            "branch": "master",
+            "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
+            "installed_by": ["modules"]
+          },
+          "sratools/prefetch": {
+            "branch": "master",
+            "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa",
+            "installed_by": ["modules"]
+          },
           "vsearch/sintax": {
             "branch": "master",
             "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",

diff --git a/modules/nf-core/sratools/fasterqdump/environment.yml b/modules/nf-core/sratools/fasterqdump/environment.yml
@@ -0,0 +1,8 @@
+name: sratools_fasterqdump
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::sra-tools=3.0.8
+  - conda-forge::pigz=2.6
diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf
@@ -0,0 +1,58 @@
+process SRATOOLS_FASTERQDUMP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' :
+        'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' }"
+
+    input:
+    tuple val(meta), path(sra)
+    path ncbi_settings
+    path certificate
+
+    output:
+    tuple val(meta), path('*.fastq.gz'), emit: reads
+    path "versions.yml"                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def outfile = meta.single_end ? "${prefix}.fastq" : prefix
+    def key_file = ''
+    if (certificate.toString().endsWith('.jwt')) {
+        key_file += " --perm ${certificate}"
+    } else if (certificate.toString().endsWith('.ngc')) {
+        key_file += " --ngc ${certificate}"
+    }
+    //HAD TO ADD SH SHEBANG TO ENSURE IT WAS INTERPRETED BY RIGHT SHELL, OTHERWISE FAILS WITH "bash not found" ERROR
+    //HAD TO CHANGE ${outfile} TO "${outfile}.fastq" TO STOP AN ERROR WHEN WRITING TO THE SAME PLACE AS INPUT FILE
+    """
+    #!/bin/sh
+    export NCBI_SETTINGS="\$PWD/${ncbi_settings}"
+
+    fasterq-dump \\
+        $args \\
+        --threads $task.cpus \\
+        --outfile "${outfile}.fastq" \\
+        ${key_file} \\
+        ${sra}
+
+    pigz \\
+        $args2 \\
+        --no-name \\
+        --processes $task.cpus \\
+        *.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sratools: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+')
+        pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/sratools/fasterqdump/meta.yml b/modules/nf-core/sratools/fasterqdump/meta.yml
@@ -0,0 +1,53 @@
+name: sratools_fasterqdump
+description: Extract sequencing reads in FASTQ format from a given NCBI Sequence Read Archive (SRA).
+keywords:
+  - sequencing
+  - FASTQ
+  - dump
+tools:
+  - sratools:
+      description: SRA Toolkit and SDK from NCBI
+      homepage: https://github.com/ncbi/sra-tools
+      documentation: https://github.com/ncbi/sra-tools/wiki
+      tool_dev_url: https://github.com/ncbi/sra-tools
+      licence: ["Public Domain"]
+input:
+  - meta:
+      type: map
+      description: >
+        Groovy Map containing sample information e.g. [ id:'test', single_end:false ]
+
+  - sra:
+      type: directory
+      description: Directory containing ETL data for the given SRA.
+      pattern: "*/*.sra"
+  - ncbi_settings:
+      type: file
+      description: >
+        An NCBI user settings file.
+
+      pattern: "*.mkfg"
+  - certificate:
+      type: file
+      description: >
+        Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit
+
+      pattern: "*.cart"
+output:
+  - meta:
+      type: map
+      description: >
+        Groovy Map containing sample information e.g. [ id:'test', single_end:false ]
+
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - reads:
+      type: file
+      description: Extracted FASTQ file or files if the sequencing reads are paired-end.
+      pattern: "*.fastq.gz"
+authors:
+  - "@Midnighter"
+maintainers:
+  - "@Midnighter"
diff --git a/modules/nf-core/sratools/fasterqdump/tests/main.nf.test b/modules/nf-core/sratools/fasterqdump/tests/main.nf.test
@@ -0,0 +1,73 @@
+nextflow_process {
+    name "Test Process SRATOOLS_FASTERQDUMP"
+    script "../main.nf"
+    config "./nextflow.config"
+    process "SRATOOLS_FASTERQDUMP"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "untar"
+    tag "sratools"
+    tag "sratools/fasterqdump"
+
+    test("Single-end") {
+
+        setup {
+            run("UNTAR") {
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = Channel.of([ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/sra/SRR13255544.tar.gz', checkIfExists: true) ])
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map{ meta, files -> [ [ id:'test_single_end', single_end:true ], files]}
+                input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true)
+                input[2] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("Paired-end") {
+
+        setup {
+            run("UNTAR") {
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = Channel.of([ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/sra/SRR11140744.tar.gz', checkIfExists: true) ])
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map{ meta, files -> [ [ id:'test_paired_end', single_end:false ], files]}
+                input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true)
+                input[2] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}