From 7ad57cb09985e93b5e5e5b63f9b348f258c9337f Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 30 Jun 2022 17:16:34 +0100 Subject: [PATCH 01/10] Add ffq to fetch metadata --- CITATIONS.md | 4 ++++ conf/modules.config | 8 ++++++++ modules/local/ffq.nf | 34 ++++++++++++++++++++++++++++++++++ workflows/sra.nf | 9 +++++++++ 4 files changed, 55 insertions(+) create mode 100644 modules/local/ffq.nf diff --git a/CITATIONS.md b/CITATIONS.md index 3826bfe7..852085ca 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,10 @@ ## Pipeline tools +- [ffq](https://www.biorxiv.org/content/10.1101/2022.05.18.492548v2) + + > Gálvez-Merchán A, Min, KHJ, Pachter L, Booeshaghi SA. Metadata retrieval from sequence databases with ffq. bioRxiv, 19 May 2022. doi: 10.1101/2022.05.18.492548. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/conf/modules.config b/conf/modules.config index 82a5f75c..ccf12423 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -38,6 +38,14 @@ if (params.input_type == 'sra') { process { + withName: FFQ { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: SRA_IDS_TO_RUNINFO { publishDir = [ path: { "${params.outdir}/metadata" }, diff --git a/modules/local/ffq.nf b/modules/local/ffq.nf new file mode 100644 index 00000000..2e6e572c --- /dev/null +++ b/modules/local/ffq.nf @@ -0,0 +1,34 @@ +process FFQ { + tag "$id" + label 'process_low' + + conda (params.enable_conda ? "bioconda::ffq=0.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ffq:0.2.1--pyhdfd78af_0': + 'quay.io/biocontainers/ffq:0.2.1--pyhdfd78af_0' }" + + input: + val id + + output: + path "*.json" , emit: json + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "$id" + """ + ffq \\ + $id \\ + $args \\ + > ${prefix}.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ffq: \$(echo \$(ffq --help 2>&1) | sed 's/^.*ffq //; s/: A command.*\$//' ) + END_VERSIONS + """ +} diff --git a/workflows/sra.nf b/workflows/sra.nf index 2b3cb498..ccd0edfb 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -19,6 +19,7 @@ WorkflowSra.initialise(params, log, valid_params) ======================================================================================== */ +include { FFQ } from '../modules/local/ffq' include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' @@ -50,6 +51,14 @@ workflow SRA { main: ch_versions = Channel.empty() + // // + // // MODULE: Get id metadata from ffq + // // + // FFQ ( + // ids + // ) + // ch_versions = ch_versions.mix(FFQ.out.versions.first()) + // // MODULE: Get SRA run information for public database ids // From c73ce9e8346579ad6577fa809d884ff9d782e292 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 30 Jun 2022 21:36:47 +0100 Subject: [PATCH 02/10] Fix linting --- CITATIONS.md | 4 +- modules/local/ffq.nf | 10 ++- workflows/sra.nf | 181 ++++++++++++++++++++++--------------------- 3 files changed, 99 insertions(+), 96 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 852085ca..e9a6d19e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -11,8 +11,8 @@ ## Pipeline tools - [ffq](https://www.biorxiv.org/content/10.1101/2022.05.18.492548v2) - - > Gálvez-Merchán A, Min, KHJ, Pachter L, Booeshaghi SA. Metadata retrieval from sequence databases with ffq. bioRxiv, 19 May 2022. doi: 10.1101/2022.05.18.492548. + +> Gálvez-Merchán A, Min, KHJ, Pachter L, Booeshaghi SA. Metadata retrieval from sequence databases with ffq. bioRxiv, 19 May 2022. doi: 10.1101/2022.05.18.492548. - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) diff --git a/modules/local/ffq.nf b/modules/local/ffq.nf index 2e6e572c..be7c56c2 100644 --- a/modules/local/ffq.nf +++ b/modules/local/ffq.nf @@ -1,5 +1,5 @@ process FFQ { - tag "$id" + tag "${ids.size() == 1 ? ids[0] : "${ids[0]+'..'+ids[-1]}"}" label 'process_low' conda (params.enable_conda ? "bioconda::ffq=0.2.1" : null) @@ -8,7 +8,7 @@ process FFQ { 'quay.io/biocontainers/ffq:0.2.1--pyhdfd78af_0' }" input: - val id + val ids output: path "*.json" , emit: json @@ -19,10 +19,12 @@ process FFQ { script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "$id" + def id_list = ids.sort() + def name = id_list.size() == 1 ? ids[0] : 'metadata' + def prefix = task.ext.prefix ?: "${name}" """ ffq \\ - $id \\ + ${id_list.join(' ')} \\ $args \\ > ${prefix}.json diff --git a/workflows/sra.nf b/workflows/sra.nf index ccd0edfb..907e62f7 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -51,101 +51,102 @@ workflow SRA { main: ch_versions = Channel.empty() - // // - // // MODULE: Get id metadata from ffq - // // - // FFQ ( - // ids - // ) - // ch_versions = ch_versions.mix(FFQ.out.versions.first()) - // - // MODULE: Get SRA run information for public database ids + // MODULE: Get id metadata from ffq // - SRA_IDS_TO_RUNINFO ( - ids, - params.ena_metadata_fields ?: '' + FFQ ( + //ids.map { [ it ] } + ids ) - ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) + ch_versions = ch_versions.mix(FFQ.out.versions.first()) - // - // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] - // - SRA_RUNINFO_TO_FTP ( - SRA_IDS_TO_RUNINFO.out.tsv - ) - ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) - - SRA_RUNINFO_TO_FTP - .out - .tsv - .splitCsv(header:true, sep:'\t') - .map { - meta -> - meta.single_end = meta.single_end.toBoolean() - [ meta, [ meta.fastq_1, meta.fastq_2 ] ] - } - .unique() - .branch { - ftp: it[0].fastq_1 && !params.force_sratools_download - sra: !it[0].fastq_1 || params.force_sratools_download - } - .set { ch_sra_reads } - ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) - - if (!params.skip_fastq_download) { - - // - // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums - // - SRA_FASTQ_FTP ( - ch_sra_reads.ftp - ) - ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) - - // - // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. - // - SRAFASTQ ( - ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } - ) - ch_versions = ch_versions.mix(SRAFASTQ.out.versions.first()) - - // - // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet - // - SRA_TO_SAMPLESHEET ( - SRA_FASTQ_FTP.out.fastq.mix(SRAFASTQ.out.reads), - params.nf_core_pipeline ?: '', - params.sample_mapping_fields - ) - - // - // MODULE: Create a merged samplesheet across all samples for the pipeline - // - SRA_MERGE_SAMPLESHEET ( - SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, - SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} - ) - ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) - - // - // MODULE: Create a MutiQC config file with sample name mappings - // - if (params.sample_mapping_fields) { - MULTIQC_MAPPINGS_CONFIG ( - SRA_MERGE_SAMPLESHEET.out.mappings - ) - ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) - } - } + // // + // // MODULE: Get SRA run information for public database ids + // // + // SRA_IDS_TO_RUNINFO ( + // ids, + // params.ena_metadata_fields ?: '' + // ) + // ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) - // - // MODULE: Dump software versions for all tools used in the workflow - // - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) + // // + // // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] + // // + // SRA_RUNINFO_TO_FTP ( + // SRA_IDS_TO_RUNINFO.out.tsv + // ) + // ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + // SRA_RUNINFO_TO_FTP + // .out + // .tsv + // .splitCsv(header:true, sep:'\t') + // .map { + // meta -> + // meta.single_end = meta.single_end.toBoolean() + // [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + // } + // .unique() + // .branch { + // ftp: it[0].fastq_1 && !params.force_sratools_download + // sra: !it[0].fastq_1 || params.force_sratools_download + // } + // .set { ch_sra_reads } + // ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + // if (!params.skip_fastq_download) { + + // // + // // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums + // // + // SRA_FASTQ_FTP ( + // ch_sra_reads.ftp + // ) + // ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) + + // // + // // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. + // // + // SRAFASTQ ( + // ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } + // ) + // ch_versions = ch_versions.mix(SRAFASTQ.out.versions.first()) + + // // + // // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet + // // + // SRA_TO_SAMPLESHEET ( + // SRA_FASTQ_FTP.out.fastq.mix(SRAFASTQ.out.reads), + // params.nf_core_pipeline ?: '', + // params.sample_mapping_fields + // ) + + // // + // // MODULE: Create a merged samplesheet across all samples for the pipeline + // // + // SRA_MERGE_SAMPLESHEET ( + // SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, + // SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} + // ) + // ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) + + // // + // // MODULE: Create a MutiQC config file with sample name mappings + // // + // if (params.sample_mapping_fields) { + // MULTIQC_MAPPINGS_CONFIG ( + // SRA_MERGE_SAMPLESHEET.out.mappings + // ) + // ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) + // } + // } + + // // + // // MODULE: Dump software versions for all tools used in the workflow + // // + // CUSTOM_DUMPSOFTWAREVERSIONS ( + // ch_versions.unique().collectFile(name: 'collated_versions.yml') + // ) } /* From ba6099da4643998e1ec1bfbeda4a330a21ec8fea Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Thu, 30 Jun 2022 21:44:39 +0100 Subject: [PATCH 03/10] Revert comments --- workflows/sra.nf | 181 +++++++++++++++++++++++------------------------ 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/workflows/sra.nf b/workflows/sra.nf index 907e62f7..ccd0edfb 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -51,102 +51,101 @@ workflow SRA { main: ch_versions = Channel.empty() - // - // MODULE: Get id metadata from ffq - // - FFQ ( - //ids.map { [ it ] } - ids - ) - ch_versions = ch_versions.mix(FFQ.out.versions.first()) - // // - // // MODULE: Get SRA run information for public database ids + // // MODULE: Get id metadata from ffq // // - // SRA_IDS_TO_RUNINFO ( - // ids, - // params.ena_metadata_fields ?: '' + // FFQ ( + // ids // ) - // ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) + // ch_versions = ch_versions.mix(FFQ.out.versions.first()) - // // - // // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] - // // - // SRA_RUNINFO_TO_FTP ( - // SRA_IDS_TO_RUNINFO.out.tsv - // ) - // ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) - - // SRA_RUNINFO_TO_FTP - // .out - // .tsv - // .splitCsv(header:true, sep:'\t') - // .map { - // meta -> - // meta.single_end = meta.single_end.toBoolean() - // [ meta, [ meta.fastq_1, meta.fastq_2 ] ] - // } - // .unique() - // .branch { - // ftp: it[0].fastq_1 && !params.force_sratools_download - // sra: !it[0].fastq_1 || params.force_sratools_download - // } - // .set { ch_sra_reads } - // ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) - - // if (!params.skip_fastq_download) { - - // // - // // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums - // // - // SRA_FASTQ_FTP ( - // ch_sra_reads.ftp - // ) - // ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) - - // // - // // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. - // // - // SRAFASTQ ( - // ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } - // ) - // ch_versions = ch_versions.mix(SRAFASTQ.out.versions.first()) - - // // - // // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet - // // - // SRA_TO_SAMPLESHEET ( - // SRA_FASTQ_FTP.out.fastq.mix(SRAFASTQ.out.reads), - // params.nf_core_pipeline ?: '', - // params.sample_mapping_fields - // ) - - // // - // // MODULE: Create a merged samplesheet across all samples for the pipeline - // // - // SRA_MERGE_SAMPLESHEET ( - // SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, - // SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} - // ) - // ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) - - // // - // // MODULE: Create a MutiQC config file with sample name mappings - // // - // if (params.sample_mapping_fields) { - // MULTIQC_MAPPINGS_CONFIG ( - // SRA_MERGE_SAMPLESHEET.out.mappings - // ) - // ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) - // } - // } + // + // MODULE: Get SRA run information for public database ids + // + SRA_IDS_TO_RUNINFO ( + ids, + params.ena_metadata_fields ?: '' + ) + ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) - // // - // // MODULE: Dump software versions for all tools used in the workflow - // // - // CUSTOM_DUMPSOFTWAREVERSIONS ( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - // ) + // + // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] + // + SRA_RUNINFO_TO_FTP ( + SRA_IDS_TO_RUNINFO.out.tsv + ) + ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + SRA_RUNINFO_TO_FTP + .out + .tsv + .splitCsv(header:true, sep:'\t') + .map { + meta -> + meta.single_end = meta.single_end.toBoolean() + [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + } + .unique() + .branch { + ftp: it[0].fastq_1 && !params.force_sratools_download + sra: !it[0].fastq_1 || params.force_sratools_download + } + .set { ch_sra_reads } + ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + if (!params.skip_fastq_download) { + + // + // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums + // + SRA_FASTQ_FTP ( + ch_sra_reads.ftp + ) + ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) + + // + // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. + // + SRAFASTQ ( + ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] } + ) + ch_versions = ch_versions.mix(SRAFASTQ.out.versions.first()) + + // + // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet + // + SRA_TO_SAMPLESHEET ( + SRA_FASTQ_FTP.out.fastq.mix(SRAFASTQ.out.reads), + params.nf_core_pipeline ?: '', + params.sample_mapping_fields + ) + + // + // MODULE: Create a merged samplesheet across all samples for the pipeline + // + SRA_MERGE_SAMPLESHEET ( + SRA_TO_SAMPLESHEET.out.samplesheet.collect{it[1]}, + SRA_TO_SAMPLESHEET.out.mappings.collect{it[1]} + ) + ch_versions = ch_versions.mix(SRA_MERGE_SAMPLESHEET.out.versions) + + // + // MODULE: Create a MutiQC config file with sample name mappings + // + if (params.sample_mapping_fields) { + MULTIQC_MAPPINGS_CONFIG ( + SRA_MERGE_SAMPLESHEET.out.mappings + ) + ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) + } + } + + // + // MODULE: Dump software versions for all tools used in the workflow + // + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) } /* From e38275384ea7d9d5ea993cbd44fd6bb30ee10898 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 09:11:10 +0100 Subject: [PATCH 04/10] Install ffq module from nf-core modules --- modules.json | 3 ++ .../ffq.nf => nf-core/modules/ffq/main.nf} | 0 modules/nf-core/modules/ffq/meta.yml | 36 +++++++++++++++++++ workflows/sra.nf | 4 +-- 4 files changed, 41 insertions(+), 2 deletions(-) rename modules/{local/ffq.nf => nf-core/modules/ffq/main.nf} (100%) create mode 100644 modules/nf-core/modules/ffq/meta.yml diff --git a/modules.json b/modules.json index fefca2b7..0a8c4bb2 100644 --- a/modules.json +++ b/modules.json @@ -9,6 +9,9 @@ "custom/sratoolsncbisettings": { "git_sha": "b2dbaa99309a2057efc32ef9d029ed91140068df" }, + "ffq": { + "git_sha": "b96066565a52fdd42901f62e03c4ff5551980b43" + }, "sratools/fasterqdump": { "git_sha": "0cdf7767a79faf424645beeff83ecfa5528b6a7c" }, diff --git a/modules/local/ffq.nf b/modules/nf-core/modules/ffq/main.nf similarity index 100% rename from modules/local/ffq.nf rename to modules/nf-core/modules/ffq/main.nf diff --git a/modules/nf-core/modules/ffq/meta.yml b/modules/nf-core/modules/ffq/meta.yml new file mode 100644 index 00000000..611e0adc --- /dev/null +++ b/modules/nf-core/modules/ffq/meta.yml @@ -0,0 +1,36 @@ +name: "ffq" +description: A command line tool that makes it easier to find sequencing data from the SRA / GEO / ENA. +keywords: + - SRA + - ENA + - GEO + - metadata + - fetch + - public + - databases +tools: + - "ffq": + description: "A command line tool that makes it easier to find sequencing data from the SRA / GEO / ENA." + homepage: https://github.com/pachterlab/ffq + documentation: https://github.com/pachterlab/ffq#usage + tool_dev_url: https://github.com/pachterlab/ffq + doi: "10.1101/2022.05.18.492548" + licence: "['MIT']" + +input: + - ids: + type: list + description: List of supported database ids e.g. SRA / GEO / ENA + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: JSON file containing metadata for ids + pattern: "*.{json}" + +authors: + - "@drpatelh" diff --git a/workflows/sra.nf b/workflows/sra.nf index ccd0edfb..fe7b8d4c 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -19,7 +19,6 @@ WorkflowSra.initialise(params, log, valid_params) ======================================================================================== */ -include { FFQ } from '../modules/local/ffq' include { SRA_IDS_TO_RUNINFO } from '../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../modules/local/sra_runinfo_to_ftp' include { SRA_FASTQ_FTP } from '../modules/local/sra_fastq_ftp' @@ -35,6 +34,7 @@ include { SRAFASTQ } from '../subworkflows/nf-core/srafastq/main' ======================================================================================== */ +include { FFQ } from '../modules/nf-core/modules/ffq/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' /* @@ -55,7 +55,7 @@ workflow SRA { // // MODULE: Get id metadata from ffq // // // FFQ ( - // ids + // ids.map { [it] } // ) // ch_versions = ch_versions.mix(FFQ.out.versions.first()) From 7caf7628943480920dc23bd11277804154bc8fd4 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 11:56:37 +0100 Subject: [PATCH 05/10] Fix API call for SRA ids --- bin/sra_ids_to_runinfo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index bbd1e6f0..f802da9e 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -246,13 +246,13 @@ def _content_check(cls, response, identifier): def _id_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" params = { - "save": "efetch", + "id": identifier, "db": "sra", "rettype": "runinfo", - "term": identifier, + "retmode": "text" } response = fetch_url( - f"https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) return [row["Experiment"] for row in open_table(response, delimiter=",")] From a6c11e04c73b842f06febec66d3c7f247741ee5d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 11:57:37 +0100 Subject: [PATCH 06/10] Update API call for GEO ids - still broken --- bin/sra_ids_to_runinfo.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index f802da9e..0f85d2fa 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -261,9 +261,14 @@ def _id_to_srx(cls, identifier): def _gse_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" ids = [] - params = {"acc": identifier, "targ": "gsm", "view": "data", "form": "text"} + params = { + "id": identifier, + "db": "gds", + "rettype": "runinfo", + "retmode": "text" + } response = fetch_url( - f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) gsm_ids = [ From 2cb54e3e390049c7d0b7637e2b3636eaff41728b Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 13:21:33 +0100 Subject: [PATCH 07/10] Change default function to resolve GSM ids --- bin/sra_ids_to_runinfo.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index 0f85d2fa..21c7225d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -193,9 +193,11 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {"GSE"} + _GEO_PREFIXES = { + "GSE", + "GSM" + } _SRA_PREFIXES = { - "GSM", "PRJNA", "SAMN", "SRR", @@ -207,7 +209,9 @@ class DatabaseResolver: "PRJDB", "SAMD", } - _ENA_PREFIXES = {"ERR"} + _ENA_PREFIXES = { + "ERR" + } @classmethod def expand_identifier(cls, identifier): From edf3dafcc4585d879eca35687b9833bdf2279962 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 16:20:17 +0100 Subject: [PATCH 08/10] Bump pipeline version to 1.7 --- CHANGELOG.md | 7 ++++++- nextflow.config | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e635ea4..dada8a72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unpublished Version / DEV] +## [[1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] - 2022-07-01 + +### :warning: Major enhancements + +Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. Please see [this PR](https://github.com/nf-core/fetchngs/pull/102). ### Enhancements & fixes - [#97](https://github.com/nf-core/fetchngs/pull/97) - Add support for generating nf-core/taxprofiler compatible samplesheets. +- [#99](https://github.com/nf-core/fetchngs/issues/99) - SRA_IDS_TO_RUNINFO fails due to bad request - Add `enum` field for `--nf_core_pipeline` to parameter schema so only accept supported pipelines are accepted ## [[1.6](https://github.com/nf-core/fetchngs/releases/tag/1.6)] - 2022-05-17 diff --git a/nextflow.config b/nextflow.config index 667f15e1..80ed6507 100644 --- a/nextflow.config +++ b/nextflow.config @@ -158,7 +158,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '1.7dev' + version = '1.7' } // Load modules.config for DSL2 module specific options From d1e07712666a224b44a5326594d56c5ec31cab48 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 16:28:18 +0100 Subject: [PATCH 09/10] Strip out mention of GEO ids everywhere --- CHANGELOG.md | 10 +++++++++- README.md | 16 ++++++++++++++-- assets/schema_input.json | 2 +- docs/output.md | 6 +++--- docs/usage.md | 18 +++++++++--------- lib/WorkflowMain.groovy | 4 ++-- lib/WorkflowSra.groovy | 17 +++++++++++++++++ main.nf | 4 ++-- nextflow_schema.json | 2 +- workflows/sra.nf | 7 +++++++ 10 files changed, 65 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dada8a72..f3bbf9a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### :warning: Major enhancements -Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. Please see [this PR](https://github.com/nf-core/fetchngs/pull/102). +Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. ### Enhancements & fixes diff --git a/README.md b/README.md index 7806362e..1151db3d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -27,7 +27,7 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt)) the pipeline performs the following steps: -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -36,6 +36,18 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet +### GEO ids + +Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. + ### Synapse ids 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. diff --git a/assets/schema_input.json b/assets/schema_input.json index 9a800216..8bf05fd8 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,7 +9,7 @@ "items": { "type": "string", "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", - "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" + "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" } } } diff --git a/docs/output.md b/docs/output.md index 7402976c..daaca914 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: - Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) + 1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids) 2. [Synapse ids](#synapse-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids
Output files - `fastq/` - - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ. - `fastq/md5/` - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. - `samplesheet/` diff --git a/docs/usage.md b/docs/usage.md index 0cb6a976..ba4de1b2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | -| ------------ | ------------ | ------------ | ---------- | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | -| SRS6531847 | ERS4399630 | DRS090921 | | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | -| SRP256957 | ERP120836 | DRP004793 | | | -| SRA1068758 | ERA2420837 | DRA008156 | | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | +| `SRA` | `ENA` | `DDBJ` | `Synapse` | +| ------------ | ------------ | ------------ | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | | +| SRS6531847 | ERS4399630 | DRS090921 | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | +| SRP256957 | ERP120836 | DRP004793 | | +| SRA1068758 | ERA2420837 | DRA008156 | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | ### SRR / ERR / DRR ids diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index f64fa80f..77b7ffde 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -104,7 +104,7 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } @@ -129,7 +129,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index a2c16219..90d86f1c 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -29,4 +29,21 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } + + // Fail pipeline if input ids are from the GEO + public static void isGeoFail(ids, log) { + def pattern = /^(GS[EM])(\d+)$/ + for (id in ids) { + if (id =~ pattern) { + log.error "===================================================================================\n" + + " GEO id detected: ${id}\n" + + " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + + " Please remove any GEO ids from the input samplesheet.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/fetchngs/pull/102\n" + + "===================================================================================" + System.exit(1) + } + } + } } diff --git a/main.nf b/main.nf index c6303a41..2c4b52f2 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ if (WorkflowMain.isSraId(ch_input, log)) { } else if (WorkflowMain.isSynapseId(ch_input, log)) { input_type = 'synapse' } else { - exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ / GEO or Synapse ids!' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' } if (params.input_type == input_type) { @@ -63,7 +63,7 @@ if (params.input_type == input_type) { workflow NFCORE_FETCHNGS { // - // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ / GEO ids + // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids // if (params.input_type == 'sra') { SRA ( ch_ids ) diff --git a/nextflow_schema.json b/nextflow_schema.json index a31b9396..a51dc45a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "pattern": "^\\S+\\.txt$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/DDBJ/GEO identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, "input_type": { "type": "string", diff --git a/workflows/sra.nf b/workflows/sra.nf index 2b3cb498..0dde4588 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -50,6 +50,13 @@ workflow SRA { main: ch_versions = Channel.empty() + // + // Fail the pipeline if GEO ids detected + // + ids + .collect() + .map { WorkflowSra.isGeoFail(it, log) } + // // MODULE: Get SRA run information for public database ids // From 3696f40cf5bcd4ec3266275b4cb77e4359d76c06 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Fri, 1 Jul 2022 16:35:45 +0100 Subject: [PATCH 10/10] Strip out regex for GEO from input schema --- assets/schema_input.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 8bf05fd8..71f0f976 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,7 +8,7 @@ "type": "array", "items": { "type": "string", - "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$", "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" } }