diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e635ea4..f3bbf9a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,24 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unpublished Version / DEV] +## [[1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] - 2022-07-01 + +### :warning: Major enhancements + +Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. ### Enhancements & fixes - [#97](https://github.com/nf-core/fetchngs/pull/97) - Add support for generating nf-core/taxprofiler compatible samplesheets. +- [#99](https://github.com/nf-core/fetchngs/issues/99) - SRA_IDS_TO_RUNINFO fails due to bad request - Add `enum` field for `--nf_core_pipeline` to parameter schema so only accept supported pipelines are accepted ## [[1.6](https://github.com/nf-core/fetchngs/releases/tag/1.6)] - 2022-05-17 diff --git a/CITATIONS.md b/CITATIONS.md index 3826bfe7..e9a6d19e 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,10 @@ ## Pipeline tools +- [ffq](https://www.biorxiv.org/content/10.1101/2022.05.18.492548v2) + +> Gálvez-Merchán A, Min, KHJ, Pachter L, Booeshaghi SA. Metadata retrieval from sequence databases with ffq. bioRxiv, 19 May 2022. doi: 10.1101/2022.05.18.492548. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index 7806362e..1151db3d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public and private databases. At present, the pipeline supports SRA / ENA / DDBJ / Synapse ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. @@ -27,7 +27,7 @@ On release, automated continuous integration tests run the pipeline on a full-si Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.txt)) the pipeline performs the following steps: -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API @@ -36,6 +36,18 @@ Via a single file of ids, provided one-per-line (see [example input file](https: - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ 4. Collate id metadata and paths to FastQ files in a single samplesheet +### GEO ids + +Support for GEO ids was dropped in [[v1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline instead: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. + ### Synapse ids 1. Resolve Synapse directory ids to their corresponding FastQ files ids via the `synapse list` command. diff --git a/assets/schema_input.json b/assets/schema_input.json index 9a800216..71f0f976 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -8,8 +8,8 @@ "type": "array", "items": { "type": "string", - "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM])|(syn))(\\d+)$", - "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(syn))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ identifier" } } } diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py index bbd1e6f0..21c7225d 100755 --- a/bin/sra_ids_to_runinfo.py +++ b/bin/sra_ids_to_runinfo.py @@ -193,9 +193,11 @@ def is_valid(cls, identifier): class DatabaseResolver: """Define a service class for resolving various identifiers to experiments.""" - _GEO_PREFIXES = {"GSE"} + _GEO_PREFIXES = { + "GSE", + "GSM" + } _SRA_PREFIXES = { - "GSM", "PRJNA", "SAMN", "SRR", @@ -207,7 +209,9 @@ class DatabaseResolver: "PRJDB", "SAMD", } - _ENA_PREFIXES = {"ERR"} + _ENA_PREFIXES = { + "ERR" + } @classmethod def expand_identifier(cls, identifier): @@ -246,13 +250,13 @@ def _content_check(cls, response, identifier): def _id_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" params = { - "save": "efetch", + "id": identifier, "db": "sra", "rettype": "runinfo", - "term": identifier, + "retmode": "text" } response = fetch_url( - f"https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) return [row["Experiment"] for row in open_table(response, delimiter=",")] @@ -261,9 +265,14 @@ def _id_to_srx(cls, identifier): def _gse_to_srx(cls, identifier): """Resolve the identifier to SRA experiments.""" ids = [] - params = {"acc": identifier, "targ": "gsm", "view": "data", "form": "text"} + params = { + "id": identifier, + "db": "gds", + "rettype": "runinfo", + "retmode": "text" + } response = fetch_url( - f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?{urlencode(params)}" + f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}" ) cls._content_check(response, identifier) gsm_ids = [ diff --git a/conf/modules.config b/conf/modules.config index 82a5f75c..ccf12423 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -38,6 +38,14 @@ if (params.input_type == 'sra') { process { + withName: FFQ { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: SRA_IDS_TO_RUNINFO { publishDir = [ path: { "${params.outdir}/metadata" }, diff --git a/docs/output.md b/docs/output.md index 7402976c..daaca914 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,19 +9,19 @@ This document describes the output produced by the pipeline. The directories lis The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: - Download FastQ files and create samplesheet from: - 1. [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) + 1. [SRA / ENA / DDBJ ids](#sra--ena--ddbj-ids) 2. [Synapse ids](#synapse-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -### SRA / ENA / DDBJ / GEO ids +### SRA / ENA / DDBJ ids
Output files - `fastq/` - - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ. - `fastq/md5/` - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. - `samplesheet/` diff --git a/docs/usage.md b/docs/usage.md index 0cb6a976..ba4de1b2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,15 +8,15 @@ The pipeline has been set-up to automatically download and process the raw FastQ files from both public and private repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -| `SRA` | `ENA` | `DDBJ` | `GEO` | `Synapse` | -| ------------ | ------------ | ------------ | ---------- | ----------- | -| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | syn26240435 | -| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | | -| SRS6531847 | ERS4399630 | DRS090921 | | | -| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | | -| SRP256957 | ERP120836 | DRP004793 | | | -| SRA1068758 | ERA2420837 | DRA008156 | | | -| PRJNA625551 | PRJEB37513 | PRJDB4176 | | | +| `SRA` | `ENA` | `DDBJ` | `Synapse` | +| ------------ | ------------ | ------------ | ----------- | +| SRR11605097 | ERR4007730 | DRR171822 | syn26240435 | +| SRX8171613 | ERX4009132 | DRX162434 | | +| SRS6531847 | ERS4399630 | DRS090921 | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | +| SRP256957 | ERP120836 | DRP004793 | | +| SRA1068758 | ERA2420837 | DRA008156 | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | ### SRR / ERR / DRR ids diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index f64fa80f..77b7ffde 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -104,7 +104,7 @@ class WorkflowMain { if (num_match == total_ids) { is_sra = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } @@ -129,7 +129,7 @@ class WorkflowMain { if (num_match == total_ids) { is_synapse = true } else { - log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ / GEO or Synapse ids!" + log.error "Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / DDBJ or Synapse ids!" System.exit(1) } } diff --git a/lib/WorkflowSra.groovy b/lib/WorkflowSra.groovy index a2c16219..90d86f1c 100755 --- a/lib/WorkflowSra.groovy +++ b/lib/WorkflowSra.groovy @@ -29,4 +29,21 @@ class WorkflowSra { " running nf-core/other pipelines.\n" + "===================================================================================" } + + // Fail pipeline if input ids are from the GEO + public static void isGeoFail(ids, log) { + def pattern = /^(GS[EM])(\d+)$/ + for (id in ids) { + if (id =~ pattern) { + log.error "===================================================================================\n" + + " GEO id detected: ${id}\n" + + " Support for GEO ids was dropped in v1.7 due to breaking changes in the NCBI API.\n" + + " Please remove any GEO ids from the input samplesheet.\n\n" + + " Please see:\n" + + " https://github.com/nf-core/fetchngs/pull/102\n" + + "===================================================================================" + System.exit(1) + } + } + } } diff --git a/main.nf b/main.nf index c6303a41..2c4b52f2 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ if (WorkflowMain.isSraId(ch_input, log)) { } else if (WorkflowMain.isSynapseId(ch_input, log)) { input_type = 'synapse' } else { - exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ / GEO or Synapse ids!' + exit 1, 'Ids provided via --input not recognised please make sure they are either SRA / ENA / DDBJ or Synapse ids!' } if (params.input_type == input_type) { @@ -63,7 +63,7 @@ if (params.input_type == input_type) { workflow NFCORE_FETCHNGS { // - // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ / GEO ids + // WORKFLOW: Download FastQ files for SRA / ENA / DDBJ ids // if (params.input_type == 'sra') { SRA ( ch_ids ) diff --git a/modules.json b/modules.json index fefca2b7..0a8c4bb2 100644 --- a/modules.json +++ b/modules.json @@ -9,6 +9,9 @@ "custom/sratoolsncbisettings": { "git_sha": "b2dbaa99309a2057efc32ef9d029ed91140068df" }, + "ffq": { + "git_sha": "b96066565a52fdd42901f62e03c4ff5551980b43" + }, "sratools/fasterqdump": { "git_sha": "0cdf7767a79faf424645beeff83ecfa5528b6a7c" }, diff --git a/modules/nf-core/modules/ffq/main.nf b/modules/nf-core/modules/ffq/main.nf new file mode 100644 index 00000000..be7c56c2 --- /dev/null +++ b/modules/nf-core/modules/ffq/main.nf @@ -0,0 +1,36 @@ +process FFQ { + tag "${ids.size() == 1 ? ids[0] : "${ids[0]+'..'+ids[-1]}"}" + label 'process_low' + + conda (params.enable_conda ? "bioconda::ffq=0.2.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ffq:0.2.1--pyhdfd78af_0': + 'quay.io/biocontainers/ffq:0.2.1--pyhdfd78af_0' }" + + input: + val ids + + output: + path "*.json" , emit: json + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def id_list = ids.sort() + def name = id_list.size() == 1 ? ids[0] : 'metadata' + def prefix = task.ext.prefix ?: "${name}" + """ + ffq \\ + ${id_list.join(' ')} \\ + $args \\ + > ${prefix}.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ffq: \$(echo \$(ffq --help 2>&1) | sed 's/^.*ffq //; s/: A command.*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/modules/ffq/meta.yml b/modules/nf-core/modules/ffq/meta.yml new file mode 100644 index 00000000..611e0adc --- /dev/null +++ b/modules/nf-core/modules/ffq/meta.yml @@ -0,0 +1,36 @@ +name: "ffq" +description: A command line tool that makes it easier to find sequencing data from the SRA / GEO / ENA. +keywords: + - SRA + - ENA + - GEO + - metadata + - fetch + - public + - databases +tools: + - "ffq": + description: "A command line tool that makes it easier to find sequencing data from the SRA / GEO / ENA." + homepage: https://github.com/pachterlab/ffq + documentation: https://github.com/pachterlab/ffq#usage + tool_dev_url: https://github.com/pachterlab/ffq + doi: "10.1101/2022.05.18.492548" + licence: "['MIT']" + +input: + - ids: + type: list + description: List of supported database ids e.g. SRA / GEO / ENA + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: JSON file containing metadata for ids + pattern: "*.{json}" + +authors: + - "@drpatelh" diff --git a/nextflow.config b/nextflow.config index 667f15e1..80ed6507 100644 --- a/nextflow.config +++ b/nextflow.config @@ -158,7 +158,7 @@ manifest { description = 'Pipeline to fetch metadata and raw FastQ files from public databases' mainScript = 'main.nf' nextflowVersion = '!>=21.10.3' - version = '1.7dev' + version = '1.7' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index a31b9396..a51dc45a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -19,7 +19,7 @@ "pattern": "^\\S+\\.txt$", "schema": "assets/schema_input.json", "fa_icon": "fas fa-file-excel", - "description": "File containing SRA/ENA/DDBJ/GEO identifiers one per line to download their associated metadata and FastQ files." + "description": "File containing SRA/ENA/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, "input_type": { "type": "string", diff --git a/workflows/sra.nf b/workflows/sra.nf index 2b3cb498..043ee0af 100644 --- a/workflows/sra.nf +++ b/workflows/sra.nf @@ -34,6 +34,7 @@ include { SRAFASTQ } from '../subworkflows/nf-core/srafastq/main' ======================================================================================== */ +include { FFQ } from '../modules/nf-core/modules/ffq/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' /* @@ -50,6 +51,21 @@ workflow SRA { main: ch_versions = Channel.empty() + // // + // // MODULE: Get id metadata from ffq + // // + // FFQ ( + // ids.map { [it] } + // ) + // ch_versions = ch_versions.mix(FFQ.out.versions.first()) + + // + // Fail the pipeline if GEO ids detected + // + ids + .collect() + .map { WorkflowSra.isGeoFail(it, log) } + // // MODULE: Get SRA run information for public database ids //