From f5a339d9325e22e360732c704146f409989ea883 Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Thu, 23 Dec 2021 21:56:47 +0100 Subject: [PATCH 1/4] Fix generation of dict_proteinId_assemblyIds for download --- bin/download_proteins_entrez.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/download_proteins_entrez.py b/bin/download_proteins_entrez.py index 7a2e93dc..620c9e01 100755 --- a/bin/download_proteins_entrez.py +++ b/bin/download_proteins_entrez.py @@ -204,18 +204,13 @@ def main(args=None): sys.exit("Entrez elink download failed!") ### for each nucleotide sequence get list of protein ids - dict_proteinId_assemblyIds = {} + dict_proteinId_assemblyIds = defaultdict(lambda : set()) for nucleotide_record in protein_results: seqId = nucleotide_record["IdList"][0] assemblyIds = dict_seqId_assemblyIds[seqId] if len(nucleotide_record["LinkSetDb"]) > 0: for protein_record in nucleotide_record["LinkSetDb"][0]["Link"]: - if protein_record["Id"] not in dict_proteinId_assemblyIds: - dict_proteinId_assemblyIds[protein_record["Id"]] = assemblyIds - else: - for i in assemblyIds: - if i not in dict_proteinId_assemblyIds[protein_record["Id"]]: - dict_proteinId_assemblyIds[protein_record["Id"]].append(i) + dict_proteinId_assemblyIds[protein_record["Id"]].update(assemblyIds) # NOTE: # some proteins, such as 487413233, occur within multiple sequences of the assembly! From cdb275ccd4296997537a8bfdfe9529d2a5ba9f65 Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Thu, 23 Dec 2021 22:35:49 +0100 Subject: [PATCH 2/4] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1027232e..cf2a43b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ Initial release of nf-core/metapep, created with the [nf-core](https://nf-co.re/ - [#41](https://github.com/skrakau/metapep/pull/41) - Allow `assembly` input without weights - [#53](https://github.com/skrakau/metapep/pull/53) - Add buffering of predictions and chunk-wise merging to avoid sbatch error due to too many input files [#52](https://github.com/skrakau/metapep/issues/52) +- [#3](https://github.com/nf-core/metapep/pull/3) - Fix generation of `entities_proteins.entrez.tsv` for Entrez download ### `Dependencies` From 46d7a3ed10dd08b351089548721e869b86fdc895 Mon Sep 17 00:00:00 2001 From: AntoniaSchuster <56543138+AntoniaSchuster@users.noreply.github.com> Date: Mon, 10 Jan 2022 13:12:59 +0100 Subject: [PATCH 3/4] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf2a43b8..731394eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.9dev - [date] +## v1.0.0dev - [date] Initial release of nf-core/metapep, created with the [nf-core](https://nf-co.re/) template. From 725cff4a09f18b67a53afd7b659e903c5b767811 Mon Sep 17 00:00:00 2001 From: Antonia Schuster <56543138+AntoniaSchuster@users.noreply.github.com> Date: Thu, 27 Jan 2022 10:05:28 +0100 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db6b9741..be366500 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - ## v1.0dev - [2022-01-20] Pipeline has been re-implemented in [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html)