nf-core · AntoniaSchuster · Jan 27, 2022 · Dec 23, 2021 · Dec 23, 2021 · Jan 10, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,7 @@ Initial release of nf-core/metapep, created with the [nf-core](https://nf-co.re/
 
 - [#41](https://github.com/skrakau/metapep/pull/41) - Allow `assembly` input without weights
 - [#53](https://github.com/skrakau/metapep/pull/53) - Add buffering of predictions and chunk-wise merging to avoid sbatch error due to too many input files [#52](https://github.com/skrakau/metapep/issues/52)
+- [#3](https://github.com/nf-core/metapep/pull/3) - Fix generation of `entities_proteins.entrez.tsv` for Entrez download
 
 ### `Dependencies`
 

diff --git a/bin/download_proteins_entrez.py b/bin/download_proteins_entrez.py
@@ -204,18 +204,13 @@ def main(args=None):
             sys.exit("Entrez elink download failed!")
 
     ### for each nucleotide sequence get list of protein ids
-    dict_proteinId_assemblyIds = {}
+    dict_proteinId_assemblyIds = defaultdict(lambda : set())
     for nucleotide_record in protein_results:
         seqId = nucleotide_record["IdList"][0]
         assemblyIds = dict_seqId_assemblyIds[seqId]
         if len(nucleotide_record["LinkSetDb"]) > 0:
             for protein_record in nucleotide_record["LinkSetDb"][0]["Link"]:
-                if protein_record["Id"] not in dict_proteinId_assemblyIds:
-                    dict_proteinId_assemblyIds[protein_record["Id"]] = assemblyIds
-                else:
-                    for i in assemblyIds:
-                        if i not in dict_proteinId_assemblyIds[protein_record["Id"]]:
-                            dict_proteinId_assemblyIds[protein_record["Id"]].append(i)
+                dict_proteinId_assemblyIds[protein_record["Id"]].update(assemblyIds)
 
     # NOTE:
     # some proteins, such as 487413233, occur within multiple sequences of the assembly!