diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a7dd2f5..be366500 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,7 @@ Initial release of nf-core/metapep, created with the [nf-core](https://nf-co.re/ - [#41](https://github.com/skrakau/metapep/pull/41) - Allow `assembly` input without weights - [#53](https://github.com/skrakau/metapep/pull/53) - Add buffering of predictions and chunk-wise merging to avoid sbatch error due to too many input files [#52](https://github.com/skrakau/metapep/issues/52) +- [#3](https://github.com/nf-core/metapep/pull/3) - Fix generation of `entities_proteins.entrez.tsv` for Entrez download ### `Dependencies` diff --git a/bin/download_proteins_entrez.py b/bin/download_proteins_entrez.py index 6c74072c..08527f1a 100755 --- a/bin/download_proteins_entrez.py +++ b/bin/download_proteins_entrez.py @@ -204,18 +204,13 @@ def main(args=None): sys.exit("Entrez elink download failed!") ### for each nucleotide sequence get list of protein ids - dict_proteinId_assemblyIds = {} + dict_proteinId_assemblyIds = defaultdict(lambda : set()) for nucleotide_record in protein_results: seqId = nucleotide_record["IdList"][0] assemblyIds = dict_seqId_assemblyIds[seqId] if len(nucleotide_record["LinkSetDb"]) > 0: for protein_record in nucleotide_record["LinkSetDb"][0]["Link"]: - if protein_record["Id"] not in dict_proteinId_assemblyIds: - dict_proteinId_assemblyIds[protein_record["Id"]] = assemblyIds - else: - for i in assemblyIds: - if i not in dict_proteinId_assemblyIds[protein_record["Id"]]: - dict_proteinId_assemblyIds[protein_record["Id"]].append(i) + dict_proteinId_assemblyIds[protein_record["Id"]].update(assemblyIds) # NOTE: # some proteins, such as 487413233, occur within multiple sequences of the assembly!