From 8e7945d3318a41de213a113ac2fb7c259f1002f5 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 18:41:57 +0200
Subject: [PATCH] change the output format for gnps_annotations in metabolomics
 results file, improve docstrings

---
 src/nplinker/genomics/bgc.py          | 26 +++++++++++------------
 src/nplinker/metabolomics/spectrum.py | 30 +++++++++++----------------
 src/nplinker/scoring/link_graph.py    | 17 +++++++--------
 3 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 880a3710..2624cfae 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -194,22 +194,22 @@ def aa_predictions(self) -> list:
         return [self._aa_predictions]
 
     def to_dict(self) -> dict[str, any]:
-        """Convert the BGC object to a dictionary that can be used to export the results.
+        """Convert the BGC object to a dictionary for exporting results.
 
-        This method gathers relevant information from the BGC object and formats it into a dictionary
-        where each key-value pair represents a specific attribute of the BGC.
+        This method compiles relevant information from the BGC object and formats it into a dictionary.
+        Each key-value pair in the dictionary represents a specific attribute of the BGC.
 
         Returns:
-            dict[str, str]: A dictionary containing relevant information about the BGC object, including:
-                - GCF_id: A comma-separated string of GCF IDs or "-" if none.
-                - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none.
-                - BGC_name: The name of the BGC.
-                - strain_id: The ID of the strain.
-                - description: A description of the BGC.
-                - antismash_id: The antiSMASH ID.
-                - antismash_region: The antiSMASH region.
-                - antismash_cluster_type: A comma-separated string of product predictions.
-                - mibig_bgc_class: The MiBIG BGC class or "-" if none.
+            A dictionary containing the following key-value pairs:
+            - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available.
+            - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available.
+            - BGC_name (str): The name of the BGC.
+            - strain_id (str): The ID of the strain.
+            - description (str): A description of the BGC.
+            - antismash_id (str): The antiSMASH ID.
+            - antismash_region (str): The antiSMASH region.
+            - antismash_cluster_type (str): A comma-separated string of product predictions.
+            - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available.
         """
         gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None}
         gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None}
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 3dc6b3ed..5c929a13 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -98,25 +98,21 @@ def has_strain(self, strain: Strain) -> bool:
         """
         return strain in self.strains
 
-    def _formatted_gnps_annotations(self) -> str:
-        """Format GNPS annotations dictionary into a string."""
-        return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items())
-
     def to_dict(self) -> dict[str, any]:
-        """Convert the Spectrum object to a dictionary that can be used to export the results.
+        """Convert the Spectrum object to a dictionary for exporting results.
 
-        This method gathers relevant information from the Spectrum object and formats it into a dictionary
-        where each key-value pair represents a specific attribute of the Spectrum.
+        This method compiles relevant information from the Spectrum object into a dictionary format.
+        Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object.
 
         Returns:
-            dict[str, str]: A dictionary containing relevant information about the Spectrum object, including:
-                - "spectrum_id": The unique identifier of the spectrum.
-                - "num_strains_with_spectrum": The number of strains associated with the spectrum.
-                - "precursor_mz": The precursor m/z value formatted to four decimal places.
-                - "rt": The retention time formatted to three decimal places.
-                - "molecular_family": The identifier of the molecular family, or "-" if not available.
-                - "gnps_id": The GNPS identifier, or "-" if not available.
-                - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available.
+            A dictionary containing containing the following key-value pairs:
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str): The identifier of the molecular family, or "-" if not available.
+                - "gnps_id" (str): The GNPS identifier, or "-" if not available.
+                - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available.
         """
         return {
             "spectrum_id": self.id,
@@ -125,7 +121,5 @@ def to_dict(self) -> dict[str, any]:
             "rt": round(self.rt, 3),
             "molecular_family": self.family.id if self.family else "-",
             "gnps_id": self.gnps_id if self.gnps_id else "-",
-            "gnps_annotations": self._formatted_gnps_annotations()
-            if self.gnps_annotations
-            else "-",
+            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-",
         }
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 278f3941..4f7753b9 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -299,15 +299,14 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any
                 table. If None, all rows are included.
 
         Returns:
-            list: A list of dictionaries, where each dictionary contains
-                the following keys:
-                - Index (int)
-                - Genomic Object Type (str)
-                - Genomic Object ID (str or int)
-                - Metabolomic Object Type (str)
-                - Metabolomic Object ID (str or int)
-                - Metcalf Score (str, formatted to 2 decimal places, or "-")
-                - Rosetta Score (str, formatted to 2 decimal places, or "-")
+            A list of dictionaries, where each dictionary contains
+                - index (int): The index of the link.
+                - genomic_object_type (str): The type of the genomic object.
+                - genomic_object_id (str or int): The ID of the genomic object.
+                - metabolomic_object_type (str): The type of the metabolomic object.
+                - metabolomic_object_id (str or int): The ID of the metabolomic object.
+                - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-".
+                - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-".
         """
         genomic_object_classes = (GCF,)