From 6b8d3d38163595a3e3ebf98e85567a004f4affd3 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 15 Oct 2024 16:19:38 +0200
Subject: [PATCH 01/24] add print links method to LinkGraph, improve LinkGraph
 string representation

---
 src/nplinker/scoring/link_graph.py    | 102 ++++++++++++++++++++------
 tests/unit/scoring/test_link_graph.py |  17 +++++
 2 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 50151997..90336635 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from collections.abc import Sequence
 from functools import wraps
+from os import PathLike
 from typing import Union
 from networkx import Graph
 from tabulate import tabulate
@@ -76,17 +77,17 @@ def __init__(self) -> None:
 
             Display the empty LinkGraph object:
             >>> lg
-            |    |   Object 1 |   Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|------------|------------|-----------------|-----------------|
+            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
+            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
 
             Add a link between a GCF and a Spectrum object:
             >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5}))
 
             Display all links in LinkGraph object:
             >>> lg
-            |    |     Object 1 |               Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|--------------|------------------------|-----------------|-----------------|
-            |  1 | GCF(id=gcf1) | Spectrum(id=spectrum1) |               1 |               - |
+            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
+            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
+            |  1 | GCF                   | 1                   | Spectrum                  | 1                       | 1.00            | -               |
 
             Get all links for a given object:
             >>> lg[gcf]
@@ -285,35 +286,92 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
         if link_data is not None:
             lg.add_link(u, v, **link_data)
 
-    def _get_table_repr(self) -> str:
-        """Generate a table representation of the LinkGraph.
+    def get_table_data(self, display_limit: int | None = None) -> list[dict]:
+        """Generate the table data for the LinkGraph.
+
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
 
-        The table is truncated to 60 links.
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
+
+        Returns:
+            list: A list of dictionaries, where each dictionary contains
+                the following keys:
+                - Index (int)
+                - Genomic Object Type (str)
+                - Genomic Object ID (str or int)
+                - Metabolomic Object Type (str)
+                - Metabolomic Object ID (str or int)
+                - Metcalf Score (str, formatted to 2 decimal places, or "-")
+                - Rosetta Score (str, formatted to 2 decimal places, or "-")
         """
-        headers = ["", "Object 1", "Object 2", "Metcalf Score", "Rosetta Score"]
+        genomic_object_classes = (GCF,)
+
         table_data = []
-        display_limit = 60
 
         for index, (u, v, data) in enumerate(self.links, start=1):
+            genomic_object = u if isinstance(u, genomic_object_classes) else v
+            metabolomic_object = v if isinstance(u, genomic_object_classes) else u
             metcalf_score = data.get("metcalf")
             rosetta_score = data.get("rosetta")
 
-            row = [
-                index,
-                str(u if isinstance(u, GCF) else v),
-                str(v if isinstance(u, GCF) else u),
-                f"{metcalf_score.value:.2f}" if metcalf_score else "-",
-                f"{rosetta_score.value:.2f}" if rosetta_score else "-",
-            ]
-            table_data.append(row)
-
-            if index == display_limit:
+            table_data.append(
+                {
+                    "Index": index,
+                    "Genomic Object Type": genomic_object.__class__.__name__,
+                    "Genomic Object ID": genomic_object.id,
+                    "Metabolomic Object Type": metabolomic_object.__class__.__name__,
+                    "Metabolomic Object ID": metabolomic_object.id,
+                    "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-",
+                    "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-",
+                }
+            )
+
+            if display_limit is not None and index == display_limit:
                 break
 
-        table = tabulate(table_data, headers=headers, tablefmt="github", stralign="right")
+        return table_data
+
+    def _get_table_repr(self, display_limit: int | None = 60) -> str:
+        """Generate a table representation of the LinkGraph.
+
+        Args:
+            display_limit: The maximum number of links to display in the table. Defaults to 60.
+
+        Returns:
+            str: A string representation of the table in GitHub-flavored markdown format. If the number of links
+            exceeds the display limit, the table is truncated and an additional line indicating the total number
+            of links is appended.
+        """
+        table = tabulate(
+            self.get_table_data(display_limit),
+            headers="keys",
+            tablefmt="github",
+            stralign="right",
+        )
 
         if len(self.links) > display_limit:
             truncated_info = f"...\n[ {len(self.links)} links ]"
-            return f"{table}\n{truncated_info}"
+            table += f"\n{truncated_info}"
 
         return table
+
+    def print_links(self, file: str | PathLike) -> None:
+        """Print the links in the LinkGraph to a file.
+
+        Args:
+            file: the file to write the links to.
+
+        Examples:
+            >>> lg.print_links("links.tsv")
+        """
+        table_data = self.get_table_data()
+        headers = table_data[0].keys()
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for row in table_data:
+                f.write("\t".join(str(row[h]) for h in headers) + "\n")
diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
index 9f7c9d7d..4745c856 100644
--- a/tests/unit/scoring/test_link_graph.py
+++ b/tests/unit/scoring/test_link_graph.py
@@ -112,3 +112,20 @@ def test_filter(gcfs, spectra, score):
     # test filtering with GCFs and Spectra
     lg_filtered = lg.filter(u_nodes, v_nodes)
     assert len(lg_filtered) == 4
+
+
+def test_get_table_data(lg, gcfs, spectra, score):
+    table_data = lg.get_table_data()
+    assert type(table_data) is list
+    assert type(table_data[0]) is dict
+    assert table_data == [
+        {
+            "Index": 1,
+            "Genomic Object Type": gcfs[0].__class__.__name__,
+            "Genomic Object ID": gcfs[0].id,
+            "Metabolomic Object Type": spectra[0].__class__.__name__,
+            "Metabolomic Object ID": spectra[0].id,
+            "Metcalf Score": f"{score.value:.2f}",
+            "Rosetta Score": "-",
+        },
+    ]

From cdd26c3330c6867c1be09b1a5ed90e3a16088fa3 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 17:17:38 +0200
Subject: [PATCH 02/24] feat: add a method to print tabular results files

---
 src/nplinker/genomics/bgc.py          | 21 ++++++++++++
 src/nplinker/metabolomics/spectrum.py | 16 +++++++++
 src/nplinker/nplinker.py              | 49 +++++++++++++++++++++++++++
 src/nplinker/scoring/link_graph.py    | 14 ++++----
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 08978587..57161d07 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -192,3 +192,24 @@ def aa_predictions(self) -> list:
                 for p in predict_aa(self.antismash_file):
                     self._aa_predictions[p[0]] = p[1]
         return [self._aa_predictions]
+
+    def to_dict(self) -> dict:
+        """Convert the BGC object to a dictionary that can be used to export the results.
+
+        Returns:
+            A dictionary containing relavant information about the BGC object.
+        """
+        gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None]
+        gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None]
+
+        return {
+            "GCF_id": ", ".join(gcf_ids) if gcf_ids else None,
+            "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None,
+            "BGC_name": self.id,
+            "strain_id": self.strain.id,
+            "description": self.description,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+            "antismash_cluster_type": ", ".join(self.product_prediction),
+            "mibig_bgc_class": self.mibig_bgc_class,
+        }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 61d8d421..a2891a2b 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -97,3 +97,19 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict:
+        """Convert the Spectrum object to a dictionary that can be used to export the results.
+
+        Returns:
+            A dictionary containing relavant information about the Spectrum object.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": self.precursor_mz,
+            "rt": self.rt,
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index a7146dcc..f15ee1dd 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -355,3 +355,52 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def print_bgcs(self, file: str | PathLike) -> None:
+        """Prints the BGC data to a specified file in tab-separated format.
+
+        Args:
+            file: The path to the file where the BGC data will be printed.
+        """
+        headers = self.bgcs[0].to_dict().keys()
+
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for bgc in self.bgcs:
+                row_data = bgc.to_dict()
+                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
+
+    def print_gcfs(self, file: str | PathLike) -> None:
+        """Prints the GCF data to a specified file in tab-separated format.
+
+        Args:
+            file: The path to the file where the GCF data will be printed.
+        """
+        headers = self.gcfs[0].to_dict().keys()
+
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for gcf in self.gcfs:
+                row_data = gcf.to_dict()
+                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
+
+    def print_spectra(self, file: str | PathLike) -> None:
+        """Prints the Spectrum data to a specified file in tab-separated format.
+
+        Args:
+            file: The path to the file where the Spectrum data will be printed.
+        """
+        headers = self.spectra[0].to_dict().keys()
+
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for spectrum in self.spectra:
+                row_data = spectrum.to_dict()
+                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
+
+    def print_results(self, lg: LinkGraph | None = None) -> None:
+        """Prints the results to the output directory in tab-separated format."""
+        self.print_bgcs(self._output_dir / "genomics_data.tsv")
+        self.print_spectra(self._output_dir / "metabolomics_data.tsv")
+        if lg is not None:
+            lg.print_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 90336635..fd1db438 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -321,13 +321,13 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict]:
 
             table_data.append(
                 {
-                    "Index": index,
-                    "Genomic Object Type": genomic_object.__class__.__name__,
-                    "Genomic Object ID": genomic_object.id,
-                    "Metabolomic Object Type": metabolomic_object.__class__.__name__,
-                    "Metabolomic Object ID": metabolomic_object.id,
-                    "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-",
-                    "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-",
+                    "index": index,
+                    "genomic_object_type": genomic_object.__class__.__name__,
+                    "genomic_object_id": genomic_object.id,
+                    "metabolomic_object_type": metabolomic_object.__class__.__name__,
+                    "metabolomic_object_id": metabolomic_object.id,
+                    "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-",
+                    "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-",
                 }
             )
 

From ec8b8ae0a12885d6ddbc28ebb5b3c90b156e1140 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 17:26:58 +0200
Subject: [PATCH 03/24] improve method names and docstrings, remove unused
 method to export gcf file

---
 src/nplinker/nplinker.py           | 47 +++++++++++++++---------------
 src/nplinker/scoring/link_graph.py |  4 +--
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index f15ee1dd..9e87ed92 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -356,11 +356,13 @@ def save_data(
         with open(file, "wb") as f:
             pickle.dump(data, f)
 
-    def print_bgcs(self, file: str | PathLike) -> None:
-        """Prints the BGC data to a specified file in tab-separated format.
+    def export_genomics_data(self, file: str | PathLike) -> None:
+        """Exports the genomics data to a specified file in tab-separated format.
+
+        Each row in the file corresponds to a BGC object.
 
         Args:
-            file: The path to the file where the BGC data will be printed.
+            file: The path to the file where the genomics data will be printed.
         """
         headers = self.bgcs[0].to_dict().keys()
 
@@ -370,25 +372,13 @@ def print_bgcs(self, file: str | PathLike) -> None:
                 row_data = bgc.to_dict()
                 f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
 
-    def print_gcfs(self, file: str | PathLike) -> None:
-        """Prints the GCF data to a specified file in tab-separated format.
-
-        Args:
-            file: The path to the file where the GCF data will be printed.
-        """
-        headers = self.gcfs[0].to_dict().keys()
-
-        with open(file, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for gcf in self.gcfs:
-                row_data = gcf.to_dict()
-                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
+    def export_metabolomics_data(self, file: str | PathLike) -> None:
+        """Exports the metabolomics data to a specified file in tab-separated format.
 
-    def print_spectra(self, file: str | PathLike) -> None:
-        """Prints the Spectrum data to a specified file in tab-separated format.
+        Each row in the file corresponds to a Spectrum object.
 
         Args:
-            file: The path to the file where the Spectrum data will be printed.
+            file: The path to the file where the metabolomics data will be printed.
         """
         headers = self.spectra[0].to_dict().keys()
 
@@ -398,9 +388,18 @@ def print_spectra(self, file: str | PathLike) -> None:
                 row_data = spectrum.to_dict()
                 f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
 
-    def print_results(self, lg: LinkGraph | None = None) -> None:
-        """Prints the results to the output directory in tab-separated format."""
-        self.print_bgcs(self._output_dir / "genomics_data.tsv")
-        self.print_spectra(self._output_dir / "metabolomics_data.tsv")
+    def export_results(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
+
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.export_genomics_data(self._output_dir / "genomics_data.tsv")
+        self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv")
         if lg is not None:
-            lg.print_links(self._output_dir / "links.tsv")
+            lg.export_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index fd1db438..86a9ca6e 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -360,8 +360,8 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
 
         return table
 
-    def print_links(self, file: str | PathLike) -> None:
-        """Print the links in the LinkGraph to a file.
+    def export_links(self, file: str | PathLike) -> None:
+        """Exports the links in the LinkGraph to a file.
 
         Args:
             file: the file to write the links to.

From 2207df1eb1a5cc7a5df8d801a103dfe31a162f68 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 18:07:22 +0200
Subject: [PATCH 04/24] improve doctring and typing

---
 src/nplinker/genomics/bgc.py          | 26 +++++++++++++++++-------
 src/nplinker/metabolomics/spectrum.py | 29 ++++++++++++++++++++-------
 src/nplinker/scoring/link_graph.py    |  2 +-
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 57161d07..880a3710 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -193,23 +193,35 @@ def aa_predictions(self) -> list:
                     self._aa_predictions[p[0]] = p[1]
         return [self._aa_predictions]
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> dict[str, any]:
         """Convert the BGC object to a dictionary that can be used to export the results.
 
+        This method gathers relevant information from the BGC object and formats it into a dictionary
+        where each key-value pair represents a specific attribute of the BGC.
+
         Returns:
-            A dictionary containing relavant information about the BGC object.
+            dict[str, str]: A dictionary containing relevant information about the BGC object, including:
+                - GCF_id: A comma-separated string of GCF IDs or "-" if none.
+                - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none.
+                - BGC_name: The name of the BGC.
+                - strain_id: The ID of the strain.
+                - description: A description of the BGC.
+                - antismash_id: The antiSMASH ID.
+                - antismash_region: The antiSMASH region.
+                - antismash_cluster_type: A comma-separated string of product predictions.
+                - mibig_bgc_class: The MiBIG BGC class or "-" if none.
         """
-        gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None]
-        gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None]
+        gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None}
+        gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None}
 
         return {
-            "GCF_id": ", ".join(gcf_ids) if gcf_ids else None,
-            "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None,
+            "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-",
+            "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-",
             "BGC_name": self.id,
             "strain_id": self.strain.id,
             "description": self.description,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
             "antismash_cluster_type": ", ".join(self.product_prediction),
-            "mibig_bgc_class": self.mibig_bgc_class,
+            "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-",
         }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index a2891a2b..20f64c9f 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -98,18 +98,33 @@ def has_strain(self, strain: Strain) -> bool:
         """
         return strain in self.strains
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> dict[str, any]:
         """Convert the Spectrum object to a dictionary that can be used to export the results.
 
+        This method gathers relevant information from the Spectrum object and formats it into a dictionary
+        where each key-value pair represents a specific attribute of the Spectrum.
+
         Returns:
-            A dictionary containing relavant information about the Spectrum object.
+            dict[str, str]: A dictionary containing relevant information about the Spectrum object, including:
+                - "spectrum_id": The unique identifier of the spectrum.
+                - "num_strains_with_spectrum": The number of strains associated with the spectrum.
+                - "precursor_mz": The precursor m/z value formatted to four decimal places.
+                - "rt": The retention time formatted to three decimal places.
+                - "molecular_family": The identifier of the molecular family, or "-" if not available.
+                - "gnps_id": The GNPS identifier, or "-" if not available.
+                - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available.
         """
+
+        def format_gnps_annotations(annotations: dict) -> str:
+            """Format GNPS annotations dictionary into a string."""
+            return "; ".join(f"{k}: {v}" for k, v in annotations.items())
+
         return {
             "spectrum_id": self.id,
             "num_strains_with_spectrum": len(self.strains),
-            "precursor_mz": self.precursor_mz,
-            "rt": self.rt,
-            "molecular_family": self.family.id if self.family else None,
-            "gnps_id": self.gnps_id,
-            "gnps_annotations": self.gnps_annotations,
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else "-",
+            "gnps_id": self.gnps_id if self.gnps_id else "-",
+            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-",
         }
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 86a9ca6e..278f3941 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -286,7 +286,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
         if link_data is not None:
             lg.add_link(u, v, **link_data)
 
-    def get_table_data(self, display_limit: int | None = None) -> list[dict]:
+    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]:
         """Generate the table data for the LinkGraph.
 
         This method iterates over the links in the LinkGraph and constructs a table

From c6e166a04647876cefd276036f1f9f799e7ecbbb Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 18:15:04 +0200
Subject: [PATCH 05/24] fix a failing test

---
 tests/unit/scoring/test_link_graph.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
index 4745c856..4c7e68b3 100644
--- a/tests/unit/scoring/test_link_graph.py
+++ b/tests/unit/scoring/test_link_graph.py
@@ -118,14 +118,11 @@ def test_get_table_data(lg, gcfs, spectra, score):
     table_data = lg.get_table_data()
     assert type(table_data) is list
     assert type(table_data[0]) is dict
-    assert table_data == [
-        {
-            "Index": 1,
-            "Genomic Object Type": gcfs[0].__class__.__name__,
-            "Genomic Object ID": gcfs[0].id,
-            "Metabolomic Object Type": spectra[0].__class__.__name__,
-            "Metabolomic Object ID": spectra[0].id,
-            "Metcalf Score": f"{score.value:.2f}",
-            "Rosetta Score": "-",
-        },
-    ]
+    assert len(table_data) == 1
+    assert table_data[0]["index"] == 1
+    assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__
+    assert table_data[0]["genomic_object_id"] == gcfs[0].id
+    assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__
+    assert table_data[0]["metabolomic_object_id"] == spectra[0].id
+    assert table_data[0]["metcalf_score"] == f"{score.value:.2f}"
+    assert table_data[0]["rosetta_score"] == "-"

From 32ca3ddd534c23cceede4c6318b82d5bd42c1ba2 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 18:21:43 +0200
Subject: [PATCH 06/24] refactor a little bit the spectrum method to covert to
 dict

---
 src/nplinker/metabolomics/spectrum.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 20f64c9f..3dc6b3ed 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -98,6 +98,10 @@ def has_strain(self, strain: Strain) -> bool:
         """
         return strain in self.strains
 
+    def _formatted_gnps_annotations(self) -> str:
+        """Format GNPS annotations dictionary into a string."""
+        return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items())
+
     def to_dict(self) -> dict[str, any]:
         """Convert the Spectrum object to a dictionary that can be used to export the results.
 
@@ -114,11 +118,6 @@ def to_dict(self) -> dict[str, any]:
                 - "gnps_id": The GNPS identifier, or "-" if not available.
                 - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available.
         """
-
-        def format_gnps_annotations(annotations: dict) -> str:
-            """Format GNPS annotations dictionary into a string."""
-            return "; ".join(f"{k}: {v}" for k, v in annotations.items())
-
         return {
             "spectrum_id": self.id,
             "num_strains_with_spectrum": len(self.strains),
@@ -126,5 +125,7 @@ def format_gnps_annotations(annotations: dict) -> str:
             "rt": round(self.rt, 3),
             "molecular_family": self.family.id if self.family else "-",
             "gnps_id": self.gnps_id if self.gnps_id else "-",
-            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-",
+            "gnps_annotations": self._formatted_gnps_annotations()
+            if self.gnps_annotations
+            else "-",
         }

From 8e7945d3318a41de213a113ac2fb7c259f1002f5 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Wed, 16 Oct 2024 18:41:57 +0200
Subject: [PATCH 07/24] change the output format for gnps_annotations in
 metabolomics results file, improve docstrings

---
 src/nplinker/genomics/bgc.py          | 26 +++++++++++------------
 src/nplinker/metabolomics/spectrum.py | 30 +++++++++++----------------
 src/nplinker/scoring/link_graph.py    | 17 +++++++--------
 3 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 880a3710..2624cfae 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -194,22 +194,22 @@ def aa_predictions(self) -> list:
         return [self._aa_predictions]
 
     def to_dict(self) -> dict[str, any]:
-        """Convert the BGC object to a dictionary that can be used to export the results.
+        """Convert the BGC object to a dictionary for exporting results.
 
-        This method gathers relevant information from the BGC object and formats it into a dictionary
-        where each key-value pair represents a specific attribute of the BGC.
+        This method compiles relevant information from the BGC object and formats it into a dictionary.
+        Each key-value pair in the dictionary represents a specific attribute of the BGC.
 
         Returns:
-            dict[str, str]: A dictionary containing relevant information about the BGC object, including:
-                - GCF_id: A comma-separated string of GCF IDs or "-" if none.
-                - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none.
-                - BGC_name: The name of the BGC.
-                - strain_id: The ID of the strain.
-                - description: A description of the BGC.
-                - antismash_id: The antiSMASH ID.
-                - antismash_region: The antiSMASH region.
-                - antismash_cluster_type: A comma-separated string of product predictions.
-                - mibig_bgc_class: The MiBIG BGC class or "-" if none.
+            A dictionary containing the following key-value pairs:
+            - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available.
+            - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available.
+            - BGC_name (str): The name of the BGC.
+            - strain_id (str): The ID of the strain.
+            - description (str): A description of the BGC.
+            - antismash_id (str): The antiSMASH ID.
+            - antismash_region (str): The antiSMASH region.
+            - antismash_cluster_type (str): A comma-separated string of product predictions.
+            - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available.
         """
         gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None}
         gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None}
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 3dc6b3ed..5c929a13 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -98,25 +98,21 @@ def has_strain(self, strain: Strain) -> bool:
         """
         return strain in self.strains
 
-    def _formatted_gnps_annotations(self) -> str:
-        """Format GNPS annotations dictionary into a string."""
-        return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items())
-
     def to_dict(self) -> dict[str, any]:
-        """Convert the Spectrum object to a dictionary that can be used to export the results.
+        """Convert the Spectrum object to a dictionary for exporting results.
 
-        This method gathers relevant information from the Spectrum object and formats it into a dictionary
-        where each key-value pair represents a specific attribute of the Spectrum.
+        This method compiles relevant information from the Spectrum object into a dictionary format.
+        Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object.
 
         Returns:
-            dict[str, str]: A dictionary containing relevant information about the Spectrum object, including:
-                - "spectrum_id": The unique identifier of the spectrum.
-                - "num_strains_with_spectrum": The number of strains associated with the spectrum.
-                - "precursor_mz": The precursor m/z value formatted to four decimal places.
-                - "rt": The retention time formatted to three decimal places.
-                - "molecular_family": The identifier of the molecular family, or "-" if not available.
-                - "gnps_id": The GNPS identifier, or "-" if not available.
-                - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available.
+            A dictionary containing containing the following key-value pairs:
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str): The identifier of the molecular family, or "-" if not available.
+                - "gnps_id" (str): The GNPS identifier, or "-" if not available.
+                - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available.
         """
         return {
             "spectrum_id": self.id,
@@ -125,7 +121,5 @@ def to_dict(self) -> dict[str, any]:
             "rt": round(self.rt, 3),
             "molecular_family": self.family.id if self.family else "-",
             "gnps_id": self.gnps_id if self.gnps_id else "-",
-            "gnps_annotations": self._formatted_gnps_annotations()
-            if self.gnps_annotations
-            else "-",
+            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-",
         }
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 278f3941..4f7753b9 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -299,15 +299,14 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any
                 table. If None, all rows are included.
 
         Returns:
-            list: A list of dictionaries, where each dictionary contains
-                the following keys:
-                - Index (int)
-                - Genomic Object Type (str)
-                - Genomic Object ID (str or int)
-                - Metabolomic Object Type (str)
-                - Metabolomic Object ID (str or int)
-                - Metcalf Score (str, formatted to 2 decimal places, or "-")
-                - Rosetta Score (str, formatted to 2 decimal places, or "-")
+            A list of dictionaries, where each dictionary contains
+                - index (int): The index of the link.
+                - genomic_object_type (str): The type of the genomic object.
+                - genomic_object_id (str or int): The ID of the genomic object.
+                - metabolomic_object_type (str): The type of the metabolomic object.
+                - metabolomic_object_id (str or int): The ID of the metabolomic object.
+                - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-".
+                - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-".
         """
         genomic_object_classes = (GCF,)
 

From 25928100a34bc0e0d49b706895e43fab50b0cee7 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Thu, 17 Oct 2024 14:47:36 +0200
Subject: [PATCH 08/24] fix: convert int to str before using join

---
 src/nplinker/genomics/bgc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 2624cfae..d9787a38 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -211,8 +211,8 @@ def to_dict(self) -> dict[str, any]:
             - antismash_cluster_type (str): A comma-separated string of product predictions.
             - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available.
         """
-        gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None}
-        gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None}
+        gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None}
+        gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None}
 
         return {
             "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-",

From 7f53de8456cd999c456bdd28fad07b2aca541c8a Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Thu, 17 Oct 2024 15:40:07 +0200
Subject: [PATCH 09/24] change representation of empty values in output files
 for improved integration to excel

---
 src/nplinker/genomics/bgc.py          |  6 +++---
 src/nplinker/metabolomics/spectrum.py |  6 +++---
 src/nplinker/scoring/link_graph.py    | 24 ++++++++++++++++++------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index d9787a38..902ba5f2 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -215,13 +215,13 @@ def to_dict(self) -> dict[str, any]:
         gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None}
 
         return {
-            "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-",
-            "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-",
+            "GCF_id": ", ".join(gcf_ids),
+            "GCF_bigscape_class": ", ".join(gcf_bsc),
             "BGC_name": self.id,
             "strain_id": self.strain.id,
             "description": self.description,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
             "antismash_cluster_type": ", ".join(self.product_prediction),
-            "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-",
+            "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "",
         }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 5c929a13..2b89dddc 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]:
             "num_strains_with_spectrum": len(self.strains),
             "precursor_mz": round(self.precursor_mz, 4),
             "rt": round(self.rt, 3),
-            "molecular_family": self.family.id if self.family else "-",
-            "gnps_id": self.gnps_id if self.gnps_id else "-",
-            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-",
+            "molecular_family": self.family.id if self.family else "",
+            "gnps_id": self.gnps_id if self.gnps_id else "",
+            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "",
         }
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 4f7753b9..bd715723 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -104,6 +104,18 @@ def __init__(self) -> None:
             Get the link data between two objects:
             >>> lg.get_link_data(gcf, spectrum)
             {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+
+            Filter the links for `gcf1` and `gcf2`:
+            >>> new_lg = lg.filter([gcf1, gcf2])
+
+            Filter the links for `spectrum1` and `spectrum2`:
+            >>> new_lg = lg.filter([spectrum1, spectrum2])
+
+            Filter the links between two lists of objects:
+            >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2])
+
+            Export the links to a file:
+            >>> lg.export_links("links.tsv")
         """
         self._g: Graph = Graph()
 
@@ -305,8 +317,8 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any
                 - genomic_object_id (str or int): The ID of the genomic object.
                 - metabolomic_object_type (str): The type of the metabolomic object.
                 - metabolomic_object_id (str or int): The ID of the metabolomic object.
-                - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-".
-                - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-".
+                - metcalf_score (float): The Metcalf score, rounded to 2 decimal places.
+                - rosetta_score (float): The Rosetta score, rounded to 2 decimal places.
         """
         genomic_object_classes = (GCF,)
 
@@ -321,12 +333,12 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any
             table_data.append(
                 {
                     "index": index,
-                    "genomic_object_type": genomic_object.__class__.__name__,
                     "genomic_object_id": genomic_object.id,
-                    "metabolomic_object_type": metabolomic_object.__class__.__name__,
+                    "genomic_object_type": genomic_object.__class__.__name__,
                     "metabolomic_object_id": metabolomic_object.id,
-                    "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-",
-                    "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-",
+                    "metabolomic_object_type": metabolomic_object.__class__.__name__,
+                    "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "",
+                    "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
                 }
             )
 

From ad049c843384c68dfa24dbee5ab99d00f6726c27 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Thu, 17 Oct 2024 17:00:59 +0200
Subject: [PATCH 10/24] refactoring the export methods

---
 src/nplinker/genomics/bgc.py          |  9 ++--
 src/nplinker/metabolomics/spectrum.py |  6 +--
 src/nplinker/nplinker.py              | 55 +++++++++++-------------
 src/nplinker/scoring/link_graph.py    | 62 +++++++++++++++------------
 tests/unit/scoring/test_link_graph.py |  4 +-
 5 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 902ba5f2..486b9861 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -211,17 +211,14 @@ def to_dict(self) -> dict[str, any]:
             - antismash_cluster_type (str): A comma-separated string of product predictions.
             - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available.
         """
-        gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None}
-        gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None}
-
         return {
-            "GCF_id": ", ".join(gcf_ids),
-            "GCF_bigscape_class": ", ".join(gcf_bsc),
+            "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
+            "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
             "BGC_name": self.id,
             "strain_id": self.strain.id,
             "description": self.description,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
-            "antismash_cluster_type": ", ".join(self.product_prediction),
+            "antismash_cluster_type": self.product_prediction,
             "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "",
         }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 2b89dddc..aa008579 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]:
             "num_strains_with_spectrum": len(self.strains),
             "precursor_mz": round(self.precursor_mz, 4),
             "rt": round(self.rt, 3),
-            "molecular_family": self.family.id if self.family else "",
-            "gnps_id": self.gnps_id if self.gnps_id else "",
-            "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "",
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
         }
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 9e87ed92..52599957 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -356,37 +356,34 @@ def save_data(
         with open(file, "wb") as f:
             pickle.dump(data, f)
 
-    def export_genomics_data(self, file: str | PathLike) -> None:
-        """Exports the genomics data to a specified file in tab-separated format.
-
-        Each row in the file corresponds to a BGC object.
+    def export_objects(self, objects: BGC | Spectrum, filename: str) -> None:
+        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
         Args:
-            file: The path to the file where the genomics data will be printed.
+            objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
         """
-        headers = self.bgcs[0].to_dict().keys()
-
-        with open(file, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for bgc in self.bgcs:
-                row_data = bgc.to_dict()
-                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
-
-    def export_metabolomics_data(self, file: str | PathLike) -> None:
-        """Exports the metabolomics data to a specified file in tab-separated format.
-
-        Each row in the file corresponds to a Spectrum object.
-
-        Args:
-            file: The path to the file where the metabolomics data will be printed.
-        """
-        headers = self.spectra[0].to_dict().keys()
-
-        with open(file, "w") as f:
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w") as f:
             f.write("\t".join(headers) + "\n")
-            for spectrum in self.spectra:
-                row_data = spectrum.to_dict()
-                f.write("\t".join(str(row_data[h]) for h in headers) + "\n")
+            for obj in objects:
+                row_data = obj.to_dict()
+                formatted_row = []
+                for header in headers:
+                    item = row_data.get(header, "")
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(item, (list, tuple, set)):
+                        formatted_row.append(", ".join(map(str, item)))
+                    # Convert dict to comma-separated string
+                    elif isinstance(item, dict):
+                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
+                    # Convert non-empty value to string
+                    elif item:
+                        formatted_row.append(str(item))
+                    # Convert empty value to empty string
+                    else:
+                        formatted_row.append("")
+                f.write("\t".join(formatted_row) + "\n")
 
     def export_results(self, lg: LinkGraph | None = None) -> None:
         """Exports the results to the output directory in tab-separated format.
@@ -399,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None:
             lg (LinkGraph | None): An optional LinkGraph object. If provided,
                        the links data will be exported to 'links.tsv'.
         """
-        self.export_genomics_data(self._output_dir / "genomics_data.tsv")
-        self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv")
+        self.export_objects(self.bgcs, "genomics_data.tsv")
+        self.export_objects(self.spectra, "metabolomics_data.tsv")
         if lg is not None:
             lg.export_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index bd715723..0d6f4074 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -311,41 +311,47 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any
                 table. If None, all rows are included.
 
         Returns:
-            A list of dictionaries, where each dictionary contains
+            A list of dictionaries containing the table data.
+        """
+        table_data = []
+        for index, link in enumerate(self.links, start=1):
+            table_data.append(self.link_to_dict(link, index))
+            if display_limit is not None and index == display_limit:
+                break
+        return table_data
+
+    def link_to_dict(self, link: LINK, index: int) -> dict[str, any]:
+        """Convert a link to a dictionary representation.
+
+        Args:
+            link: A tuple containing the link information (u, v, data).
+            index: The index of the link.
+
+        Returns:
+            A dictionary containing the link information with the following keys:
                 - index (int): The index of the link.
-                - genomic_object_type (str): The type of the genomic object.
                 - genomic_object_id (str or int): The ID of the genomic object.
-                - metabolomic_object_type (str): The type of the metabolomic object.
+                - genomic_object_type (str): The type of the genomic object.
                 - metabolomic_object_id (str or int): The ID of the metabolomic object.
+                - metabolomic_object_type (str): The type of the metabolomic object.
                 - metcalf_score (float): The Metcalf score, rounded to 2 decimal places.
                 - rosetta_score (float): The Rosetta score, rounded to 2 decimal places.
         """
+        u, v, data = link
         genomic_object_classes = (GCF,)
-
-        table_data = []
-
-        for index, (u, v, data) in enumerate(self.links, start=1):
-            genomic_object = u if isinstance(u, genomic_object_classes) else v
-            metabolomic_object = v if isinstance(u, genomic_object_classes) else u
-            metcalf_score = data.get("metcalf")
-            rosetta_score = data.get("rosetta")
-
-            table_data.append(
-                {
-                    "index": index,
-                    "genomic_object_id": genomic_object.id,
-                    "genomic_object_type": genomic_object.__class__.__name__,
-                    "metabolomic_object_id": metabolomic_object.id,
-                    "metabolomic_object_type": metabolomic_object.__class__.__name__,
-                    "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "",
-                    "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
-                }
-            )
-
-            if display_limit is not None and index == display_limit:
-                break
-
-        return table_data
+        genomic_object = u if isinstance(u, genomic_object_classes) else v
+        metabolomic_object = v if isinstance(u, genomic_object_classes) else u
+        metcalf_score = data.get("metcalf")
+        rosetta_score = data.get("rosetta")
+        return {
+            "index": index,
+            "genomic_object_id": genomic_object.id,
+            "genomic_object_type": genomic_object.__class__.__name__,
+            "metabolomic_object_id": metabolomic_object.id,
+            "metabolomic_object_type": metabolomic_object.__class__.__name__,
+            "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "",
+            "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
+        }
 
     def _get_table_repr(self, display_limit: int | None = 60) -> str:
         """Generate a table representation of the LinkGraph.
diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
index 4c7e68b3..5a4e7197 100644
--- a/tests/unit/scoring/test_link_graph.py
+++ b/tests/unit/scoring/test_link_graph.py
@@ -124,5 +124,5 @@ def test_get_table_data(lg, gcfs, spectra, score):
     assert table_data[0]["genomic_object_id"] == gcfs[0].id
     assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__
     assert table_data[0]["metabolomic_object_id"] == spectra[0].id
-    assert table_data[0]["metcalf_score"] == f"{score.value:.2f}"
-    assert table_data[0]["rosetta_score"] == "-"
+    assert table_data[0]["metcalf_score"] == round(score.value, 2)
+    assert table_data[0]["rosetta_score"] == ""

From b220fb024af7be8479fe9facc362c3de83c9520f Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Fri, 18 Oct 2024 17:07:21 +0200
Subject: [PATCH 11/24] small refactor: specify staticmethod

---
 src/nplinker/scoring/link_graph.py | 47 +++++++++++++++---------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 0d6f4074..091474e5 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -298,29 +298,8 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
         if link_data is not None:
             lg.add_link(u, v, **link_data)
 
-    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]:
-        """Generate the table data for the LinkGraph.
-
-        This method iterates over the links in the LinkGraph and constructs a table
-        containing information about genomic and metabolomic objects, as well as their
-        associated scores. Each row in the table represents a link between a genomic
-        object and a metabolomic object.
-
-        Args:
-            display_limit (int | None): The maximum number of rows to include in the
-                table. If None, all rows are included.
-
-        Returns:
-            A list of dictionaries containing the table data.
-        """
-        table_data = []
-        for index, link in enumerate(self.links, start=1):
-            table_data.append(self.link_to_dict(link, index))
-            if display_limit is not None and index == display_limit:
-                break
-        return table_data
-
-    def link_to_dict(self, link: LINK, index: int) -> dict[str, any]:
+    @staticmethod
+    def link_to_dict(link: LINK, index: int) -> dict[str, any]:
         """Convert a link to a dictionary representation.
 
         Args:
@@ -353,6 +332,28 @@ def link_to_dict(self, link: LINK, index: int) -> dict[str, any]:
             "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
         }
 
+    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]:
+        """Generate the table data for the LinkGraph.
+
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
+
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
+
+        Returns:
+            A list of dictionaries containing the table data.
+        """
+        table_data = []
+        for index, link in enumerate(self.links, start=1):
+            table_data.append(self.link_to_dict(link, index))
+            if display_limit is not None and index == display_limit:
+                break
+        return table_data
+
     def _get_table_repr(self, display_limit: int | None = 60) -> str:
         """Generate a table representation of the LinkGraph.
 

From f98fa98097c6785925382a95a4bb43922bec0e71 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Fri, 18 Oct 2024 17:10:01 +0200
Subject: [PATCH 12/24] add more tests

---
 src/nplinker/genomics/bgc.py             |  6 ++---
 tests/unit/genomics/test_bgc.py          | 28 +++++++++++++++++++++
 tests/unit/metabolomics/test_spectrum.py | 32 ++++++++++++++++++++++++
 tests/unit/scoring/test_link_graph.py    | 28 +++++++++++++++------
 4 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 486b9861..7d606fe0 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -215,10 +215,10 @@ def to_dict(self) -> dict[str, any]:
             "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
             "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
             "BGC_name": self.id,
-            "strain_id": self.strain.id,
+            "product_prediction": self.product_prediction,
+            "mibig_bgc_class": self.mibig_bgc_class,
             "description": self.description,
+            "strain_id": self.strain.id,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
-            "antismash_cluster_type": self.product_prediction,
-            "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "",
         }
diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py
index 1cf3f401..9706e961 100644
--- a/tests/unit/genomics/test_bgc.py
+++ b/tests/unit/genomics/test_bgc.py
@@ -24,3 +24,31 @@ def test_add_and_detach_parent():
     assert bgc.parents == {gcf}
     bgc.detach_parent(gcf)
     assert bgc.parents == set()
+
+
+def test_to_dict():
+    bgc = BGC("BGC0000001", "Polyketide", "NRP")
+    bgc.strain = Strain("sample_strain")
+    bgc.description = "Sample description"
+
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == set()
+    assert dict_repr["GCF_bigscape_class"] == set()
+    assert dict_repr["BGC_name"] == "BGC0000001"
+    assert dict_repr["product_prediction"] == ("Polyketide", "NRP")
+    assert dict_repr["mibig_bgc_class"] is None
+    assert dict_repr["description"] == "Sample description"
+    assert dict_repr["strain_id"] == "sample_strain"
+    assert dict_repr["antismash_id"] is None
+    assert dict_repr["antismash_region"] is None
+
+    bgc.add_parent(GCF("1"))
+    bgc.mibig_bgc_class = ("NRP",)
+    bgc.antismash_id = "ABC_0001"
+    bgc.antismash_region = 1
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == {"1"}
+    assert dict_repr["GCF_bigscape_class"] == set()
+    assert dict_repr["mibig_bgc_class"] == ("NRP",)
+    assert dict_repr["antismash_id"] == "ABC_0001"
+    assert dict_repr["antismash_region"] == 1
diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py
index e984eaba..d77ea0d4 100644
--- a/tests/unit/metabolomics/test_spectrum.py
+++ b/tests/unit/metabolomics/test_spectrum.py
@@ -68,3 +68,35 @@ def test_has_strain():
     spec.strains.add(strain1)
     assert spec.has_strain(strain1)
     assert not spec.has_strain(strain2)
+
+
+def test_to_dict():
+    """Test the to_dict method."""
+    spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"})
+    spec.strains.add(Strain("strain1"))
+    spec.strains.add(Strain("strain2"))
+
+    dict_repr = spec.to_dict()
+    assert dict_repr["spectrum_id"] == "spec1"
+    assert dict_repr["num_strains_with_spectrum"] == 2
+    assert dict_repr["precursor_mz"] == 150.0
+    assert dict_repr["rt"] == 0.0
+    assert dict_repr["molecular_family"] is None
+    assert dict_repr["gnps_id"] is None
+    assert dict_repr["gnps_annotations"] == dict()
+
+    # Test with gnps information
+    spec.gnps_id = "GNPS0001"
+    spec.gnps_annotations = {"annotation1": "value1"}
+
+    # Test with molecular family
+    class MockMolecularFamily:
+        def __init__(self, id):
+            self.id = id
+
+    spec.family = MockMolecularFamily("family1")
+
+    dict_repr = spec.to_dict()
+    assert dict_repr["molecular_family"] == "family1"
+    assert dict_repr["gnps_id"] == "GNPS0001"
+    assert dict_repr["gnps_annotations"] == {"annotation1": "value1"}
diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
index 5a4e7197..f1542338 100644
--- a/tests/unit/scoring/test_link_graph.py
+++ b/tests/unit/scoring/test_link_graph.py
@@ -114,15 +114,29 @@ def test_filter(gcfs, spectra, score):
     assert len(lg_filtered) == 4
 
 
+def test_link_to_dict(lg, gcfs, spectra, score):
+    link = lg.links[0]
+    index = 1
+    dict_repr = lg.link_to_dict(link, index)
+    assert type(dict_repr) is dict
+    assert dict_repr["index"] == 1
+    assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__
+    assert dict_repr["genomic_object_id"] == gcfs[0].id
+    assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__
+    assert dict_repr["metabolomic_object_id"] == spectra[0].id
+    assert dict_repr["metcalf_score"] == round(score.value, 2)
+    assert dict_repr["rosetta_score"] == ""
+
+
 def test_get_table_data(lg, gcfs, spectra, score):
+    # add a second link
+    lg.add_link(gcfs[1], spectra[1], metcalf=score)
+
     table_data = lg.get_table_data()
     assert type(table_data) is list
     assert type(table_data[0]) is dict
+    assert len(table_data) == 2
+
+    display_limit = 1
+    table_data = lg.get_table_data(display_limit)
     assert len(table_data) == 1
-    assert table_data[0]["index"] == 1
-    assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__
-    assert table_data[0]["genomic_object_id"] == gcfs[0].id
-    assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__
-    assert table_data[0]["metabolomic_object_id"] == spectra[0].id
-    assert table_data[0]["metcalf_score"] == round(score.value, 2)
-    assert table_data[0]["rosetta_score"] == ""

From a8a83290b2a0980ab85cd1655af4f585aa7a2140 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Fri, 18 Oct 2024 17:28:46 +0200
Subject: [PATCH 13/24] correct typing in doctrings

---
 src/nplinker/genomics/bgc.py          | 20 ++++++++++----------
 src/nplinker/metabolomics/spectrum.py |  6 +++---
 src/nplinker/scoring/link_graph.py    |  8 ++++----
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 7d606fe0..2ea0c3b9 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -201,24 +201,24 @@ def to_dict(self) -> dict[str, any]:
 
         Returns:
             A dictionary containing the following key-value pairs:
-            - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available.
-            - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available.
-            - BGC_name (str): The name of the BGC.
+            - GCF_id (set): A set of GCF IDs.
+            - GCF_bigscape_class (set): A set of BiG-SCAPE classes.
             - strain_id (str): The ID of the strain.
-            - description (str): A description of the BGC.
-            - antismash_id (str): The antiSMASH ID.
-            - antismash_region (str): The antiSMASH region.
-            - antismash_cluster_type (str): A comma-separated string of product predictions.
-            - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (tuple): (predicted) natural products or product classes of the BGC.
+            - mibig_bgc_class (tuple[str] | None):  MIBiG biosynthetic classes to which the BGC belongs.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region.
         """
         return {
             "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
             "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
+            "strain_id": self.strain.id,
+            "description": self.description,
             "BGC_name": self.id,
             "product_prediction": self.product_prediction,
             "mibig_bgc_class": self.mibig_bgc_class,
-            "description": self.description,
-            "strain_id": self.strain.id,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
         }
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index aa008579..4842f9b0 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -110,9 +110,9 @@ def to_dict(self) -> dict[str, any]:
                 - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
                 - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
                 - "rt" (float): The retention time, rounded to three decimal places.
-                - "molecular_family" (str): The identifier of the molecular family, or "-" if not available.
-                - "gnps_id" (str): The GNPS identifier, or "-" if not available.
-                - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict): A dictionary of GNPS annotations.
         """
         return {
             "spectrum_id": self.id,
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 091474e5..4fc5c23b 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -309,12 +309,12 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]:
         Returns:
             A dictionary containing the link information with the following keys:
                 - index (int): The index of the link.
-                - genomic_object_id (str or int): The ID of the genomic object.
+                - genomic_object_id (str): The ID of the genomic object.
                 - genomic_object_type (str): The type of the genomic object.
-                - metabolomic_object_id (str or int): The ID of the metabolomic object.
+                - metabolomic_object_id (str): The ID of the metabolomic object.
                 - metabolomic_object_type (str): The type of the metabolomic object.
-                - metcalf_score (float): The Metcalf score, rounded to 2 decimal places.
-                - rosetta_score (float): The Rosetta score, rounded to 2 decimal places.
+                - metcalf_score (float | str): The Metcalf score, rounded to 2 decimal places.
+                - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places.
         """
         u, v, data = link
         genomic_object_classes = (GCF,)

From c6c33e6d1dbbcdf60bfaeaee3e5b83305ea36382 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 22 Oct 2024 18:25:26 +0200
Subject: [PATCH 14/24] typing: changed typings to pass mypy static typing
 checks

---
 .github/workflows/format-typing-check.yml | 2 +-
 pyproject.toml                            | 1 +
 src/nplinker/genomics/bgc.py              | 7 ++++---
 src/nplinker/metabolomics/spectrum.py     | 3 ++-
 src/nplinker/nplinker.py                  | 4 ++--
 src/nplinker/scoring/link_graph.py        | 7 ++++---
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
index a5def2b9..10ea0990 100644
--- a/.github/workflows/format-typing-check.yml
+++ b/.github/workflows/format-typing-check.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44
diff --git a/pyproject.toml b/pyproject.toml
index 4ab04c75..c627f6ca 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ dev = [
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",
+    "types-tabulate",
     "pandas-stubs",
     # docs
     "black",
diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 2ea0c3b9..e59d29ae 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from typing import Any
 from deprecated import deprecated
 from nplinker.strain import Strain
 from .aa_pred import predict_aa
@@ -193,7 +194,7 @@ def aa_predictions(self) -> list:
                     self._aa_predictions[p[0]] = p[1]
         return [self._aa_predictions]
 
-    def to_dict(self) -> dict[str, any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert the BGC object to a dictionary for exporting results.
 
         This method compiles relevant information from the BGC object and formats it into a dictionary.
@@ -203,7 +204,7 @@ def to_dict(self) -> dict[str, any]:
             A dictionary containing the following key-value pairs:
             - GCF_id (set): A set of GCF IDs.
             - GCF_bigscape_class (set): A set of BiG-SCAPE classes.
-            - strain_id (str): The ID of the strain.
+            - strain_id (str | None): The ID of the strain.
             - description (str | None): A description of the BGC.
             - BGC_name (str): The name of the BGC.
             - product_prediction (tuple): (predicted) natural products or product classes of the BGC.
@@ -214,7 +215,7 @@ def to_dict(self) -> dict[str, any]:
         return {
             "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
             "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
-            "strain_id": self.strain.id,
+            "strain_id": self.strain.id if self.strain is not None else None,
             "description": self.description,
             "BGC_name": self.id,
             "product_prediction": self.product_prediction,
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index 4842f9b0..e0e10e6d 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from functools import cached_property
 from typing import TYPE_CHECKING
+from typing import Any
 import numpy as np
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
@@ -98,7 +99,7 @@ def has_strain(self, strain: Strain) -> bool:
         """
         return strain in self.strains
 
-    def to_dict(self) -> dict[str, any]:
+    def to_dict(self) -> dict[str, Any]:
         """Convert the Spectrum object to a dictionary for exporting results.
 
         This method compiles relevant information from the Spectrum object into a dictionary format.
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 52599957..99e139bf 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -356,11 +356,11 @@ def save_data(
         with open(file, "wb") as f:
             pickle.dump(data, f)
 
-    def export_objects(self, objects: BGC | Spectrum, filename: str) -> None:
+    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
         """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
         Args:
-            objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported.
+            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
             filename (str): The name of the file where the data will be saved.
         """
         headers = objects[0].to_dict().keys()
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 4fc5c23b..8da29912 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -2,6 +2,7 @@
 from collections.abc import Sequence
 from functools import wraps
 from os import PathLike
+from typing import Any
 from typing import Union
 from networkx import Graph
 from tabulate import tabulate
@@ -299,7 +300,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
             lg.add_link(u, v, **link_data)
 
     @staticmethod
-    def link_to_dict(link: LINK, index: int) -> dict[str, any]:
+    def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
         """Convert a link to a dictionary representation.
 
         Args:
@@ -332,7 +333,7 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]:
             "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
         }
 
-    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]:
+    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]:
         """Generate the table data for the LinkGraph.
 
         This method iterates over the links in the LinkGraph and constructs a table
@@ -372,7 +373,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
             stralign="right",
         )
 
-        if len(self.links) > display_limit:
+        if display_limit is not None and len(self.links) > display_limit:
             truncated_info = f"...\n[ {len(self.links)} links ]"
             table += f"\n{truncated_info}"
 

From a2603381a6161574751cce26f3cebade34ce530b Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Tue, 22 Oct 2024 18:46:48 +0200
Subject: [PATCH 15/24] refactor: change the order of methods/functions

---
 src/nplinker/genomics/bgc.py       |  64 +++++------
 src/nplinker/nplinker.py           | 138 ++++++++++++------------
 src/nplinker/scoring/link_graph.py | 168 ++++++++++++++---------------
 3 files changed, 185 insertions(+), 185 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index e59d29ae..6dfd6c66 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -117,18 +117,6 @@ def __reduce__(self) -> tuple:
         """Reduce function for pickling."""
         return (self.__class__, (self.id, *self.product_prediction), self.__dict__)
 
-    def add_parent(self, gcf: GCF) -> None:
-        """Add a parent GCF to the BGC.
-
-        Args:
-            gcf: gene cluster family
-        """
-        gcf.add_bgc(self)
-
-    def detach_parent(self, gcf: GCF) -> None:
-        """Remove a parent GCF."""
-        gcf.detach_bgc(self)
-
     @property
     def strain(self) -> Strain | None:
         """Get the strain of the BGC."""
@@ -162,6 +150,18 @@ def bigscape_classes(self) -> set[str | None]:
         """
         return {p.bigscape_class for p in self.parents}
 
+    def add_parent(self, gcf: GCF) -> None:
+        """Add a parent GCF to the BGC.
+
+        Args:
+            gcf: gene cluster family
+        """
+        gcf.add_bgc(self)
+
+    def detach_parent(self, gcf: GCF) -> None:
+        """Remove a parent GCF."""
+        gcf.detach_bgc(self)
+
     def is_mibig(self) -> bool:
         """Check if the BGC is a MIBiG reference BGC or not.
 
@@ -174,26 +174,6 @@ def is_mibig(self) -> bool:
         """
         return self.id.startswith("BGC")
 
-    # CG: why not providing whole product but only amino acid as product monomer?
-    # this property is not used in NPLinker core business.
-    @property
-    @deprecated(version="2.0.0", reason="This method will be removed soon")
-    def aa_predictions(self) -> list:
-        """Amino acids as predicted monomers of product.
-
-        Returns:
-            list of dicts with key as amino acid and value as prediction
-            probability.
-        """
-        # Load aa predictions and cache them
-        self._aa_predictions = None
-        if self._aa_predictions is None:
-            self._aa_predictions = {}
-            if self.antismash_file is not None:
-                for p in predict_aa(self.antismash_file):
-                    self._aa_predictions[p[0]] = p[1]
-        return [self._aa_predictions]
-
     def to_dict(self) -> dict[str, Any]:
         """Convert the BGC object to a dictionary for exporting results.
 
@@ -223,3 +203,23 @@ def to_dict(self) -> dict[str, Any]:
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
         }
+
+    # CG: why not providing whole product but only amino acid as product monomer?
+    # this property is not used in NPLinker core business.
+    @property
+    @deprecated(version="2.0.0", reason="This method will be removed soon")
+    def aa_predictions(self) -> list:
+        """Amino acids as predicted monomers of product.
+
+        Returns:
+            list of dicts with key as amino acid and value as prediction
+            probability.
+        """
+        # Load aa predictions and cache them
+        self._aa_predictions = None
+        if self._aa_predictions is None:
+            self._aa_predictions = {}
+            if self.antismash_file is not None:
+                for p in predict_aa(self.antismash_file):
+                    self._aa_predictions[p[0]] = p[1]
+        return [self._aa_predictions]
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 99e139bf..79dffcbe 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -168,34 +168,50 @@ def scoring_methods(self) -> list[str]:
         """Get names of all valid scoring methods."""
         return list(self._valid_scoring_methods.keys())
 
-    def load_data(self):
-        """Load all data from files into memory.
-
-        This method is a convenience function that calls the
-        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
-        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
-        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
-        from the files into memory.
+    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
+        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
-        The loaded data is stored in various data containers for easy access, e.g.
-        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
-        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
+        Args:
+            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
         """
-        arranger = DatasetArranger(self.config)
-        arranger.arrange()
-        loader = DatasetLoader(self.config)
-        loader.load()
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for obj in objects:
+                row_data = obj.to_dict()
+                formatted_row = []
+                for header in headers:
+                    item = row_data.get(header, "")
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(item, (list, tuple, set)):
+                        formatted_row.append(", ".join(map(str, item)))
+                    # Convert dict to comma-separated string
+                    elif isinstance(item, dict):
+                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
+                    # Convert non-empty value to string
+                    elif item:
+                        formatted_row.append(str(item))
+                    # Convert empty value to empty string
+                    else:
+                        formatted_row.append("")
+                f.write("\t".join(formatted_row) + "\n")
 
-        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
-        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
-        self._spec_dict = {spec.id: spec for spec in loader.spectra}
-        self._mf_dict = {mf.id: mf for mf in loader.mfs}
+    def export_results(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
 
-        self._mibig_bgcs = loader.mibig_bgcs
-        self._strains = loader.strains
-        self._product_types = loader.product_types
-        self._chem_classes = loader.chem_classes
-        self._class_matches = loader.class_matches
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.export_objects(self.bgcs, "genomics_data.tsv")
+        self.export_objects(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.export_links(self._output_dir / "links.tsv")
 
     @overload
     def get_links(
@@ -281,6 +297,35 @@ def get_links(
 
         return scoring.get_links(*objects, **scoring_params)
 
+    def load_data(self):
+        """Load all data from files into memory.
+
+        This method is a convenience function that calls the
+        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
+        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
+        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
+        from the files into memory.
+
+        The loaded data is stored in various data containers for easy access, e.g.
+        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
+        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
+        """
+        arranger = DatasetArranger(self.config)
+        arranger.arrange()
+        loader = DatasetLoader(self.config)
+        loader.load()
+
+        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
+        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
+        self._spec_dict = {spec.id: spec for spec in loader.spectra}
+        self._mf_dict = {mf.id: mf for mf in loader.mfs}
+
+        self._mibig_bgcs = loader.mibig_bgcs
+        self._strains = loader.strains
+        self._product_types = loader.product_types
+        self._chem_classes = loader.chem_classes
+        self._class_matches = loader.class_matches
+
     def lookup_bgc(self, id: str) -> BGC | None:
         """Get the BGC object with the given ID.
 
@@ -355,48 +400,3 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
-
-    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
-        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
-
-        Args:
-            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
-            filename (str): The name of the file where the data will be saved.
-        """
-        headers = objects[0].to_dict().keys()
-        with open(self._output_dir / filename, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for obj in objects:
-                row_data = obj.to_dict()
-                formatted_row = []
-                for header in headers:
-                    item = row_data.get(header, "")
-                    # Convert list, tuple, set to comma-separated string
-                    if isinstance(item, (list, tuple, set)):
-                        formatted_row.append(", ".join(map(str, item)))
-                    # Convert dict to comma-separated string
-                    elif isinstance(item, dict):
-                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
-                    # Convert non-empty value to string
-                    elif item:
-                        formatted_row.append(str(item))
-                    # Convert empty value to empty string
-                    else:
-                        formatted_row.append("")
-                f.write("\t".join(formatted_row) + "\n")
-
-    def export_results(self, lg: LinkGraph | None = None) -> None:
-        """Exports the results to the output directory in tab-separated format.
-
-        This method exports genomics and metabolomics data to their respective
-        TSV files in the specified output directory. If a LinkGraph object is
-        provided, it also exports the links data to a TSV file.
-
-        Args:
-            lg (LinkGraph | None): An optional LinkGraph object. If provided,
-                       the links data will be exported to 'links.tsv'.
-        """
-        self.export_objects(self.bgcs, "genomics_data.tsv")
-        self.export_objects(self.spectra, "metabolomics_data.tsv")
-        if lg is not None:
-            lg.export_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 8da29912..e01dbc59 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -198,44 +198,21 @@ def add_link(
 
         self._g.add_edge(u, v, **data)
 
-    @validate_uv
-    def has_link(self, u: Entity, v: Entity) -> bool:
-        """Check if there is a link between two objects.
-
-        Args:
-            u: the first object, either a GCF, Spectrum, or MolecularFamily
-            v: the second object, either a GCF, Spectrum, or MolecularFamily
-
-        Returns:
-            True if there is a link between the two objects, False otherwise
-
-        Examples:
-            >>> lg.has_link(gcf, spectrum)
-            True
-        """
-        return self._g.has_edge(u, v)
-
-    @validate_uv
-    def get_link_data(
-        self,
-        u: Entity,
-        v: Entity,
-    ) -> LINK_DATA | None:
-        """Get the data for a link between two objects.
+    def export_links(self, file: str | PathLike) -> None:
+        """Exports the links in the LinkGraph to a file.
 
         Args:
-            u: the first object, either a GCF, Spectrum, or MolecularFamily
-            v: the second object, either a GCF, Spectrum, or MolecularFamily
-
-        Returns:
-            A dictionary of scoring methods and their data for the link between the two objects, or
-            None if there is no link between the two objects.
+            file: the file to write the links to.
 
         Examples:
-            >>> lg.get_link_data(gcf, spectrum)
-            {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+            >>> lg.print_links("links.tsv")
         """
-        return self._g.get_edge_data(u, v)  # type: ignore
+        table_data = self.get_table_data()
+        headers = table_data[0].keys()
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for row in table_data:
+                f.write("\t".join(str(row[h]) for h in headers) + "\n")
 
     def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph:
         """Return a new LinkGraph object with the filtered links between the given objects.
@@ -281,23 +258,66 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -
 
         return lg
 
-    @validate_u
-    def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None:
-        """Filter the links for a given object and add them to the new LinkGraph object."""
-        try:
-            links = self[u]
-        except KeyError:
-            pass
-        else:
-            for node2, value in links.items():
-                lg.add_link(u, node2, **value)
+    @validate_uv
+    def get_link_data(
+        self,
+        u: Entity,
+        v: Entity,
+    ) -> LINK_DATA | None:
+        """Get the data for a link between two objects.
+
+        Args:
+            u: the first object, either a GCF, Spectrum, or MolecularFamily
+            v: the second object, either a GCF, Spectrum, or MolecularFamily
+
+        Returns:
+            A dictionary of scoring methods and their data for the link between the two objects, or
+            None if there is no link between the two objects.
+
+        Examples:
+            >>> lg.get_link_data(gcf, spectrum)
+            {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+        """
+        return self._g.get_edge_data(u, v)  # type: ignore
+
+    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]:
+        """Generate the table data for the LinkGraph.
+
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
+
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
+
+        Returns:
+            A list of dictionaries containing the table data.
+        """
+        table_data = []
+        for index, link in enumerate(self.links, start=1):
+            table_data.append(self.link_to_dict(link, index))
+            if display_limit is not None and index == display_limit:
+                break
+        return table_data
 
     @validate_uv
-    def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
-        """Filter the links between two objects and add them to the new LinkGraph object."""
-        link_data = self.get_link_data(u, v)
-        if link_data is not None:
-            lg.add_link(u, v, **link_data)
+    def has_link(self, u: Entity, v: Entity) -> bool:
+        """Check if there is a link between two objects.
+
+        Args:
+            u: the first object, either a GCF, Spectrum, or MolecularFamily
+            v: the second object, either a GCF, Spectrum, or MolecularFamily
+
+        Returns:
+            True if there is a link between the two objects, False otherwise
+
+        Examples:
+            >>> lg.has_link(gcf, spectrum)
+            True
+        """
+        return self._g.has_edge(u, v)
 
     @staticmethod
     def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
@@ -333,27 +353,23 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
             "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
         }
 
-    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]:
-        """Generate the table data for the LinkGraph.
-
-        This method iterates over the links in the LinkGraph and constructs a table
-        containing information about genomic and metabolomic objects, as well as their
-        associated scores. Each row in the table represents a link between a genomic
-        object and a metabolomic object.
-
-        Args:
-            display_limit (int | None): The maximum number of rows to include in the
-                table. If None, all rows are included.
+    @validate_u
+    def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None:
+        """Filter the links for a given object and add them to the new LinkGraph object."""
+        try:
+            links = self[u]
+        except KeyError:
+            pass
+        else:
+            for node2, value in links.items():
+                lg.add_link(u, node2, **value)
 
-        Returns:
-            A list of dictionaries containing the table data.
-        """
-        table_data = []
-        for index, link in enumerate(self.links, start=1):
-            table_data.append(self.link_to_dict(link, index))
-            if display_limit is not None and index == display_limit:
-                break
-        return table_data
+    @validate_uv
+    def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
+        """Filter the links between two objects and add them to the new LinkGraph object."""
+        link_data = self.get_link_data(u, v)
+        if link_data is not None:
+            lg.add_link(u, v, **link_data)
 
     def _get_table_repr(self, display_limit: int | None = 60) -> str:
         """Generate a table representation of the LinkGraph.
@@ -378,19 +394,3 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
             table += f"\n{truncated_info}"
 
         return table
-
-    def export_links(self, file: str | PathLike) -> None:
-        """Exports the links in the LinkGraph to a file.
-
-        Args:
-            file: the file to write the links to.
-
-        Examples:
-            >>> lg.print_links("links.tsv")
-        """
-        table_data = self.get_table_data()
-        headers = table_data[0].keys()
-        with open(file, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for row in table_data:
-                f.write("\t".join(str(row[h]) for h in headers) + "\n")

From 328968358e4c12ceb38b1d8fdbbd3b699857144d Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 19:22:41 +0100
Subject: [PATCH 16/24] restore the order of already existing functions and
 methods

---
 src/nplinker/genomics/bgc.py       |  24 ++---
 src/nplinker/nplinker.py           | 138 ++++++++++++++---------------
 src/nplinker/scoring/link_graph.py |  78 ++++++++--------
 3 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 6dfd6c66..9b544160 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -117,6 +117,18 @@ def __reduce__(self) -> tuple:
         """Reduce function for pickling."""
         return (self.__class__, (self.id, *self.product_prediction), self.__dict__)
 
+    def add_parent(self, gcf: GCF) -> None:
+        """Add a parent GCF to the BGC.
+
+        Args:
+            gcf: gene cluster family
+        """
+        gcf.add_bgc(self)
+
+    def detach_parent(self, gcf: GCF) -> None:
+        """Remove a parent GCF."""
+        gcf.detach_bgc(self)
+
     @property
     def strain(self) -> Strain | None:
         """Get the strain of the BGC."""
@@ -150,18 +162,6 @@ def bigscape_classes(self) -> set[str | None]:
         """
         return {p.bigscape_class for p in self.parents}
 
-    def add_parent(self, gcf: GCF) -> None:
-        """Add a parent GCF to the BGC.
-
-        Args:
-            gcf: gene cluster family
-        """
-        gcf.add_bgc(self)
-
-    def detach_parent(self, gcf: GCF) -> None:
-        """Remove a parent GCF."""
-        gcf.detach_bgc(self)
-
     def is_mibig(self) -> bool:
         """Check if the BGC is a MIBiG reference BGC or not.
 
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 79dffcbe..99e139bf 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -168,50 +168,34 @@ def scoring_methods(self) -> list[str]:
         """Get names of all valid scoring methods."""
         return list(self._valid_scoring_methods.keys())
 
-    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
-        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
+    def load_data(self):
+        """Load all data from files into memory.
 
-        Args:
-            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
-            filename (str): The name of the file where the data will be saved.
-        """
-        headers = objects[0].to_dict().keys()
-        with open(self._output_dir / filename, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for obj in objects:
-                row_data = obj.to_dict()
-                formatted_row = []
-                for header in headers:
-                    item = row_data.get(header, "")
-                    # Convert list, tuple, set to comma-separated string
-                    if isinstance(item, (list, tuple, set)):
-                        formatted_row.append(", ".join(map(str, item)))
-                    # Convert dict to comma-separated string
-                    elif isinstance(item, dict):
-                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
-                    # Convert non-empty value to string
-                    elif item:
-                        formatted_row.append(str(item))
-                    # Convert empty value to empty string
-                    else:
-                        formatted_row.append("")
-                f.write("\t".join(formatted_row) + "\n")
+        This method is a convenience function that calls the
+        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
+        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
+        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
+        from the files into memory.
 
-    def export_results(self, lg: LinkGraph | None = None) -> None:
-        """Exports the results to the output directory in tab-separated format.
+        The loaded data is stored in various data containers for easy access, e.g.
+        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
+        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
+        """
+        arranger = DatasetArranger(self.config)
+        arranger.arrange()
+        loader = DatasetLoader(self.config)
+        loader.load()
 
-        This method exports genomics and metabolomics data to their respective
-        TSV files in the specified output directory. If a LinkGraph object is
-        provided, it also exports the links data to a TSV file.
+        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
+        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
+        self._spec_dict = {spec.id: spec for spec in loader.spectra}
+        self._mf_dict = {mf.id: mf for mf in loader.mfs}
 
-        Args:
-            lg (LinkGraph | None): An optional LinkGraph object. If provided,
-                       the links data will be exported to 'links.tsv'.
-        """
-        self.export_objects(self.bgcs, "genomics_data.tsv")
-        self.export_objects(self.spectra, "metabolomics_data.tsv")
-        if lg is not None:
-            lg.export_links(self._output_dir / "links.tsv")
+        self._mibig_bgcs = loader.mibig_bgcs
+        self._strains = loader.strains
+        self._product_types = loader.product_types
+        self._chem_classes = loader.chem_classes
+        self._class_matches = loader.class_matches
 
     @overload
     def get_links(
@@ -297,35 +281,6 @@ def get_links(
 
         return scoring.get_links(*objects, **scoring_params)
 
-    def load_data(self):
-        """Load all data from files into memory.
-
-        This method is a convenience function that calls the
-        [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files
-        (download, generate and/or validate data) in the [correct directory structure][working-directory-structure],
-        and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data
-        from the files into memory.
-
-        The loaded data is stored in various data containers for easy access, e.g.
-        [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects,
-        [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc.
-        """
-        arranger = DatasetArranger(self.config)
-        arranger.arrange()
-        loader = DatasetLoader(self.config)
-        loader.load()
-
-        self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs}
-        self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs}
-        self._spec_dict = {spec.id: spec for spec in loader.spectra}
-        self._mf_dict = {mf.id: mf for mf in loader.mfs}
-
-        self._mibig_bgcs = loader.mibig_bgcs
-        self._strains = loader.strains
-        self._product_types = loader.product_types
-        self._chem_classes = loader.chem_classes
-        self._class_matches = loader.class_matches
-
     def lookup_bgc(self, id: str) -> BGC | None:
         """Get the BGC object with the given ID.
 
@@ -400,3 +355,48 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
+        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
+
+        Args:
+            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
+        """
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for obj in objects:
+                row_data = obj.to_dict()
+                formatted_row = []
+                for header in headers:
+                    item = row_data.get(header, "")
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(item, (list, tuple, set)):
+                        formatted_row.append(", ".join(map(str, item)))
+                    # Convert dict to comma-separated string
+                    elif isinstance(item, dict):
+                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
+                    # Convert non-empty value to string
+                    elif item:
+                        formatted_row.append(str(item))
+                    # Convert empty value to empty string
+                    else:
+                        formatted_row.append("")
+                f.write("\t".join(formatted_row) + "\n")
+
+    def export_results(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
+
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.export_objects(self.bgcs, "genomics_data.tsv")
+        self.export_objects(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.export_links(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index e01dbc59..f7690013 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -198,6 +198,45 @@ def add_link(
 
         self._g.add_edge(u, v, **data)
 
+    @validate_uv
+    def has_link(self, u: Entity, v: Entity) -> bool:
+        """Check if there is a link between two objects.
+
+        Args:
+            u: the first object, either a GCF, Spectrum, or MolecularFamily
+            v: the second object, either a GCF, Spectrum, or MolecularFamily
+
+        Returns:
+            True if there is a link between the two objects, False otherwise
+
+        Examples:
+            >>> lg.has_link(gcf, spectrum)
+            True
+        """
+        return self._g.has_edge(u, v)
+
+    @validate_uv
+    def get_link_data(
+        self,
+        u: Entity,
+        v: Entity,
+    ) -> LINK_DATA | None:
+        """Get the data for a link between two objects.
+
+        Args:
+            u: the first object, either a GCF, Spectrum, or MolecularFamily
+            v: the second object, either a GCF, Spectrum, or MolecularFamily
+
+        Returns:
+            A dictionary of scoring methods and their data for the link between the two objects, or
+            None if there is no link between the two objects.
+
+        Examples:
+            >>> lg.get_link_data(gcf, spectrum)
+            {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+        """
+        return self._g.get_edge_data(u, v)  # type: ignore
+
     def export_links(self, file: str | PathLike) -> None:
         """Exports the links in the LinkGraph to a file.
 
@@ -258,28 +297,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -
 
         return lg
 
-    @validate_uv
-    def get_link_data(
-        self,
-        u: Entity,
-        v: Entity,
-    ) -> LINK_DATA | None:
-        """Get the data for a link between two objects.
-
-        Args:
-            u: the first object, either a GCF, Spectrum, or MolecularFamily
-            v: the second object, either a GCF, Spectrum, or MolecularFamily
-
-        Returns:
-            A dictionary of scoring methods and their data for the link between the two objects, or
-            None if there is no link between the two objects.
-
-        Examples:
-            >>> lg.get_link_data(gcf, spectrum)
-            {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
-        """
-        return self._g.get_edge_data(u, v)  # type: ignore
-
     def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]:
         """Generate the table data for the LinkGraph.
 
@@ -302,23 +319,6 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any
                 break
         return table_data
 
-    @validate_uv
-    def has_link(self, u: Entity, v: Entity) -> bool:
-        """Check if there is a link between two objects.
-
-        Args:
-            u: the first object, either a GCF, Spectrum, or MolecularFamily
-            v: the second object, either a GCF, Spectrum, or MolecularFamily
-
-        Returns:
-            True if there is a link between the two objects, False otherwise
-
-        Examples:
-            >>> lg.has_link(gcf, spectrum)
-            True
-        """
-        return self._g.has_edge(u, v)
-
     @staticmethod
     def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
         """Convert a link to a dictionary representation.

From d2272e2ffe7ad32738fc04affd4612536364efc5 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 20:10:01 +0100
Subject: [PATCH 17/24] make dicts json compatible

---
 src/nplinker/genomics/bgc.py          | 18 +++++++++---------
 src/nplinker/metabolomics/spectrum.py |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
index 9b544160..c61d7942 100644
--- a/src/nplinker/genomics/bgc.py
+++ b/src/nplinker/genomics/bgc.py
@@ -177,28 +177,28 @@ def is_mibig(self) -> bool:
     def to_dict(self) -> dict[str, Any]:
         """Convert the BGC object to a dictionary for exporting results.
 
-        This method compiles relevant information from the BGC object and formats it into a dictionary.
+        Compiles relevant information from the BGC object and formats it into a dictionary.
         Each key-value pair in the dictionary represents a specific attribute of the BGC.
 
         Returns:
             A dictionary containing the following key-value pairs:
-            - GCF_id (set): A set of GCF IDs.
-            - GCF_bigscape_class (set): A set of BiG-SCAPE classes.
+            - GCF_id (list[str]): A list of GCF IDs.
+            - GCF_bigscape_class (list[str | None]): A list of BiG-SCAPE classes.
             - strain_id (str | None): The ID of the strain.
             - description (str | None): A description of the BGC.
             - BGC_name (str): The name of the BGC.
-            - product_prediction (tuple): (predicted) natural products or product classes of the BGC.
-            - mibig_bgc_class (tuple[str] | None):  MIBiG biosynthetic classes to which the BGC belongs.
+            - product_prediction (list[str]): (predicted) products or product classes of the BGC.
+            - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes.
             - antismash_id (str | None): The antiSMASH ID.
-            - antismash_region (int | None): The antiSMASH region.
+            - antismash_region (int | None): The antiSMASH region number.
         """
         return {
-            "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None},
-            "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None},
+            "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None],
+            "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None],
             "strain_id": self.strain.id if self.strain is not None else None,
             "description": self.description,
             "BGC_name": self.id,
-            "product_prediction": self.product_prediction,
+            "product_prediction": list(self.product_prediction),
             "mibig_bgc_class": self.mibig_bgc_class,
             "antismash_id": self.antismash_id,
             "antismash_region": self.antismash_region,
diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
index e0e10e6d..6fccf47b 100644
--- a/src/nplinker/metabolomics/spectrum.py
+++ b/src/nplinker/metabolomics/spectrum.py
@@ -113,7 +113,7 @@ def to_dict(self) -> dict[str, Any]:
                 - "rt" (float): The retention time, rounded to three decimal places.
                 - "molecular_family" (str | None ): The identifier of the molecular family.
                 - "gnps_id" (str | None ): The GNPS identifier.
-                - "gnps_annotations" (dict): A dictionary of GNPS annotations.
+                - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations.
         """
         return {
             "spectrum_id": self.id,

From cb49209bbba18e47c1d05b5a65b2a325de29404c Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 20:16:57 +0100
Subject: [PATCH 18/24] rename functions and variables

---
 src/nplinker/nplinker.py           | 12 ++---
 src/nplinker/scoring/link_graph.py | 84 +++++++++++++++---------------
 2 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 99e139bf..bc03fde7 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -356,8 +356,8 @@ def save_data(
         with open(file, "wb") as f:
             pickle.dump(data, f)
 
-    def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None:
-        """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format.
+    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None:
+        """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
         Args:
             objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
@@ -385,7 +385,7 @@ def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> No
                         formatted_row.append("")
                 f.write("\t".join(formatted_row) + "\n")
 
-    def export_results(self, lg: LinkGraph | None = None) -> None:
+    def to_tsv(self, lg: LinkGraph | None = None) -> None:
         """Exports the results to the output directory in tab-separated format.
 
         This method exports genomics and metabolomics data to their respective
@@ -396,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None:
             lg (LinkGraph | None): An optional LinkGraph object. If provided,
                        the links data will be exported to 'links.tsv'.
         """
-        self.export_objects(self.bgcs, "genomics_data.tsv")
-        self.export_objects(self.spectra, "metabolomics_data.tsv")
+        self.objects_to_tsv(self.bgcs, "genomics_data.tsv")
+        self.objects_to_tsv(self.spectra, "metabolomics_data.tsv")
         if lg is not None:
-            lg.export_links(self._output_dir / "links.tsv")
+            lg.to_tsv(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index f7690013..32ed290a 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -237,22 +237,6 @@ def get_link_data(
         """
         return self._g.get_edge_data(u, v)  # type: ignore
 
-    def export_links(self, file: str | PathLike) -> None:
-        """Exports the links in the LinkGraph to a file.
-
-        Args:
-            file: the file to write the links to.
-
-        Examples:
-            >>> lg.print_links("links.tsv")
-        """
-        table_data = self.get_table_data()
-        headers = table_data[0].keys()
-        with open(file, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for row in table_data:
-                f.write("\t".join(str(row[h]) for h in headers) + "\n")
-
     def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph:
         """Return a new LinkGraph object with the filtered links between the given objects.
 
@@ -297,28 +281,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -
 
         return lg
 
-    def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]:
-        """Generate the table data for the LinkGraph.
-
-        This method iterates over the links in the LinkGraph and constructs a table
-        containing information about genomic and metabolomic objects, as well as their
-        associated scores. Each row in the table represents a link between a genomic
-        object and a metabolomic object.
-
-        Args:
-            display_limit (int | None): The maximum number of rows to include in the
-                table. If None, all rows are included.
-
-        Returns:
-            A list of dictionaries containing the table data.
-        """
-        table_data = []
-        for index, link in enumerate(self.links, start=1):
-            table_data.append(self.link_to_dict(link, index))
-            if display_limit is not None and index == display_limit:
-                break
-        return table_data
-
     @staticmethod
     def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
         """Convert a link to a dictionary representation.
@@ -338,9 +300,9 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
                 - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places.
         """
         u, v, data = link
-        genomic_object_classes = (GCF,)
-        genomic_object = u if isinstance(u, genomic_object_classes) else v
-        metabolomic_object = v if isinstance(u, genomic_object_classes) else u
+        genomic_types = (GCF,)
+        genomic_object = u if isinstance(u, genomic_types) else v
+        metabolomic_object = v if isinstance(u, genomic_types) else u
         metcalf_score = data.get("metcalf")
         rosetta_score = data.get("rosetta")
         return {
@@ -353,6 +315,22 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
             "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
         }
 
+    def to_tsv(self, file: str | PathLike) -> None:
+        """Exports the links in the LinkGraph to a file  in tab-separated format.
+
+        Args:
+            file: the file to write the links to.
+
+        Examples:
+            >>> lg.print_links("links.tsv")
+        """
+        table_data = self._links_to_dicts()
+        headers = table_data[0].keys()
+        with open(file, "w") as f:
+            f.write("\t".join(headers) + "\n")
+            for row in table_data:
+                f.write("\t".join(str(row[h]) for h in headers) + "\n")
+
     @validate_u
     def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None:
         """Filter the links for a given object and add them to the new LinkGraph object."""
@@ -383,7 +361,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
             of links is appended.
         """
         table = tabulate(
-            self.get_table_data(display_limit),
+            self._links_to_dicts(display_limit),
             headers="keys",
             tablefmt="github",
             stralign="right",
@@ -394,3 +372,25 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
             table += f"\n{truncated_info}"
 
         return table
+
+    def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, Any]]:
+        """Generate the table data for the LinkGraph.
+
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
+
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
+
+        Returns:
+            A list of dictionaries containing the table data.
+        """
+        link_dicts = []
+        for index, link in enumerate(self.links, start=1):
+            link_dicts.append(self.link_to_dict(link, index))
+            if display_limit is not None and index == display_limit:
+                break
+        return link_dicts

From 6a4da5f0761a8b4388d9b4ae5e62a8af9b7c79d2 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 20:31:36 +0100
Subject: [PATCH 19/24] refactor: changed the place when the index is added to
 the link dict

---
 src/nplinker/scoring/link_graph.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 32ed290a..45ba5b30 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -282,16 +282,14 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -
         return lg
 
     @staticmethod
-    def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
+    def link_to_dict(link: LINK) -> dict[str, Any]:
         """Convert a link to a dictionary representation.
 
         Args:
             link: A tuple containing the link information (u, v, data).
-            index: The index of the link.
 
         Returns:
             A dictionary containing the link information with the following keys:
-                - index (int): The index of the link.
                 - genomic_object_id (str): The ID of the genomic object.
                 - genomic_object_type (str): The type of the genomic object.
                 - metabolomic_object_id (str): The ID of the metabolomic object.
@@ -306,7 +304,6 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]:
         metcalf_score = data.get("metcalf")
         rosetta_score = data.get("rosetta")
         return {
-            "index": index,
             "genomic_object_id": genomic_object.id,
             "genomic_object_type": genomic_object.__class__.__name__,
             "metabolomic_object_id": metabolomic_object.id,
@@ -388,9 +385,8 @@ def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, An
         Returns:
             A list of dictionaries containing the table data.
         """
+        links = self.links[:display_limit] if display_limit else self.links
         link_dicts = []
-        for index, link in enumerate(self.links, start=1):
-            link_dicts.append(self.link_to_dict(link, index))
-            if display_limit is not None and index == display_limit:
-                break
+        for idx, link in enumerate(links):
+            link_dicts.append({"index": idx + 1, **self.link_to_dict(link)})
         return link_dicts

From edcc7db0d7b97be459d14b77f2768191db54a9cc Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 21:08:04 +0100
Subject: [PATCH 20/24] use csv package to write the tabular output files

---
 src/nplinker/nplinker.py           | 37 ++++++++++++++++--------------
 src/nplinker/scoring/link_graph.py | 13 ++++++-----
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index bc03fde7..16713f40 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import csv
 import logging
 import pickle
 from collections.abc import Sequence
@@ -356,34 +357,36 @@ def save_data(
         with open(file, "wb") as f:
             pickle.dump(data, f)
 
-    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None:
+    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None:
         """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format.
 
         Args:
-            objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported.
+            objects (list): A list of BGC or a list of Spectrum objects to be exported.
             filename (str): The name of the file where the data will be saved.
         """
+        if not objects:
+            raise ValueError("No objects provided to export")
+
         headers = objects[0].to_dict().keys()
-        with open(self._output_dir / filename, "w") as f:
-            f.write("\t".join(headers) + "\n")
+        with open(self._output_dir / filename, "w", newline="") as outfile:
+            writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
             for obj in objects:
-                row_data = obj.to_dict()
-                formatted_row = []
+                row = obj.to_dict()
                 for header in headers:
-                    item = row_data.get(header, "")
+                    value = row[header]
                     # Convert list, tuple, set to comma-separated string
-                    if isinstance(item, (list, tuple, set)):
-                        formatted_row.append(", ".join(map(str, item)))
+                    if isinstance(value, (list, tuple, set)):
+                        row[header] = ", ".join(map(str, value))
                     # Convert dict to comma-separated string
-                    elif isinstance(item, dict):
-                        formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()]))
-                    # Convert non-empty value to string
-                    elif item:
-                        formatted_row.append(str(item))
-                    # Convert empty value to empty string
+                    elif isinstance(value, dict):
+                        row[header] = ", ".join([f"{k}:{v}" for k, v in value.items()])
+                    # Convert anything else to string
                     else:
-                        formatted_row.append("")
-                f.write("\t".join(formatted_row) + "\n")
+                        row[header] = str(value) if value else ""
+                    # Replace tabs with 4 spaces
+                    row[header] = row[header].replace("\t", "    ")
+                writer.writerow(row)
 
     def to_tsv(self, lg: LinkGraph | None = None) -> None:
         """Exports the results to the output directory in tab-separated format.
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index 45ba5b30..d1f3cf4b 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import csv
 from collections.abc import Sequence
 from functools import wraps
 from os import PathLike
@@ -313,20 +314,20 @@ def link_to_dict(link: LINK) -> dict[str, Any]:
         }
 
     def to_tsv(self, file: str | PathLike) -> None:
-        """Exports the links in the LinkGraph to a file  in tab-separated format.
+        """Exports the links in the LinkGraph to a file in tab-separated format.
 
         Args:
             file: the file to write the links to.
 
         Examples:
-            >>> lg.print_links("links.tsv")
+            >>> lg.to_tsv("links.tsv")
         """
         table_data = self._links_to_dicts()
         headers = table_data[0].keys()
-        with open(file, "w") as f:
-            f.write("\t".join(headers) + "\n")
-            for row in table_data:
-                f.write("\t".join(str(row[h]) for h in headers) + "\n")
+        with open(file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
+            writer.writerows(table_data)
 
     @validate_u
     def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None:

From 05f9f76ef26847b54554fd582ca27caa7c424245 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 21:12:54 +0100
Subject: [PATCH 21/24] make sure all elements of the input list have the same
 type of data.

---
 src/nplinker/nplinker.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
index 16713f40..55450a24 100644
--- a/src/nplinker/nplinker.py
+++ b/src/nplinker/nplinker.py
@@ -367,6 +367,11 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename:
         if not objects:
             raise ValueError("No objects provided to export")
 
+        # Ensure all elements in the list are of the same type
+        obj_type = type(objects[0])
+        if not all(isinstance(obj, obj_type) for obj in objects):
+            raise TypeError("All objects in the list must be of the same type")
+
         headers = objects[0].to_dict().keys()
         with open(self._output_dir / filename, "w", newline="") as outfile:
             writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")

From bff7731c7ae1593091897d6590b4789bdaec9067 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 21:20:01 +0100
Subject: [PATCH 22/24] shorten to long doc string lines, correct some doc
 strings

---
 src/nplinker/scoring/link_graph.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
index d1f3cf4b..5ee61aa7 100644
--- a/src/nplinker/scoring/link_graph.py
+++ b/src/nplinker/scoring/link_graph.py
@@ -79,17 +79,17 @@ def __init__(self) -> None:
 
             Display the empty LinkGraph object:
             >>> lg
-            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
-            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
+            |   index |   genomic_object_id |   genomic_object_type |   metabolomic_object_id |   metabolomic_object_type |   metcalf_score |   rosetta_score |
+            |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------|
 
             Add a link between a GCF and a Spectrum object:
             >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5}))
 
             Display all links in LinkGraph object:
             >>> lg
-            |    | Genomic Object Type   | Genomic Object ID   | Metabolomic Object Type   | Metabolomic Object ID   | Metcalf Score   | Rosetta Score   |
-            |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------|
-            |  1 | GCF                   | 1                   | Spectrum                  | 1                       | 1.00            | -               |
+            |   index |   genomic_object_id |   genomic_object_type |   metabolomic_object_id |   metabolomic_object_type |   metcalf_score |   rosetta_score |
+            |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------|
+            |       1 |                   1 |                   GCF |                       1 |                  Spectrum |            1.00 |                 |
 
             Get all links for a given object:
             >>> lg[gcf]
@@ -117,7 +117,7 @@ def __init__(self) -> None:
             >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2])
 
             Export the links to a file:
-            >>> lg.export_links("links.tsv")
+            >>> lg.to_tsv("links.tsv")
         """
         self._g: Graph = Graph()
 
@@ -354,9 +354,9 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str:
             display_limit: The maximum number of links to display in the table. Defaults to 60.
 
         Returns:
-            str: A string representation of the table in GitHub-flavored markdown format. If the number of links
-            exceeds the display limit, the table is truncated and an additional line indicating the total number
-            of links is appended.
+            str: A string representation of the table in GitHub-flavored markdown format. If the
+            number of links exceeds the display limit, the table is truncated and an additional
+            line indicating the total number of links is appended.
         """
         table = tabulate(
             self._links_to_dicts(display_limit),

From d4bf9fb2d277424faa52181db9750a85d000c322 Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 21:32:05 +0100
Subject: [PATCH 23/24] tests: adapted the test to the changes

---
 tests/unit/data/justafile.ipynb       | 131 ++++++++++++++++++++++++++
 tests/unit/genomics/test_bgc.py       |  20 ++--
 tests/unit/scoring/test_link_graph.py |  12 +--
 3 files changed, 150 insertions(+), 13 deletions(-)
 create mode 100644 tests/unit/data/justafile.ipynb

diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb
new file mode 100644
index 00000000..43a5453b
--- /dev/null
+++ b/tests/unit/data/justafile.ipynb
@@ -0,0 +1,131 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nplinker.genomics.antismash import AntismashBGCLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = AntismashBGCLoader(\"antismash\")\n",
+    "mapping = loader.get_genome_bgcs_mapping()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "isinstance(mapping, dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "20"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(mapping[\"GCF_000514515.1\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'NZ_AZWB01000006.region001'"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mapping[\"GCF_000514515.1\"][-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "npl_dev",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py
index 9706e961..71f173ba 100644
--- a/tests/unit/genomics/test_bgc.py
+++ b/tests/unit/genomics/test_bgc.py
@@ -32,10 +32,10 @@ def test_to_dict():
     bgc.description = "Sample description"
 
     dict_repr = bgc.to_dict()
-    assert dict_repr["GCF_id"] == set()
-    assert dict_repr["GCF_bigscape_class"] == set()
+    assert dict_repr["GCF_id"] == list()
+    assert dict_repr["GCF_bigscape_class"] == list()
     assert dict_repr["BGC_name"] == "BGC0000001"
-    assert dict_repr["product_prediction"] == ("Polyketide", "NRP")
+    assert dict_repr["product_prediction"] == ["Polyketide", "NRP"]
     assert dict_repr["mibig_bgc_class"] is None
     assert dict_repr["description"] == "Sample description"
     assert dict_repr["strain_id"] == "sample_strain"
@@ -43,12 +43,18 @@ def test_to_dict():
     assert dict_repr["antismash_region"] is None
 
     bgc.add_parent(GCF("1"))
-    bgc.mibig_bgc_class = ("NRP",)
+    bgc.mibig_bgc_class = [
+        "NRP",
+    ]
     bgc.antismash_id = "ABC_0001"
     bgc.antismash_region = 1
     dict_repr = bgc.to_dict()
-    assert dict_repr["GCF_id"] == {"1"}
-    assert dict_repr["GCF_bigscape_class"] == set()
-    assert dict_repr["mibig_bgc_class"] == ("NRP",)
+    assert dict_repr["GCF_id"] == [
+        "1",
+    ]
+    assert dict_repr["GCF_bigscape_class"] == list()
+    assert dict_repr["mibig_bgc_class"] == [
+        "NRP",
+    ]
     assert dict_repr["antismash_id"] == "ABC_0001"
     assert dict_repr["antismash_region"] == 1
diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py
index f1542338..32e73f7f 100644
--- a/tests/unit/scoring/test_link_graph.py
+++ b/tests/unit/scoring/test_link_graph.py
@@ -116,10 +116,8 @@ def test_filter(gcfs, spectra, score):
 
 def test_link_to_dict(lg, gcfs, spectra, score):
     link = lg.links[0]
-    index = 1
-    dict_repr = lg.link_to_dict(link, index)
+    dict_repr = lg.link_to_dict(link)
     assert type(dict_repr) is dict
-    assert dict_repr["index"] == 1
     assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__
     assert dict_repr["genomic_object_id"] == gcfs[0].id
     assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__
@@ -128,15 +126,17 @@ def test_link_to_dict(lg, gcfs, spectra, score):
     assert dict_repr["rosetta_score"] == ""
 
 
-def test_get_table_data(lg, gcfs, spectra, score):
+def test__links_to_dicts(lg, gcfs, spectra, score):
     # add a second link
     lg.add_link(gcfs[1], spectra[1], metcalf=score)
 
-    table_data = lg.get_table_data()
+    table_data = lg._links_to_dicts()
     assert type(table_data) is list
     assert type(table_data[0]) is dict
     assert len(table_data) == 2
+    assert table_data[0]["index"] == 1
+    assert table_data[1]["index"] == 2
 
     display_limit = 1
-    table_data = lg.get_table_data(display_limit)
+    table_data = lg._links_to_dicts(display_limit)
     assert len(table_data) == 1

From 2c05efbbc0b7c511beec865241e7cfe8024c5cab Mon Sep 17 00:00:00 2001
From: Annette Lien <a.lien@posteo.de>
Date: Mon, 4 Nov 2024 21:38:40 +0100
Subject: [PATCH 24/24] remove a file that was committed by accident

---
 tests/unit/data/justafile.ipynb | 131 --------------------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 tests/unit/data/justafile.ipynb

diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb
deleted file mode 100644
index 43a5453b..00000000
--- a/tests/unit/data/justafile.ipynb
+++ /dev/null
@@ -1,131 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from nplinker.genomics.antismash import AntismashBGCLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = AntismashBGCLoader(\"antismash\")\n",
-    "mapping = loader.get_genome_bgcs_mapping()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "isinstance(mapping, dict)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(mapping)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "20"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(mapping[\"GCF_000514515.1\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'NZ_AZWB01000006.region001'"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mapping[\"GCF_000514515.1\"][-1]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "npl_dev",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}