NPLinker · liannette · Oct 15, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Install ruff and mypy
         run: |
           pip install ruff mypy typing_extensions \
-            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs
+            types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs 
       - name: Get all changed python files
         id: changed-python-files
         uses: tj-actions/changed-files@v44

diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ dev = [
     "types-beautifulsoup4",
     "types-jsonschema",
     "types-networkx",
+    "types-tabulate",
     "pandas-stubs",
     # docs
     "black",

diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING
+from typing import Any
 from deprecated import deprecated
 from nplinker.strain import Strain
 from .aa_pred import predict_aa
@@ -173,6 +174,36 @@ def is_mibig(self) -> bool:
         """
         return self.id.startswith("BGC")
 
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the BGC object to a dictionary for exporting results.
+
+        Compiles relevant information from the BGC object and formats it into a dictionary.
+        Each key-value pair in the dictionary represents a specific attribute of the BGC.
+
+        Returns:
+            A dictionary containing the following key-value pairs:
+            - GCF_id (list[str]): A list of GCF IDs.
+            - GCF_bigscape_class (list[str | None]): A list of BiG-SCAPE classes.
+            - strain_id (str | None): The ID of the strain.
+            - description (str | None): A description of the BGC.
+            - BGC_name (str): The name of the BGC.
+            - product_prediction (list[str]): (predicted) products or product classes of the BGC.
+            - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes.
+            - antismash_id (str | None): The antiSMASH ID.
+            - antismash_region (int | None): The antiSMASH region number.
+        """
+        return {
+            "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None],
+            "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None],
+            "strain_id": self.strain.id if self.strain is not None else None,
+            "description": self.description,
+            "BGC_name": self.id,
+            "product_prediction": list(self.product_prediction),
+            "mibig_bgc_class": self.mibig_bgc_class,
+            "antismash_id": self.antismash_id,
+            "antismash_region": self.antismash_region,
+        }
+
     # CG: why not providing whole product but only amino acid as product monomer?
     # this property is not used in NPLinker core business.
     @property

diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from functools import cached_property
 from typing import TYPE_CHECKING
+from typing import Any
 import numpy as np
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
@@ -97,3 +98,29 @@ def has_strain(self, strain: Strain) -> bool:
             True when the given strain exist in the spectrum.
         """
         return strain in self.strains
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the Spectrum object to a dictionary for exporting results.
+
+        This method compiles relevant information from the Spectrum object into a dictionary format.
+        Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object.
+
+        Returns:
+            A dictionary containing containing the following key-value pairs:
+                - "spectrum_id" (str): The unique identifier of the spectrum.
+                - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum.
+                - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places.
+                - "rt" (float): The retention time, rounded to three decimal places.
+                - "molecular_family" (str | None ): The identifier of the molecular family.
+                - "gnps_id" (str | None ): The GNPS identifier.
+                - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations.
+        """
+        return {
+            "spectrum_id": self.id,
+            "num_strains_with_spectrum": len(self.strains),
+            "precursor_mz": round(self.precursor_mz, 4),
+            "rt": round(self.rt, 3),
+            "molecular_family": self.family.id if self.family else None,
+            "gnps_id": self.gnps_id,
+            "gnps_annotations": self.gnps_annotations,
+        }
diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import csv
 import logging
 import pickle
 from collections.abc import Sequence
@@ -355,3 +356,55 @@ def save_data(
         data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links)
         with open(file, "wb") as f:
             pickle.dump(data, f)
+
+    def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None:
+        """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format.
+
+        Args:
+            objects (list): A list of BGC or a list of Spectrum objects to be exported.
+            filename (str): The name of the file where the data will be saved.
+        """
+        if not objects:
+            raise ValueError("No objects provided to export")
+
+        # Ensure all elements in the list are of the same type
+        obj_type = type(objects[0])
+        if not all(isinstance(obj, obj_type) for obj in objects):
+            raise TypeError("All objects in the list must be of the same type")
+
+        headers = objects[0].to_dict().keys()
+        with open(self._output_dir / filename, "w", newline="") as outfile:
+            writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
+            for obj in objects:
+                row = obj.to_dict()
+                for header in headers:
+                    value = row[header]
+                    # Convert list, tuple, set to comma-separated string
+                    if isinstance(value, (list, tuple, set)):
+                        row[header] = ", ".join(map(str, value))
+                    # Convert dict to comma-separated string
+                    elif isinstance(value, dict):
+                        row[header] = ", ".join([f"{k}:{v}" for k, v in value.items()])
+                    # Convert anything else to string
+                    else:
+                        row[header] = str(value) if value else ""
+                    # Replace tabs with 4 spaces
+                    row[header] = row[header].replace("\t", "    ")
+                writer.writerow(row)
+
+    def to_tsv(self, lg: LinkGraph | None = None) -> None:
+        """Exports the results to the output directory in tab-separated format.
+
+        This method exports genomics and metabolomics data to their respective
+        TSV files in the specified output directory. If a LinkGraph object is
+        provided, it also exports the links data to a TSV file.
+
+        Args:
+            lg (LinkGraph | None): An optional LinkGraph object. If provided,
+                       the links data will be exported to 'links.tsv'.
+        """
+        self.objects_to_tsv(self.bgcs, "genomics_data.tsv")
+        self.objects_to_tsv(self.spectra, "metabolomics_data.tsv")
+        if lg is not None:
+            lg.to_tsv(self._output_dir / "links.tsv")
diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
+import csv
 from collections.abc import Sequence
 from functools import wraps
+from os import PathLike
+from typing import Any
 from typing import Union
 from networkx import Graph
 from tabulate import tabulate
@@ -76,17 +79,17 @@ def __init__(self) -> None:
 
             Display the empty LinkGraph object:
             >>> lg
-            |    |   Object 1 |   Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|------------|------------|-----------------|-----------------|
+            |   index |   genomic_object_id |   genomic_object_type |   metabolomic_object_id |   metabolomic_object_type |   metcalf_score |   rosetta_score |
+            |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------|
 
             Add a link between a GCF and a Spectrum object:
             >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5}))
 
             Display all links in LinkGraph object:
             >>> lg
-            |    |     Object 1 |               Object 2 |   Metcalf Score |   Rosetta Score |
-            |----|--------------|------------------------|-----------------|-----------------|
-            |  1 | GCF(id=gcf1) | Spectrum(id=spectrum1) |               1 |               - |
+            |   index |   genomic_object_id |   genomic_object_type |   metabolomic_object_id |   metabolomic_object_type |   metcalf_score |   rosetta_score |
+            |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------|
+            |       1 |                   1 |                   GCF |                       1 |                  Spectrum |            1.00 |                 |
 
             Get all links for a given object:
             >>> lg[gcf]
@@ -103,6 +106,18 @@ def __init__(self) -> None:
             Get the link data between two objects:
             >>> lg.get_link_data(gcf, spectrum)
             {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})}
+
+            Filter the links for `gcf1` and `gcf2`:
+            >>> new_lg = lg.filter([gcf1, gcf2])
+
+            Filter the links for `spectrum1` and `spectrum2`:
+            >>> new_lg = lg.filter([spectrum1, spectrum2])
+
+            Filter the links between two lists of objects:
+            >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2])
+
+            Export the links to a file:
+            >>> lg.to_tsv("links.tsv")
         """
         self._g: Graph = Graph()
 
@@ -267,6 +282,53 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -
 
         return lg
 
+    @staticmethod
+    def link_to_dict(link: LINK) -> dict[str, Any]:
+        """Convert a link to a dictionary representation.
+
+        Args:
+            link: A tuple containing the link information (u, v, data).
+
+        Returns:
+            A dictionary containing the link information with the following keys:
+                - genomic_object_id (str): The ID of the genomic object.
+                - genomic_object_type (str): The type of the genomic object.
+                - metabolomic_object_id (str): The ID of the metabolomic object.
+                - metabolomic_object_type (str): The type of the metabolomic object.
+                - metcalf_score (float | str): The Metcalf score, rounded to 2 decimal places.
+                - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places.
+        """
+        u, v, data = link
+        genomic_types = (GCF,)
+        genomic_object = u if isinstance(u, genomic_types) else v
+        metabolomic_object = v if isinstance(u, genomic_types) else u
+        metcalf_score = data.get("metcalf")
+        rosetta_score = data.get("rosetta")
+        return {
+            "genomic_object_id": genomic_object.id,
+            "genomic_object_type": genomic_object.__class__.__name__,
+            "metabolomic_object_id": metabolomic_object.id,
+            "metabolomic_object_type": metabolomic_object.__class__.__name__,
+            "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "",
+            "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "",
+        }
+
+    def to_tsv(self, file: str | PathLike) -> None:
+        """Exports the links in the LinkGraph to a file in tab-separated format.
+
+        Args:
+            file: the file to write the links to.
+
+        Examples:
+            >>> lg.to_tsv("links.tsv")
+        """
+        table_data = self._links_to_dicts()
+        headers = table_data[0].keys()
+        with open(file, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=headers, delimiter="\t")
+            writer.writeheader()
+            writer.writerows(table_data)
+
     @validate_u
     def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None:
         """Filter the links for a given object and add them to the new LinkGraph object."""
@@ -285,35 +347,47 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None:
         if link_data is not None:
             lg.add_link(u, v, **link_data)
 
-    def _get_table_repr(self) -> str:
+    def _get_table_repr(self, display_limit: int | None = 60) -> str:
         """Generate a table representation of the LinkGraph.
 
-        The table is truncated to 60 links.
-        """
-        headers = ["", "Object 1", "Object 2", "Metcalf Score", "Rosetta Score"]
-        table_data = []
-        display_limit = 60
+        Args:
+            display_limit: The maximum number of links to display in the table. Defaults to 60.
 
-        for index, (u, v, data) in enumerate(self.links, start=1):
-            metcalf_score = data.get("metcalf")
-            rosetta_score = data.get("rosetta")
+        Returns:
+            str: A string representation of the table in GitHub-flavored markdown format. If the
+            number of links exceeds the display limit, the table is truncated and an additional
+            line indicating the total number of links is appended.
+        """
+        table = tabulate(
+            self._links_to_dicts(display_limit),
+            headers="keys",
+            tablefmt="github",
+            stralign="right",
+        )
+
+        if display_limit is not None and len(self.links) > display_limit:
+            truncated_info = f"...\n[ {len(self.links)} links ]"
+            table += f"\n{truncated_info}"
 
-            row = [
-                index,
-                str(u if isinstance(u, GCF) else v),
-                str(v if isinstance(u, GCF) else u),
-                f"{metcalf_score.value:.2f}" if metcalf_score else "-",
-                f"{rosetta_score.value:.2f}" if rosetta_score else "-",
-            ]
-            table_data.append(row)
+        return table
 
-            if index == display_limit:
-                break
+    def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, Any]]:
+        """Generate the table data for the LinkGraph.
 
-        table = tabulate(table_data, headers=headers, tablefmt="github", stralign="right")
+        This method iterates over the links in the LinkGraph and constructs a table
+        containing information about genomic and metabolomic objects, as well as their
+        associated scores. Each row in the table represents a link between a genomic
+        object and a metabolomic object.
 
-        if len(self.links) > display_limit:
-            truncated_info = f"...\n[ {len(self.links)} links ]"
-            return f"{table}\n{truncated_info}"
+        Args:
+            display_limit (int | None): The maximum number of rows to include in the
+                table. If None, all rows are included.
 
-        return table
+        Returns:
+            A list of dictionaries containing the table data.
+        """
+        links = self.links[:display_limit] if display_limit else self.links
+        link_dicts = []
+        for idx, link in enumerate(links):
+            link_dicts.append({"index": idx + 1, **self.link_to_dict(link)})
+        return link_dicts
diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py
@@ -24,3 +24,37 @@ def test_add_and_detach_parent():
     assert bgc.parents == {gcf}
     bgc.detach_parent(gcf)
     assert bgc.parents == set()
+
+
+def test_to_dict():
+    bgc = BGC("BGC0000001", "Polyketide", "NRP")
+    bgc.strain = Strain("sample_strain")
+    bgc.description = "Sample description"
+
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == list()
+    assert dict_repr["GCF_bigscape_class"] == list()
+    assert dict_repr["BGC_name"] == "BGC0000001"
+    assert dict_repr["product_prediction"] == ["Polyketide", "NRP"]
+    assert dict_repr["mibig_bgc_class"] is None
+    assert dict_repr["description"] == "Sample description"
+    assert dict_repr["strain_id"] == "sample_strain"
+    assert dict_repr["antismash_id"] is None
+    assert dict_repr["antismash_region"] is None
+
+    bgc.add_parent(GCF("1"))
+    bgc.mibig_bgc_class = [
+        "NRP",
+    ]
+    bgc.antismash_id = "ABC_0001"
+    bgc.antismash_region = 1
+    dict_repr = bgc.to_dict()
+    assert dict_repr["GCF_id"] == [
+        "1",
+    ]
+    assert dict_repr["GCF_bigscape_class"] == list()
+    assert dict_repr["mibig_bgc_class"] == [
+        "NRP",
+    ]
+    assert dict_repr["antismash_id"] == "ABC_0001"
+    assert dict_repr["antismash_region"] == 1