From 6b8d3d38163595a3e3ebf98e85567a004f4affd3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 15 Oct 2024 16:19:38 +0200 Subject: [PATCH 01/24] add print links method to LinkGraph, improve LinkGraph string representation --- src/nplinker/scoring/link_graph.py | 102 ++++++++++++++++++++------ tests/unit/scoring/test_link_graph.py | 17 +++++ 2 files changed, 97 insertions(+), 22 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 50151997..90336635 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Sequence from functools import wraps +from os import PathLike from typing import Union from networkx import Graph from tabulate import tabulate @@ -76,17 +77,17 @@ def __init__(self) -> None: Display the empty LinkGraph object: >>> lg - | | Object 1 | Object 2 | Metcalf Score | Rosetta Score | - |----|------------|------------|-----------------|-----------------| + | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | + |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| Add a link between a GCF and a Spectrum object: >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5})) Display all links in LinkGraph object: >>> lg - | | Object 1 | Object 2 | Metcalf Score | Rosetta Score | - |----|--------------|------------------------|-----------------|-----------------| - | 1 | GCF(id=gcf1) | Spectrum(id=spectrum1) | 1 | - | + | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | + |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| + | 1 | GCF | 1 | Spectrum | 1 | 1.00 | - | Get all links for a given object: >>> lg[gcf] @@ -285,35 +286,92 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def _get_table_repr(self) -> str: - """Generate a table representation of the LinkGraph. + def get_table_data(self, display_limit: int | None = None) -> list[dict]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. - The table is truncated to 60 links. + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + list: A list of dictionaries, where each dictionary contains + the following keys: + - Index (int) + - Genomic Object Type (str) + - Genomic Object ID (str or int) + - Metabolomic Object Type (str) + - Metabolomic Object ID (str or int) + - Metcalf Score (str, formatted to 2 decimal places, or "-") + - Rosetta Score (str, formatted to 2 decimal places, or "-") """ - headers = ["", "Object 1", "Object 2", "Metcalf Score", "Rosetta Score"] + genomic_object_classes = (GCF,) + table_data = [] - display_limit = 60 for index, (u, v, data) in enumerate(self.links, start=1): + genomic_object = u if isinstance(u, genomic_object_classes) else v + metabolomic_object = v if isinstance(u, genomic_object_classes) else u metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") - row = [ - index, - str(u if isinstance(u, GCF) else v), - str(v if isinstance(u, GCF) else u), - f"{metcalf_score.value:.2f}" if metcalf_score else "-", - f"{rosetta_score.value:.2f}" if rosetta_score else "-", - ] - table_data.append(row) - - if index == display_limit: + table_data.append( + { + "Index": index, + "Genomic Object Type": genomic_object.__class__.__name__, + "Genomic Object ID": genomic_object.id, + "Metabolomic Object Type": metabolomic_object.__class__.__name__, + "Metabolomic Object ID": metabolomic_object.id, + "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", + "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + } + ) + + if display_limit is not None and index == display_limit: break - table = tabulate(table_data, headers=headers, tablefmt="github", stralign="right") + return table_data + + def _get_table_repr(self, display_limit: int | None = 60) -> str: + """Generate a table representation of the LinkGraph. + + Args: + display_limit: The maximum number of links to display in the table. Defaults to 60. + + Returns: + str: A string representation of the table in GitHub-flavored markdown format. If the number of links + exceeds the display limit, the table is truncated and an additional line indicating the total number + of links is appended. + """ + table = tabulate( + self.get_table_data(display_limit), + headers="keys", + tablefmt="github", + stralign="right", + ) if len(self.links) > display_limit: truncated_info = f"...\n[ {len(self.links)} links ]" - return f"{table}\n{truncated_info}" + table += f"\n{truncated_info}" return table + + def print_links(self, file: str | PathLike) -> None: + """Print the links in the LinkGraph to a file. + + Args: + file: the file to write the links to. + + Examples: + >>> lg.print_links("links.tsv") + """ + table_data = self.get_table_data() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 9f7c9d7d..4745c856 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -112,3 +112,20 @@ def test_filter(gcfs, spectra, score): # test filtering with GCFs and Spectra lg_filtered = lg.filter(u_nodes, v_nodes) assert len(lg_filtered) == 4 + + +def test_get_table_data(lg, gcfs, spectra, score): + table_data = lg.get_table_data() + assert type(table_data) is list + assert type(table_data[0]) is dict + assert table_data == [ + { + "Index": 1, + "Genomic Object Type": gcfs[0].__class__.__name__, + "Genomic Object ID": gcfs[0].id, + "Metabolomic Object Type": spectra[0].__class__.__name__, + "Metabolomic Object ID": spectra[0].id, + "Metcalf Score": f"{score.value:.2f}", + "Rosetta Score": "-", + }, + ] From cdd26c3330c6867c1be09b1a5ed90e3a16088fa3 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 17:17:38 +0200 Subject: [PATCH 02/24] feat: add a method to print tabular results files --- src/nplinker/genomics/bgc.py | 21 ++++++++++++ src/nplinker/metabolomics/spectrum.py | 16 +++++++++ src/nplinker/nplinker.py | 49 +++++++++++++++++++++++++++ src/nplinker/scoring/link_graph.py | 14 ++++---- 4 files changed, 93 insertions(+), 7 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 08978587..57161d07 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -192,3 +192,24 @@ def aa_predictions(self) -> list: for p in predict_aa(self.antismash_file): self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] + + def to_dict(self) -> dict: + """Convert the BGC object to a dictionary that can be used to export the results. + + Returns: + A dictionary containing relavant information about the BGC object. + """ + gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None] + gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None] + + return { + "GCF_id": ", ".join(gcf_ids) if gcf_ids else None, + "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None, + "BGC_name": self.id, + "strain_id": self.strain.id, + "description": self.description, + "antismash_id": self.antismash_id, + "antismash_region": self.antismash_region, + "antismash_cluster_type": ", ".join(self.product_prediction), + "mibig_bgc_class": self.mibig_bgc_class, + } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 61d8d421..a2891a2b 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -97,3 +97,19 @@ def has_strain(self, strain: Strain) -> bool: True when the given strain exist in the spectrum. """ return strain in self.strains + + def to_dict(self) -> dict: + """Convert the Spectrum object to a dictionary that can be used to export the results. + + Returns: + A dictionary containing relavant information about the Spectrum object. + """ + return { + "spectrum_id": self.id, + "num_strains_with_spectrum": len(self.strains), + "precursor_mz": self.precursor_mz, + "rt": self.rt, + "molecular_family": self.family.id if self.family else None, + "gnps_id": self.gnps_id, + "gnps_annotations": self.gnps_annotations, + } diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index a7146dcc..f15ee1dd 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -355,3 +355,52 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) + + def print_bgcs(self, file: str | PathLike) -> None: + """Prints the BGC data to a specified file in tab-separated format. + + Args: + file: The path to the file where the BGC data will be printed. + """ + headers = self.bgcs[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for bgc in self.bgcs: + row_data = bgc.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_gcfs(self, file: str | PathLike) -> None: + """Prints the GCF data to a specified file in tab-separated format. + + Args: + file: The path to the file where the GCF data will be printed. + """ + headers = self.gcfs[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for gcf in self.gcfs: + row_data = gcf.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_spectra(self, file: str | PathLike) -> None: + """Prints the Spectrum data to a specified file in tab-separated format. + + Args: + file: The path to the file where the Spectrum data will be printed. + """ + headers = self.spectra[0].to_dict().keys() + + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for spectrum in self.spectra: + row_data = spectrum.to_dict() + f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + + def print_results(self, lg: LinkGraph | None = None) -> None: + """Prints the results to the output directory in tab-separated format.""" + self.print_bgcs(self._output_dir / "genomics_data.tsv") + self.print_spectra(self._output_dir / "metabolomics_data.tsv") + if lg is not None: + lg.print_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 90336635..fd1db438 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -321,13 +321,13 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict]: table_data.append( { - "Index": index, - "Genomic Object Type": genomic_object.__class__.__name__, - "Genomic Object ID": genomic_object.id, - "Metabolomic Object Type": metabolomic_object.__class__.__name__, - "Metabolomic Object ID": metabolomic_object.id, - "Metcalf Score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", - "Rosetta Score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + "index": index, + "genomic_object_type": genomic_object.__class__.__name__, + "genomic_object_id": genomic_object.id, + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metabolomic_object_id": metabolomic_object.id, + "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", + "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", } ) From ec8b8ae0a12885d6ddbc28ebb5b3c90b156e1140 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 17:26:58 +0200 Subject: [PATCH 03/24] improve method names and docstrings, remove unused method to export gcf file --- src/nplinker/nplinker.py | 47 +++++++++++++++--------------- src/nplinker/scoring/link_graph.py | 4 +-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index f15ee1dd..9e87ed92 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,11 +356,13 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def print_bgcs(self, file: str | PathLike) -> None: - """Prints the BGC data to a specified file in tab-separated format. + def export_genomics_data(self, file: str | PathLike) -> None: + """Exports the genomics data to a specified file in tab-separated format. + + Each row in the file corresponds to a BGC object. Args: - file: The path to the file where the BGC data will be printed. + file: The path to the file where the genomics data will be printed. """ headers = self.bgcs[0].to_dict().keys() @@ -370,25 +372,13 @@ def print_bgcs(self, file: str | PathLike) -> None: row_data = bgc.to_dict() f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - def print_gcfs(self, file: str | PathLike) -> None: - """Prints the GCF data to a specified file in tab-separated format. - - Args: - file: The path to the file where the GCF data will be printed. - """ - headers = self.gcfs[0].to_dict().keys() - - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for gcf in self.gcfs: - row_data = gcf.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + def export_metabolomics_data(self, file: str | PathLike) -> None: + """Exports the metabolomics data to a specified file in tab-separated format. - def print_spectra(self, file: str | PathLike) -> None: - """Prints the Spectrum data to a specified file in tab-separated format. + Each row in the file corresponds to a Spectrum object. Args: - file: The path to the file where the Spectrum data will be printed. + file: The path to the file where the metabolomics data will be printed. """ headers = self.spectra[0].to_dict().keys() @@ -398,9 +388,18 @@ def print_spectra(self, file: str | PathLike) -> None: row_data = spectrum.to_dict() f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - def print_results(self, lg: LinkGraph | None = None) -> None: - """Prints the results to the output directory in tab-separated format.""" - self.print_bgcs(self._output_dir / "genomics_data.tsv") - self.print_spectra(self._output_dir / "metabolomics_data.tsv") + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. + + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_genomics_data(self._output_dir / "genomics_data.tsv") + self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv") if lg is not None: - lg.print_links(self._output_dir / "links.tsv") + lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index fd1db438..86a9ca6e 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -360,8 +360,8 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: return table - def print_links(self, file: str | PathLike) -> None: - """Print the links in the LinkGraph to a file. + def export_links(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file. Args: file: the file to write the links to. From 2207df1eb1a5cc7a5df8d801a103dfe31a162f68 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:07:22 +0200 Subject: [PATCH 04/24] improve doctring and typing --- src/nplinker/genomics/bgc.py | 26 +++++++++++++++++------- src/nplinker/metabolomics/spectrum.py | 29 ++++++++++++++++++++------- src/nplinker/scoring/link_graph.py | 2 +- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 57161d07..880a3710 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -193,23 +193,35 @@ def aa_predictions(self) -> list: self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] - def to_dict(self) -> dict: + def to_dict(self) -> dict[str, any]: """Convert the BGC object to a dictionary that can be used to export the results. + This method gathers relevant information from the BGC object and formats it into a dictionary + where each key-value pair represents a specific attribute of the BGC. + Returns: - A dictionary containing relavant information about the BGC object. + dict[str, str]: A dictionary containing relevant information about the BGC object, including: + - GCF_id: A comma-separated string of GCF IDs or "-" if none. + - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none. + - BGC_name: The name of the BGC. + - strain_id: The ID of the strain. + - description: A description of the BGC. + - antismash_id: The antiSMASH ID. + - antismash_region: The antiSMASH region. + - antismash_cluster_type: A comma-separated string of product predictions. + - mibig_bgc_class: The MiBIG BGC class or "-" if none. """ - gcf_ids = [gcf.id for gcf in self.parents if gcf.id is not None] - gcf_bsc = [gcf.bigscape_class for gcf in self.parents if gcf.bigscape_class is not None] + gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} + gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} return { - "GCF_id": ", ".join(gcf_ids) if gcf_ids else None, - "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else None, + "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", + "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-", "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, "antismash_cluster_type": ", ".join(self.product_prediction), - "mibig_bgc_class": self.mibig_bgc_class, + "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index a2891a2b..20f64c9f 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,18 +98,33 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def to_dict(self) -> dict: + def to_dict(self) -> dict[str, any]: """Convert the Spectrum object to a dictionary that can be used to export the results. + This method gathers relevant information from the Spectrum object and formats it into a dictionary + where each key-value pair represents a specific attribute of the Spectrum. + Returns: - A dictionary containing relavant information about the Spectrum object. + dict[str, str]: A dictionary containing relevant information about the Spectrum object, including: + - "spectrum_id": The unique identifier of the spectrum. + - "num_strains_with_spectrum": The number of strains associated with the spectrum. + - "precursor_mz": The precursor m/z value formatted to four decimal places. + - "rt": The retention time formatted to three decimal places. + - "molecular_family": The identifier of the molecular family, or "-" if not available. + - "gnps_id": The GNPS identifier, or "-" if not available. + - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. """ + + def format_gnps_annotations(annotations: dict) -> str: + """Format GNPS annotations dictionary into a string.""" + return "; ".join(f"{k}: {v}" for k, v in annotations.items()) + return { "spectrum_id": self.id, "num_strains_with_spectrum": len(self.strains), - "precursor_mz": self.precursor_mz, - "rt": self.rt, - "molecular_family": self.family.id if self.family else None, - "gnps_id": self.gnps_id, - "gnps_annotations": self.gnps_annotations, + "precursor_mz": round(self.precursor_mz, 4), + "rt": round(self.rt, 3), + "molecular_family": self.family.id if self.family else "-", + "gnps_id": self.gnps_id if self.gnps_id else "-", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 86a9ca6e..278f3941 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -286,7 +286,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def get_table_data(self, display_limit: int | None = None) -> list[dict]: + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: """Generate the table data for the LinkGraph. This method iterates over the links in the LinkGraph and constructs a table From c6e166a04647876cefd276036f1f9f799e7ecbbb Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:15:04 +0200 Subject: [PATCH 05/24] fix a failing test --- tests/unit/scoring/test_link_graph.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 4745c856..4c7e68b3 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -118,14 +118,11 @@ def test_get_table_data(lg, gcfs, spectra, score): table_data = lg.get_table_data() assert type(table_data) is list assert type(table_data[0]) is dict - assert table_data == [ - { - "Index": 1, - "Genomic Object Type": gcfs[0].__class__.__name__, - "Genomic Object ID": gcfs[0].id, - "Metabolomic Object Type": spectra[0].__class__.__name__, - "Metabolomic Object ID": spectra[0].id, - "Metcalf Score": f"{score.value:.2f}", - "Rosetta Score": "-", - }, - ] + assert len(table_data) == 1 + assert table_data[0]["index"] == 1 + assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__ + assert table_data[0]["genomic_object_id"] == gcfs[0].id + assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ + assert table_data[0]["metabolomic_object_id"] == spectra[0].id + assert table_data[0]["metcalf_score"] == f"{score.value:.2f}" + assert table_data[0]["rosetta_score"] == "-" From 32ca3ddd534c23cceede4c6318b82d5bd42c1ba2 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:21:43 +0200 Subject: [PATCH 06/24] refactor a little bit the spectrum method to covert to dict --- src/nplinker/metabolomics/spectrum.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 20f64c9f..3dc6b3ed 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,6 +98,10 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains + def _formatted_gnps_annotations(self) -> str: + """Format GNPS annotations dictionary into a string.""" + return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items()) + def to_dict(self) -> dict[str, any]: """Convert the Spectrum object to a dictionary that can be used to export the results. @@ -114,11 +118,6 @@ def to_dict(self) -> dict[str, any]: - "gnps_id": The GNPS identifier, or "-" if not available. - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. """ - - def format_gnps_annotations(annotations: dict) -> str: - """Format GNPS annotations dictionary into a string.""" - return "; ".join(f"{k}: {v}" for k, v in annotations.items()) - return { "spectrum_id": self.id, "num_strains_with_spectrum": len(self.strains), @@ -126,5 +125,7 @@ def format_gnps_annotations(annotations: dict) -> str: "rt": round(self.rt, 3), "molecular_family": self.family.id if self.family else "-", "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", + "gnps_annotations": self._formatted_gnps_annotations() + if self.gnps_annotations + else "-", } From 8e7945d3318a41de213a113ac2fb7c259f1002f5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:41:57 +0200 Subject: [PATCH 07/24] change the output format for gnps_annotations in metabolomics results file, improve docstrings --- src/nplinker/genomics/bgc.py | 26 +++++++++++------------ src/nplinker/metabolomics/spectrum.py | 30 +++++++++++---------------- src/nplinker/scoring/link_graph.py | 17 +++++++-------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 880a3710..2624cfae 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -194,22 +194,22 @@ def aa_predictions(self) -> list: return [self._aa_predictions] def to_dict(self) -> dict[str, any]: - """Convert the BGC object to a dictionary that can be used to export the results. + """Convert the BGC object to a dictionary for exporting results. - This method gathers relevant information from the BGC object and formats it into a dictionary - where each key-value pair represents a specific attribute of the BGC. + This method compiles relevant information from the BGC object and formats it into a dictionary. + Each key-value pair in the dictionary represents a specific attribute of the BGC. Returns: - dict[str, str]: A dictionary containing relevant information about the BGC object, including: - - GCF_id: A comma-separated string of GCF IDs or "-" if none. - - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none. - - BGC_name: The name of the BGC. - - strain_id: The ID of the strain. - - description: A description of the BGC. - - antismash_id: The antiSMASH ID. - - antismash_region: The antiSMASH region. - - antismash_cluster_type: A comma-separated string of product predictions. - - mibig_bgc_class: The MiBIG BGC class or "-" if none. + A dictionary containing the following key-value pairs: + - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available. + - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available. + - BGC_name (str): The name of the BGC. + - strain_id (str): The ID of the strain. + - description (str): A description of the BGC. + - antismash_id (str): The antiSMASH ID. + - antismash_region (str): The antiSMASH region. + - antismash_cluster_type (str): A comma-separated string of product predictions. + - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 3dc6b3ed..5c929a13 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,25 +98,21 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def _formatted_gnps_annotations(self) -> str: - """Format GNPS annotations dictionary into a string.""" - return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items()) - def to_dict(self) -> dict[str, any]: - """Convert the Spectrum object to a dictionary that can be used to export the results. + """Convert the Spectrum object to a dictionary for exporting results. - This method gathers relevant information from the Spectrum object and formats it into a dictionary - where each key-value pair represents a specific attribute of the Spectrum. + This method compiles relevant information from the Spectrum object into a dictionary format. + Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object. Returns: - dict[str, str]: A dictionary containing relevant information about the Spectrum object, including: - - "spectrum_id": The unique identifier of the spectrum. - - "num_strains_with_spectrum": The number of strains associated with the spectrum. - - "precursor_mz": The precursor m/z value formatted to four decimal places. - - "rt": The retention time formatted to three decimal places. - - "molecular_family": The identifier of the molecular family, or "-" if not available. - - "gnps_id": The GNPS identifier, or "-" if not available. - - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. + A dictionary containing containing the following key-value pairs: + - "spectrum_id" (str): The unique identifier of the spectrum. + - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. + - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. + - "rt" (float): The retention time, rounded to three decimal places. + - "molecular_family" (str): The identifier of the molecular family, or "-" if not available. + - "gnps_id" (str): The GNPS identifier, or "-" if not available. + - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available. """ return { "spectrum_id": self.id, @@ -125,7 +121,5 @@ def to_dict(self) -> dict[str, any]: "rt": round(self.rt, 3), "molecular_family": self.family.id if self.family else "-", "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self._formatted_gnps_annotations() - if self.gnps_annotations - else "-", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 278f3941..4f7753b9 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -299,15 +299,14 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table. If None, all rows are included. Returns: - list: A list of dictionaries, where each dictionary contains - the following keys: - - Index (int) - - Genomic Object Type (str) - - Genomic Object ID (str or int) - - Metabolomic Object Type (str) - - Metabolomic Object ID (str or int) - - Metcalf Score (str, formatted to 2 decimal places, or "-") - - Rosetta Score (str, formatted to 2 decimal places, or "-") + A list of dictionaries, where each dictionary contains + - index (int): The index of the link. + - genomic_object_type (str): The type of the genomic object. + - genomic_object_id (str or int): The ID of the genomic object. + - metabolomic_object_type (str): The type of the metabolomic object. + - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-". + - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-". """ genomic_object_classes = (GCF,) From 25928100a34bc0e0d49b706895e43fab50b0cee7 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 14:47:36 +0200 Subject: [PATCH 08/24] fix: convert int to str before using join --- src/nplinker/genomics/bgc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 2624cfae..d9787a38 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -211,8 +211,8 @@ def to_dict(self) -> dict[str, any]: - antismash_cluster_type (str): A comma-separated string of product predictions. - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ - gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} - gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} + gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None} + gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} return { "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", From 7f53de8456cd999c456bdd28fad07b2aca541c8a Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 15:40:07 +0200 Subject: [PATCH 09/24] change representation of empty values in output files for improved integration to excel --- src/nplinker/genomics/bgc.py | 6 +++--- src/nplinker/metabolomics/spectrum.py | 6 +++--- src/nplinker/scoring/link_graph.py | 24 ++++++++++++++++++------ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index d9787a38..902ba5f2 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -215,13 +215,13 @@ def to_dict(self) -> dict[str, any]: gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} return { - "GCF_id": ", ".join(gcf_ids) if gcf_ids else "-", - "GCF_bigscape_class": ", ".join(gcf_bsc) if gcf_bsc else "-", + "GCF_id": ", ".join(gcf_ids), + "GCF_bigscape_class": ", ".join(gcf_bsc), "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, "antismash_cluster_type": ", ".join(self.product_prediction), - "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "-", + "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 5c929a13..2b89dddc 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]: "num_strains_with_spectrum": len(self.strains), "precursor_mz": round(self.precursor_mz, 4), "rt": round(self.rt, 3), - "molecular_family": self.family.id if self.family else "-", - "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", + "molecular_family": self.family.id if self.family else "", + "gnps_id": self.gnps_id if self.gnps_id else "", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 4f7753b9..bd715723 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -104,6 +104,18 @@ def __init__(self) -> None: Get the link data between two objects: >>> lg.get_link_data(gcf, spectrum) {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + + Filter the links for `gcf1` and `gcf2`: + >>> new_lg = lg.filter([gcf1, gcf2]) + + Filter the links for `spectrum1` and `spectrum2`: + >>> new_lg = lg.filter([spectrum1, spectrum2]) + + Filter the links between two lists of objects: + >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2]) + + Export the links to a file: + >>> lg.export_links("links.tsv") """ self._g: Graph = Graph() @@ -305,8 +317,8 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any - genomic_object_id (str or int): The ID of the genomic object. - metabolomic_object_type (str): The type of the metabolomic object. - metabolomic_object_id (str or int): The ID of the metabolomic object. - - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-". - - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-". + - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. + - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. """ genomic_object_classes = (GCF,) @@ -321,12 +333,12 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table_data.append( { "index": index, - "genomic_object_type": genomic_object.__class__.__name__, "genomic_object_id": genomic_object.id, - "metabolomic_object_type": metabolomic_object.__class__.__name__, + "genomic_object_type": genomic_object.__class__.__name__, "metabolomic_object_id": metabolomic_object.id, - "metcalf_score": f"{metcalf_score.value:.2f}" if metcalf_score else "-", - "rosetta_score": f"{rosetta_score.value:.2f}" if rosetta_score else "-", + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", + "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } ) From ad049c843384c68dfa24dbee5ab99d00f6726c27 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Thu, 17 Oct 2024 17:00:59 +0200 Subject: [PATCH 10/24] refactoring the export methods --- src/nplinker/genomics/bgc.py | 9 ++-- src/nplinker/metabolomics/spectrum.py | 6 +-- src/nplinker/nplinker.py | 55 +++++++++++------------- src/nplinker/scoring/link_graph.py | 62 +++++++++++++++------------ tests/unit/scoring/test_link_graph.py | 4 +- 5 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 902ba5f2..486b9861 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -211,17 +211,14 @@ def to_dict(self) -> dict[str, any]: - antismash_cluster_type (str): A comma-separated string of product predictions. - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ - gcf_ids = {str(gcf.id) for gcf in self.parents if gcf.id is not None} - gcf_bsc = {str(bsc) for bsc in self.bigscape_classes if bsc is not None} - return { - "GCF_id": ", ".join(gcf_ids), - "GCF_bigscape_class": ", ".join(gcf_bsc), + "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, + "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, "BGC_name": self.id, "strain_id": self.strain.id, "description": self.description, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, - "antismash_cluster_type": ", ".join(self.product_prediction), + "antismash_cluster_type": self.product_prediction, "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 2b89dddc..aa008579 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -119,7 +119,7 @@ def to_dict(self) -> dict[str, any]: "num_strains_with_spectrum": len(self.strains), "precursor_mz": round(self.precursor_mz, 4), "rt": round(self.rt, 3), - "molecular_family": self.family.id if self.family else "", - "gnps_id": self.gnps_id if self.gnps_id else "", - "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "", + "molecular_family": self.family.id if self.family else None, + "gnps_id": self.gnps_id, + "gnps_annotations": self.gnps_annotations, } diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 9e87ed92..52599957 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,37 +356,34 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_genomics_data(self, file: str | PathLike) -> None: - """Exports the genomics data to a specified file in tab-separated format. - - Each row in the file corresponds to a BGC object. + def export_objects(self, objects: BGC | Spectrum, filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - file: The path to the file where the genomics data will be printed. + objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. """ - headers = self.bgcs[0].to_dict().keys() - - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for bgc in self.bgcs: - row_data = bgc.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") - - def export_metabolomics_data(self, file: str | PathLike) -> None: - """Exports the metabolomics data to a specified file in tab-separated format. - - Each row in the file corresponds to a Spectrum object. - - Args: - file: The path to the file where the metabolomics data will be printed. - """ - headers = self.spectra[0].to_dict().keys() - - with open(file, "w") as f: + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: f.write("\t".join(headers) + "\n") - for spectrum in self.spectra: - row_data = spectrum.to_dict() - f.write("\t".join(str(row_data[h]) for h in headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") def export_results(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. @@ -399,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None: lg (LinkGraph | None): An optional LinkGraph object. If provided, the links data will be exported to 'links.tsv'. """ - self.export_genomics_data(self._output_dir / "genomics_data.tsv") - self.export_metabolomics_data(self._output_dir / "metabolomics_data.tsv") + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") if lg is not None: lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index bd715723..0d6f4074 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -311,41 +311,47 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table. If None, all rows are included. Returns: - A list of dictionaries, where each dictionary contains + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data + + def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: + """Convert a link to a dictionary representation. + + Args: + link: A tuple containing the link information (u, v, data). + index: The index of the link. + + Returns: + A dictionary containing the link information with the following keys: - index (int): The index of the link. - - genomic_object_type (str): The type of the genomic object. - genomic_object_id (str or int): The ID of the genomic object. - - metabolomic_object_type (str): The type of the metabolomic object. + - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metabolomic_object_type (str): The type of the metabolomic object. - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. """ + u, v, data = link genomic_object_classes = (GCF,) - - table_data = [] - - for index, (u, v, data) in enumerate(self.links, start=1): - genomic_object = u if isinstance(u, genomic_object_classes) else v - metabolomic_object = v if isinstance(u, genomic_object_classes) else u - metcalf_score = data.get("metcalf") - rosetta_score = data.get("rosetta") - - table_data.append( - { - "index": index, - "genomic_object_id": genomic_object.id, - "genomic_object_type": genomic_object.__class__.__name__, - "metabolomic_object_id": metabolomic_object.id, - "metabolomic_object_type": metabolomic_object.__class__.__name__, - "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", - "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", - } - ) - - if display_limit is not None and index == display_limit: - break - - return table_data + genomic_object = u if isinstance(u, genomic_object_classes) else v + metabolomic_object = v if isinstance(u, genomic_object_classes) else u + metcalf_score = data.get("metcalf") + rosetta_score = data.get("rosetta") + return { + "index": index, + "genomic_object_id": genomic_object.id, + "genomic_object_type": genomic_object.__class__.__name__, + "metabolomic_object_id": metabolomic_object.id, + "metabolomic_object_type": metabolomic_object.__class__.__name__, + "metcalf_score": round(metcalf_score.value, 2) if metcalf_score else "", + "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", + } def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 4c7e68b3..5a4e7197 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -124,5 +124,5 @@ def test_get_table_data(lg, gcfs, spectra, score): assert table_data[0]["genomic_object_id"] == gcfs[0].id assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ assert table_data[0]["metabolomic_object_id"] == spectra[0].id - assert table_data[0]["metcalf_score"] == f"{score.value:.2f}" - assert table_data[0]["rosetta_score"] == "-" + assert table_data[0]["metcalf_score"] == round(score.value, 2) + assert table_data[0]["rosetta_score"] == "" From b220fb024af7be8479fe9facc362c3de83c9520f Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:07:21 +0200 Subject: [PATCH 11/24] small refactor: specify staticmethod --- src/nplinker/scoring/link_graph.py | 47 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 0d6f4074..091474e5 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -298,29 +298,8 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: if link_data is not None: lg.add_link(u, v, **link_data) - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. - - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data - - def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: + @staticmethod + def link_to_dict(link: LINK, index: int) -> dict[str, any]: """Convert a link to a dictionary representation. Args: @@ -353,6 +332,28 @@ def link_to_dict(self, link: LINK, index: int) -> dict[str, any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data + def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. From f98fa98097c6785925382a95a4bb43922bec0e71 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:10:01 +0200 Subject: [PATCH 12/24] add more tests --- src/nplinker/genomics/bgc.py | 6 ++--- tests/unit/genomics/test_bgc.py | 28 +++++++++++++++++++++ tests/unit/metabolomics/test_spectrum.py | 32 ++++++++++++++++++++++++ tests/unit/scoring/test_link_graph.py | 28 +++++++++++++++------ 4 files changed, 84 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 486b9861..7d606fe0 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -215,10 +215,10 @@ def to_dict(self) -> dict[str, any]: "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, "BGC_name": self.id, - "strain_id": self.strain.id, + "product_prediction": self.product_prediction, + "mibig_bgc_class": self.mibig_bgc_class, "description": self.description, + "strain_id": self.strain.id, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, - "antismash_cluster_type": self.product_prediction, - "mibig_bgc_class": self.mibig_bgc_class if self.mibig_bgc_class else "", } diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 1cf3f401..9706e961 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -24,3 +24,31 @@ def test_add_and_detach_parent(): assert bgc.parents == {gcf} bgc.detach_parent(gcf) assert bgc.parents == set() + + +def test_to_dict(): + bgc = BGC("BGC0000001", "Polyketide", "NRP") + bgc.strain = Strain("sample_strain") + bgc.description = "Sample description" + + dict_repr = bgc.to_dict() + assert dict_repr["GCF_id"] == set() + assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["BGC_name"] == "BGC0000001" + assert dict_repr["product_prediction"] == ("Polyketide", "NRP") + assert dict_repr["mibig_bgc_class"] is None + assert dict_repr["description"] == "Sample description" + assert dict_repr["strain_id"] == "sample_strain" + assert dict_repr["antismash_id"] is None + assert dict_repr["antismash_region"] is None + + bgc.add_parent(GCF("1")) + bgc.mibig_bgc_class = ("NRP",) + bgc.antismash_id = "ABC_0001" + bgc.antismash_region = 1 + dict_repr = bgc.to_dict() + assert dict_repr["GCF_id"] == {"1"} + assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["mibig_bgc_class"] == ("NRP",) + assert dict_repr["antismash_id"] == "ABC_0001" + assert dict_repr["antismash_region"] == 1 diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index e984eaba..d77ea0d4 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -68,3 +68,35 @@ def test_has_strain(): spec.strains.add(strain1) assert spec.has_strain(strain1) assert not spec.has_strain(strain2) + + +def test_to_dict(): + """Test the to_dict method.""" + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec.strains.add(Strain("strain1")) + spec.strains.add(Strain("strain2")) + + dict_repr = spec.to_dict() + assert dict_repr["spectrum_id"] == "spec1" + assert dict_repr["num_strains_with_spectrum"] == 2 + assert dict_repr["precursor_mz"] == 150.0 + assert dict_repr["rt"] == 0.0 + assert dict_repr["molecular_family"] is None + assert dict_repr["gnps_id"] is None + assert dict_repr["gnps_annotations"] == dict() + + # Test with gnps information + spec.gnps_id = "GNPS0001" + spec.gnps_annotations = {"annotation1": "value1"} + + # Test with molecular family + class MockMolecularFamily: + def __init__(self, id): + self.id = id + + spec.family = MockMolecularFamily("family1") + + dict_repr = spec.to_dict() + assert dict_repr["molecular_family"] == "family1" + assert dict_repr["gnps_id"] == "GNPS0001" + assert dict_repr["gnps_annotations"] == {"annotation1": "value1"} diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index 5a4e7197..f1542338 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -114,15 +114,29 @@ def test_filter(gcfs, spectra, score): assert len(lg_filtered) == 4 +def test_link_to_dict(lg, gcfs, spectra, score): + link = lg.links[0] + index = 1 + dict_repr = lg.link_to_dict(link, index) + assert type(dict_repr) is dict + assert dict_repr["index"] == 1 + assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__ + assert dict_repr["genomic_object_id"] == gcfs[0].id + assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__ + assert dict_repr["metabolomic_object_id"] == spectra[0].id + assert dict_repr["metcalf_score"] == round(score.value, 2) + assert dict_repr["rosetta_score"] == "" + + def test_get_table_data(lg, gcfs, spectra, score): + # add a second link + lg.add_link(gcfs[1], spectra[1], metcalf=score) + table_data = lg.get_table_data() assert type(table_data) is list assert type(table_data[0]) is dict + assert len(table_data) == 2 + + display_limit = 1 + table_data = lg.get_table_data(display_limit) assert len(table_data) == 1 - assert table_data[0]["index"] == 1 - assert table_data[0]["genomic_object_type"] == gcfs[0].__class__.__name__ - assert table_data[0]["genomic_object_id"] == gcfs[0].id - assert table_data[0]["metabolomic_object_type"] == spectra[0].__class__.__name__ - assert table_data[0]["metabolomic_object_id"] == spectra[0].id - assert table_data[0]["metcalf_score"] == round(score.value, 2) - assert table_data[0]["rosetta_score"] == "" From a8a83290b2a0980ab85cd1655af4f585aa7a2140 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Fri, 18 Oct 2024 17:28:46 +0200 Subject: [PATCH 13/24] correct typing in doctrings --- src/nplinker/genomics/bgc.py | 20 ++++++++++---------- src/nplinker/metabolomics/spectrum.py | 6 +++--- src/nplinker/scoring/link_graph.py | 8 ++++---- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 7d606fe0..2ea0c3b9 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -201,24 +201,24 @@ def to_dict(self) -> dict[str, any]: Returns: A dictionary containing the following key-value pairs: - - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available. - - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available. - - BGC_name (str): The name of the BGC. + - GCF_id (set): A set of GCF IDs. + - GCF_bigscape_class (set): A set of BiG-SCAPE classes. - strain_id (str): The ID of the strain. - - description (str): A description of the BGC. - - antismash_id (str): The antiSMASH ID. - - antismash_region (str): The antiSMASH region. - - antismash_cluster_type (str): A comma-separated string of product predictions. - - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. + - description (str | None): A description of the BGC. + - BGC_name (str): The name of the BGC. + - product_prediction (tuple): (predicted) natural products or product classes of the BGC. + - mibig_bgc_class (tuple[str] | None): MIBiG biosynthetic classes to which the BGC belongs. + - antismash_id (str | None): The antiSMASH ID. + - antismash_region (int | None): The antiSMASH region. """ return { "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, + "strain_id": self.strain.id, + "description": self.description, "BGC_name": self.id, "product_prediction": self.product_prediction, "mibig_bgc_class": self.mibig_bgc_class, - "description": self.description, - "strain_id": self.strain.id, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, } diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index aa008579..4842f9b0 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -110,9 +110,9 @@ def to_dict(self) -> dict[str, any]: - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. - "rt" (float): The retention time, rounded to three decimal places. - - "molecular_family" (str): The identifier of the molecular family, or "-" if not available. - - "gnps_id" (str): The GNPS identifier, or "-" if not available. - - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available. + - "molecular_family" (str | None ): The identifier of the molecular family. + - "gnps_id" (str | None ): The GNPS identifier. + - "gnps_annotations" (dict): A dictionary of GNPS annotations. """ return { "spectrum_id": self.id, diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 091474e5..4fc5c23b 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -309,12 +309,12 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]: Returns: A dictionary containing the link information with the following keys: - index (int): The index of the link. - - genomic_object_id (str or int): The ID of the genomic object. + - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metabolomic_object_id (str): The ID of the metabolomic object. - metabolomic_object_type (str): The type of the metabolomic object. - - metcalf_score (float): The Metcalf score, rounded to 2 decimal places. - - rosetta_score (float): The Rosetta score, rounded to 2 decimal places. + - metcalf_score (float | str): The Metcalf score, rounded to 2 decimal places. + - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places. """ u, v, data = link genomic_object_classes = (GCF,) From c6c33e6d1dbbcdf60bfaeaee3e5b83305ea36382 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 22 Oct 2024 18:25:26 +0200 Subject: [PATCH 14/24] typing: changed typings to pass mypy static typing checks --- .github/workflows/format-typing-check.yml | 2 +- pyproject.toml | 1 + src/nplinker/genomics/bgc.py | 7 ++++--- src/nplinker/metabolomics/spectrum.py | 3 ++- src/nplinker/nplinker.py | 4 ++-- src/nplinker/scoring/link_graph.py | 7 ++++--- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml index a5def2b9..10ea0990 100644 --- a/.github/workflows/format-typing-check.yml +++ b/.github/workflows/format-typing-check.yml @@ -37,7 +37,7 @@ jobs: - name: Install ruff and mypy run: | pip install ruff mypy typing_extensions \ - types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx pandas-stubs + types-Deprecated types-beautifulsoup4 types-jsonschema types-networkx types-tabulate pandas-stubs - name: Get all changed python files id: changed-python-files uses: tj-actions/changed-files@v44 diff --git a/pyproject.toml b/pyproject.toml index 4ab04c75..c627f6ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ dev = [ "types-beautifulsoup4", "types-jsonschema", "types-networkx", + "types-tabulate", "pandas-stubs", # docs "black", diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 2ea0c3b9..e59d29ae 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -1,6 +1,7 @@ from __future__ import annotations import logging from typing import TYPE_CHECKING +from typing import Any from deprecated import deprecated from nplinker.strain import Strain from .aa_pred import predict_aa @@ -193,7 +194,7 @@ def aa_predictions(self) -> list: self._aa_predictions[p[0]] = p[1] return [self._aa_predictions] - def to_dict(self) -> dict[str, any]: + def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. This method compiles relevant information from the BGC object and formats it into a dictionary. @@ -203,7 +204,7 @@ def to_dict(self) -> dict[str, any]: A dictionary containing the following key-value pairs: - GCF_id (set): A set of GCF IDs. - GCF_bigscape_class (set): A set of BiG-SCAPE classes. - - strain_id (str): The ID of the strain. + - strain_id (str | None): The ID of the strain. - description (str | None): A description of the BGC. - BGC_name (str): The name of the BGC. - product_prediction (tuple): (predicted) natural products or product classes of the BGC. @@ -214,7 +215,7 @@ def to_dict(self) -> dict[str, any]: return { "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, - "strain_id": self.strain.id, + "strain_id": self.strain.id if self.strain is not None else None, "description": self.description, "BGC_name": self.id, "product_prediction": self.product_prediction, diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 4842f9b0..e0e10e6d 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import cached_property from typing import TYPE_CHECKING +from typing import Any import numpy as np from nplinker.strain import Strain from nplinker.strain import StrainCollection @@ -98,7 +99,7 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def to_dict(self) -> dict[str, any]: + def to_dict(self) -> dict[str, Any]: """Convert the Spectrum object to a dictionary for exporting results. This method compiles relevant information from the Spectrum object into a dictionary format. diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 52599957..99e139bf 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,11 +356,11 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_objects(self, objects: BGC | Spectrum, filename: str) -> None: + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - objects (BGC | Spectrum): A list of BGC or Spectrum objects to be exported. + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. filename (str): The name of the file where the data will be saved. """ headers = objects[0].to_dict().keys() diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 4fc5c23b..8da29912 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -2,6 +2,7 @@ from collections.abc import Sequence from functools import wraps from os import PathLike +from typing import Any from typing import Union from networkx import Graph from tabulate import tabulate @@ -299,7 +300,7 @@ def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: lg.add_link(u, v, **link_data) @staticmethod - def link_to_dict(link: LINK, index: int) -> dict[str, any]: + def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. Args: @@ -332,7 +333,7 @@ def link_to_dict(link: LINK, index: int) -> dict[str, any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any]]: + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: """Generate the table data for the LinkGraph. This method iterates over the links in the LinkGraph and constructs a table @@ -372,7 +373,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: stralign="right", ) - if len(self.links) > display_limit: + if display_limit is not None and len(self.links) > display_limit: truncated_info = f"...\n[ {len(self.links)} links ]" table += f"\n{truncated_info}" From a2603381a6161574751cce26f3cebade34ce530b Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Tue, 22 Oct 2024 18:46:48 +0200 Subject: [PATCH 15/24] refactor: change the order of methods/functions --- src/nplinker/genomics/bgc.py | 64 +++++------ src/nplinker/nplinker.py | 138 ++++++++++++------------ src/nplinker/scoring/link_graph.py | 168 ++++++++++++++--------------- 3 files changed, 185 insertions(+), 185 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index e59d29ae..6dfd6c66 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -117,18 +117,6 @@ def __reduce__(self) -> tuple: """Reduce function for pickling.""" return (self.__class__, (self.id, *self.product_prediction), self.__dict__) - def add_parent(self, gcf: GCF) -> None: - """Add a parent GCF to the BGC. - - Args: - gcf: gene cluster family - """ - gcf.add_bgc(self) - - def detach_parent(self, gcf: GCF) -> None: - """Remove a parent GCF.""" - gcf.detach_bgc(self) - @property def strain(self) -> Strain | None: """Get the strain of the BGC.""" @@ -162,6 +150,18 @@ def bigscape_classes(self) -> set[str | None]: """ return {p.bigscape_class for p in self.parents} + def add_parent(self, gcf: GCF) -> None: + """Add a parent GCF to the BGC. + + Args: + gcf: gene cluster family + """ + gcf.add_bgc(self) + + def detach_parent(self, gcf: GCF) -> None: + """Remove a parent GCF.""" + gcf.detach_bgc(self) + def is_mibig(self) -> bool: """Check if the BGC is a MIBiG reference BGC or not. @@ -174,26 +174,6 @@ def is_mibig(self) -> bool: """ return self.id.startswith("BGC") - # CG: why not providing whole product but only amino acid as product monomer? - # this property is not used in NPLinker core business. - @property - @deprecated(version="2.0.0", reason="This method will be removed soon") - def aa_predictions(self) -> list: - """Amino acids as predicted monomers of product. - - Returns: - list of dicts with key as amino acid and value as prediction - probability. - """ - # Load aa predictions and cache them - self._aa_predictions = None - if self._aa_predictions is None: - self._aa_predictions = {} - if self.antismash_file is not None: - for p in predict_aa(self.antismash_file): - self._aa_predictions[p[0]] = p[1] - return [self._aa_predictions] - def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. @@ -223,3 +203,23 @@ def to_dict(self) -> dict[str, Any]: "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, } + + # CG: why not providing whole product but only amino acid as product monomer? + # this property is not used in NPLinker core business. + @property + @deprecated(version="2.0.0", reason="This method will be removed soon") + def aa_predictions(self) -> list: + """Amino acids as predicted monomers of product. + + Returns: + list of dicts with key as amino acid and value as prediction + probability. + """ + # Load aa predictions and cache them + self._aa_predictions = None + if self._aa_predictions is None: + self._aa_predictions = {} + if self.antismash_file is not None: + for p in predict_aa(self.antismash_file): + self._aa_predictions[p[0]] = p[1] + return [self._aa_predictions] diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 99e139bf..79dffcbe 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -168,34 +168,50 @@ def scoring_methods(self) -> list[str]: """Get names of all valid scoring methods.""" return list(self._valid_scoring_methods.keys()) - def load_data(self): - """Load all data from files into memory. - - This method is a convenience function that calls the - [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files - (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], - and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data - from the files into memory. + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. - The loaded data is stored in various data containers for easy access, e.g. - [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, - [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + Args: + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. """ - arranger = DatasetArranger(self.config) - arranger.arrange() - loader = DatasetLoader(self.config) - loader.load() + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: + f.write("\t".join(headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") - self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} - self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} - self._spec_dict = {spec.id: spec for spec in loader.spectra} - self._mf_dict = {mf.id: mf for mf in loader.mfs} + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. - self._mibig_bgcs = loader.mibig_bgcs - self._strains = loader.strains - self._product_types = loader.product_types - self._chem_classes = loader.chem_classes - self._class_matches = loader.class_matches + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") + if lg is not None: + lg.export_links(self._output_dir / "links.tsv") @overload def get_links( @@ -281,6 +297,35 @@ def get_links( return scoring.get_links(*objects, **scoring_params) + def load_data(self): + """Load all data from files into memory. + + This method is a convenience function that calls the + [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files + (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], + and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data + from the files into memory. + + The loaded data is stored in various data containers for easy access, e.g. + [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, + [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + """ + arranger = DatasetArranger(self.config) + arranger.arrange() + loader = DatasetLoader(self.config) + loader.load() + + self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} + self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} + self._spec_dict = {spec.id: spec for spec in loader.spectra} + self._mf_dict = {mf.id: mf for mf in loader.mfs} + + self._mibig_bgcs = loader.mibig_bgcs + self._strains = loader.strains + self._product_types = loader.product_types + self._chem_classes = loader.chem_classes + self._class_matches = loader.class_matches + def lookup_bgc(self, id: str) -> BGC | None: """Get the BGC object with the given ID. @@ -355,48 +400,3 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) - - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. - - Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. - filename (str): The name of the file where the data will be saved. - """ - headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") - for obj in objects: - row_data = obj.to_dict() - formatted_row = [] - for header in headers: - item = row_data.get(header, "") - # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) - # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string - else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") - - def export_results(self, lg: LinkGraph | None = None) -> None: - """Exports the results to the output directory in tab-separated format. - - This method exports genomics and metabolomics data to their respective - TSV files in the specified output directory. If a LinkGraph object is - provided, it also exports the links data to a TSV file. - - Args: - lg (LinkGraph | None): An optional LinkGraph object. If provided, - the links data will be exported to 'links.tsv'. - """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") - if lg is not None: - lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 8da29912..e01dbc59 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -198,44 +198,21 @@ def add_link( self._g.add_edge(u, v, **data) - @validate_uv - def has_link(self, u: Entity, v: Entity) -> bool: - """Check if there is a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - True if there is a link between the two objects, False otherwise - - Examples: - >>> lg.has_link(gcf, spectrum) - True - """ - return self._g.has_edge(u, v) - - @validate_uv - def get_link_data( - self, - u: Entity, - v: Entity, - ) -> LINK_DATA | None: - """Get the data for a link between two objects. + def export_links(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file. Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - A dictionary of scoring methods and their data for the link between the two objects, or - None if there is no link between the two objects. + file: the file to write the links to. Examples: - >>> lg.get_link_data(gcf, spectrum) - {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + >>> lg.print_links("links.tsv") """ - return self._g.get_edge_data(u, v) # type: ignore + table_data = self.get_table_data() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph: """Return a new LinkGraph object with the filtered links between the given objects. @@ -281,23 +258,66 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - @validate_u - def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: - """Filter the links for a given object and add them to the new LinkGraph object.""" - try: - links = self[u] - except KeyError: - pass - else: - for node2, value in links.items(): - lg.add_link(u, node2, **value) + @validate_uv + def get_link_data( + self, + u: Entity, + v: Entity, + ) -> LINK_DATA | None: + """Get the data for a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + A dictionary of scoring methods and their data for the link between the two objects, or + None if there is no link between the two objects. + + Examples: + >>> lg.get_link_data(gcf, spectrum) + {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + """ + return self._g.get_edge_data(u, v) # type: ignore + + def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + table_data = [] + for index, link in enumerate(self.links, start=1): + table_data.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return table_data @validate_uv - def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: - """Filter the links between two objects and add them to the new LinkGraph object.""" - link_data = self.get_link_data(u, v) - if link_data is not None: - lg.add_link(u, v, **link_data) + def has_link(self, u: Entity, v: Entity) -> bool: + """Check if there is a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + True if there is a link between the two objects, False otherwise + + Examples: + >>> lg.has_link(gcf, spectrum) + True + """ + return self._g.has_edge(u, v) @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: @@ -333,27 +353,23 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. + @validate_u + def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: + """Filter the links for a given object and add them to the new LinkGraph object.""" + try: + links = self[u] + except KeyError: + pass + else: + for node2, value in links.items(): + lg.add_link(u, node2, **value) - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data + @validate_uv + def _filter_two_nodes(self, u: Entity, v: Entity, lg: LinkGraph) -> None: + """Filter the links between two objects and add them to the new LinkGraph object.""" + link_data = self.get_link_data(u, v) + if link_data is not None: + lg.add_link(u, v, **link_data) def _get_table_repr(self, display_limit: int | None = 60) -> str: """Generate a table representation of the LinkGraph. @@ -378,19 +394,3 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: table += f"\n{truncated_info}" return table - - def export_links(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file. - - Args: - file: the file to write the links to. - - Examples: - >>> lg.print_links("links.tsv") - """ - table_data = self.get_table_data() - headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") From 328968358e4c12ceb38b1d8fdbbd3b699857144d Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 19:22:41 +0100 Subject: [PATCH 16/24] restore the order of already existing functions and methods --- src/nplinker/genomics/bgc.py | 24 ++--- src/nplinker/nplinker.py | 138 ++++++++++++++--------------- src/nplinker/scoring/link_graph.py | 78 ++++++++-------- 3 files changed, 120 insertions(+), 120 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 6dfd6c66..9b544160 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -117,6 +117,18 @@ def __reduce__(self) -> tuple: """Reduce function for pickling.""" return (self.__class__, (self.id, *self.product_prediction), self.__dict__) + def add_parent(self, gcf: GCF) -> None: + """Add a parent GCF to the BGC. + + Args: + gcf: gene cluster family + """ + gcf.add_bgc(self) + + def detach_parent(self, gcf: GCF) -> None: + """Remove a parent GCF.""" + gcf.detach_bgc(self) + @property def strain(self) -> Strain | None: """Get the strain of the BGC.""" @@ -150,18 +162,6 @@ def bigscape_classes(self) -> set[str | None]: """ return {p.bigscape_class for p in self.parents} - def add_parent(self, gcf: GCF) -> None: - """Add a parent GCF to the BGC. - - Args: - gcf: gene cluster family - """ - gcf.add_bgc(self) - - def detach_parent(self, gcf: GCF) -> None: - """Remove a parent GCF.""" - gcf.detach_bgc(self) - def is_mibig(self) -> bool: """Check if the BGC is a MIBiG reference BGC or not. diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 79dffcbe..99e139bf 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -168,50 +168,34 @@ def scoring_methods(self) -> list[str]: """Get names of all valid scoring methods.""" return list(self._valid_scoring_methods.keys()) - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + def load_data(self): + """Load all data from files into memory. - Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. - filename (str): The name of the file where the data will be saved. - """ - headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") - for obj in objects: - row_data = obj.to_dict() - formatted_row = [] - for header in headers: - item = row_data.get(header, "") - # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) - # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string - else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") + This method is a convenience function that calls the + [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files + (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], + and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data + from the files into memory. - def export_results(self, lg: LinkGraph | None = None) -> None: - """Exports the results to the output directory in tab-separated format. + The loaded data is stored in various data containers for easy access, e.g. + [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, + [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. + """ + arranger = DatasetArranger(self.config) + arranger.arrange() + loader = DatasetLoader(self.config) + loader.load() - This method exports genomics and metabolomics data to their respective - TSV files in the specified output directory. If a LinkGraph object is - provided, it also exports the links data to a TSV file. + self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} + self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} + self._spec_dict = {spec.id: spec for spec in loader.spectra} + self._mf_dict = {mf.id: mf for mf in loader.mfs} - Args: - lg (LinkGraph | None): An optional LinkGraph object. If provided, - the links data will be exported to 'links.tsv'. - """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") - if lg is not None: - lg.export_links(self._output_dir / "links.tsv") + self._mibig_bgcs = loader.mibig_bgcs + self._strains = loader.strains + self._product_types = loader.product_types + self._chem_classes = loader.chem_classes + self._class_matches = loader.class_matches @overload def get_links( @@ -297,35 +281,6 @@ def get_links( return scoring.get_links(*objects, **scoring_params) - def load_data(self): - """Load all data from files into memory. - - This method is a convenience function that calls the - [`DatasetArranger`][nplinker.arranger.DatasetArranger] class to arrange data files - (download, generate and/or validate data) in the [correct directory structure][working-directory-structure], - and then calls the [`DatasetLoader`][nplinker.loader.DatasetLoader] class to load all data - from the files into memory. - - The loaded data is stored in various data containers for easy access, e.g. - [`self.bgcs`][nplinker.NPLinker.bgcs] for all BGC objects, - [`self.strains`][nplinker.NPLinker.strains] for all Strain objects, etc. - """ - arranger = DatasetArranger(self.config) - arranger.arrange() - loader = DatasetLoader(self.config) - loader.load() - - self._bgc_dict = {bgc.id: bgc for bgc in loader.bgcs} - self._gcf_dict = {gcf.id: gcf for gcf in loader.gcfs} - self._spec_dict = {spec.id: spec for spec in loader.spectra} - self._mf_dict = {mf.id: mf for mf in loader.mfs} - - self._mibig_bgcs = loader.mibig_bgcs - self._strains = loader.strains - self._product_types = loader.product_types - self._chem_classes = loader.chem_classes - self._class_matches = loader.class_matches - def lookup_bgc(self, id: str) -> BGC | None: """Get the BGC object with the given ID. @@ -400,3 +355,48 @@ def save_data( data = (self.bgcs, self.gcfs, self.spectra, self.mfs, self.strains, links) with open(file, "wb") as f: pickle.dump(data, f) + + def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: + """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + + Args: + objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + filename (str): The name of the file where the data will be saved. + """ + headers = objects[0].to_dict().keys() + with open(self._output_dir / filename, "w") as f: + f.write("\t".join(headers) + "\n") + for obj in objects: + row_data = obj.to_dict() + formatted_row = [] + for header in headers: + item = row_data.get(header, "") + # Convert list, tuple, set to comma-separated string + if isinstance(item, (list, tuple, set)): + formatted_row.append(", ".join(map(str, item))) + # Convert dict to comma-separated string + elif isinstance(item, dict): + formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) + # Convert non-empty value to string + elif item: + formatted_row.append(str(item)) + # Convert empty value to empty string + else: + formatted_row.append("") + f.write("\t".join(formatted_row) + "\n") + + def export_results(self, lg: LinkGraph | None = None) -> None: + """Exports the results to the output directory in tab-separated format. + + This method exports genomics and metabolomics data to their respective + TSV files in the specified output directory. If a LinkGraph object is + provided, it also exports the links data to a TSV file. + + Args: + lg (LinkGraph | None): An optional LinkGraph object. If provided, + the links data will be exported to 'links.tsv'. + """ + self.export_objects(self.bgcs, "genomics_data.tsv") + self.export_objects(self.spectra, "metabolomics_data.tsv") + if lg is not None: + lg.export_links(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index e01dbc59..f7690013 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -198,6 +198,45 @@ def add_link( self._g.add_edge(u, v, **data) + @validate_uv + def has_link(self, u: Entity, v: Entity) -> bool: + """Check if there is a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + True if there is a link between the two objects, False otherwise + + Examples: + >>> lg.has_link(gcf, spectrum) + True + """ + return self._g.has_edge(u, v) + + @validate_uv + def get_link_data( + self, + u: Entity, + v: Entity, + ) -> LINK_DATA | None: + """Get the data for a link between two objects. + + Args: + u: the first object, either a GCF, Spectrum, or MolecularFamily + v: the second object, either a GCF, Spectrum, or MolecularFamily + + Returns: + A dictionary of scoring methods and their data for the link between the two objects, or + None if there is no link between the two objects. + + Examples: + >>> lg.get_link_data(gcf, spectrum) + {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} + """ + return self._g.get_edge_data(u, v) # type: ignore + def export_links(self, file: str | PathLike) -> None: """Exports the links in the LinkGraph to a file. @@ -258,28 +297,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - @validate_uv - def get_link_data( - self, - u: Entity, - v: Entity, - ) -> LINK_DATA | None: - """Get the data for a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - A dictionary of scoring methods and their data for the link between the two objects, or - None if there is no link between the two objects. - - Examples: - >>> lg.get_link_data(gcf, spectrum) - {"metcalf": Score("metcalf", 1.0, {"cutoff": 0.5})} - """ - return self._g.get_edge_data(u, v) # type: ignore - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: """Generate the table data for the LinkGraph. @@ -302,23 +319,6 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any break return table_data - @validate_uv - def has_link(self, u: Entity, v: Entity) -> bool: - """Check if there is a link between two objects. - - Args: - u: the first object, either a GCF, Spectrum, or MolecularFamily - v: the second object, either a GCF, Spectrum, or MolecularFamily - - Returns: - True if there is a link between the two objects, False otherwise - - Examples: - >>> lg.has_link(gcf, spectrum) - True - """ - return self._g.has_edge(u, v) - @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. From d2272e2ffe7ad32738fc04affd4612536364efc5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:10:01 +0100 Subject: [PATCH 17/24] make dicts json compatible --- src/nplinker/genomics/bgc.py | 18 +++++++++--------- src/nplinker/metabolomics/spectrum.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 9b544160..c61d7942 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -177,28 +177,28 @@ def is_mibig(self) -> bool: def to_dict(self) -> dict[str, Any]: """Convert the BGC object to a dictionary for exporting results. - This method compiles relevant information from the BGC object and formats it into a dictionary. + Compiles relevant information from the BGC object and formats it into a dictionary. Each key-value pair in the dictionary represents a specific attribute of the BGC. Returns: A dictionary containing the following key-value pairs: - - GCF_id (set): A set of GCF IDs. - - GCF_bigscape_class (set): A set of BiG-SCAPE classes. + - GCF_id (list[str]): A list of GCF IDs. + - GCF_bigscape_class (list[str | None]): A list of BiG-SCAPE classes. - strain_id (str | None): The ID of the strain. - description (str | None): A description of the BGC. - BGC_name (str): The name of the BGC. - - product_prediction (tuple): (predicted) natural products or product classes of the BGC. - - mibig_bgc_class (tuple[str] | None): MIBiG biosynthetic classes to which the BGC belongs. + - product_prediction (list[str]): (predicted) products or product classes of the BGC. + - mibig_bgc_class (list[str] | None): MIBiG biosynthetic classes. - antismash_id (str | None): The antiSMASH ID. - - antismash_region (int | None): The antiSMASH region. + - antismash_region (int | None): The antiSMASH region number. """ return { - "GCF_id": {gcf.id for gcf in self.parents if gcf.id is not None}, - "GCF_bigscape_class": {bsc for bsc in self.bigscape_classes if bsc is not None}, + "GCF_id": [gcf.id for gcf in self.parents if gcf.id is not None], + "GCF_bigscape_class": [bsc for bsc in self.bigscape_classes if bsc is not None], "strain_id": self.strain.id if self.strain is not None else None, "description": self.description, "BGC_name": self.id, - "product_prediction": self.product_prediction, + "product_prediction": list(self.product_prediction), "mibig_bgc_class": self.mibig_bgc_class, "antismash_id": self.antismash_id, "antismash_region": self.antismash_region, diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index e0e10e6d..6fccf47b 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -113,7 +113,7 @@ def to_dict(self) -> dict[str, Any]: - "rt" (float): The retention time, rounded to three decimal places. - "molecular_family" (str | None ): The identifier of the molecular family. - "gnps_id" (str | None ): The GNPS identifier. - - "gnps_annotations" (dict): A dictionary of GNPS annotations. + - "gnps_annotations" (dict[str, str]): A dictionary of GNPS annotations. """ return { "spectrum_id": self.id, From cb49209bbba18e47c1d05b5a65b2a325de29404c Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:16:57 +0100 Subject: [PATCH 18/24] rename functions and variables --- src/nplinker/nplinker.py | 12 ++--- src/nplinker/scoring/link_graph.py | 84 +++++++++++++++--------------- 2 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 99e139bf..bc03fde7 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -356,8 +356,8 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> None: - """Exports the data for a list of BGC or Spectrum objects to a specified file in tab-separated format. + def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None: + """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. @@ -385,7 +385,7 @@ def export_objects(self, objects: Sequence[BGC | Spectrum], filename: str) -> No formatted_row.append("") f.write("\t".join(formatted_row) + "\n") - def export_results(self, lg: LinkGraph | None = None) -> None: + def to_tsv(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. This method exports genomics and metabolomics data to their respective @@ -396,7 +396,7 @@ def export_results(self, lg: LinkGraph | None = None) -> None: lg (LinkGraph | None): An optional LinkGraph object. If provided, the links data will be exported to 'links.tsv'. """ - self.export_objects(self.bgcs, "genomics_data.tsv") - self.export_objects(self.spectra, "metabolomics_data.tsv") + self.objects_to_tsv(self.bgcs, "genomics_data.tsv") + self.objects_to_tsv(self.spectra, "metabolomics_data.tsv") if lg is not None: - lg.export_links(self._output_dir / "links.tsv") + lg.to_tsv(self._output_dir / "links.tsv") diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index f7690013..32ed290a 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -237,22 +237,6 @@ def get_link_data( """ return self._g.get_edge_data(u, v) # type: ignore - def export_links(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file. - - Args: - file: the file to write the links to. - - Examples: - >>> lg.print_links("links.tsv") - """ - table_data = self.get_table_data() - headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") - def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) -> LinkGraph: """Return a new LinkGraph object with the filtered links between the given objects. @@ -297,28 +281,6 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg - def get_table_data(self, display_limit: int | None = None) -> list[dict[str, Any]]: - """Generate the table data for the LinkGraph. - - This method iterates over the links in the LinkGraph and constructs a table - containing information about genomic and metabolomic objects, as well as their - associated scores. Each row in the table represents a link between a genomic - object and a metabolomic object. - - Args: - display_limit (int | None): The maximum number of rows to include in the - table. If None, all rows are included. - - Returns: - A list of dictionaries containing the table data. - """ - table_data = [] - for index, link in enumerate(self.links, start=1): - table_data.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break - return table_data - @staticmethod def link_to_dict(link: LINK, index: int) -> dict[str, Any]: """Convert a link to a dictionary representation. @@ -338,9 +300,9 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: - rosetta_score (float | str): The Rosetta score, rounded to 2 decimal places. """ u, v, data = link - genomic_object_classes = (GCF,) - genomic_object = u if isinstance(u, genomic_object_classes) else v - metabolomic_object = v if isinstance(u, genomic_object_classes) else u + genomic_types = (GCF,) + genomic_object = u if isinstance(u, genomic_types) else v + metabolomic_object = v if isinstance(u, genomic_types) else u metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") return { @@ -353,6 +315,22 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: "rosetta_score": round(rosetta_score.value, 2) if rosetta_score else "", } + def to_tsv(self, file: str | PathLike) -> None: + """Exports the links in the LinkGraph to a file in tab-separated format. + + Args: + file: the file to write the links to. + + Examples: + >>> lg.print_links("links.tsv") + """ + table_data = self._links_to_dicts() + headers = table_data[0].keys() + with open(file, "w") as f: + f.write("\t".join(headers) + "\n") + for row in table_data: + f.write("\t".join(str(row[h]) for h in headers) + "\n") + @validate_u def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: """Filter the links for a given object and add them to the new LinkGraph object.""" @@ -383,7 +361,7 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: of links is appended. """ table = tabulate( - self.get_table_data(display_limit), + self._links_to_dicts(display_limit), headers="keys", tablefmt="github", stralign="right", @@ -394,3 +372,25 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: table += f"\n{truncated_info}" return table + + def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, Any]]: + """Generate the table data for the LinkGraph. + + This method iterates over the links in the LinkGraph and constructs a table + containing information about genomic and metabolomic objects, as well as their + associated scores. Each row in the table represents a link between a genomic + object and a metabolomic object. + + Args: + display_limit (int | None): The maximum number of rows to include in the + table. If None, all rows are included. + + Returns: + A list of dictionaries containing the table data. + """ + link_dicts = [] + for index, link in enumerate(self.links, start=1): + link_dicts.append(self.link_to_dict(link, index)) + if display_limit is not None and index == display_limit: + break + return link_dicts From 6a4da5f0761a8b4388d9b4ae5e62a8af9b7c79d2 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 20:31:36 +0100 Subject: [PATCH 19/24] refactor: changed the place when the index is added to the link dict --- src/nplinker/scoring/link_graph.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 32ed290a..45ba5b30 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -282,16 +282,14 @@ def filter(self, u_nodes: Sequence[Entity], v_nodes: Sequence[Entity] = [], /) - return lg @staticmethod - def link_to_dict(link: LINK, index: int) -> dict[str, Any]: + def link_to_dict(link: LINK) -> dict[str, Any]: """Convert a link to a dictionary representation. Args: link: A tuple containing the link information (u, v, data). - index: The index of the link. Returns: A dictionary containing the link information with the following keys: - - index (int): The index of the link. - genomic_object_id (str): The ID of the genomic object. - genomic_object_type (str): The type of the genomic object. - metabolomic_object_id (str): The ID of the metabolomic object. @@ -306,7 +304,6 @@ def link_to_dict(link: LINK, index: int) -> dict[str, Any]: metcalf_score = data.get("metcalf") rosetta_score = data.get("rosetta") return { - "index": index, "genomic_object_id": genomic_object.id, "genomic_object_type": genomic_object.__class__.__name__, "metabolomic_object_id": metabolomic_object.id, @@ -388,9 +385,8 @@ def _links_to_dicts(self, display_limit: int | None = None) -> list[dict[str, An Returns: A list of dictionaries containing the table data. """ + links = self.links[:display_limit] if display_limit else self.links link_dicts = [] - for index, link in enumerate(self.links, start=1): - link_dicts.append(self.link_to_dict(link, index)) - if display_limit is not None and index == display_limit: - break + for idx, link in enumerate(links): + link_dicts.append({"index": idx + 1, **self.link_to_dict(link)}) return link_dicts From edcc7db0d7b97be459d14b77f2768191db54a9cc Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:08:04 +0100 Subject: [PATCH 20/24] use csv package to write the tabular output files --- src/nplinker/nplinker.py | 37 ++++++++++++++++-------------- src/nplinker/scoring/link_graph.py | 13 ++++++----- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index bc03fde7..16713f40 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -1,4 +1,5 @@ from __future__ import annotations +import csv import logging import pickle from collections.abc import Sequence @@ -356,34 +357,36 @@ def save_data( with open(file, "wb") as f: pickle.dump(data, f) - def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[BGC], filename: str) -> None: + def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: str) -> None: """Exports a list of BGC or Spectrum objects to a specified file in tab-separated format. Args: - objects (list[BGC | Spectrum]): A list of BGC or Spectrum objects to be exported. + objects (list): A list of BGC or a list of Spectrum objects to be exported. filename (str): The name of the file where the data will be saved. """ + if not objects: + raise ValueError("No objects provided to export") + headers = objects[0].to_dict().keys() - with open(self._output_dir / filename, "w") as f: - f.write("\t".join(headers) + "\n") + with open(self._output_dir / filename, "w", newline="") as outfile: + writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") + writer.writeheader() for obj in objects: - row_data = obj.to_dict() - formatted_row = [] + row = obj.to_dict() for header in headers: - item = row_data.get(header, "") + value = row[header] # Convert list, tuple, set to comma-separated string - if isinstance(item, (list, tuple, set)): - formatted_row.append(", ".join(map(str, item))) + if isinstance(value, (list, tuple, set)): + row[header] = ", ".join(map(str, value)) # Convert dict to comma-separated string - elif isinstance(item, dict): - formatted_row.append(", ".join([f"{k}:{v}" for k, v in item.items()])) - # Convert non-empty value to string - elif item: - formatted_row.append(str(item)) - # Convert empty value to empty string + elif isinstance(value, dict): + row[header] = ", ".join([f"{k}:{v}" for k, v in value.items()]) + # Convert anything else to string else: - formatted_row.append("") - f.write("\t".join(formatted_row) + "\n") + row[header] = str(value) if value else "" + # Replace tabs with 4 spaces + row[header] = row[header].replace("\t", " ") + writer.writerow(row) def to_tsv(self, lg: LinkGraph | None = None) -> None: """Exports the results to the output directory in tab-separated format. diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 45ba5b30..d1f3cf4b 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -1,4 +1,5 @@ from __future__ import annotations +import csv from collections.abc import Sequence from functools import wraps from os import PathLike @@ -313,20 +314,20 @@ def link_to_dict(link: LINK) -> dict[str, Any]: } def to_tsv(self, file: str | PathLike) -> None: - """Exports the links in the LinkGraph to a file in tab-separated format. + """Exports the links in the LinkGraph to a file in tab-separated format. Args: file: the file to write the links to. Examples: - >>> lg.print_links("links.tsv") + >>> lg.to_tsv("links.tsv") """ table_data = self._links_to_dicts() headers = table_data[0].keys() - with open(file, "w") as f: - f.write("\t".join(headers) + "\n") - for row in table_data: - f.write("\t".join(str(row[h]) for h in headers) + "\n") + with open(file, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=headers, delimiter="\t") + writer.writeheader() + writer.writerows(table_data) @validate_u def _filter_one_node(self, u: Entity, lg: LinkGraph) -> None: From 05f9f76ef26847b54554fd582ca27caa7c424245 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:12:54 +0100 Subject: [PATCH 21/24] make sure all elements of the input list have the same type of data. --- src/nplinker/nplinker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 16713f40..55450a24 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -367,6 +367,11 @@ def objects_to_tsv(self, objects: Sequence[BGC] | Sequence[Spectrum], filename: if not objects: raise ValueError("No objects provided to export") + # Ensure all elements in the list are of the same type + obj_type = type(objects[0]) + if not all(isinstance(obj, obj_type) for obj in objects): + raise TypeError("All objects in the list must be of the same type") + headers = objects[0].to_dict().keys() with open(self._output_dir / filename, "w", newline="") as outfile: writer = csv.DictWriter(outfile, fieldnames=headers, delimiter="\t") From bff7731c7ae1593091897d6590b4789bdaec9067 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:20:01 +0100 Subject: [PATCH 22/24] shorten to long doc string lines, correct some doc strings --- src/nplinker/scoring/link_graph.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index d1f3cf4b..5ee61aa7 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -79,17 +79,17 @@ def __init__(self) -> None: Display the empty LinkGraph object: >>> lg - | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | - |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| + | index | genomic_object_id | genomic_object_type | metabolomic_object_id | metabolomic_object_type | metcalf_score | rosetta_score | + |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------| Add a link between a GCF and a Spectrum object: >>> lg.add_link(gcf, spectrum, metcalf=Score("metcalf", 1.0, {"cutoff": 0.5})) Display all links in LinkGraph object: >>> lg - | | Genomic Object Type | Genomic Object ID | Metabolomic Object Type | Metabolomic Object ID | Metcalf Score | Rosetta Score | - |----|-----------------------|---------------------|---------------------------|-------------------------|-----------------|-----------------| - | 1 | GCF | 1 | Spectrum | 1 | 1.00 | - | + | index | genomic_object_id | genomic_object_type | metabolomic_object_id | metabolomic_object_type | metcalf_score | rosetta_score | + |---------|---------------------|-----------------------|-------------------------|---------------------------|-----------------|-----------------| + | 1 | 1 | GCF | 1 | Spectrum | 1.00 | | Get all links for a given object: >>> lg[gcf] @@ -117,7 +117,7 @@ def __init__(self) -> None: >>> new_lg = lg.filter([gcf1, gcf2], [spectrum1, spectrum2]) Export the links to a file: - >>> lg.export_links("links.tsv") + >>> lg.to_tsv("links.tsv") """ self._g: Graph = Graph() @@ -354,9 +354,9 @@ def _get_table_repr(self, display_limit: int | None = 60) -> str: display_limit: The maximum number of links to display in the table. Defaults to 60. Returns: - str: A string representation of the table in GitHub-flavored markdown format. If the number of links - exceeds the display limit, the table is truncated and an additional line indicating the total number - of links is appended. + str: A string representation of the table in GitHub-flavored markdown format. If the + number of links exceeds the display limit, the table is truncated and an additional + line indicating the total number of links is appended. """ table = tabulate( self._links_to_dicts(display_limit), From d4bf9fb2d277424faa52181db9750a85d000c322 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:32:05 +0100 Subject: [PATCH 23/24] tests: adapted the test to the changes --- tests/unit/data/justafile.ipynb | 131 ++++++++++++++++++++++++++ tests/unit/genomics/test_bgc.py | 20 ++-- tests/unit/scoring/test_link_graph.py | 12 +-- 3 files changed, 150 insertions(+), 13 deletions(-) create mode 100644 tests/unit/data/justafile.ipynb diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb new file mode 100644 index 00000000..43a5453b --- /dev/null +++ b/tests/unit/data/justafile.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nplinker.genomics.antismash import AntismashBGCLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "loader = AntismashBGCLoader(\"antismash\")\n", + "mapping = loader.get_genome_bgcs_mapping()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isinstance(mapping, dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(mapping[\"GCF_000514515.1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'NZ_AZWB01000006.region001'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapping[\"GCF_000514515.1\"][-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "npl_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/unit/genomics/test_bgc.py b/tests/unit/genomics/test_bgc.py index 9706e961..71f173ba 100644 --- a/tests/unit/genomics/test_bgc.py +++ b/tests/unit/genomics/test_bgc.py @@ -32,10 +32,10 @@ def test_to_dict(): bgc.description = "Sample description" dict_repr = bgc.to_dict() - assert dict_repr["GCF_id"] == set() - assert dict_repr["GCF_bigscape_class"] == set() + assert dict_repr["GCF_id"] == list() + assert dict_repr["GCF_bigscape_class"] == list() assert dict_repr["BGC_name"] == "BGC0000001" - assert dict_repr["product_prediction"] == ("Polyketide", "NRP") + assert dict_repr["product_prediction"] == ["Polyketide", "NRP"] assert dict_repr["mibig_bgc_class"] is None assert dict_repr["description"] == "Sample description" assert dict_repr["strain_id"] == "sample_strain" @@ -43,12 +43,18 @@ def test_to_dict(): assert dict_repr["antismash_region"] is None bgc.add_parent(GCF("1")) - bgc.mibig_bgc_class = ("NRP",) + bgc.mibig_bgc_class = [ + "NRP", + ] bgc.antismash_id = "ABC_0001" bgc.antismash_region = 1 dict_repr = bgc.to_dict() - assert dict_repr["GCF_id"] == {"1"} - assert dict_repr["GCF_bigscape_class"] == set() - assert dict_repr["mibig_bgc_class"] == ("NRP",) + assert dict_repr["GCF_id"] == [ + "1", + ] + assert dict_repr["GCF_bigscape_class"] == list() + assert dict_repr["mibig_bgc_class"] == [ + "NRP", + ] assert dict_repr["antismash_id"] == "ABC_0001" assert dict_repr["antismash_region"] == 1 diff --git a/tests/unit/scoring/test_link_graph.py b/tests/unit/scoring/test_link_graph.py index f1542338..32e73f7f 100644 --- a/tests/unit/scoring/test_link_graph.py +++ b/tests/unit/scoring/test_link_graph.py @@ -116,10 +116,8 @@ def test_filter(gcfs, spectra, score): def test_link_to_dict(lg, gcfs, spectra, score): link = lg.links[0] - index = 1 - dict_repr = lg.link_to_dict(link, index) + dict_repr = lg.link_to_dict(link) assert type(dict_repr) is dict - assert dict_repr["index"] == 1 assert dict_repr["genomic_object_type"] == gcfs[0].__class__.__name__ assert dict_repr["genomic_object_id"] == gcfs[0].id assert dict_repr["metabolomic_object_type"] == spectra[0].__class__.__name__ @@ -128,15 +126,17 @@ def test_link_to_dict(lg, gcfs, spectra, score): assert dict_repr["rosetta_score"] == "" -def test_get_table_data(lg, gcfs, spectra, score): +def test__links_to_dicts(lg, gcfs, spectra, score): # add a second link lg.add_link(gcfs[1], spectra[1], metcalf=score) - table_data = lg.get_table_data() + table_data = lg._links_to_dicts() assert type(table_data) is list assert type(table_data[0]) is dict assert len(table_data) == 2 + assert table_data[0]["index"] == 1 + assert table_data[1]["index"] == 2 display_limit = 1 - table_data = lg.get_table_data(display_limit) + table_data = lg._links_to_dicts(display_limit) assert len(table_data) == 1 From 2c05efbbc0b7c511beec865241e7cfe8024c5cab Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Mon, 4 Nov 2024 21:38:40 +0100 Subject: [PATCH 24/24] remove a file that was committed by accident --- tests/unit/data/justafile.ipynb | 131 -------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 tests/unit/data/justafile.ipynb diff --git a/tests/unit/data/justafile.ipynb b/tests/unit/data/justafile.ipynb deleted file mode 100644 index 43a5453b..00000000 --- a/tests/unit/data/justafile.ipynb +++ /dev/null @@ -1,131 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from nplinker.genomics.antismash import AntismashBGCLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "loader = AntismashBGCLoader(\"antismash\")\n", - "mapping = loader.get_genome_bgcs_mapping()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "isinstance(mapping, dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapping)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "20" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(mapping[\"GCF_000514515.1\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'NZ_AZWB01000006.region001'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mapping[\"GCF_000514515.1\"][-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "npl_dev", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}