From 8e7945d3318a41de213a113ac2fb7c259f1002f5 Mon Sep 17 00:00:00 2001 From: Annette Lien Date: Wed, 16 Oct 2024 18:41:57 +0200 Subject: [PATCH] change the output format for gnps_annotations in metabolomics results file, improve docstrings --- src/nplinker/genomics/bgc.py | 26 +++++++++++------------ src/nplinker/metabolomics/spectrum.py | 30 +++++++++++---------------- src/nplinker/scoring/link_graph.py | 17 +++++++-------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index 880a3710..2624cfae 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -194,22 +194,22 @@ def aa_predictions(self) -> list: return [self._aa_predictions] def to_dict(self) -> dict[str, any]: - """Convert the BGC object to a dictionary that can be used to export the results. + """Convert the BGC object to a dictionary for exporting results. - This method gathers relevant information from the BGC object and formats it into a dictionary - where each key-value pair represents a specific attribute of the BGC. + This method compiles relevant information from the BGC object and formats it into a dictionary. + Each key-value pair in the dictionary represents a specific attribute of the BGC. Returns: - dict[str, str]: A dictionary containing relevant information about the BGC object, including: - - GCF_id: A comma-separated string of GCF IDs or "-" if none. - - GCF_bigscape_class: A comma-separated string of BiG-SCAPE classes or "-" if none. - - BGC_name: The name of the BGC. - - strain_id: The ID of the strain. - - description: A description of the BGC. - - antismash_id: The antiSMASH ID. - - antismash_region: The antiSMASH region. - - antismash_cluster_type: A comma-separated string of product predictions. - - mibig_bgc_class: The MiBIG BGC class or "-" if none. + A dictionary containing the following key-value pairs: + - GCF_id (str): A comma-separated string of GCF IDs or "-" if none are available. + - GCF_bigscape_class (str): A comma-separated string of BiG-SCAPE classes or "-" if none are available. + - BGC_name (str): The name of the BGC. + - strain_id (str): The ID of the strain. + - description (str): A description of the BGC. + - antismash_id (str): The antiSMASH ID. + - antismash_region (str): The antiSMASH region. + - antismash_cluster_type (str): A comma-separated string of product predictions. + - mibig_bgc_class (str): The MiBIG BGC class or "-" if none is available. """ gcf_ids = {gcf.id for gcf in self.parents if gcf.id is not None} gcf_bsc = {bsc for bsc in self.bigscape_classes if bsc is not None} diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 3dc6b3ed..5c929a13 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -98,25 +98,21 @@ def has_strain(self, strain: Strain) -> bool: """ return strain in self.strains - def _formatted_gnps_annotations(self) -> str: - """Format GNPS annotations dictionary into a string.""" - return "; ".join(f"{k}: {v}" for k, v in self.gnps_annotations.items()) - def to_dict(self) -> dict[str, any]: - """Convert the Spectrum object to a dictionary that can be used to export the results. + """Convert the Spectrum object to a dictionary for exporting results. - This method gathers relevant information from the Spectrum object and formats it into a dictionary - where each key-value pair represents a specific attribute of the Spectrum. + This method compiles relevant information from the Spectrum object into a dictionary format. + Each key-value pair in the dictionary represents a specific attribute of the Spectrum Object. Returns: - dict[str, str]: A dictionary containing relevant information about the Spectrum object, including: - - "spectrum_id": The unique identifier of the spectrum. - - "num_strains_with_spectrum": The number of strains associated with the spectrum. - - "precursor_mz": The precursor m/z value formatted to four decimal places. - - "rt": The retention time formatted to three decimal places. - - "molecular_family": The identifier of the molecular family, or "-" if not available. - - "gnps_id": The GNPS identifier, or "-" if not available. - - "gnps_annotations": A formatted string of GNPS annotations, or "-" if not available. + A dictionary containing containing the following key-value pairs: + - "spectrum_id" (str): The unique identifier of the spectrum. + - "num_strains_with_spectrum" (int): The number of strains associated with the spectrum. + - "precursor_mz" (float): The precursor m/z value, rounded to four decimal places. + - "rt" (float): The retention time, rounded to three decimal places. + - "molecular_family" (str): The identifier of the molecular family, or "-" if not available. + - "gnps_id" (str): The GNPS identifier, or "-" if not available. + - "gnps_annotations" (dict | str): A dictionary of GNPS annotations, or "-" if not available. """ return { "spectrum_id": self.id, @@ -125,7 +121,5 @@ def to_dict(self) -> dict[str, any]: "rt": round(self.rt, 3), "molecular_family": self.family.id if self.family else "-", "gnps_id": self.gnps_id if self.gnps_id else "-", - "gnps_annotations": self._formatted_gnps_annotations() - if self.gnps_annotations - else "-", + "gnps_annotations": self.gnps_annotations if self.gnps_annotations else "-", } diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 278f3941..4f7753b9 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -299,15 +299,14 @@ def get_table_data(self, display_limit: int | None = None) -> list[dict[str, any table. If None, all rows are included. Returns: - list: A list of dictionaries, where each dictionary contains - the following keys: - - Index (int) - - Genomic Object Type (str) - - Genomic Object ID (str or int) - - Metabolomic Object Type (str) - - Metabolomic Object ID (str or int) - - Metcalf Score (str, formatted to 2 decimal places, or "-") - - Rosetta Score (str, formatted to 2 decimal places, or "-") + A list of dictionaries, where each dictionary contains + - index (int): The index of the link. + - genomic_object_type (str): The type of the genomic object. + - genomic_object_id (str or int): The ID of the genomic object. + - metabolomic_object_type (str): The type of the metabolomic object. + - metabolomic_object_id (str or int): The ID of the metabolomic object. + - metcalf_score (str): The Metcalf score, formatted to 2 decimal places, or "-". + - rosetta_score (str): The Rosetta score, formatted to 2 decimal places, or "-". """ genomic_object_classes = (GCF,)