From 817dabdb06fc01815475cd46df030d2468235895 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 4 Aug 2023 12:01:24 +0200 Subject: [PATCH] Create functions to extract/get/generate strain mappings (#166) * Fix pandas api change on testing * add logger to `generate_genome_bgc_mappings_file` function * add `GNPS_FILE_MAPPINGS_FILENAME` to globals * add `generate_strain_mappings` method to `DatasetLoader` * fix variable bug * rename function `generate_genome_bgc_mappings_file` to `generate_mappings_genome_id_bgc_id` * add parameter `output_file` to func `generate_mappings_genome_id_bgc_id` * fix the bug of empty genome id * Create strain_mappings_generator.py * Create test_strain_mappings_generator.py * fix logic bug on getting best id --- src/nplinker/genomics/__init__.py | 4 +- src/nplinker/genomics/genomics.py | 18 +- src/nplinker/globals.py | 1 + src/nplinker/loader.py | 27 +- src/nplinker/nplinker.py | 3 +- src/nplinker/pairedomics/__init__.py | 23 +- .../pairedomics/podp_antismash_downloader.py | 23 +- .../pairedomics/strain_mappings_generator.py | 331 ++++++++++++++++++ tests/genomics/test_genomics.py | 17 +- .../test_strain_mappings_generator.py | 288 +++++++++++++++ tests/scoring/test_data_links.py | 2 +- tests/scoring/test_link_finder.py | 2 +- tests/scoring/test_metcalf_scoring.py | 2 +- tests/test_nplinker_local.py | 2 +- 14 files changed, 714 insertions(+), 29 deletions(-) create mode 100644 src/nplinker/pairedomics/strain_mappings_generator.py create mode 100644 tests/pairedomics/test_strain_mappings_generator.py diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index 849b87e3..fdd11bf5 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -3,7 +3,7 @@ from .bgc import BGC from .gcf import GCF from .genomics import filter_mibig_only_gcf -from .genomics import generate_genome_bgc_mappings_file +from .genomics import generate_mappings_genome_id_bgc_id from .genomics import get_bgcs_from_gcfs from .genomics import get_strains_from_bgcs from .genomics import load_gcfs @@ -18,7 +18,7 @@ "BGC", "GCF", "filter_mibig_only_gcf", - "generate_genome_bgc_mappings_file", + "generate_mappings_genome_id_bgc_id", "get_bgcs_from_gcfs", "get_strains_from_bgcs", "load_gcfs", diff --git a/src/nplinker/genomics/genomics.py b/src/nplinker/genomics/genomics.py index ea0e61a6..a77a6417 100644 --- a/src/nplinker/genomics/genomics.py +++ b/src/nplinker/genomics/genomics.py @@ -16,12 +16,12 @@ logger = LogConfig.getLogger(__name__) -def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None: +def generate_mappings_genome_id_bgc_id( + bgc_dir: str | PathLike, + output_file: str | PathLike | None = None) -> None: """Generate a file that maps genome id to BGC id. - The output file is named in variable `GENOME_BGC_MAPPINGS_FILENAME` and - is placed in the same directory as `bgc_dir`. The file will be overwritten - if it already exists. + Note that the `output_file` will be overwritten if it already exists. Args: bgc_dir(str | PathLike): The directory has one-layer of subfolders and @@ -29,6 +29,11 @@ def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None: It assumes that - the subfolder name is the genome id (e.g. refseq), - the BGC file name is the BGC id. + output_file(str | PathLike | None): The path to the output file. Note + that the file will be overwritten if it already exists. + Defaults to None, in which case the output file will be placed in + the directory `bgc_dir` with a file name defined in global variable + `GENOME_BGC_MAPPINGS_FILENAME`. """ bgc_dir = Path(bgc_dir) genome_bgc_mappings = {} @@ -53,8 +58,11 @@ def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None: "version": "1.0" } - with open(bgc_dir / GENOME_BGC_MAPPINGS_FILENAME, "w") as f: + if output_file is None: + output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME + with open(output_file, "w") as f: json.dump(json_data, f) + logger.info("Generated genome-BGC mappings file: %s", output_file) def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC], diff --git a/src/nplinker/globals.py b/src/nplinker/globals.py index 2ee801f6..0df4e229 100644 --- a/src/nplinker/globals.py +++ b/src/nplinker/globals.py @@ -7,3 +7,4 @@ STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json' GENOME_BGC_MAPPINGS_FILENAME = "genome_bgc_mappings.json" GENOME_STATUS_FILENAME = "genome_status.json" +GNPS_FILE_MAPPINGS_FILENAME = "file_mappings.tsv" diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 6af65545..6fd067af 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -1,23 +1,29 @@ import glob import os from pathlib import Path -import sys from nplinker.annotations import load_annotations from nplinker.class_info.chem_classes import ChemClassPredictions from nplinker.class_info.class_matches import ClassMatches from nplinker.class_info.runcanopus import run_canopus +from nplinker.genomics import generate_mappings_genome_id_bgc_id from nplinker.genomics import load_gcfs from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.mibig import download_and_extract_mibig_metadata from nplinker.genomics.mibig import MibigBGCLoader +from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME +from nplinker.globals import GENOME_STATUS_FILENAME +from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME from nplinker.globals import PFAM_PATH from nplinker.globals import STRAIN_MAPPINGS_FILENAME from nplinker.logconfig import LogConfig from nplinker.metabolomics.metabolomics import load_dataset from nplinker.pairedomics.downloader import PODPDownloader from nplinker.pairedomics.runbigscape import run_bigscape +from nplinker.pairedomics.strain_mappings_generator import \ + podp_generate_strain_mappings from nplinker.strain_collection import StrainCollection + try: from importlib.resources import files except ImportError: @@ -97,7 +103,7 @@ def __init__(self, config_data): self.USE_MIBIG_DEFAULT) self._mibig_version = self._config_dataset.get( 'mibig_version', self.MIBIG_VERSION_DEFAULT) - self._root = self._config_dataset['root'] + self._root = Path(self._config_dataset['root']) self._platform_id = self._config_dataset['platform_id'] self._remote_loading = len(self._platform_id) > 0 @@ -139,6 +145,21 @@ def validate(self): self._init_paths() self._validate_paths() + def generate_strain_mappings(self): + + generate_mappings_genome_id_bgc_id(self._root / "antismash") + + podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json") + genome_status_json_file = self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME + genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME + gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME + + podp_generate_strain_mappings(podp_project_json_file, + genome_status_json_file, + genome_bgc_mappings_file, + gnps_file_mapping_tsv_file, + self.strain_mappings_file) + def load(self): # load strain mappings first if not self._load_strain_mappings(): @@ -175,7 +196,7 @@ def load(self): def _start_downloads(self): downloader = PODPDownloader(self._platform_id) - self._root = downloader.project_results_dir + self._root = Path(downloader.project_results_dir) logger.debug('remote loading mode, configuring root=%s', self._root) # CG: to download both MET and GEN data # CG: Continue to understand how strain_mappings.json is generated diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 747dc989..7c73804d 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -6,8 +6,8 @@ from .config import Config from .genomics import BGC from .genomics import GCF -from .loader import NPLINKER_APP_DATA_DIR from .loader import DatasetLoader +from .loader import NPLINKER_APP_DATA_DIR from .logconfig import LogConfig from .metabolomics.molecular_family import MolecularFamily from .metabolomics.spectrum import Spectrum @@ -258,6 +258,7 @@ def load_data(self, new_bigscape_cutoff=None): logger.debug('load_data(new_bigscape_cutoff=%s)', new_bigscape_cutoff) if new_bigscape_cutoff is None: self._loader.validate() + self._loader.generate_strain_mappings() if not self._loader.load(): return False else: diff --git a/src/nplinker/pairedomics/__init__.py b/src/nplinker/pairedomics/__init__.py index 5838a4ed..ee001a6f 100644 --- a/src/nplinker/pairedomics/__init__.py +++ b/src/nplinker/pairedomics/__init__.py @@ -2,8 +2,29 @@ from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data +from .strain_mappings_generator import extract_mappings_ms_filename_spectrum_id +from .strain_mappings_generator import \ + extract_mappings_original_genome_id_resolved_genome_id +from .strain_mappings_generator import \ + extract_mappings_resolved_genome_id_bgc_id +from .strain_mappings_generator import extract_mappings_strain_id_ms_filename +from .strain_mappings_generator import \ + extract_mappings_strain_id_original_genome_id +from .strain_mappings_generator import get_mappings_strain_id_bgc_id +from .strain_mappings_generator import get_mappings_strain_id_spectrum_id +from .strain_mappings_generator import podp_generate_strain_mappings logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data"] +__all__ = [ + "GenomeStatus", "get_best_available_genome_id", + "podp_download_and_extract_antismash_data", + "podp_generate_strain_mappings", + "extract_mappings_strain_id_original_genome_id", + "extract_mappings_original_genome_id_resolved_genome_id", + "extract_mappings_resolved_genome_id_bgc_id", + "get_mappings_strain_id_bgc_id", "extract_mappings_strain_id_ms_filename", + "extract_mappings_ms_filename_spectrum_id", + "get_mappings_strain_id_spectrum_id" +] diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py index 17a21a7b..41e4e78a 100644 --- a/src/nplinker/pairedomics/podp_antismash_downloader.py +++ b/src/nplinker/pairedomics/podp_antismash_downloader.py @@ -217,17 +217,20 @@ def get_best_available_genome_id( Returns: str | None: ID for the genome, if present, otherwise None. """ - if 'RefSeq_accession' in genome_id_data: - return genome_id_data['RefSeq_accession'] - if 'GenBank_accession' in genome_id_data: - return genome_id_data['GenBank_accession'] - if 'JGI_Genome_ID' in genome_id_data: - return genome_id_data['JGI_Genome_ID'] - - logger.warning( - f'No known genome ID field in genome data: {genome_id_data}') - return None + best_id = genome_id_data['RefSeq_accession'] + elif 'GenBank_accession' in genome_id_data: + best_id = genome_id_data['GenBank_accession'] + elif 'JGI_Genome_ID' in genome_id_data: + best_id = genome_id_data['JGI_Genome_ID'] + else: + best_id = None + + if best_id is None or len(best_id) == 0: + logger.warning( + f'Failed to get valid genome ID in genome data: {genome_id_data}') + return None + return best_id def _ncbi_genbank_search(genbank_id: str, diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py new file mode 100644 index 00000000..ff26dd64 --- /dev/null +++ b/src/nplinker/pairedomics/strain_mappings_generator.py @@ -0,0 +1,331 @@ +import json +import logging +from os import PathLike +from pathlib import Path +from nplinker.metabolomics.gnps.gnps_file_mapping_loader import \ + GNPSFileMappingLoader +from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain +from .podp_antismash_downloader import GenomeStatus +from .podp_antismash_downloader import get_best_available_genome_id + + +logger = logging.getLogger(__name__) + +__all__ = [ + "podp_generate_strain_mappings", + "extract_mappings_strain_id_original_genome_id", + "extract_mappings_original_genome_id_resolved_genome_id", + "extract_mappings_resolved_genome_id_bgc_id", + "get_mappings_strain_id_bgc_id", "extract_mappings_strain_id_ms_filename", + "extract_mappings_ms_filename_spectrum_id", + "get_mappings_strain_id_spectrum_id" +] + + +def podp_generate_strain_mappings( + podp_project_json_file: str | PathLike, + genome_status_json_file: str | PathLike, + genome_bgc_mappings_file: str | PathLike, + gnps_file_mapping_tsv_file: str | PathLike, + output_json_file: str | PathLike) -> StrainCollection: + """Generate strain mappings JSON file for PODP pipeline. + + To get the strain mappings, we need to combine the following mappings: + - strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id + - strain_id <-> MS_filename <-> spectrum_id + + These mappings are extracted from the following files: + - "strain_id <-> original_genome_id" is extracted from `podp_project_json_file`. + - "original_genome_id <-> resolved_genome_id" is extracted from `genome_status_json_file`. + - "resolved_genome_id <-> bgc_id" is extracted from `genome_bgc_mappings_file`. + - "strain_id <-> MS_filename" is extracted from `podp_project_json_file`. + - "MS_filename <-> spectrum_id" is extracted from `gnps_file_mapping_tsv_file`. + + Args: + podp_project_json_file(str | PathLike): The path to the PODP project + JSON file. + genome_status_json_file(str | PathLike): The path to the genome status + JSON file. + genome_bgc_mappings_file(str | PathLike): The path to the genome BGC + mappings JSON file. + gnps_file_mapping_tsv_file(str | PathLike): The path to the GNPS file + mapping TSV file. + output_json_file(str | PathLike): The path to the output JSON file. + + Returns: + StrainCollection: The strain mappings stored in a StrainCollection object. + + See Also: + `extract_mappings_strain_id_original_genome_id`: Extract mappings + "strain_id <-> original_genome_id". + `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings + "original_genome_id <-> resolved_genome_id". + `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings + "resolved_genome_id <-> bgc_id". + `get_mappings_strain_id_bgc_id`: Get mappings "strain_id <-> bgc_id". + `extract_mappings_strain_id_ms_filename`: Extract mappings + "strain_id <-> MS_filename". + `extract_mappings_ms_filename_spectrum_id`: Extract mappings + "MS_filename <-> spectrum_id". + `get_mappings_strain_id_spectrum_id`: Get mappings "strain_id <-> spectrum_id". + """ + + # Get mappings strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id + mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id( + extract_mappings_strain_id_original_genome_id(podp_project_json_file), + extract_mappings_original_genome_id_resolved_genome_id( + genome_status_json_file), + extract_mappings_resolved_genome_id_bgc_id(genome_bgc_mappings_file)) + + # Get mappings strain_id <-> MS_filename <-> spectrum_id + mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id( + extract_mappings_strain_id_ms_filename(podp_project_json_file), + extract_mappings_ms_filename_spectrum_id(gnps_file_mapping_tsv_file)) + + # Get mappings strain_id <-> bgc_id / spectrum_id + mappings = mappings_strain_id_bgc_id.copy() + for strain_id, spectrum_ids in mappings_strain_id_spectrum_id.items(): + if strain_id in mappings: + mappings[strain_id].update(spectrum_ids) + else: + mappings[strain_id] = spectrum_ids.copy() + + # Create StrainCollection + sc = StrainCollection() + for strain_id, bgc_ids in mappings.items(): + if strain_id not in sc: + strain = Strain(strain_id) + for bgc_id in bgc_ids: + strain.add_alias(bgc_id) + sc.add(strain) + else: + strain = sc.lookup(strain_id) + for bgc_id in bgc_ids: + strain.add_alias(bgc_id) + + # Write strain mappings JSON file + sc.to_json(output_json_file) + logger.info('Generated strain mappings JSON file: %s', output_json_file) + + return sc + + +#------------------------------------------------------------------------------ +# Functions to extract mappings for genomics side: +# strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id +#------------------------------------------------------------------------------ +def extract_mappings_strain_id_original_genome_id( + podp_project_json_file: str | PathLike) -> dict[str, set[str]]: + """Extract mappings "strain id <-> original genome id". + + Args: + podp_project_json_file(str | PathLike): The path to the PODP project + JSON file. + + Returns: + dict[str, set[str]]: Key is strain id and value is a set of original genome ids. + + Notes: + The `podp_project_json_file` is the project JSON file downloaded from + PODP platform. For example, for project MSV000079284, its json file is + https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. + """ + mappings_dict = {} + with open(podp_project_json_file, 'r') as f: + json_data = json.load(f) + + for record in json_data['genomes']: + strain_id = record['genome_label'] + genome_id = get_best_available_genome_id(record['genome_ID']) + if genome_id is None: + logger.warning( + 'Failed to extract genome ID from genome with label %s', + strain_id) + continue + if strain_id in mappings_dict: + mappings_dict[strain_id].add(genome_id) + else: + mappings_dict[strain_id] = {genome_id} + return mappings_dict + + +def extract_mappings_original_genome_id_resolved_genome_id( + genome_status_json_file: str | PathLike) -> dict[str, str]: + """Extract mappings "original_genome_id <-> resolved_genome_id". + + Args: + genome_status_json_file(str | PathLike): The path to the genome status + JSON file. + + Returns: + dict[str, str]: Key is original genome id and value is resolved genome id. + + Notes: + The `genome_status_json_file` is usually generated by the + `podp_download_and_extract_antismash_data` function with + a default file name defined in `nplinker.globals.GENOME_STATUS_FILENAME`. + """ + gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file) + return { + gs.original_id: gs.resolved_refseq_id + for gs in gs_mappings_dict.values() + } + + +def extract_mappings_resolved_genome_id_bgc_id( + genome_bgc_mappings_file: str | PathLike) -> dict[str, set[str]]: + """Extract mappings "resolved_genome_id <-> bgc_id". + + Args: + genome_bgc_mappings_file(str | PathLike): The path to the genome BGC + mappings JSON file. + + Returns: + dict[str, set[str]]: Key is resolved genome id and value is a set of BGC ids. + + Notes: + The `genome_bgc_mappings_file` is usually generated by the + `generate_mappings_genome_id_bgc_id` function with a default file name + defined in `nplinker.globals.GENOME_BGC_MAPPINGS_FILENAME`. + """ + with open(genome_bgc_mappings_file, 'r') as f: + json_data = json.load(f) + return { + mapping["genome_ID"]: set(mapping["BGC_ID"]) + for mapping in json_data['mappings'] + } + + +def get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id: dict[str, set[str]], + mappings_original_genome_id_resolved_genome_id: dict[str, str], + mappings_resolved_genome_id_bgc_id: dict[str, set[str]] +) -> dict[str, set[str]]: + """Get mappings "strain_id <-> bgc_id". + + Args: + mappings_strain_id_original_genome_id(dict[str, set[str]]): Mappings + "strain_id <-> original_genome_id". + mappings_original_genome_id_resolved_genome_id(dict[str, str]): Mappings + "original_genome_id <-> resolved_genome_id". + mappings_resolved_genome_id_bgc_id(dict[str, set[str]]): Mappings + "resolved_genome_id <-> bgc_id". + + Returns: + dict[str, set[str]]: Key is strain id and value is a set of BGC ids. + + See Also: + `extract_mappings_strain_id_original_genome_id`: Extract mappings + "strain_id <-> original_genome_id". + `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings + "original_genome_id <-> resolved_genome_id". + `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings + "resolved_genome_id <-> bgc_id". + """ + mappings_dict = {} + for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items( + ): + bgc_ids = set() + for original_genome_id in original_genome_ids: + resolved_genome_id = mappings_original_genome_id_resolved_genome_id[ + original_genome_id] + if (bgc_id := + mappings_resolved_genome_id_bgc_id.get(resolved_genome_id) + ) is not None: + bgc_ids.update(bgc_id) + if bgc_ids: + mappings_dict[strain_id] = bgc_ids + return mappings_dict + + +#------------------------------------------------------------------------------ +# Functions to extract mappings for metabolomics side: +# strain_id <-> MS_filename <-> spectrum_id +#------------------------------------------------------------------------------ +def extract_mappings_strain_id_ms_filename( + podp_project_json_file: str | PathLike) -> dict[str, set[str]]: + """Extract mappings "strain_id <-> MS_filename". + + Args: + podp_project_json_file(str | PathLike): The path to the PODP project + JSON file. + + Returns: + dict[str, set[str]]: Key is strain id and value is a set of MS filenames. + + Notes: + The `podp_project_json_file` is the project JSON file downloaded from + PODP platform. For example, for project MSV000079284, its json file is + https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. + """ + mappings_dict = {} + with open(podp_project_json_file, 'r') as f: + json_data = json.load(f) + + # Extract mappings strain id <-> metabolomics filename + for record in json_data['genome_metabolome_links']: + strain_id = record['genome_label'] + # get the actual filename of the mzXML URL + filename = Path(record['metabolomics_file']).name + if strain_id in mappings_dict: + mappings_dict[strain_id].add(filename) + else: + mappings_dict[strain_id] = {filename} + return mappings_dict + + +def extract_mappings_ms_filename_spectrum_id( + tsv_file: str | PathLike) -> dict[str, set[str]]: + """Extract mappings "MS_filename <-> spectrum_id". + + Args: + tsv_file(str | PathLike): The path to the GNPS file mapping TSV file. + + Returns: + dict[str, set[str]]: Key is MS filename and value is a set of spectrum ids. + + Notes: + The `tsv_file` is generated by GNPS molecular networking. It's downloaded + from GNPS website to a file with a default name defined in + `GNPS_FILE_MAPPINGS_FILENAME`. + + See Also: + `GNPSFileMappingLoader`: A class to load GNPS file mapping TSV file. + """ + loader = GNPSFileMappingLoader(tsv_file) + return loader.mapping_reversed() + + +def get_mappings_strain_id_spectrum_id( + mappings_strain_id_ms_filename: dict[str, set[str]], + mappings_ms_filename_spectrum_id: dict[str, + set[str]]) -> dict[str, set[str]]: + """Get mappings "strain_id <-> spectrum_id". + + Args: + mappings_strain_id_ms_filename(dict[str, set[str]]): Mappings + "strain_id <-> MS_filename". + mappings_ms_filename_spectrum_id(dict[str, set[str]]): Mappings + "MS_filename <-> spectrum_id". + + Returns: + dict[str, set[str]]: Key is strain id and value is a set of spectrum ids. + + + See Also: + `extract_mappings_strain_id_ms_filename`: Extract mappings + "strain_id <-> MS_filename". + `extract_mappings_ms_filename_spectrum_id`: Extract mappings + "MS_filename <-> spectrum_id". + """ + mappings_dict = {} + for strain_id, ms_filenames in mappings_strain_id_ms_filename.items(): + spectrum_ids = set() + for ms_filename in ms_filenames: + if (sid := mappings_ms_filename_spectrum_id.get(ms_filename) + ) is not None: + spectrum_ids.update(sid) + if spectrum_ids: + mappings_dict[strain_id] = spectrum_ids + return mappings_dict diff --git a/tests/genomics/test_genomics.py b/tests/genomics/test_genomics.py index d89945bf..1b15cff1 100644 --- a/tests/genomics/test_genomics.py +++ b/tests/genomics/test_genomics.py @@ -4,7 +4,7 @@ from nplinker.genomics import BGC from nplinker.genomics import filter_mibig_only_gcf from nplinker.genomics import GCF -from nplinker.genomics import generate_genome_bgc_mappings_file +from nplinker.genomics import generate_mappings_genome_id_bgc_id from nplinker.genomics import get_bgcs_from_gcfs from nplinker.genomics import get_strains_from_bgcs from nplinker.genomics import map_bgc_to_gcf @@ -15,14 +15,25 @@ from .. import DATA_DIR -def test_generate_genome_bgc_mappings_file(): +def test_generate_mappings_genome_id_bgc_id(tmp_path): bgc_dir = DATA_DIR / "antismash" - generate_genome_bgc_mappings_file(bgc_dir) + # using default output file path + generate_mappings_genome_id_bgc_id(bgc_dir) + # using custom output file path + generate_mappings_genome_id_bgc_id(bgc_dir, + tmp_path / GENOME_BGC_MAPPINGS_FILENAME) + # read both files with open(bgc_dir / GENOME_BGC_MAPPINGS_FILENAME) as f: mappings = json.load(f) + with open(tmp_path / GENOME_BGC_MAPPINGS_FILENAME) as f: + mappings_with_outfile = json.load(f) + # check if both files are the same + assert mappings == mappings_with_outfile + + # then check the content assert mappings["count"] == 2 assert mappings["mappings"][0]["genome_ID"] == "GCF_000514515.1" diff --git a/tests/pairedomics/test_strain_mappings_generator.py b/tests/pairedomics/test_strain_mappings_generator.py new file mode 100644 index 00000000..c1f87a26 --- /dev/null +++ b/tests/pairedomics/test_strain_mappings_generator.py @@ -0,0 +1,288 @@ +import json +from nplinker.pairedomics import extract_mappings_ms_filename_spectrum_id +from nplinker.pairedomics import \ + extract_mappings_original_genome_id_resolved_genome_id +from nplinker.pairedomics import extract_mappings_resolved_genome_id_bgc_id +from nplinker.pairedomics import extract_mappings_strain_id_ms_filename +from nplinker.pairedomics import extract_mappings_strain_id_original_genome_id +from nplinker.pairedomics import get_mappings_strain_id_bgc_id +from nplinker.pairedomics import get_mappings_strain_id_spectrum_id +from nplinker.pairedomics import podp_generate_strain_mappings +from nplinker.strain_collection import StrainCollection +from nplinker.strains import Strain + + +def test_podp_generate_strain_mappings(monkeypatch, tmp_path): + # mock functions called by the tested function + mappings_strain_bgc = { + "strain1": {"bgc1", "bgc2"}, + "strain2": {"bgc3"}, + } + mappings_strain_spectrum = { + "strain1": {"spec1", "spec2"}, + "strain2": {"spec3"} + } + + monkeypatch.setattr( + "nplinker.pairedomics.strain_mappings_generator.extract_mappings_strain_id_original_genome_id", + lambda *args: {}) # any return value is fine + monkeypatch.setattr( + 'nplinker.pairedomics.strain_mappings_generator.extract_mappings_original_genome_id_resolved_genome_id', + lambda *args: {}) + monkeypatch.setattr( + 'nplinker.pairedomics.strain_mappings_generator.extract_mappings_resolved_genome_id_bgc_id', + lambda *args: {}) + monkeypatch.setattr( + "nplinker.pairedomics.strain_mappings_generator.get_mappings_strain_id_bgc_id", + lambda *args: mappings_strain_bgc) + + monkeypatch.setattr( + 'nplinker.pairedomics.strain_mappings_generator.extract_mappings_strain_id_ms_filename', + lambda *args: {}) + monkeypatch.setattr( + 'nplinker.pairedomics.strain_mappings_generator.extract_mappings_ms_filename_spectrum_id', + lambda *args: {}) + monkeypatch.setattr( + "nplinker.pairedomics.strain_mappings_generator.get_mappings_strain_id_spectrum_id", + lambda *args: mappings_strain_spectrum) + + # Create the expected + expected_dict = { + "strain1": {"bgc1", "bgc2", "spec1", "spec2"}, + "strain2": {"bgc3", "spec3"} + } + expected_sc = StrainCollection() + for strain_id, ids in expected_dict.items(): + strain = Strain(strain_id) + for iid in ids: + strain.add_alias(iid) + expected_sc.add(strain) + + # Call function to generate strain mappings + output_file = tmp_path / "output.json" + result = podp_generate_strain_mappings("dummy_podp_project_file", + "dummy_genome_status_file", + "dummy_genome_bgc_mappings_file", + "dummy_gnps_file_mapping_file", + output_file) + # check returned value + assert isinstance(result, StrainCollection) + assert result == expected_sc + # check output file + sc = StrainCollection.read_json(output_file) + assert sc == expected_sc + + +def test_extract_mappings_strain_id_original_genome_id(tmp_path): + test_data = { + "genomes": [ + { + "genome_label": "strain1", + "genome_ID": { + "RefSeq_accession": "id1" + } + }, + { + "genome_label": "strain1", + "genome_ID": { + "RefSeq_accession": "id2" + } + }, + { + "genome_label": "strain2", + "genome_ID": { + "RefSeq_accession": "id3" + } + }, + ] + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + + expected_result = { + "strain1": {"id1", "id2"}, + "strain2": {"id3"}, + } + assert extract_mappings_strain_id_original_genome_id( + test_file) == expected_result + + +def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): + test_data = { + "genome_status": [ + { + "original_id": "id1", + "resolved_refseq_id": "refseq1", + "resolve_attempted": True, + "bgc_path": "" + }, + { + "original_id": "id2", + "resolved_refseq_id": "refseq2", + "resolve_attempted": True, + "bgc_path": "" + }, + { + "original_id": "id3", + "resolved_refseq_id": "refseq3", + "resolve_attempted": True, + "bgc_path": "" + }, + ] + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + + expected_result = {"id1": "refseq1", "id2": "refseq2", "id3": "refseq3"} + + assert extract_mappings_original_genome_id_resolved_genome_id( + test_file) == expected_result + + +def test_extract_mappings_resolved_genome_id_bgc_id(tmp_path): + test_data = { + "mappings": [ + { + "genome_ID": "id1", + "BGC_ID": ["bgc1", "bgc2"] + }, + { + "genome_ID": "id2", + "BGC_ID": ["bgc3"] + }, + { + "genome_ID": "id3", + "BGC_ID": [] + }, + ] + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + expected_result = { + "id1": {"bgc1", "bgc2"}, + "id2": {"bgc3"}, + "id3": set(), + } + assert extract_mappings_resolved_genome_id_bgc_id( + test_file) == expected_result + + +def test_get_mappings_strain_id_bgc_id(): + # Test case 1: Test with empty mappings + mappings_strain_id_original_genome_id = {} + mappings_original_genome_id_resolved_genome_id = {} + mappings_resolved_genome_id_bgc_id = {} + expected_result = {} + assert get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id) == expected_result + + # Test case 2: Test with one strain and one genome + mappings_strain_id_original_genome_id = {"strain1": {"genome1"}} + mappings_original_genome_id_resolved_genome_id = { + "genome1": "resolved_genome1" + } + mappings_resolved_genome_id_bgc_id = {"resolved_genome1": {"bgc1"}} + expected_result = {"strain1": {"bgc1"}} + assert get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id) == expected_result + + # Test case 3: Test with multiple strains and genomes + mappings_strain_id_original_genome_id = { + "strain1": {"genome1", "genome2"}, + "strain2": {"genome3"}, + "strain3": {"genome4"} + } + mappings_original_genome_id_resolved_genome_id = { + "genome1": "resolved_genome1", + "genome2": "resolved_genome1", + "genome3": "resolved_genome2", + "genome4": "" + } + mappings_resolved_genome_id_bgc_id = { + "resolved_genome1": { + "bgc1", + }, + "resolved_genome2": {"bgc2", "bgc3"}, + } + expected_result = {"strain1": {"bgc1"}, "strain2": {"bgc2", "bgc3"}} + assert get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id) == expected_result + + +def test_extract_mappings_strain_id_ms_filename(tmp_path): + test_data = { + "genome_metabolome_links": [ + { + "genome_label": "strain1", + "metabolomics_file": "http://example.com/file1.mzXML" + }, + { + "genome_label": "strain1", + "metabolomics_file": "http://example.com/file2.mzXML" + }, + { + "genome_label": "strain2", + "metabolomics_file": "http://example.com/file3.mzXML" + }, + { + "genome_label": "strain3", + "metabolomics_file": "http://example.com/file4.mzXML" + }, + ] + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + expected_result = { + "strain1": {"file1.mzXML", "file2.mzXML"}, + "strain2": {"file3.mzXML"}, + "strain3": {"file4.mzXML"}, + } + + assert extract_mappings_strain_id_ms_filename(test_file) == expected_result + + +def test_extract_mappings_ms_filename_spectrum_id(tmp_path): + test_data = "cluster index\tAllFiles\nspec1\tfile1.mzXML:123###\nspec2\tfile2.mzXML:123###\nspec3\tfile2.mzXML:123###file3.mzXML:123###\n" + test_file = tmp_path / "test_data.tsv" + with open(test_file, "w") as f: + f.write(test_data) + expected_result = { + "file1.mzXML": {"spec1"}, + "file2.mzXML": {"spec2", "spec3"}, + "file3.mzXML": {"spec3"} + } + + assert extract_mappings_ms_filename_spectrum_id( + test_file) == expected_result + + +def test_get_mappings_strain_id_spectrum_id(): + mappings_strain_id_ms_filename = { + 'strain1': {'file1.mzXML', 'file2.mzXML'}, + 'strain2': {'file3.mzXML'}, + 'strain3': {'file4.mzXML'} + } + mappings_ms_filename_spectrum_id = { + 'file1.mzXML': {'spec1'}, + 'file2.mzXML': {'spec2', 'spec3'}, + 'file3.mzXML': {'spec3'} + } + + expected_mappings_dict = { + 'strain1': {'spec1', 'spec2', 'spec3'}, + 'strain2': {'spec3'}, + } + actual_mappings_dict = get_mappings_strain_id_spectrum_id( + mappings_strain_id_ms_filename, mappings_ms_filename_spectrum_id) + + assert actual_mappings_dict == expected_mappings_dict diff --git a/tests/scoring/test_data_links.py b/tests/scoring/test_data_links.py index 22a9c21a..f3b0e96c 100644 --- a/tests/scoring/test_data_links.py +++ b/tests/scoring/test_data_links.py @@ -1,5 +1,5 @@ import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal import pytest from nplinker.metabolomics.singleton_family import SingletonFamily diff --git a/tests/scoring/test_link_finder.py b/tests/scoring/test_link_finder.py index 30d84980..56cd415e 100644 --- a/tests/scoring/test_link_finder.py +++ b/tests/scoring/test_link_finder.py @@ -1,6 +1,6 @@ import numpy as np import pandas as pd -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal import pytest from pytest import fixture from nplinker.scoring.linking import LinkFinder diff --git a/tests/scoring/test_metcalf_scoring.py b/tests/scoring/test_metcalf_scoring.py index 471c073c..be458578 100644 --- a/tests/scoring/test_metcalf_scoring.py +++ b/tests/scoring/test_metcalf_scoring.py @@ -1,6 +1,6 @@ import numpy as np from numpy.testing import assert_array_equal -from pandas.util.testing import assert_frame_equal +from pandas.testing import assert_frame_equal import pytest from nplinker.scoring import LinkCollection from nplinker.scoring import MetcalfScoring diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py index 10e27611..818e2b9a 100644 --- a/tests/test_nplinker_local.py +++ b/tests/test_nplinker_local.py @@ -29,7 +29,7 @@ def npl() -> NPLinker: npl = NPLinker(str(DATA_DIR / 'nplinker_demo1.toml')) npl.load_data() hash_proj_file = get_file_hash( - os.path.join(Path(npl._loader._root).parent.parent, + os.path.join(npl._loader._root.parent.parent, npl._loader._platform_id + '.json')) if hash_proj_file != '22e4f20d6f8aa425b2040479d0b6c00e7d3deb03f8fc4a277b3b91eb07c9ad72': pytest.exit(