Skip to content

Commit

Permalink
Create functions to extract/get/generate strain mappings (#166)
Browse files Browse the repository at this point in the history
* Fix pandas api change on testing

* add logger to `generate_genome_bgc_mappings_file` function

* add `GNPS_FILE_MAPPINGS_FILENAME` to globals

* add `generate_strain_mappings` method to `DatasetLoader`

* fix variable bug

* rename function `generate_genome_bgc_mappings_file` to `generate_mappings_genome_id_bgc_id`

* add parameter `output_file` to func `generate_mappings_genome_id_bgc_id`

* fix the bug of empty genome id

* Create strain_mappings_generator.py

* Create test_strain_mappings_generator.py

* fix logic bug on getting best id
  • Loading branch information
CunliangGeng authored Aug 4, 2023
1 parent 2fcac2d commit 817dabd
Show file tree
Hide file tree
Showing 14 changed files with 714 additions and 29 deletions.
4 changes: 2 additions & 2 deletions src/nplinker/genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .bgc import BGC
from .gcf import GCF
from .genomics import filter_mibig_only_gcf
from .genomics import generate_genome_bgc_mappings_file
from .genomics import generate_mappings_genome_id_bgc_id
from .genomics import get_bgcs_from_gcfs
from .genomics import get_strains_from_bgcs
from .genomics import load_gcfs
Expand All @@ -18,7 +18,7 @@
"BGC",
"GCF",
"filter_mibig_only_gcf",
"generate_genome_bgc_mappings_file",
"generate_mappings_genome_id_bgc_id",
"get_bgcs_from_gcfs",
"get_strains_from_bgcs",
"load_gcfs",
Expand Down
18 changes: 13 additions & 5 deletions src/nplinker/genomics/genomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,24 @@
logger = LogConfig.getLogger(__name__)


def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None:
def generate_mappings_genome_id_bgc_id(
bgc_dir: str | PathLike,
output_file: str | PathLike | None = None) -> None:
"""Generate a file that maps genome id to BGC id.
The output file is named in variable `GENOME_BGC_MAPPINGS_FILENAME` and
is placed in the same directory as `bgc_dir`. The file will be overwritten
if it already exists.
Note that the `output_file` will be overwritten if it already exists.
Args:
bgc_dir(str | PathLike): The directory has one-layer of subfolders and
each subfolder contains BGC files in `.gbk` format.
It assumes that
- the subfolder name is the genome id (e.g. refseq),
- the BGC file name is the BGC id.
output_file(str | PathLike | None): The path to the output file. Note
that the file will be overwritten if it already exists.
Defaults to None, in which case the output file will be placed in
the directory `bgc_dir` with a file name defined in global variable
`GENOME_BGC_MAPPINGS_FILENAME`.
"""
bgc_dir = Path(bgc_dir)
genome_bgc_mappings = {}
Expand All @@ -53,8 +58,11 @@ def generate_genome_bgc_mappings_file(bgc_dir: str | PathLike) -> None:
"version": "1.0"
}

with open(bgc_dir / GENOME_BGC_MAPPINGS_FILENAME, "w") as f:
if output_file is None:
output_file = bgc_dir / GENOME_BGC_MAPPINGS_FILENAME
with open(output_file, "w") as f:
json.dump(json_data, f)
logger.info("Generated genome-BGC mappings file: %s", output_file)


def map_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC],
Expand Down
1 change: 1 addition & 0 deletions src/nplinker/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
STRAIN_MAPPINGS_FILENAME = 'strain_mappings.json'
GENOME_BGC_MAPPINGS_FILENAME = "genome_bgc_mappings.json"
GENOME_STATUS_FILENAME = "genome_status.json"
GNPS_FILE_MAPPINGS_FILENAME = "file_mappings.tsv"
27 changes: 24 additions & 3 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,29 @@
import glob
import os
from pathlib import Path
import sys
from nplinker.annotations import load_annotations
from nplinker.class_info.chem_classes import ChemClassPredictions
from nplinker.class_info.class_matches import ClassMatches
from nplinker.class_info.runcanopus import run_canopus
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics import load_gcfs
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.genomics.mibig import MibigBGCLoader
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.globals import GENOME_STATUS_FILENAME
from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME
from nplinker.globals import PFAM_PATH
from nplinker.globals import STRAIN_MAPPINGS_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.metabolomics.metabolomics import load_dataset
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.pairedomics.strain_mappings_generator import \
podp_generate_strain_mappings
from nplinker.strain_collection import StrainCollection


try:
from importlib.resources import files
except ImportError:
Expand Down Expand Up @@ -97,7 +103,7 @@ def __init__(self, config_data):
self.USE_MIBIG_DEFAULT)
self._mibig_version = self._config_dataset.get(
'mibig_version', self.MIBIG_VERSION_DEFAULT)
self._root = self._config_dataset['root']
self._root = Path(self._config_dataset['root'])
self._platform_id = self._config_dataset['platform_id']
self._remote_loading = len(self._platform_id) > 0

Expand Down Expand Up @@ -139,6 +145,21 @@ def validate(self):
self._init_paths()
self._validate_paths()

def generate_strain_mappings(self):

generate_mappings_genome_id_bgc_id(self._root / "antismash")

podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json")
genome_status_json_file = self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME
genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME
gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME

podp_generate_strain_mappings(podp_project_json_file,
genome_status_json_file,
genome_bgc_mappings_file,
gnps_file_mapping_tsv_file,
self.strain_mappings_file)

def load(self):
# load strain mappings first
if not self._load_strain_mappings():
Expand Down Expand Up @@ -175,7 +196,7 @@ def load(self):

def _start_downloads(self):
downloader = PODPDownloader(self._platform_id)
self._root = downloader.project_results_dir
self._root = Path(downloader.project_results_dir)
logger.debug('remote loading mode, configuring root=%s', self._root)
# CG: to download both MET and GEN data
# CG: Continue to understand how strain_mappings.json is generated
Expand Down
3 changes: 2 additions & 1 deletion src/nplinker/nplinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from .config import Config
from .genomics import BGC
from .genomics import GCF
from .loader import NPLINKER_APP_DATA_DIR
from .loader import DatasetLoader
from .loader import NPLINKER_APP_DATA_DIR
from .logconfig import LogConfig
from .metabolomics.molecular_family import MolecularFamily
from .metabolomics.spectrum import Spectrum
Expand Down Expand Up @@ -258,6 +258,7 @@ def load_data(self, new_bigscape_cutoff=None):
logger.debug('load_data(new_bigscape_cutoff=%s)', new_bigscape_cutoff)
if new_bigscape_cutoff is None:
self._loader.validate()
self._loader.generate_strain_mappings()
if not self._loader.load():
return False
else:
Expand Down
23 changes: 22 additions & 1 deletion src/nplinker/pairedomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,29 @@
from .podp_antismash_downloader import GenomeStatus
from .podp_antismash_downloader import get_best_available_genome_id
from .podp_antismash_downloader import podp_download_and_extract_antismash_data
from .strain_mappings_generator import extract_mappings_ms_filename_spectrum_id
from .strain_mappings_generator import \
extract_mappings_original_genome_id_resolved_genome_id
from .strain_mappings_generator import \
extract_mappings_resolved_genome_id_bgc_id
from .strain_mappings_generator import extract_mappings_strain_id_ms_filename
from .strain_mappings_generator import \
extract_mappings_strain_id_original_genome_id
from .strain_mappings_generator import get_mappings_strain_id_bgc_id
from .strain_mappings_generator import get_mappings_strain_id_spectrum_id
from .strain_mappings_generator import podp_generate_strain_mappings


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data"]
__all__ = [
"GenomeStatus", "get_best_available_genome_id",
"podp_download_and_extract_antismash_data",
"podp_generate_strain_mappings",
"extract_mappings_strain_id_original_genome_id",
"extract_mappings_original_genome_id_resolved_genome_id",
"extract_mappings_resolved_genome_id_bgc_id",
"get_mappings_strain_id_bgc_id", "extract_mappings_strain_id_ms_filename",
"extract_mappings_ms_filename_spectrum_id",
"get_mappings_strain_id_spectrum_id"
]
23 changes: 13 additions & 10 deletions src/nplinker/pairedomics/podp_antismash_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,20 @@ def get_best_available_genome_id(
Returns:
str | None: ID for the genome, if present, otherwise None.
"""

if 'RefSeq_accession' in genome_id_data:
return genome_id_data['RefSeq_accession']
if 'GenBank_accession' in genome_id_data:
return genome_id_data['GenBank_accession']
if 'JGI_Genome_ID' in genome_id_data:
return genome_id_data['JGI_Genome_ID']

logger.warning(
f'No known genome ID field in genome data: {genome_id_data}')
return None
best_id = genome_id_data['RefSeq_accession']
elif 'GenBank_accession' in genome_id_data:
best_id = genome_id_data['GenBank_accession']
elif 'JGI_Genome_ID' in genome_id_data:
best_id = genome_id_data['JGI_Genome_ID']
else:
best_id = None

if best_id is None or len(best_id) == 0:
logger.warning(
f'Failed to get valid genome ID in genome data: {genome_id_data}')
return None
return best_id


def _ncbi_genbank_search(genbank_id: str,
Expand Down
Loading

0 comments on commit 817dabd

Please sign in to comment.