Skip to content

Commit

Permalink
Fix issue #5 and add ConfigManager
Browse files Browse the repository at this point in the history
  • Loading branch information
andrasmatyassy committed Sep 20, 2024
1 parent c5d396b commit c025299
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 17 deletions.
30 changes: 25 additions & 5 deletions CodonTransformer/CodonData.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import json
import os
import random
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
Expand All @@ -24,6 +25,7 @@
START_CODONS,
STOP_CODONS,
STOP_SYMBOL,
ConfigManager,
find_pattern_in_fasta,
get_taxonomy_id,
sort_amino2codon_skeleton,
Expand Down Expand Up @@ -161,7 +163,7 @@ def preprocess_protein_sequence(protein: str) -> str:
str: The preprocessed protein sequence.
Raises:
ValueError: If the protein sequence is invalid.
ValueError: If the protein sequence is invalid or if the configuration is invalid.
"""
if not protein:
raise ValueError("Protein sequence is empty.")
Expand All @@ -171,10 +173,28 @@ def preprocess_protein_sequence(protein: str) -> str:
protein.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
)

# Replace ambiguous amino acids with standard 20 amino acids
protein = "".join(
AMBIGUOUS_AMINOACID_MAP.get(aminoacid, aminoacid) for aminoacid in protein
)
# Handle ambiguous amino acids based on the specified behavior
config = ConfigManager()
ambiguous_aminoacid_map_override = config.get('ambiguous_aminoacid_map_override')
ambiguous_aminoacid_behavior = config.get('ambiguous_aminoacid_behavior')
ambiguous_aminoacid_map = AMBIGUOUS_AMINOACID_MAP.copy()

for aminoacid, standard_aminoacids in ambiguous_aminoacid_map_override.items():
ambiguous_aminoacid_map[aminoacid] = standard_aminoacids

if ambiguous_aminoacid_behavior == 'raise_error':
if any(aminoacid in ambiguous_aminoacid_map for aminoacid in protein):
raise ValueError("Ambiguous amino acids found in protein sequence.")
elif ambiguous_aminoacid_behavior == 'standardize_deterministic':
protein = "".join(
ambiguous_aminoacid_map.get(aminoacid, [aminoacid])[0] for aminoacid in protein
)
elif ambiguous_aminoacid_behavior == 'standardize_random':
protein = "".join(
random.choice(ambiguous_aminoacid_map.get(aminoacid, [aminoacid])) for aminoacid in protein
)
else:
raise ValueError(f"Invalid ambiguous_aminoacid_behavior: {ambiguous_aminoacid_behavior}.")

# Check for sequence validity
if any(
Expand Down
5 changes: 0 additions & 5 deletions CodonTransformer/CodonPrediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from CodonTransformer.CodonData import get_merged_seq
from CodonTransformer.CodonUtils import (
AMINO_ACIDS,
INDEX2TOKEN,
NUM_ORGANISMS,
ORGANISM2ID,
Expand Down Expand Up @@ -128,10 +127,6 @@ def predict_dna_sequence(
if not protein:
raise ValueError("Protein sequence cannot be empty.")

# Test that the input protein sequence contains only valid amino acids
if not all(aminoacid in AMINO_ACIDS for aminoacid in protein):
raise ValueError("Invalid amino acid found in protein sequence.")

# Load tokenizer
if not isinstance(tokenizer, PreTrainedTokenizerFast):
tokenizer = load_tokenizer(tokenizer)
Expand Down
110 changes: 103 additions & 7 deletions CodonTransformer/CodonUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@
]

# Dictionary ambiguous amino acids to standard amino acids
AMBIGUOUS_AMINOACID_MAP: Dict[str, str] = {
"B": "N", # Aspartic acid (D) or Asparagine (N)
"Z": "Q", # Glutamic acid (E) or Glutamine (Q)
"X": "A", # Any amino acid (typically replaced with Alanine)
"J": "L", # Leucine (L) or Isoleucine (I)
"U": "C", # Selenocysteine (typically replaced with Cysteine)
"O": "K", # Pyrrolysine (typically replaced with Lysine)
AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = {
"B": ["N", "D"], # Asparagine (N) or Aspartic acid (D)
"Z": ["Q", "E"], # Glutamine (Q) or Glutamic acid (E)
"X": ["A"], # Any amino acid (typically replaced with Alanine)
"J": ["L", "I"], # Leucine (L) or Isoleucine (I)
"U": ["C"], # Selenocysteine (typically replaced with Cysteine)
"O": ["K"], # Pyrrolysine (typically replaced with Lysine)
}

# List of all possible start and stop codons
Expand Down Expand Up @@ -544,6 +544,102 @@ def __init__(self, data_path: str, train: bool = True, **kwargs):
self.train = train


class ConfigManager:
"""
A class to manage configuration settings.
This class ensures that the configuration is a singleton.
It provides methods to get, set, and update configuration values.
Attributes:
_instance (Optional[ConfigManager]): The singleton instance of the ConfigManager.
_config (Dict[str, Any]): The configuration dictionary.
"""
_instance = None

def __new__(cls):
"""
Create a new instance of the ConfigManager class.
Returns:
ConfigManager: The singleton instance of the ConfigManager.
"""
if cls._instance is None:
cls._instance = super(ConfigManager, cls).__new__(cls)
cls._instance._config = {
'ambiguous_aminoacid_behavior': 'raise_error',
'ambiguous_aminoacid_map_override': {}
}
return cls._instance

def get(self, key: str) -> Any:
"""
Get the value of a configuration key.
Args:
key (str): The key to retrieve the value for.
Returns:
Any: The value of the configuration key.
"""
return self._config.get(key)

def set(self, key: str, value: Any) -> None:
"""
Set the value of a configuration key.
Args:
key (str): The key to set the value for.
value (Any): The value to set for the key.
"""
self.validate_inputs(key, value)
self._config[key] = value

def update(self, config_dict: dict) -> None:
"""
Update the configuration with a dictionary of key-value pairs after validating them.
Args:
config_dict (dict): A dictionary of key-value pairs to update the configuration.
"""
for key, value in config_dict.items():
self.validate_inputs(key, value)
for key, value in config_dict.items():
self.set(key, value)

def validate_inputs(self, key: str, value: Any) -> None:
"""
Validate the inputs for the configuration.
Args:
key (str): The key to validate.
value (Any): The value to validate.
Raises:
ValueError: If the value is invalid.
TypeError: If the value is of the wrong type.
"""
if key == 'ambiguous_aminoacid_behavior':
if value not in [
'raise_error',
'standardize_deterministic',
'standardize_random'
]:
raise ValueError(f"Invalid value for ambiguous_aminoacid_behavior: {value}.")
elif key == 'ambiguous_aminoacid_map_override':
if not isinstance(value, dict):
raise TypeError(f"Invalid type for ambiguous_aminoacid_map_override: {value}.")
for ambiguous_aminoacid, aminoacids in value.items():
if not isinstance(aminoacids, list):
raise TypeError(f"Invalid type for aminoacids: {aminoacids}.")
if not aminoacids:
raise ValueError(f"Override for aminoacid '{ambiguous_aminoacid}' cannot be empty list.")
if ambiguous_aminoacid not in AMBIGUOUS_AMINOACID_MAP:
raise ValueError(f"Invalid amino acid in ambiguous_aminoacid_map_override: {ambiguous_aminoacid}")
else:
raise ValueError(f"Invalid configuration key: {key}")


def load_python_object_from_disk(file_path: str) -> Any:
"""
Load a Pickle object from disk and return it as a Python object.
Expand Down

0 comments on commit c025299

Please sign in to comment.