Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue #5 and add ConfigManager #9

Merged
merged 39 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
c025299
Fix issue #5 and add ConfigManager
andrasmatyassy Sep 20, 2024
16bd563
add testcases for fix issue #5
andrasmatyassy Sep 23, 2024
a829c37
Bump version to 1.6.0
Adibvafa Sep 20, 2024
c17c5a0
Add support for variable randomness in DNA prediction.
Adibvafa Sep 20, 2024
acbd4a0
Add random use case to docstring of predict_dna_sequence.
Adibvafa Sep 20, 2024
a0962d2
Add support for non-deterministic test with temperature.
Adibvafa Sep 20, 2024
dbb4d42
Bump version to 1.6.1
Adibvafa Sep 20, 2024
dae0890
Bump version to 1.6.1
Adibvafa Sep 20, 2024
590280e
Bump version to 1.6.2
Adibvafa Sep 20, 2024
d0f6fc5
Add more tests to check predict_dna_sequence.
Adibvafa Sep 20, 2024
97a0079
Add support for top_p in non-deterministic generation.
Adibvafa Sep 20, 2024
6c5ffdf
Improve style.
Adibvafa Sep 20, 2024
b80f419
Add extensive testing for predict_dna_sequence.
Adibvafa Sep 20, 2024
353a14b
Add a list of possible stop symbols.
Adibvafa Sep 20, 2024
0e43429
Add docstrings for sample_non_deterministic and STOP_SYMBOLS.
Adibvafa Sep 20, 2024
6c0c091
Remove checking for protein sequence validity and bring it to preproc…
Adibvafa Sep 20, 2024
12e3af1
Remove test_predict_dna_sequence_ambiguous_amino_acids test.
Adibvafa Sep 20, 2024
0723663
Update .pre-commit-config.yaml
Adibvafa Sep 20, 2024
e8c25bf
Improve style.
Adibvafa Sep 20, 2024
0ecc4cb
Update .pre-commit-config.yaml
Adibvafa Sep 20, 2024
1ffe33a
Fix issue with top_p sampling.
Adibvafa Sep 21, 2024
b5201a9
Fix issue with top_p sampling.
Adibvafa Sep 21, 2024
d66cf72
Bump version to 1.6.3
Adibvafa Sep 21, 2024
28589ba
Update issue templates
Adibvafa Sep 21, 2024
8247187
Update issue templates
Adibvafa Sep 21, 2024
eb05419
Create CODE_OF_CONDUCT.md
Adibvafa Sep 21, 2024
126b6c2
Add support for multiple sequence generation.
Adibvafa Sep 21, 2024
c35d3c7
Test multiple sequence generation.
Adibvafa Sep 21, 2024
17343c7
Update README.
Adibvafa Sep 21, 2024
73cef8a
Bump version to 1.6.4
Adibvafa Sep 21, 2024
349cc68
Update README.md
Adibvafa Sep 21, 2024
78a818b
Update README.md
Adibvafa Sep 21, 2024
2d71550
Merge remote-tracking branch 'upstream/main'
andrasmatyassy Sep 24, 2024
b7a6695
change ConfigManager to abc and add ProteinConfig for issue #5
andrasmatyassy Sep 24, 2024
e27fe4d
Merge remote-tracking branch 'upstream/main'
andrasmatyassy Sep 25, 2024
0fcba6a
change default behavior of ProteinConfig
andrasmatyassy Sep 25, 2024
f4fe97a
fix init behavior of ConfigManager
andrasmatyassy Sep 26, 2024
d709119
fix testcase for ConfigManager
andrasmatyassy Sep 26, 2024
ac37266
Merge remote-tracking branch 'upstream/main'
andrasmatyassy Sep 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions CodonTransformer/CodonData.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import json
import os
import random
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
Expand All @@ -25,6 +26,7 @@
STOP_CODONS,
STOP_SYMBOL,
STOP_SYMBOLS,
ProteinConfig,
find_pattern_in_fasta,
get_taxonomy_id,
sort_amino2codon_skeleton,
Expand Down Expand Up @@ -162,7 +164,7 @@ def preprocess_protein_sequence(protein: str) -> str:
str: The preprocessed protein sequence.

Raises:
ValueError: If the protein sequence is invalid.
ValueError: If the protein sequence is invalid or if the configuration is invalid.
"""
if not protein:
raise ValueError("Protein sequence is empty.")
Expand All @@ -172,10 +174,28 @@ def preprocess_protein_sequence(protein: str) -> str:
protein.upper().strip().replace("\n", "").replace(" ", "").replace("\t", "")
)

# Replace ambiguous amino acids with standard 20 amino acids
protein = "".join(
AMBIGUOUS_AMINOACID_MAP.get(aminoacid, aminoacid) for aminoacid in protein
)
# Handle ambiguous amino acids based on the specified behavior
config = ProteinConfig()
ambiguous_aminoacid_map_override = config.get('ambiguous_aminoacid_map_override')
ambiguous_aminoacid_behavior = config.get('ambiguous_aminoacid_behavior')
ambiguous_aminoacid_map = AMBIGUOUS_AMINOACID_MAP.copy()

for aminoacid, standard_aminoacids in ambiguous_aminoacid_map_override.items():
ambiguous_aminoacid_map[aminoacid] = standard_aminoacids

if ambiguous_aminoacid_behavior == 'raise_error':
if any(aminoacid in ambiguous_aminoacid_map for aminoacid in protein):
raise ValueError("Ambiguous amino acids found in protein sequence.")
elif ambiguous_aminoacid_behavior == 'standardize_deterministic':
protein = "".join(
ambiguous_aminoacid_map.get(aminoacid, [aminoacid])[0] for aminoacid in protein
)
elif ambiguous_aminoacid_behavior == 'standardize_random':
protein = "".join(
random.choice(ambiguous_aminoacid_map.get(aminoacid, [aminoacid])) for aminoacid in protein
)
else:
raise ValueError(f"Invalid ambiguous_aminoacid_behavior: {ambiguous_aminoacid_behavior}.")

# Check for sequence validity
if any(aminoacid not in AMINO_ACIDS + STOP_SYMBOLS for aminoacid in protein):
Expand Down
141 changes: 134 additions & 7 deletions CodonTransformer/CodonUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
import pickle
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, Iterator, List, Optional, Tuple

Expand Down Expand Up @@ -41,13 +42,13 @@
STOP_SYMBOLS = ["_", "*"] # Stop codon symbols

# Dictionary ambiguous amino acids to standard amino acids
AMBIGUOUS_AMINOACID_MAP: Dict[str, str] = {
"B": "N", # Aspartic acid (D) or Asparagine (N)
"Z": "Q", # Glutamic acid (E) or Glutamine (Q)
"X": "A", # Any amino acid (typically replaced with Alanine)
"J": "L", # Leucine (L) or Isoleucine (I)
"U": "C", # Selenocysteine (typically replaced with Cysteine)
"O": "K", # Pyrrolysine (typically replaced with Lysine)
AMBIGUOUS_AMINOACID_MAP: Dict[str, list[str]] = {
"B": ["N", "D"], # Asparagine (N) or Aspartic acid (D)
"Z": ["Q", "E"], # Glutamine (Q) or Glutamic acid (E)
"X": ["A"], # Any amino acid (typically replaced with Alanine)
"J": ["L", "I"], # Leucine (L) or Isoleucine (I)
"U": ["C"], # Selenocysteine (typically replaced with Cysteine)
"O": ["K"], # Pyrrolysine (typically replaced with Lysine)
}

# List of all possible start and stop codons
Expand Down Expand Up @@ -545,6 +546,132 @@ def __init__(self, data_path: str, train: bool = True, **kwargs):
self.train = train


class ConfigManager(ABC):
"""
Abstract base class for managing configuration settings.
"""
def __init__(self):
self._config: Dict[str, Any] = {}

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
if exc_type is not None:
print(f"Exception occurred: {exc_type}, {exc_value}, {traceback}")
self.reset_config()

@abstractmethod
def reset_config(self) -> None:
"""Reset the configuration to default values."""
pass

def get(self, key: str) -> Any:
"""
Get the value of a configuration key.

Args:
key (str): The key to retrieve the value for.

Returns:
Any: The value of the configuration key.
"""
return self._config.get(key)

def set(self, key: str, value: Any) -> None:
"""
Set the value of a configuration key.

Args:
key (str): The key to set the value for.
value (Any): The value to set for the key.
"""
self.validate_inputs(key, value)
self._config[key] = value

def update(self, config_dict: dict) -> None:
"""
Update the configuration with a dictionary of key-value pairs after validating them.

Args:
config_dict (dict): A dictionary of key-value pairs to update the configuration.
"""
for key, value in config_dict.items():
self.validate_inputs(key, value)
self._config.update(config_dict)

@abstractmethod
def validate_inputs(self, key: str, value: Any) -> None:
"""Validate the inputs for the configuration."""
pass

class ProteinConfig(ConfigManager):
"""
A class to manage configuration settings for protein sequences.

This class ensures that the configuration is a singleton.
It provides methods to get, set, and update configuration values.

Attributes:
_instance (Optional[ConfigManager]): The singleton instance of the ConfigManager.
_config (Dict[str, Any]): The configuration dictionary.
"""
_instance = None

def __new__(cls):
"""
Create a new instance of the ProteinConfig class.

Returns:
ProteinConfig: The singleton instance of the ProteinConfig.
"""
if cls._instance is None:
cls._instance = super(ProteinConfig, cls).__new__(cls)
cls._instance.reset_config()
return cls._instance

def validate_inputs(self, key: str, value: Any) -> None:
"""
Validate the inputs for the configuration.

Args:
key (str): The key to validate.
value (Any): The value to validate.

Raises:
ValueError: If the value is invalid.
TypeError: If the value is of the wrong type.
"""
if key == 'ambiguous_aminoacid_behavior':
if value not in [
'raise_error',
'standardize_deterministic',
'standardize_random'
]:
raise ValueError(f"Invalid value for ambiguous_aminoacid_behavior: {value}.")
elif key == 'ambiguous_aminoacid_map_override':
if not isinstance(value, dict):
raise TypeError(f"Invalid type for ambiguous_aminoacid_map_override: {value}.")
for ambiguous_aminoacid, aminoacids in value.items():
if not isinstance(aminoacids, list):
raise TypeError(f"Invalid type for aminoacids: {aminoacids}.")
if not aminoacids:
raise ValueError(f"Override for aminoacid '{ambiguous_aminoacid}' cannot be empty list.")
if ambiguous_aminoacid not in AMBIGUOUS_AMINOACID_MAP:
raise ValueError(f"Invalid amino acid in ambiguous_aminoacid_map_override: {ambiguous_aminoacid}")
else:
raise ValueError(f"Invalid configuration key: {key}")

def reset_config(self) -> None:
"""
Reset the configuration to the default values.
"""
self._config = {
'ambiguous_aminoacid_behavior': 'raise_error',
'ambiguous_aminoacid_map_override': {}
}
andrasmatyassy marked this conversation as resolved.
Show resolved Hide resolved


def load_python_object_from_disk(file_path: str) -> Any:
"""
Load a Pickle object from disk and return it as a Python object.
Expand Down
24 changes: 23 additions & 1 deletion tests/test_CodonData.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,32 @@
get_amino_acid_sequence,
is_correct_seq,
read_fasta_file,
preprocess_protein_sequence,
)

from CodonTransformer.CodonUtils import ProteinConfig

class TestCodonData(unittest.TestCase):
def test_preprocess_protein_sequence(self):
with ProteinConfig() as config:
protein = "Z_"
try:
preprocess_protein_sequence(protein)
self.fail("Expected ValueError")
except ValueError:
pass
config.set("ambiguous_aminoacid_behavior", "standardize_deterministic")
for _ in range(10):
preprocessed_protein = preprocess_protein_sequence(protein)
self.assertEqual(preprocessed_protein, "Q_")
config.set("ambiguous_aminoacid_behavior", "standardize_random")
random_results = set()
# The probability of getting the same result 30 times in a row is
# 1 in 1.073741824*10^9 if there are only two possible results.
for _ in range(30):
preprocessed_protein = preprocess_protein_sequence(protein)
random_results.add(preprocessed_protein)
self.assertGreater(len(random_results), 1)

def test_read_fasta_file(self):
fasta_content = ">sequence1\n" "ATGATGATGATGATG\n" ">sequence2\n" "TGATGATGATGA"

Expand Down
46 changes: 46 additions & 0 deletions tests/test_CodonUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import unittest

from CodonTransformer.CodonUtils import (
ProteinConfig,
find_pattern_in_fasta,
get_organism2id_dict,
get_taxonomy_id,
Expand All @@ -15,6 +16,51 @@


class TestCodonUtils(unittest.TestCase):
def test_config_manager(self):
with ProteinConfig() as config:
config.set(
"ambiguous_aminoacid_behavior",
"standardize_deterministic"
)
self.assertEqual(
config.get("ambiguous_aminoacid_behavior"),
"standardize_deterministic"
)
config.set(
"ambiguous_aminoacid_map_override",
{"R": ["A", "G"]}
)
self.assertEqual(
config.get("ambiguous_aminoacid_map_override"),
{"R": ["A", "G"]}
)
config.update({
"ambiguous_aminoacid_behavior": "raise_error",
"ambiguous_aminoacid_map_override": {"X": ["A", "G"]},
})
self.assertEqual(
config.get("ambiguous_aminoacid_behavior"),
"raise_error"
)
self.assertEqual(
config.get("ambiguous_aminoacid_map_override"),
{"X": ["A", "G"]}
)
try:
config.set("invalid_key", "invalid_value")
self.fail("Expected ValueError")
except ValueError:
pass
with ProteinConfig() as config:
self.assertEqual(
config.get("ambiguous_aminoacid_behavior"),
"raise_error"
)
self.assertEqual(
config.get("ambiguous_aminoacid_map_override"),
{}
)

def test_load_python_object_from_disk(self):
test_obj = {"key1": "value1", "key2": 2}
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as temp_file:
Expand Down
Loading