Skip to content

Commit

Permalink
Fix issue #1 and add formatting hooks
Browse files Browse the repository at this point in the history
  • Loading branch information
gui11aume committed Sep 19, 2024
1 parent 08edd7f commit 372748d
Show file tree
Hide file tree
Showing 20 changed files with 433 additions and 220 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,4 @@ lightning_logs/

# PyTorch model weights
*.pth
*.pt
*.pt
34 changes: 34 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks

# Don't run pre-commit on files under third-party/
exclude: "^\
(third-party/.*)\
"

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
hooks:
- id: check-added-large-files # prevents giant files from being committed.
- id: check-case-conflict # checks for files that would conflict in case-insensitive filesystems.
- id: check-merge-conflict # checks for files that contain merge conflict strings.
- id: check-yaml # checks yaml files for parseable syntax.
- id: detect-private-key # detects the presence of private keys.
- id: end-of-file-fixer # ensures that a file is either empty, or ends with one newline.
- id: fix-byte-order-marker # removes utf-8 byte order marker.
- id: mixed-line-ending # replaces or checks mixed line ending.
- id: requirements-txt-fixer # sorts entries in requirements.txt.
- id: trailing-whitespace # trims trailing whitespace.

- repo: https://github.com/sirosen/check-jsonschema
rev: 0.23.2
hooks:
- id: check-github-actions
- id: check-github-workflows

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.13
hooks:
- id: ruff
- id: ruff-format
81 changes: 47 additions & 34 deletions CodonTransformer/CodonData.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,30 @@
preparing the data for training and inference of the CodonTransformer model.
"""

import os
import json
import os
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
import python_codon_tables as pct
from Bio import SeqIO
from Bio.Seq import Seq
from sklearn.utils import shuffle as sk_shuffle
from tqdm import tqdm

from CodonTransformer.CodonUtils import (
AMBIGUOUS_AMINOACID_MAP,
AMINO2CODON_TYPE,
AMINO_ACIDS,
ORGANISM2ID,
START_CODONS,
STOP_CODONS,
STOP_SYMBOL,
AMINO2CODON_TYPE,
AMBIGUOUS_AMINOACID_MAP,
ORGANISM2ID,
find_pattern_in_fasta,
sort_amino2codon_skeleton,
get_taxonomy_id,
sort_amino2codon_skeleton,
)

from Bio import SeqIO
from Bio.Seq import Seq

import python_codon_tables as pct

from typing import List, Dict, Tuple, Union, Optional
from tqdm import tqdm


def prepare_training_data(
dataset: Union[str, pd.DataFrame], output_file: str, shuffle: bool = True
Expand All @@ -50,7 +48,8 @@ def prepare_training_data(
Args:
dataset (Union[str, pd.DataFrame]): Input dataset in CSV or DataFrame format.
output_file (str): Path to save the output JSON dataset.
shuffle (bool, optional): Whether to shuffle the dataset before saving. Defaults to True.
shuffle (bool, optional): Whether to shuffle the dataset before saving.
Defaults to True.
Returns:
None
Expand Down Expand Up @@ -78,15 +77,16 @@ def prepare_training_data(

def dataframe_to_json(df: pd.DataFrame, output_file: str, shuffle: bool = True) -> None:
"""
Convert a pandas DataFrame to a JSON file format suitable for training CodonTransformer.
Convert pandas DataFrame to JSON file format suitable for training CodonTransformer.
This function takes a preprocessed DataFrame and writes it to a JSON file
where each line is a JSON object representing a single record.
Args:
df (pd.DataFrame): The input DataFrame with 'codons' and 'organism' columns.
output_file (str): Path to the output JSON file.
shuffle (bool, optional): Whether to shuffle the dataset before saving. Defaults to True.
shuffle (bool, optional): Whether to shuffle the dataset before saving.
Defaults to True.
Returns:
None
Expand Down Expand Up @@ -123,8 +123,9 @@ def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int])
It validates the input against a provided mapping of organism names to IDs.
Args:
organism (Union[str, int]): The input organism, either as a name (str) or ID (int).
organism_to_id (Dict[str, int]): A dictionary mapping organism names to their corresponding IDs.
organism (Union[str, int]): Input organism, either as a name (str) or ID (int).
organism_to_id (Dict[str, int]): Dictionary mapping organism names to their
corresponding IDs.
Returns:
int: The validated organism ID.
Expand All @@ -150,7 +151,8 @@ def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int])

def preprocess_protein_sequence(protein: str) -> str:
"""
Preprocess a protein sequence by cleaning, standardizing, and handling ambiguous amino acids.
Preprocess a protein sequence by cleaning, standardizing, and handling
ambiguous amino acids.
Args:
protein (str): The input protein sequence.
Expand Down Expand Up @@ -221,7 +223,8 @@ def replace_ambiguous_codons(dna: str) -> str:

def preprocess_dna_sequence(dna: str) -> str:
"""
Cleans and preprocesses a DNA sequence by standardizing it and replacing ambiguous codons.
Cleans and preprocesses a DNA sequence by standardizing it and replacing
ambiguous codons.
Args:
dna (str): The DNA sequence to preprocess.
Expand All @@ -247,8 +250,9 @@ def preprocess_dna_sequence(dna: str) -> str:

def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
"""
Return the merged sequence of protein amino acids and DNA codons in the form of tokens
separated by space, where each token is composed of an amino acid + separator + codon.
Return the merged sequence of protein amino acids and DNA codons in the form
of tokens separated by space, where each token is composed of an amino acid +
separator + codon.
Args:
protein (str): Protein sequence.
Expand All @@ -274,8 +278,9 @@ def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
# Check if the length of protein and dna sequences are equal
if len(dna) > 0 and len(protein) != len(dna) / 3:
raise ValueError(
'Length of protein (including stop symbol such as "_") and \
the number of codons in DNA sequence (including stop codon) must be equal.'
'Length of protein (including stop symbol such as "_") and '
"the number of codons in DNA sequence (including stop codon) "
"must be equal."
)

# Merge protein and DNA sequences into tokens
Expand Down Expand Up @@ -331,8 +336,8 @@ def get_amino_acid_sequence(
return_correct_seq (bool): Whether to return if the sequence is correct.
Returns:
Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if return_correct_seq is True,
otherwise just the protein sequence.
Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if
return_correct_seq is True, otherwise just the protein sequence.
"""
dna_seq = Seq(dna).strip()

Expand Down Expand Up @@ -365,12 +370,15 @@ def read_fasta_file(
Args:
input_file (str): Path to the input FASTA file.
save_to_file (Optional[str]): Path to save the output DataFrame. If None, data is only returned.
organism (str): Name of the organism. If empty, it will be extracted from the FASTA description.
save_to_file (Optional[str]): Path to save the output DataFrame. If None,
data is only returned.
organism (str): Name of the organism. If empty, it will be extracted from
the FASTA description.
buffer_size (int): Number of records to process before writing to file.
Returns:
pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe is True, else None.
pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe
is True, else None.
Raises:
FileNotFoundError: If the input file does not exist.
Expand Down Expand Up @@ -498,7 +506,8 @@ def download_codon_frequencies_from_kazusa(

def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
"""
Return the empty skeleton of the amino2codon dictionary, needed for get_codon_frequencies.
Return the empty skeleton of the amino2codon dictionary, needed for
get_codon_frequencies.
Args:
organism (str): Name of the organism.
Expand All @@ -514,7 +523,8 @@ def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
return_correct_seq=False,
)

# Initialize the amino2codon skeleton with all possible codons and set their frequencies to 0
# Initialize the amino2codon skeleton with all possible codons and set their
# frequencies to 0
for i, (codon, amino) in enumerate(zip(possible_codons, possible_aminoacids)):
if amino not in amino2codon:
amino2codon[amino] = ([], [])
Expand Down Expand Up @@ -543,7 +553,8 @@ def get_codon_frequencies(
organism (Optional[str]): Name of the organism.
Returns:
AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons and frequencies.
AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons
and frequencies.
"""
if organism:
codon_table = get_codon_table(organism)
Expand Down Expand Up @@ -583,7 +594,8 @@ def get_organism_to_codon_frequencies(
organisms (List[str]): List of organisms.
Returns:
Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon frequency distribution.
Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon
frequency distribution.
"""
organism2frequencies = {}

Expand Down Expand Up @@ -617,7 +629,8 @@ def get_codon_table(organism: str) -> int:
"Arabidopsis thaliana",
"Caenorhabditis elegans",
"Chlamydomonas reinhardtii",
"Saccharomyces cerevisiae" "Danio rerio",
"Saccharomyces cerevisiae",
"Danio rerio",
"Drosophila melanogaster",
"Homo sapiens",
"Mus musculus",
Expand Down
31 changes: 16 additions & 15 deletions CodonTransformer/CodonEvaluation.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
"""
File: CodonEvaluation.py
---------------------------
Includes functions to calculate various evaluation metrics along with helper functions.
Includes functions to calculate various evaluation metrics along with helper
functions.
"""

import pandas as pd
from typing import Dict, List, Tuple

import pandas as pd
from CAI import CAI, relative_adaptiveness

from typing import List, Dict, Tuple
from tqdm import tqdm



def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
"""
Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences.
Expand Down Expand Up @@ -47,7 +46,7 @@ def get_organism_to_CSI_weights(
Calculate the Codon Similarity Index (CSI) weights for a list of organisms.
Args:
dataset (pd.DataFrame): The dataset containing organism and DNA sequence information.
dataset (pd.DataFrame): Dataset containing organism and DNA sequence info.
organisms (List[str]): List of organism names.
Returns:
Expand Down Expand Up @@ -91,7 +90,8 @@ def get_cfd(
Args:
dna (str): The DNA sequence.
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon frequency distribution per amino acid.
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
frequency distribution per amino acid.
threshold (float): Frequency threshold for counting rare codons.
Returns:
Expand Down Expand Up @@ -127,7 +127,8 @@ def get_min_max_percentage(
Args:
dna (str): The DNA sequence.
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon frequency distribution per amino acid.
codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
frequency distribution per amino acid.
window_size (int): Size of the window to calculate %MinMax.
Returns:
Expand All @@ -147,14 +148,12 @@ def get_min_max_percentage(

# Iterate through the DNA sequence using the specified window size
for i in range(len(codons) - window_size + 1):
codon_window = codons[
i : i + window_size
] # List of the codons in the current window
codon_window = codons[i : i + window_size] # Codons in the current window

Actual = 0.0 # Average of the actual codon frequencies
Max = 0.0 # Average of the min codon frequencies
Min = 0.0 # Average of the max codon frequencies
Avg = 0.0 # Average of the averages of all the frequencies associated with each amino acid
Avg = 0.0 # Average of the averages of all frequencies for each amino acid

# Sum the frequencies for codons in the current window
for codon in codon_window:
Expand Down Expand Up @@ -210,7 +209,7 @@ def sum_up_to(x):
return x + sum_up_to(x - 1)

def f(x):
"""Function that returns 4 if x is greater than or equal to 4, else returns x."""
"""Returns 4 if x is greater than or equal to 4, else returns x."""
if x >= 4:
return 4
elif x < 4:
Expand Down Expand Up @@ -242,8 +241,10 @@ def get_sequence_similarity(
Args:
original (str): The original sequence.
predicted (str): The predicted sequence.
truncate (bool): If True, truncate the original sequence to match the length of the predicted sequence.
window_length (int): Length of the window for comparison (1 for amino acids, 3 for codons).
truncate (bool): If True, truncate the original sequence to match the length
of the predicted sequence.
window_length (int): Length of the window for comparison (1 for amino acids,
3 for codons).
Returns:
float: The sequence similarity as a percentage.
Expand Down
Loading

0 comments on commit 372748d

Please sign in to comment.