Fix issue #1 and add formatting hooks

Adibvafa · Sep 19, 2024 · 372748d · 372748d
1 parent 08edd7f
commit 372748d
Show file tree

Hide file tree

Showing 20 changed files with 433 additions and 220 deletions.
diff --git a/.gitignore b/.gitignore
@@ -174,4 +174,4 @@ lightning_logs/
 
 # PyTorch model weights
 *.pth
-*.pt
+*.pt
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,34 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+
+# Don't run pre-commit on files under third-party/
+exclude: "^\
+  (third-party/.*)\
+  "
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+      - id: check-added-large-files # prevents giant files from being committed.
+      - id: check-case-conflict # checks for files that would conflict in case-insensitive filesystems.
+      - id: check-merge-conflict # checks for files that contain merge conflict strings.
+      - id: check-yaml # checks yaml files for parseable syntax.
+      - id: detect-private-key # detects the presence of private keys.
+      - id: end-of-file-fixer # ensures that a file is either empty, or ends with one newline.
+      - id: fix-byte-order-marker # removes utf-8 byte order marker.
+      - id: mixed-line-ending # replaces or checks mixed line ending.
+      - id: requirements-txt-fixer # sorts entries in requirements.txt.
+      - id: trailing-whitespace # trims trailing whitespace.
+
+  - repo: https://github.com/sirosen/check-jsonschema
+    rev: 0.23.2
+    hooks:
+      - id: check-github-actions
+      - id: check-github-workflows
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.13
+    hooks:
+      - id: ruff
+      - id: ruff-format
diff --git a/CodonTransformer/CodonData.py b/CodonTransformer/CodonData.py
@@ -5,32 +5,30 @@
 preparing the data for training and inference of the CodonTransformer model.
 """
 
-import os
 import json
+import os
+from typing import Dict, List, Optional, Tuple, Union
+
 import pandas as pd
+import python_codon_tables as pct
+from Bio import SeqIO
+from Bio.Seq import Seq
 from sklearn.utils import shuffle as sk_shuffle
+from tqdm import tqdm
 
 from CodonTransformer.CodonUtils import (
+    AMBIGUOUS_AMINOACID_MAP,
+    AMINO2CODON_TYPE,
     AMINO_ACIDS,
+    ORGANISM2ID,
     START_CODONS,
     STOP_CODONS,
     STOP_SYMBOL,
-    AMINO2CODON_TYPE,
-    AMBIGUOUS_AMINOACID_MAP,
-    ORGANISM2ID,
     find_pattern_in_fasta,
-    sort_amino2codon_skeleton,
     get_taxonomy_id,
+    sort_amino2codon_skeleton,
 )
 
-from Bio import SeqIO
-from Bio.Seq import Seq
-
-import python_codon_tables as pct
-
-from typing import List, Dict, Tuple, Union, Optional
-from tqdm import tqdm
-
 
 def prepare_training_data(
     dataset: Union[str, pd.DataFrame], output_file: str, shuffle: bool = True
@@ -50,7 +48,8 @@ def prepare_training_data(
     Args:
         dataset (Union[str, pd.DataFrame]): Input dataset in CSV or DataFrame format.
         output_file (str): Path to save the output JSON dataset.
-        shuffle (bool, optional): Whether to shuffle the dataset before saving. Defaults to True.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
 
     Returns:
         None
@@ -78,15 +77,16 @@ def prepare_training_data(
 
 def dataframe_to_json(df: pd.DataFrame, output_file: str, shuffle: bool = True) -> None:
     """
-    Convert a pandas DataFrame to a JSON file format suitable for training CodonTransformer.
+    Convert pandas DataFrame to JSON file format suitable for training CodonTransformer.
 
     This function takes a preprocessed DataFrame and writes it to a JSON file
     where each line is a JSON object representing a single record.
 
     Args:
         df (pd.DataFrame): The input DataFrame with 'codons' and 'organism' columns.
         output_file (str): Path to the output JSON file.
-        shuffle (bool, optional): Whether to shuffle the dataset before saving. Defaults to True.
+        shuffle (bool, optional): Whether to shuffle the dataset before saving.
+            Defaults to True.
 
     Returns:
         None
@@ -123,8 +123,9 @@ def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int])
     It validates the input against a provided mapping of organism names to IDs.
 
     Args:
-        organism (Union[str, int]): The input organism, either as a name (str) or ID (int).
-        organism_to_id (Dict[str, int]): A dictionary mapping organism names to their corresponding IDs.
+        organism (Union[str, int]): Input organism, either as a name (str) or ID (int).
+        organism_to_id (Dict[str, int]): Dictionary mapping organism names to their
+            corresponding IDs.
 
     Returns:
         int: The validated organism ID.
@@ -150,7 +151,8 @@ def process_organism(organism: Union[str, int], organism_to_id: Dict[str, int])
 
 def preprocess_protein_sequence(protein: str) -> str:
     """
-    Preprocess a protein sequence by cleaning, standardizing, and handling ambiguous amino acids.
+    Preprocess a protein sequence by cleaning, standardizing, and handling
+    ambiguous amino acids.
 
     Args:
         protein (str): The input protein sequence.
@@ -221,7 +223,8 @@ def replace_ambiguous_codons(dna: str) -> str:
 
 def preprocess_dna_sequence(dna: str) -> str:
     """
-    Cleans and preprocesses a DNA sequence by standardizing it and replacing ambiguous codons.
+    Cleans and preprocesses a DNA sequence by standardizing it and replacing
+    ambiguous codons.
 
     Args:
         dna (str): The DNA sequence to preprocess.
@@ -247,8 +250,9 @@ def preprocess_dna_sequence(dna: str) -> str:
 
 def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
     """
-    Return the merged sequence of protein amino acids and DNA codons in the form of tokens
-    separated by space, where each token is composed of an amino acid + separator + codon.
+    Return the merged sequence of protein amino acids and DNA codons in the form
+    of tokens separated by space, where each token is composed of an amino acid +
+    separator + codon.
 
     Args:
         protein (str): Protein sequence.
@@ -274,8 +278,9 @@ def get_merged_seq(protein: str, dna: str = "", separator: str = "_") -> str:
     # Check if the length of protein and dna sequences are equal
     if len(dna) > 0 and len(protein) != len(dna) / 3:
         raise ValueError(
-            'Length of protein (including stop symbol such as "_") and \
-                         the number of codons in DNA sequence (including stop codon) must be equal.'
+            'Length of protein (including stop symbol such as "_") and '
+            "the number of codons in DNA sequence (including stop codon) "
+            "must be equal."
         )
 
     # Merge protein and DNA sequences into tokens
@@ -331,8 +336,8 @@ def get_amino_acid_sequence(
         return_correct_seq (bool): Whether to return if the sequence is correct.
 
     Returns:
-        Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if return_correct_seq is True,
-                                      otherwise just the protein sequence.
+        Union[str, Tuple[str, bool]]: Protein sequence and correctness flag if
+        return_correct_seq is True, otherwise just the protein sequence.
     """
     dna_seq = Seq(dna).strip()
 
@@ -365,12 +370,15 @@ def read_fasta_file(
 
     Args:
         input_file (str): Path to the input FASTA file.
-        save_to_file (Optional[str]): Path to save the output DataFrame. If None, data is only returned.
-        organism (str): Name of the organism. If empty, it will be extracted from the FASTA description.
+        save_to_file (Optional[str]): Path to save the output DataFrame. If None,
+            data is only returned.
+        organism (str): Name of the organism. If empty, it will be extracted from
+            the FASTA description.
         buffer_size (int): Number of records to process before writing to file.
 
     Returns:
-        pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe is True, else None.
+        pd.DataFrame: DataFrame containing the DNA sequences if return_dataframe
+        is True, else None.
 
     Raises:
         FileNotFoundError: If the input file does not exist.
@@ -498,7 +506,8 @@ def download_codon_frequencies_from_kazusa(
 
 def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
     """
-    Return the empty skeleton of the amino2codon dictionary, needed for get_codon_frequencies.
+    Return the empty skeleton of the amino2codon dictionary, needed for
+    get_codon_frequencies.
 
     Args:
         organism (str): Name of the organism.
@@ -514,7 +523,8 @@ def build_amino2codon_skeleton(organism: str) -> AMINO2CODON_TYPE:
         return_correct_seq=False,
     )
 
-    # Initialize the amino2codon skeleton with all possible codons and set their frequencies to 0
+    # Initialize the amino2codon skeleton with all possible codons and set their
+    # frequencies to 0
     for i, (codon, amino) in enumerate(zip(possible_codons, possible_aminoacids)):
         if amino not in amino2codon:
             amino2codon[amino] = ([], [])
@@ -543,7 +553,8 @@ def get_codon_frequencies(
         organism (Optional[str]): Name of the organism.
 
     Returns:
-        AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons and frequencies.
+        AMINO2CODON_TYPE: Dictionary mapping each amino acid to a tuple of codons
+        and frequencies.
     """
     if organism:
         codon_table = get_codon_table(organism)
@@ -583,7 +594,8 @@ def get_organism_to_codon_frequencies(
         organisms (List[str]): List of organisms.
 
     Returns:
-        Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon frequency distribution.
+        Dict[str, AMINO2CODON_TYPE]: Dictionary mapping each organism to its codon
+        frequency distribution.
     """
     organism2frequencies = {}
 
@@ -617,7 +629,8 @@ def get_codon_table(organism: str) -> int:
         "Arabidopsis thaliana",
         "Caenorhabditis elegans",
         "Chlamydomonas reinhardtii",
-        "Saccharomyces cerevisiae" "Danio rerio",
+        "Saccharomyces cerevisiae",
+        "Danio rerio",
         "Drosophila melanogaster",
         "Homo sapiens",
         "Mus musculus",

diff --git a/CodonTransformer/CodonEvaluation.py b/CodonTransformer/CodonEvaluation.py
@@ -1,18 +1,17 @@
 """
 File: CodonEvaluation.py
 ---------------------------
-Includes functions to calculate various evaluation metrics along with helper functions.
+Includes functions to calculate various evaluation metrics along with helper
+functions.
 """
 
-import pandas as pd
+from typing import Dict, List, Tuple
 
+import pandas as pd
 from CAI import CAI, relative_adaptiveness
-
-from typing import List, Dict, Tuple
 from tqdm import tqdm
 
 
-
 def get_CSI_weights(sequences: List[str]) -> Dict[str, float]:
     """
     Calculate the Codon Similarity Index (CSI) weights for a list of DNA sequences.
@@ -47,7 +46,7 @@ def get_organism_to_CSI_weights(
     Calculate the Codon Similarity Index (CSI) weights for a list of organisms.
 
     Args:
-        dataset (pd.DataFrame): The dataset containing organism and DNA sequence information.
+        dataset (pd.DataFrame): Dataset containing organism and DNA sequence info.
         organisms (List[str]): List of organism names.
 
     Returns:
@@ -91,7 +90,8 @@ def get_cfd(
 
     Args:
         dna (str): The DNA sequence.
-        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon frequency distribution per amino acid.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
         threshold (float): Frequency threshold for counting rare codons.
 
     Returns:
@@ -127,7 +127,8 @@ def get_min_max_percentage(
 
     Args:
         dna (str): The DNA sequence.
-        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon frequency distribution per amino acid.
+        codon_frequencies (Dict[str, Tuple[List[str], List[float]]]): Codon
+            frequency distribution per amino acid.
         window_size (int): Size of the window to calculate %MinMax.
 
     Returns:
@@ -147,14 +148,12 @@ def get_min_max_percentage(
 
     # Iterate through the DNA sequence using the specified window size
     for i in range(len(codons) - window_size + 1):
-        codon_window = codons[
-            i : i + window_size
-        ]  # List of the codons in the current window
+        codon_window = codons[i : i + window_size]  # Codons in the current window
 
         Actual = 0.0  # Average of the actual codon frequencies
         Max = 0.0  # Average of the min codon frequencies
         Min = 0.0  # Average of the max codon frequencies
-        Avg = 0.0  # Average of the averages of all the frequencies associated with each amino acid
+        Avg = 0.0  # Average of the averages of all frequencies for each amino acid
 
         # Sum the frequencies for codons in the current window
         for codon in codon_window:
@@ -210,7 +209,7 @@ def sum_up_to(x):
             return x + sum_up_to(x - 1)
 
     def f(x):
-        """Function that returns 4 if x is greater than or equal to 4, else returns x."""
+        """Returns 4 if x is greater than or equal to 4, else returns x."""
         if x >= 4:
             return 4
         elif x < 4:
@@ -242,8 +241,10 @@ def get_sequence_similarity(
     Args:
         original (str): The original sequence.
         predicted (str): The predicted sequence.
-        truncate (bool): If True, truncate the original sequence to match the length of the predicted sequence.
-        window_length (int): Length of the window for comparison (1 for amino acids, 3 for codons).
+        truncate (bool): If True, truncate the original sequence to match the length
+            of the predicted sequence.
+        window_length (int): Length of the window for comparison (1 for amino acids,
+            3 for codons).
 
     Returns:
         float: The sequence similarity as a percentage.
-Original file line number
+Diff line change
@@ Expand Up / @@ -174,4 +174,4 @@ lightning_logs/ @@
     # PyTorch model weights
     *.pth
-    *.pt
+    *.pt