diff --git a/.github/workflows/setup-and-test.yml b/.github/workflows/setup-and-test.yml new file mode 100644 index 0000000..6ddb9c6 --- /dev/null +++ b/.github/workflows/setup-and-test.yml @@ -0,0 +1,25 @@ +on: "push" +name: setup-and-test + +jobs: + uv-example: + name: python + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + + - name: "Set up Python" + uses: actions/setup-python@v5 + with: + python-version-file: ".python-version" + + - name: Install the project + run: uv sync --all-extras --dev + + - name: Run tests + # For example, using `pytest` + run: uv run pytest tests diff --git a/pyproject.toml b/pyproject.toml index d7358e9..ddcad77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "biotite>=1.0.1", "polyleven>=0.8", "scikit-learn>=1.5.2", + "pytest>=8.3.3", ] [build-system] diff --git a/src/utilin/constants.py b/src/utilin/constants.py index ab27118..7b6ddf4 100644 --- a/src/utilin/constants.py +++ b/src/utilin/constants.py @@ -1,2 +1,49 @@ AA_ALPHABET = list("ACDEFGHIKLMNPQRSTVWY") AA_ALPHABET_GREMLIN = list("ARNDCQEGHILKMFPSTWYV-") +THREE_TO_SINGLE_LETTER_CODES = { + "ALA": "A", + "ARG": "R", + "ASN": "N", + "ASP": "D", + "CYS": "C", + "GLN": "Q", + "GLU": "E", + "GLY": "G", + "HIS": "H", + "ILE": "I", + "LEU": "L", + "LYS": "K", + "MET": "M", + "PHE": "F", + "PRO": "P", + "SER": "S", + "THR": "T", + "TRP": "W", + "TYR": "Y", + "VAL": "V", +} +SINGLE_TO_THREE_LETTER_CODES = { + "A": "ALA", + "R": "ARG", + "N": "ASN", + "D": "ASP", + "C": "CYS", + "Q": "GLN", + "E": "GLU", + "G": "GLY", + "H": "HIS", + "I": "ILE", + "L": "LEU", + "K": "LYS", + "M": "MET", + "F": "PHE", + "P": "PRO", + "S": "SER", + "T": "THR", + "W": "TRP", + "Y": "TYR", + "V": "VAL", +} +UNIPROT_ACCESSION_PATTERN = ( + r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}" +) diff --git a/src/utilin/plotting/prediction.py b/src/utilin/plotting/prediction.py index 1afc7ba..b388501 100644 --- a/src/utilin/plotting/prediction.py +++ b/src/utilin/plotting/prediction.py @@ -15,15 +15,22 @@ def plot_prediction_scatter( y_label: str = "Prediction", title: Optional[str] = None, axis_label_fontsize: int = 16, + title_fontsize: int = 18, + margin_ratio: float = 0.15, ) -> Tuple[Figure, Axes]: + x_min, x_max = min(y_true), max(y_true) + y_min, y_max = min(y_pred), max(y_pred) + x_margin = margin_ratio * (x_max - x_min) + y_margin = margin_ratio * (y_max - y_min) fig, ax = plt.subplots() ax.scatter(y_true, y_pred, c="k", s=50, alpha=0.75) ax.set_xlabel(x_label, fontsize=axis_label_fontsize) ax.set_ylabel(y_label, fontsize=axis_label_fontsize) - ax.set_title(title) + ax.set_title(title, fontsize=title_fontsize) + ax.set_xlim(x_min, x_max + x_margin) + ax.set_ylim(y_min, y_max + y_margin) spearman, _ = spearmanr(y_true, y_pred) r2 = r2_score(y_true, y_pred) - print(spearman, r2) metrics_str = f"Spearman: {spearman:.2f}\nR2: {r2:.2f}" box_format = dict(boxstyle="round", facecolor="white", alpha=0.5) ax.text( @@ -31,7 +38,7 @@ def plot_prediction_scatter( 0.95, metrics_str, transform=ax.transAxes, - fontsize=14, + fontsize=12, verticalalignment="top", bbox=box_format, ) diff --git a/tests/utilin/encode/test_one_hot.py b/tests/utilin/encode/test_one_hot.py new file mode 100644 index 0000000..be69df6 --- /dev/null +++ b/tests/utilin/encode/test_one_hot.py @@ -0,0 +1,58 @@ +import numpy as np +import pytest +import pandas as pd + +from utilin.encode.one_hot import encode_sequences_one_hot + + +def test_encode_sequences_one_hot_default(): + sequences = ["ACD", "GHK"] + expected_output = np.array([ + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ], + [ + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ] + ]) + output = encode_sequences_one_hot(sequences) + assert np.array_equal(output, expected_output) + + +def test_encode_sequences_one_hot_gremlin(): + sequences = ["A-", "CY"] + expected_output = np.array([ + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] + ], + [ + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0] + ] + ]) + output = encode_sequences_one_hot(sequences, aa_alphabet="gremlin") + assert np.array_equal(output, expected_output) + + +def test_encode_empty_sequences(): + sequences = [] + expected_output = np.array([]) + output = encode_sequences_one_hot(sequences) + assert np.array_equal(output, expected_output) + + +def test_encode_pandas_sequences_one_hot(): + sequences = pd.Series(["AC"]) + expected_output = np.array([ + [ + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ] + ]) + output = encode_sequences_one_hot(sequences) + assert np.array_equal(output, expected_output) diff --git a/uv.lock b/uv.lock index 3ebb084..8a36eaf 100644 --- a/uv.lock +++ b/uv.lock @@ -446,6 +446,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, ] +[[package]] +name = "iniconfig" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, +] + [[package]] name = "ipykernel" version = "6.29.5" @@ -1244,6 +1253,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3c/a6/bc1012356d8ece4d66dd75c4b9fc6c1f6650ddd5991e421177d9f8f671be/platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb", size = 18439 }, ] +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, +] + [[package]] name = "polyleven" version = "0.8" @@ -1333,6 +1351,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/0c/0e3c05b1c87bb6a1c76d281b0f35e78d2d80ac91b5f8f524cebf77f51049/pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c", size = 104100 }, ] +[[package]] +name = "pytest" +version = "8.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/6c/62bbd536103af674e227c41a8f3dcd022d591f6eed5facb5a0f31ee33bbc/pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181", size = 1442487 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -1796,6 +1829,7 @@ dependencies = [ { name = "matplotlib" }, { name = "pandas" }, { name = "polyleven" }, + { name = "pytest" }, { name = "ruff" }, { name = "scikit-learn" }, { name = "seaborn" }, @@ -1809,8 +1843,9 @@ requires-dist = [ { name = "matplotlib", specifier = ">=3.9.2" }, { name = "pandas", specifier = ">=2.2.3" }, { name = "polyleven", specifier = ">=0.8" }, + { name = "pytest" }, { name = "ruff", specifier = ">=0.6.9" }, - { name = "scikit-learn" }, + { name = "scikit-learn", specifier = ">=1.5.2" }, { name = "seaborn", specifier = ">=0.13.2" }, ]