Merge pull request #88 from Clinical-Genomics-Lund/87-update-cgmlst-m…

…odel-and-parser-regarding-chewbbaca-errors Updated parsing of chewbbaca cgMLST error codes
SMD-Bioinformatics-Lund · Sep 4, 2024 · 7cff73f · 7cff73f
2 parents 3f68f64 + 9e37be4
commit 7cff73f
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 ### Fixed
 
+- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.
+
 ### Changed
 
 ## [0.10.0]

diff --git a/prp/cli.py b/prp/cli.py
@@ -16,7 +16,7 @@
 from .models.metadata import SoupType, SoupVersion
 from .models.phenotype import ElementType
 from .models.qc import QcMethodIndex, QcSoftware
-from .models.sample import MethodIndex, PipelineResult, ReferenceGenome
+from .models.sample import MethodIndex, PipelineResult, ReferenceGenome, IgvAnnotationTrack
 from .parse import (
     load_variants,
     parse_alignment_results,
@@ -584,7 +584,7 @@ def add_igv_annotation_track(track_name, annotation_file, bonsai_input_file, out
         track_info = result_obj.genome_annotation
 
     # add new tracks
-    track_info.append({"name": track_name, "file": annotation_file})
+    track_info.append(IgvAnnotationTrack(name=track_name, file=annotation_file))
 
     # update data model
     upd_result = result_obj.model_copy(update={"genome_annotation": track_info})

diff --git a/prp/models/typing.py b/prp/models/typing.py
@@ -44,6 +44,8 @@ class ChewbbacaErrors(str, Enum):
     ALM = "ALM"
     ASM = "ASM"
     LNF = "LNF"
+    EXC = "EXC"
+    PAMA = "PAMA"
 
 
 class MlstErrors(str, Enum):

diff --git a/prp/parse/typing.py b/prp/parse/typing.py
@@ -13,6 +13,7 @@
     TypingResultCgMlst,
     TypingResultGeneAllele,
     TypingResultMlst,
+    ChewbbacaErrors,
 )
 from ..models.typing import TypingSoftware as Software
 from .phenotype.serotypefinder import parse_serotype_gene
@@ -57,6 +58,37 @@ def parse_mlst_results(mlst_fpath: str) -> TypingResultMlst:
     )
 
 
+def replace_cgmlst_errors(
+    allele: str, include_novel_alleles: bool = True, correct_alleles: bool = False
+) -> int | str | None:
+    """Replace errors and novel allele calls with null values."""
+    errors = [err.value for err in ChewbbacaErrors]
+    if any(
+        [
+            correct_alleles and allele in errors,
+            correct_alleles and allele.startswith("INF") and not include_novel_alleles,
+        ]
+    ):
+        return None
+
+    if include_novel_alleles:
+        if allele.startswith("INF"):
+            allele = allele.split("-")[1]
+        else:
+            allele = allele.replace("*", "")
+
+    # try convert to an int
+    try:
+        allele = int(allele)
+    except ValueError:
+        allele = str(allele)
+        LOG.warning(
+            "Possible cgMLST parser error, allele could not be cast as an integer: %s",
+            allele,
+        )
+    return allele
+
+
 def parse_cgmlst_results(
     chewbacca_res_path: str,
     include_novel_alleles: bool = True,
@@ -74,33 +106,12 @@ def parse_cgmlst_results(
     NIPHEM,
     ALM, alleles larger than locus length
     ASM, alleles smaller than locus length
+    EXC, Total number of CDSs classified as EXC
+    LOTSC, Total number of CDSs classified as LOTSC
+    PAMA, Total number of PAMA classifications
     """
-    errors = ("LNF", "PLOT3", "PLOT5", "NIPH", "NIPHEM", "ALM", "ASM")
-
-    def replace_errors(allele):
-        """Replace errors and novel alleles with null values."""
-        if any(
-            [
-                correct_alleles and allele in errors,
-                correct_alleles
-                and allele.startswith("INF")
-                and not include_novel_alleles,
-            ]
-        ):
-            return None
-
-        if allele.startswith("INF") and include_novel_alleles:
-            try:
-                allele = int(allele.split("-")[1])
-            except ValueError:
-                allele = str(allele.split("-")[1])
-        # try convert to an int
-        try:
-            allele = int(allele)
-        except ValueError:
-            allele = str(allele)
-        return allele
 
+    errors = [err.value for err in ChewbbacaErrors]
     LOG.info(
         "Parsing cgmslt results, %s including novel alleles",
         "not" if not include_novel_alleles else "",
@@ -111,10 +122,21 @@ def replace_errors(allele):
         _, *allele_names = (colname.rstrip(".fasta") for colname in next(creader))
         # parse alleles
         _, *alleles = next(creader)
-    corrected_alleles = (replace_errors(a) for a in alleles)
+
+    # setup counters for counting novel and missing alleles before correction
+    n_novel = 0
+    n_missing = 0
+    corrected_alleles = []
+    for allele in alleles:
+        if allele.startswith("INF") or allele.startswith("*"):
+            n_novel += 1
+        if allele in errors:
+            n_missing += 1
+        corrected_alleles.append(replace_cgmlst_errors(allele))
+
     results = TypingResultCgMlst(
-        n_novel=sum(1 for a in alleles if a.startswith("INF")),
-        n_missing=sum(1 for a in alleles if a in errors),
+        n_novel=n_novel,
+        n_missing=n_missing,
         alleles=dict(zip(allele_names, corrected_alleles)),
     )
     return MethodIndex(

diff --git a/tests/fixtures/ecoli/cdm_input.json b/tests/fixtures/ecoli/cdm_input.json
@@ -42,7 +42,7 @@
       "software": "chewbbaca",
       "version": null,
       "result": {
-         "n_missing": 4228
+         "n_missing": 4232
       }
    }
 ]
diff --git a/tests/parse/test_metadata.py b/tests/parse/test_metadata.py
@@ -0,0 +1,20 @@
+"""Test functions for parsing metadata."""
+
+from datetime import datetime
+from prp.parse.metadata import parse_sequence_date_from_run_id
+
+
+def test_parse_sequence_date_from_run_id():
+    """Test parsing of sequencing run id."""
+
+    # test that a run id from illumina works
+    date = parse_sequence_date_from_run_id("220214_NB501699_0302_AHJLM7AFX3")
+    assert date == datetime(2022, 2, 14)
+
+    # test that an unknown id does not work
+    date = parse_sequence_date_from_run_id("my-unknown-run-id")
+    assert date == None
+
+    # test that an unknown id does not work
+    date = parse_sequence_date_from_run_id("my_unknown_run_id")
+    assert date == None
diff --git a/tests/parse/test_typing.py b/tests/parse/test_typing.py
@@ -0,0 +1,87 @@
+"""Test typing method parsing."""
+
+import pytest
+import logging
+from prp.parse.typing import replace_cgmlst_errors
+from prp.models.typing import ChewbbacaErrors
+
+# build test cases for handeling chewbacca allele caller errors and annotations
+# reference, https://chewbbaca.readthedocs.io/en/latest/user/modules/AlleleCall.html
+cgmlst_test_base = [("1", 1), ("99", 99)]  # normal alllale calls
+cgmlst_test_include_novel = [("INF-1", 1), ("INF-99", 99), ("*1", 1), ("*99", 99)]  # inferred alleles
+cgmlst_test_not_include_novel = [
+    ("INF-1", "INF-1"),
+    ("INF-99", "INF-99"),
+]  # inferred alleles
+cgmlst_test_replace_errors = [
+    (err.value, None) for err in ChewbbacaErrors
+]  # errors to strip
+cgmlst_test_not_replace_errors = [
+    (err.value, err.value) for err in ChewbbacaErrors
+]  # errors to strip
+
+
+@pytest.mark.parametrize(
+    "called_allele,expected",
+    [
+        *cgmlst_test_base,
+        *cgmlst_test_not_include_novel,
+        *cgmlst_test_not_replace_errors,
+    ],
+)
+def test_replace_cgmlst_errors_not_include_novel(called_allele, expected):
+    """Test function that process Chewbbaca allele calling."""
+
+    assert (
+        replace_cgmlst_errors(
+            called_allele, include_novel_alleles=False, correct_alleles=False
+        )
+        == expected
+    )
+
+
+@pytest.mark.parametrize(
+    "called_allele,expected",
+    [*cgmlst_test_base, *cgmlst_test_include_novel, *cgmlst_test_not_replace_errors],
+)
+def test_replace_cgmlst_errors_include_novel(called_allele, expected):
+    """Test function that process Chewbbaca allele calling."""
+
+    assert (
+        replace_cgmlst_errors(
+            called_allele, include_novel_alleles=True, correct_alleles=False
+        )
+        == expected
+    )
+
+
+@pytest.mark.parametrize(
+    "called_allele,expected",
+    [*cgmlst_test_base, *cgmlst_test_include_novel, *cgmlst_test_replace_errors],
+)
+def test_replace_cgmlst_errors_include_novel_and_correct_allels(
+    called_allele, expected
+):
+    """Test function that process Chewbbaca allele calling."""
+
+    assert (
+        replace_cgmlst_errors(
+            called_allele, include_novel_alleles=True, correct_alleles=True
+        )
+        == expected
+    )
+
+
+def test_replace_cgmlst_errors_warnings(caplog):
+    """Test that replace_cgmlst_errors warns if allele could not be cast as an integer."""
+    caplog.at_level(logging.WARNING)
+    # run test that should not trigger a warning
+    replace_cgmlst_errors("1", include_novel_alleles=True, correct_alleles=True)
+    # check that warning was not triggered
+    for record in caplog.records:
+        assert record.levelname != "WARNING"
+
+    # run test that a warning was triggered if input is unknown string
+    allele = "A_STRANGE_STRING"
+    replace_cgmlst_errors(allele, include_novel_alleles=True, correct_alleles=True)
+    assert allele in caplog.text