Skip to content

Commit

Permalink
Merge pull request #88 from Clinical-Genomics-Lund/87-update-cgmlst-m…
Browse files Browse the repository at this point in the history
…odel-and-parser-regarding-chewbbaca-errors

Updated parsing of chewbbaca cgMLST error codes
  • Loading branch information
mhkc authored Sep 4, 2024
2 parents 3f68f64 + 9e37be4 commit 7cff73f
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 31 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### Fixed

- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.

### Changed

## [0.10.0]
Expand Down
4 changes: 2 additions & 2 deletions prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .models.metadata import SoupType, SoupVersion
from .models.phenotype import ElementType
from .models.qc import QcMethodIndex, QcSoftware
from .models.sample import MethodIndex, PipelineResult, ReferenceGenome
from .models.sample import MethodIndex, PipelineResult, ReferenceGenome, IgvAnnotationTrack
from .parse import (
load_variants,
parse_alignment_results,
Expand Down Expand Up @@ -584,7 +584,7 @@ def add_igv_annotation_track(track_name, annotation_file, bonsai_input_file, out
track_info = result_obj.genome_annotation

# add new tracks
track_info.append({"name": track_name, "file": annotation_file})
track_info.append(IgvAnnotationTrack(name=track_name, file=annotation_file))

# update data model
upd_result = result_obj.model_copy(update={"genome_annotation": track_info})
Expand Down
2 changes: 2 additions & 0 deletions prp/models/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ class ChewbbacaErrors(str, Enum):
ALM = "ALM"
ASM = "ASM"
LNF = "LNF"
EXC = "EXC"
PAMA = "PAMA"


class MlstErrors(str, Enum):
Expand Down
78 changes: 50 additions & 28 deletions prp/parse/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TypingResultCgMlst,
TypingResultGeneAllele,
TypingResultMlst,
ChewbbacaErrors,
)
from ..models.typing import TypingSoftware as Software
from .phenotype.serotypefinder import parse_serotype_gene
Expand Down Expand Up @@ -57,6 +58,37 @@ def parse_mlst_results(mlst_fpath: str) -> TypingResultMlst:
)


def replace_cgmlst_errors(
allele: str, include_novel_alleles: bool = True, correct_alleles: bool = False
) -> int | str | None:
"""Replace errors and novel allele calls with null values."""
errors = [err.value for err in ChewbbacaErrors]
if any(
[
correct_alleles and allele in errors,
correct_alleles and allele.startswith("INF") and not include_novel_alleles,
]
):
return None

if include_novel_alleles:
if allele.startswith("INF"):
allele = allele.split("-")[1]
else:
allele = allele.replace("*", "")

# try convert to an int
try:
allele = int(allele)
except ValueError:
allele = str(allele)
LOG.warning(
"Possible cgMLST parser error, allele could not be cast as an integer: %s",
allele,
)
return allele


def parse_cgmlst_results(
chewbacca_res_path: str,
include_novel_alleles: bool = True,
Expand All @@ -74,33 +106,12 @@ def parse_cgmlst_results(
NIPHEM,
ALM, alleles larger than locus length
ASM, alleles smaller than locus length
EXC, Total number of CDSs classified as EXC
LOTSC, Total number of CDSs classified as LOTSC
PAMA, Total number of PAMA classifications
"""
errors = ("LNF", "PLOT3", "PLOT5", "NIPH", "NIPHEM", "ALM", "ASM")

def replace_errors(allele):
"""Replace errors and novel alleles with null values."""
if any(
[
correct_alleles and allele in errors,
correct_alleles
and allele.startswith("INF")
and not include_novel_alleles,
]
):
return None

if allele.startswith("INF") and include_novel_alleles:
try:
allele = int(allele.split("-")[1])
except ValueError:
allele = str(allele.split("-")[1])
# try convert to an int
try:
allele = int(allele)
except ValueError:
allele = str(allele)
return allele

errors = [err.value for err in ChewbbacaErrors]
LOG.info(
"Parsing cgmslt results, %s including novel alleles",
"not" if not include_novel_alleles else "",
Expand All @@ -111,10 +122,21 @@ def replace_errors(allele):
_, *allele_names = (colname.rstrip(".fasta") for colname in next(creader))
# parse alleles
_, *alleles = next(creader)
corrected_alleles = (replace_errors(a) for a in alleles)

# setup counters for counting novel and missing alleles before correction
n_novel = 0
n_missing = 0
corrected_alleles = []
for allele in alleles:
if allele.startswith("INF") or allele.startswith("*"):
n_novel += 1
if allele in errors:
n_missing += 1
corrected_alleles.append(replace_cgmlst_errors(allele))

results = TypingResultCgMlst(
n_novel=sum(1 for a in alleles if a.startswith("INF")),
n_missing=sum(1 for a in alleles if a in errors),
n_novel=n_novel,
n_missing=n_missing,
alleles=dict(zip(allele_names, corrected_alleles)),
)
return MethodIndex(
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/ecoli/cdm_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"software": "chewbbaca",
"version": null,
"result": {
"n_missing": 4228
"n_missing": 4232
}
}
]
20 changes: 20 additions & 0 deletions tests/parse/test_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Test functions for parsing metadata."""

from datetime import datetime
from prp.parse.metadata import parse_sequence_date_from_run_id


def test_parse_sequence_date_from_run_id():
"""Test parsing of sequencing run id."""

# test that a run id from illumina works
date = parse_sequence_date_from_run_id("220214_NB501699_0302_AHJLM7AFX3")
assert date == datetime(2022, 2, 14)

# test that an unknown id does not work
date = parse_sequence_date_from_run_id("my-unknown-run-id")
assert date == None

# test that an unknown id does not work
date = parse_sequence_date_from_run_id("my_unknown_run_id")
assert date == None
87 changes: 87 additions & 0 deletions tests/parse/test_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Test typing method parsing."""

import pytest
import logging
from prp.parse.typing import replace_cgmlst_errors
from prp.models.typing import ChewbbacaErrors

# build test cases for handeling chewbacca allele caller errors and annotations
# reference, https://chewbbaca.readthedocs.io/en/latest/user/modules/AlleleCall.html
cgmlst_test_base = [("1", 1), ("99", 99)] # normal alllale calls
cgmlst_test_include_novel = [("INF-1", 1), ("INF-99", 99), ("*1", 1), ("*99", 99)] # inferred alleles
cgmlst_test_not_include_novel = [
("INF-1", "INF-1"),
("INF-99", "INF-99"),
] # inferred alleles
cgmlst_test_replace_errors = [
(err.value, None) for err in ChewbbacaErrors
] # errors to strip
cgmlst_test_not_replace_errors = [
(err.value, err.value) for err in ChewbbacaErrors
] # errors to strip


@pytest.mark.parametrize(
"called_allele,expected",
[
*cgmlst_test_base,
*cgmlst_test_not_include_novel,
*cgmlst_test_not_replace_errors,
],
)
def test_replace_cgmlst_errors_not_include_novel(called_allele, expected):
"""Test function that process Chewbbaca allele calling."""

assert (
replace_cgmlst_errors(
called_allele, include_novel_alleles=False, correct_alleles=False
)
== expected
)


@pytest.mark.parametrize(
"called_allele,expected",
[*cgmlst_test_base, *cgmlst_test_include_novel, *cgmlst_test_not_replace_errors],
)
def test_replace_cgmlst_errors_include_novel(called_allele, expected):
"""Test function that process Chewbbaca allele calling."""

assert (
replace_cgmlst_errors(
called_allele, include_novel_alleles=True, correct_alleles=False
)
== expected
)


@pytest.mark.parametrize(
"called_allele,expected",
[*cgmlst_test_base, *cgmlst_test_include_novel, *cgmlst_test_replace_errors],
)
def test_replace_cgmlst_errors_include_novel_and_correct_allels(
called_allele, expected
):
"""Test function that process Chewbbaca allele calling."""

assert (
replace_cgmlst_errors(
called_allele, include_novel_alleles=True, correct_alleles=True
)
== expected
)


def test_replace_cgmlst_errors_warnings(caplog):
"""Test that replace_cgmlst_errors warns if allele could not be cast as an integer."""
caplog.at_level(logging.WARNING)
# run test that should not trigger a warning
replace_cgmlst_errors("1", include_novel_alleles=True, correct_alleles=True)
# check that warning was not triggered
for record in caplog.records:
assert record.levelname != "WARNING"

# run test that a warning was triggered if input is unknown string
allele = "A_STRANGE_STRING"
replace_cgmlst_errors(allele, include_novel_alleles=True, correct_alleles=True)
assert allele in caplog.text

0 comments on commit 7cff73f

Please sign in to comment.