Merge pull request #92 from Clinical-Genomics-Lund/91-add-emmtyper

Add emmtyper, update shigapass and fix quast
SMD-Bioinformatics-Lund · Oct 10, 2024 · ba8c9b3 · ba8c9b3
2 parents eef32b6 + b25c37a
commit ba8c9b3
Show file tree

Hide file tree

Showing 15 changed files with 126 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,17 +2,24 @@
 
 ### Added
 
+ - Added emmtyper and parser
+ - Added pytests for emmtyper
+
 ### Fixed
 
 ### Changed
 
+ - Changed Shigapass models to be consistent with other typing models
+ - Changed Shigapass parsers to be consistent with other typing parsers
+ - Changed ref genome related variables to be optional in quast
+
 ## [0.10.1]
 
 ### Added
 
 ### Fixed
 
-- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.
+ - Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3.
 
 ### Changed
 

diff --git a/prp/cli.py b/prp/cli.py
@@ -23,6 +23,7 @@
     parse_amrfinder_amr_pred,
     parse_amrfinder_vir_pred,
     parse_cgmlst_results,
+    parse_emmtyper_pred,
     parse_kraken_result,
     parse_mlst_results,
     parse_mykrobe_amr_pred,
@@ -116,6 +117,7 @@ def cli(silent, debug):
 )
 @click.option("-p", "--quality", type=click.Path(), help="postalignqc qc results")
 @click.option("-k", "--mykrobe", type=click.Path(), help="mykrobe results")
+@click.option("-e", "--emmtyper", type=click.Path(), help="Emmtyper m-type prediction results")
 @click.option("-g", "--shigapass", type=click.Path(), help="shigapass results")
 @click.option("-t", "--tbprofiler", type=click.Path(), help="tbprofiler results")
 @click.option("--bam", type=click.Path(), help="Read mapping to reference genome")
@@ -153,6 +155,7 @@ def create_bonsai_input(
     serotypefinder,
     quality,
     mykrobe,
+    emmtyper,
     shigapass,
     tbprofiler,
     bam,
@@ -246,6 +249,13 @@ def create_bonsai_input(
         if res is not None:
             results["typing_result"].extend(res)
 
+    if emmtyper:
+        LOG.info("Parse emmtyper results")
+        # Emmtyping
+        res: MethodIndex | None = parse_emmtyper_pred(emmtyper)
+        if res is not None:
+            results["typing_result"].extend(res)
+
     if shigapass:
         LOG.info("Parse shigapass results")
         # Shigatyping

diff --git a/prp/models/phenotype.py b/prp/models/phenotype.py
@@ -130,7 +130,7 @@ class GeneBase(BaseModel):
         default=None, description="Reference sequence name"
     )
     element_type: ElementType = Field(
-        description="The predominant function fo the gene."
+        description="The predominant function of the gene."
     )
     element_subtype: Union[
         ElementStressSubtype,

diff --git a/prp/models/qc.py b/prp/models/qc.py
@@ -20,13 +20,13 @@ class QuastQcResult(BaseModel):
     """Assembly QC metrics."""
 
     total_length: int
-    reference_length: int
+    reference_length: int | None = None
     largest_contig: int
     n_contigs: int
     n50: int
     assembly_gc: float
-    reference_gc: float
-    duplication_ratio: float
+    reference_gc: float | None = None
+    duplication_ratio: float | None = None
 
 
 class PostAlignQcResult(BaseModel):

diff --git a/prp/models/sample.py b/prp/models/sample.py
@@ -17,6 +17,7 @@
 from .typing import (
     ResultLineageBase,
     ShigaTypingMethodIndex,
+    EmmTypingMethodIndex,
     TbProfilerLineage,
     TypingMethod,
     TypingResultCgMlst,
@@ -80,7 +81,7 @@ class PipelineResult(SampleBase):
 
     schema_version: Literal[1] = 1
     # optional typing
-    typing_result: list[Union[ShigaTypingMethodIndex, MethodIndex]] = Field(
+    typing_result: list[Union[ShigaTypingMethodIndex, EmmTypingMethodIndex, MethodIndex]] = Field(
         ..., alias="typingResult"
     )
     # optional phenotype prediction

diff --git a/prp/models/typing.py b/prp/models/typing.py
@@ -19,6 +19,7 @@ class TypingSoftware(str, Enum):
     VIRULENCEFINDER = "virulencefinder"
     SEROTYPEFINDER = "serotypefinder"
     SHIGAPASS = "shigapass"
+    EMMTYPER = "emmtyper"
 
 
 class TypingMethod(str, Enum):
@@ -31,6 +32,7 @@ class TypingMethod(str, Enum):
     OTYPE = "O_type"
     HTYPE = "H_type"
     SHIGATYPE = "shigatype"
+    EMMTYPE = "emmtype"
 
 
 class ChewbbacaErrors(str, Enum):
@@ -97,6 +99,23 @@ class ShigaTypingMethodIndex(RWModel):
     result: TypingResultShiga
 
 
+class TypingResultEmm(RWModel):
+    """Container for emmtype gene information"""
+
+    cluster_count: int
+    emmtype: str
+    emm_like_alleles: list[str]
+    emm_cluster: str
+
+
+class EmmTypingMethodIndex(RWModel):
+    """Method Index Emm."""
+
+    type: Literal[TypingMethod.EMMTYPE]
+    software: Literal[TypingSoftware.EMMTYPER]
+    result: TypingResultEmm
+
+
 class ResultLineageBase(RWModel):
     """Lineage results"""
 

diff --git a/prp/parse/__init__.py b/prp/parse/__init__.py
@@ -3,6 +3,7 @@
 from .phenotype import (
     parse_amrfinder_amr_pred,
     parse_amrfinder_vir_pred,
+    parse_emmtyper_pred,
     parse_mykrobe_amr_pred,
     parse_resfinder_amr_pred,
     parse_shigapass_pred,

diff --git a/prp/parse/phenotype/__init__.py b/prp/parse/phenotype/__init__.py
@@ -1,6 +1,7 @@
 """Module for parsing resistance prediction results."""
 
 from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred
+from .emmtyper import parse_emmtyper_pred
 from .mykrobe import parse_mykrobe_amr_pred
 from .resfinder import parse_resfinder_amr_pred
 from .shigapass import parse_shigapass_pred

diff --git a/prp/parse/phenotype/emmtyper.py b/prp/parse/phenotype/emmtyper.py
@@ -0,0 +1,41 @@
+"""Functions for parsing emmtyper result."""
+
+import logging
+import pandas as pd
+
+from typing import Any
+
+from ...models.typing import EmmTypingMethodIndex, TypingMethod, TypingResultEmm
+from ...models.typing import TypingSoftware as Software
+
+LOG = logging.getLogger(__name__)
+
+def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex:
+    """Parse emmtyper's output re emm-typing"""
+    LOG.info("Parsing emmtyper results")
+    pred_result = []
+    df = pd.read_csv(path, sep='\t', header=None)
+    df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"]
+    df_loa = df.to_dict(orient="records")
+    for emmtype_array in df_loa:
+        emmtype_results = _parse_emmtyper_results(emmtype_array)
+        pred_result.append(
+            EmmTypingMethodIndex(
+                type=TypingMethod.EMMTYPE,
+                result=emmtype_results,
+                software=Software.EMMTYPER,
+            )
+        )
+    return pred_result
+
+
+def _parse_emmtyper_results(info: dict[str, Any]) -> TypingResultEmm:
+    """Parse emm gene prediction results."""
+    emm_like_alleles = info["emm_like_alleles"].split(";")
+    return TypingResultEmm(
+        # info
+        cluster_count=info["cluster_count"],
+        emmtype=info["emmtype"],
+        emm_like_alleles=emm_like_alleles,
+        emm_cluster=info["emm_cluster"],
+    )
diff --git a/prp/parse/qc.py b/prp/parse/qc.py
@@ -255,13 +255,13 @@ def parse_quast_results(tsv_fpath: str) -> QcMethodIndex:
         raw = [dict(zip(header, row)) for row in creader]
         qc_res = QuastQcResult(
             total_length=int(raw[0]["Total length"]),
-            reference_length=raw[0]["Reference length"],
+            reference_length=raw[0].get("Reference length", None),
             largest_contig=raw[0]["Largest contig"],
             n_contigs=raw[0]["# contigs"],
             n50=raw[0]["N50"],
             assembly_gc=raw[0]["GC (%)"],
-            reference_gc=raw[0]["Reference GC (%)"],
-            duplication_ratio=raw[0]["Duplication ratio"],
+            reference_gc=raw[0].get("Reference GC (%)", None),
+            duplication_ratio=raw[0].get("Duplication ratio", None),
         )
     return QcMethodIndex(software=QcSoftware.QUAST, result=qc_res)
 

diff --git a/prp/parse/typing.py b/prp/parse/typing.py
@@ -201,6 +201,7 @@ def parse_mykrobe_lineage_results(pred_res: dict) -> MethodIndex | None:
 
 def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None:
     """Parse virulencefinder's output re stx typing"""
+    LOG.info("Parsing virulencefinder stx results")
     with open(path, "rb") as inpt:
         pred_obj = json.load(inpt)
         # if has valid results
@@ -230,7 +231,8 @@ def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None:
 
 
 def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None:
-    """Parse serotypefinder's output re OH typing"""
+    """Parse 's output re OH typing"""
+    LOG.info("Parsing serotypefinder oh type results")
     with open(path, "rb") as inpt:
         pred_obj = json.load(inpt)
         # if has valid results

diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
@@ -4,3 +4,4 @@
 from .mtuberculosis import *
 from .saureus import *
 from .shigella import *
+from .streptococcus import *
diff --git a/tests/fixtures/streptococcus/__init__.py b/tests/fixtures/streptococcus/__init__.py
@@ -0,0 +1,10 @@
+"""Fixtures for Streptococcus."""
+import pytest
+
+from ..fixtures import data_path
+
+
+@pytest.fixture()
+def streptococcus_emmtyper_path(data_path):
+    """Get path for Emmtyper results for streptococcus."""
+    return str(data_path.joinpath("streptococcus", "emmtyper.tsv"))
diff --git a/tests/fixtures/streptococcus/emmtyper.tsv b/tests/fixtures/streptococcus/emmtyper.tsv
@@ -0,0 +1 @@
+test1_240920_nb000000_0000_test	2	EMM169.3	EMM164.2~*	E4
diff --git a/tests/parse/test_emmtyper.py b/tests/parse/test_emmtyper.py
@@ -0,0 +1,22 @@
+"""Test functions for parsing Emmtyper results."""
+
+import pytest
+
+from prp.parse.phenotype.emmtyper import parse_emmtyper_pred
+
+
+def test_parse_emmtyper_results(streptococcus_emmtyper_path):
+    """Test parsing of emmtyper result files."""
+
+    # test parsing the output of an streptococcus.
+    result = parse_emmtyper_pred(streptococcus_emmtyper_path)
+    expected_streptococcus =  {
+        "cluster_count": 2,
+        "emmtype": "EMM169.3",
+        "emm_like_alleles": [
+          "EMM164.2~*"
+        ],
+        "emm_cluster": "E4"
+    }
+    # check if data matches
+    assert expected_streptococcus == result[0].result.model_dump()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		test1_240920_nb000000_0000_test 2 EMM169.3 EMM164.2~* E4