diff --git a/CHANGELOG.md b/CHANGELOG.md index a5740cd..b398c8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,17 +2,24 @@ ### Added + - Added emmtyper and parser + - Added pytests for emmtyper + ### Fixed ### Changed + - Changed Shigapass models to be consistent with other typing models + - Changed Shigapass parsers to be consistent with other typing parsers + - Changed ref genome related variables to be optional in quast + ## [0.10.1] ### Added ### Fixed -- Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3. + - Updated parsing of ChewBBACA allele calling annotations and novel alleles. This adds support for annotations introduced in v3. ### Changed diff --git a/prp/cli.py b/prp/cli.py index c9abf62..83af60e 100644 --- a/prp/cli.py +++ b/prp/cli.py @@ -23,6 +23,7 @@ parse_amrfinder_amr_pred, parse_amrfinder_vir_pred, parse_cgmlst_results, + parse_emmtyper_pred, parse_kraken_result, parse_mlst_results, parse_mykrobe_amr_pred, @@ -116,6 +117,7 @@ def cli(silent, debug): ) @click.option("-p", "--quality", type=click.Path(), help="postalignqc qc results") @click.option("-k", "--mykrobe", type=click.Path(), help="mykrobe results") +@click.option("-e", "--emmtyper", type=click.Path(), help="Emmtyper m-type prediction results") @click.option("-g", "--shigapass", type=click.Path(), help="shigapass results") @click.option("-t", "--tbprofiler", type=click.Path(), help="tbprofiler results") @click.option("--bam", type=click.Path(), help="Read mapping to reference genome") @@ -153,6 +155,7 @@ def create_bonsai_input( serotypefinder, quality, mykrobe, + emmtyper, shigapass, tbprofiler, bam, @@ -246,6 +249,13 @@ def create_bonsai_input( if res is not None: results["typing_result"].extend(res) + if emmtyper: + LOG.info("Parse emmtyper results") + # Emmtyping + res: MethodIndex | None = parse_emmtyper_pred(emmtyper) + if res is not None: + results["typing_result"].extend(res) + if shigapass: LOG.info("Parse shigapass results") # Shigatyping diff --git a/prp/models/phenotype.py b/prp/models/phenotype.py index 2496d87..40b37e6 100644 --- a/prp/models/phenotype.py +++ b/prp/models/phenotype.py @@ -130,7 +130,7 @@ class GeneBase(BaseModel): default=None, description="Reference sequence name" ) element_type: ElementType = Field( - description="The predominant function fo the gene." + description="The predominant function of the gene." ) element_subtype: Union[ ElementStressSubtype, diff --git a/prp/models/qc.py b/prp/models/qc.py index 4095f24..cfa16bd 100644 --- a/prp/models/qc.py +++ b/prp/models/qc.py @@ -20,13 +20,13 @@ class QuastQcResult(BaseModel): """Assembly QC metrics.""" total_length: int - reference_length: int + reference_length: int | None = None largest_contig: int n_contigs: int n50: int assembly_gc: float - reference_gc: float - duplication_ratio: float + reference_gc: float | None = None + duplication_ratio: float | None = None class PostAlignQcResult(BaseModel): diff --git a/prp/models/sample.py b/prp/models/sample.py index ecdd8e2..ec66694 100644 --- a/prp/models/sample.py +++ b/prp/models/sample.py @@ -17,6 +17,7 @@ from .typing import ( ResultLineageBase, ShigaTypingMethodIndex, + EmmTypingMethodIndex, TbProfilerLineage, TypingMethod, TypingResultCgMlst, @@ -80,7 +81,7 @@ class PipelineResult(SampleBase): schema_version: Literal[1] = 1 # optional typing - typing_result: list[Union[ShigaTypingMethodIndex, MethodIndex]] = Field( + typing_result: list[Union[ShigaTypingMethodIndex, EmmTypingMethodIndex, MethodIndex]] = Field( ..., alias="typingResult" ) # optional phenotype prediction diff --git a/prp/models/typing.py b/prp/models/typing.py index 20795ca..4d04e00 100644 --- a/prp/models/typing.py +++ b/prp/models/typing.py @@ -19,6 +19,7 @@ class TypingSoftware(str, Enum): VIRULENCEFINDER = "virulencefinder" SEROTYPEFINDER = "serotypefinder" SHIGAPASS = "shigapass" + EMMTYPER = "emmtyper" class TypingMethod(str, Enum): @@ -31,6 +32,7 @@ class TypingMethod(str, Enum): OTYPE = "O_type" HTYPE = "H_type" SHIGATYPE = "shigatype" + EMMTYPE = "emmtype" class ChewbbacaErrors(str, Enum): @@ -97,6 +99,23 @@ class ShigaTypingMethodIndex(RWModel): result: TypingResultShiga +class TypingResultEmm(RWModel): + """Container for emmtype gene information""" + + cluster_count: int + emmtype: str + emm_like_alleles: list[str] + emm_cluster: str + + +class EmmTypingMethodIndex(RWModel): + """Method Index Emm.""" + + type: Literal[TypingMethod.EMMTYPE] + software: Literal[TypingSoftware.EMMTYPER] + result: TypingResultEmm + + class ResultLineageBase(RWModel): """Lineage results""" diff --git a/prp/parse/__init__.py b/prp/parse/__init__.py index 5879422..b7eb5ae 100644 --- a/prp/parse/__init__.py +++ b/prp/parse/__init__.py @@ -3,6 +3,7 @@ from .phenotype import ( parse_amrfinder_amr_pred, parse_amrfinder_vir_pred, + parse_emmtyper_pred, parse_mykrobe_amr_pred, parse_resfinder_amr_pred, parse_shigapass_pred, diff --git a/prp/parse/phenotype/__init__.py b/prp/parse/phenotype/__init__.py index 88f9950..f98a2b4 100644 --- a/prp/parse/phenotype/__init__.py +++ b/prp/parse/phenotype/__init__.py @@ -1,6 +1,7 @@ """Module for parsing resistance prediction results.""" from .amrfinder import parse_amrfinder_amr_pred, parse_amrfinder_vir_pred +from .emmtyper import parse_emmtyper_pred from .mykrobe import parse_mykrobe_amr_pred from .resfinder import parse_resfinder_amr_pred from .shigapass import parse_shigapass_pred diff --git a/prp/parse/phenotype/emmtyper.py b/prp/parse/phenotype/emmtyper.py new file mode 100644 index 0000000..0e4049a --- /dev/null +++ b/prp/parse/phenotype/emmtyper.py @@ -0,0 +1,41 @@ +"""Functions for parsing emmtyper result.""" + +import logging +import pandas as pd + +from typing import Any + +from ...models.typing import EmmTypingMethodIndex, TypingMethod, TypingResultEmm +from ...models.typing import TypingSoftware as Software + +LOG = logging.getLogger(__name__) + +def parse_emmtyper_pred(path: str) -> EmmTypingMethodIndex: + """Parse emmtyper's output re emm-typing""" + LOG.info("Parsing emmtyper results") + pred_result = [] + df = pd.read_csv(path, sep='\t', header=None) + df.columns = ["sample_name", "cluster_count", "emmtype", "emm_like_alleles", "emm_cluster"] + df_loa = df.to_dict(orient="records") + for emmtype_array in df_loa: + emmtype_results = _parse_emmtyper_results(emmtype_array) + pred_result.append( + EmmTypingMethodIndex( + type=TypingMethod.EMMTYPE, + result=emmtype_results, + software=Software.EMMTYPER, + ) + ) + return pred_result + + +def _parse_emmtyper_results(info: dict[str, Any]) -> TypingResultEmm: + """Parse emm gene prediction results.""" + emm_like_alleles = info["emm_like_alleles"].split(";") + return TypingResultEmm( + # info + cluster_count=info["cluster_count"], + emmtype=info["emmtype"], + emm_like_alleles=emm_like_alleles, + emm_cluster=info["emm_cluster"], + ) diff --git a/prp/parse/qc.py b/prp/parse/qc.py index 3ee87e5..aa93e21 100644 --- a/prp/parse/qc.py +++ b/prp/parse/qc.py @@ -255,13 +255,13 @@ def parse_quast_results(tsv_fpath: str) -> QcMethodIndex: raw = [dict(zip(header, row)) for row in creader] qc_res = QuastQcResult( total_length=int(raw[0]["Total length"]), - reference_length=raw[0]["Reference length"], + reference_length=raw[0].get("Reference length", None), largest_contig=raw[0]["Largest contig"], n_contigs=raw[0]["# contigs"], n50=raw[0]["N50"], assembly_gc=raw[0]["GC (%)"], - reference_gc=raw[0]["Reference GC (%)"], - duplication_ratio=raw[0]["Duplication ratio"], + reference_gc=raw[0].get("Reference GC (%)", None), + duplication_ratio=raw[0].get("Duplication ratio", None), ) return QcMethodIndex(software=QcSoftware.QUAST, result=qc_res) diff --git a/prp/parse/typing.py b/prp/parse/typing.py index 0fe1d09..6b4cec8 100644 --- a/prp/parse/typing.py +++ b/prp/parse/typing.py @@ -201,6 +201,7 @@ def parse_mykrobe_lineage_results(pred_res: dict) -> MethodIndex | None: def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None: """Parse virulencefinder's output re stx typing""" + LOG.info("Parsing virulencefinder stx results") with open(path, "rb") as inpt: pred_obj = json.load(inpt) # if has valid results @@ -230,7 +231,8 @@ def parse_virulencefinder_stx_typing(path: str) -> MethodIndex | None: def parse_serotypefinder_oh_typing(path: str) -> MethodIndex | None: - """Parse serotypefinder's output re OH typing""" + """Parse 's output re OH typing""" + LOG.info("Parsing serotypefinder oh type results") with open(path, "rb") as inpt: pred_obj = json.load(inpt) # if has valid results diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py index bc525d3..214fa3e 100644 --- a/tests/fixtures/__init__.py +++ b/tests/fixtures/__init__.py @@ -4,3 +4,4 @@ from .mtuberculosis import * from .saureus import * from .shigella import * +from .streptococcus import * diff --git a/tests/fixtures/streptococcus/__init__.py b/tests/fixtures/streptococcus/__init__.py new file mode 100644 index 0000000..6d02b5c --- /dev/null +++ b/tests/fixtures/streptococcus/__init__.py @@ -0,0 +1,10 @@ +"""Fixtures for Streptococcus.""" +import pytest + +from ..fixtures import data_path + + +@pytest.fixture() +def streptococcus_emmtyper_path(data_path): + """Get path for Emmtyper results for streptococcus.""" + return str(data_path.joinpath("streptococcus", "emmtyper.tsv")) diff --git a/tests/fixtures/streptococcus/emmtyper.tsv b/tests/fixtures/streptococcus/emmtyper.tsv new file mode 100644 index 0000000..06ba33b --- /dev/null +++ b/tests/fixtures/streptococcus/emmtyper.tsv @@ -0,0 +1 @@ +test1_240920_nb000000_0000_test 2 EMM169.3 EMM164.2~* E4 diff --git a/tests/parse/test_emmtyper.py b/tests/parse/test_emmtyper.py new file mode 100644 index 0000000..8653085 --- /dev/null +++ b/tests/parse/test_emmtyper.py @@ -0,0 +1,22 @@ +"""Test functions for parsing Emmtyper results.""" + +import pytest + +from prp.parse.phenotype.emmtyper import parse_emmtyper_pred + + +def test_parse_emmtyper_results(streptococcus_emmtyper_path): + """Test parsing of emmtyper result files.""" + + # test parsing the output of an streptococcus. + result = parse_emmtyper_pred(streptococcus_emmtyper_path) + expected_streptococcus = { + "cluster_count": 2, + "emmtype": "EMM169.3", + "emm_like_alleles": [ + "EMM164.2~*" + ], + "emm_cluster": "E4" + } + # check if data matches + assert expected_streptococcus == result[0].result.model_dump() \ No newline at end of file