Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update prp to handle mykrobe csv format #13

Merged
merged 29 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ff08639
Update parse_mykrobe_amr_pred to handle mykrobe csv format
ryanjameskennedy Dec 27, 2023
5563589
Fix mykrobe variant parser
ryanjameskennedy Dec 28, 2023
89c94d5
Fix mykrobe phenotype model
ryanjameskennedy Dec 28, 2023
3183b1b
Fix read_csv in cli
ryanjameskennedy Dec 28, 2023
d4369c5
Update CHANGELOG.md
ryanjameskennedy Dec 28, 2023
8c44c99
Fix PhenotypeInfo for tbprofiler
ryanjameskennedy Dec 29, 2023
b545968
Set ref_aa & alt_aa to optional
ryanjameskennedy Dec 29, 2023
3d0bcec
Set coverage in LineageInformation to optional
ryanjameskennedy Dec 29, 2023
da7ae4c
Fix mykrobe lineage csv parser
ryanjameskennedy Dec 29, 2023
0b37a95
Add Mtuberculosis test files
ryanjameskennedy Dec 29, 2023
268ea86
Update test function to include create_bonsai_input for mtuberculosis
ryanjameskennedy Dec 29, 2023
6fc6898
Create and add _default_amr_phenotype to tbprofiler & mykrobe
ryanjameskennedy Dec 29, 2023
c19f095
Fix test_virulencefinder.py file spelling
ryanjameskennedy Dec 29, 2023
3d39291
Add pytest to pylint.yml GA
ryanjameskennedy Dec 29, 2023
c4f94d5
Simple pylint.yml GA fix
ryanjameskennedy Dec 29, 2023
076fbef
Pylint fixes
ryanjameskennedy Dec 29, 2023
510165a
More pylint fixes
ryanjameskennedy Dec 29, 2023
012048f
Add docstrings to test functions
ryanjameskennedy Jan 2, 2024
4b77020
Fix data_path error
ryanjameskennedy Jan 2, 2024
56a9da3
Fix data_fpath error
ryanjameskennedy Jan 2, 2024
5eec581
Update CHANGELOG.md
ryanjameskennedy Jan 2, 2024
b90fb4d
Fix parsers
ryanjameskennedy Jan 3, 2024
023c385
Remove genes from VariantBase
ryanjameskennedy Jan 3, 2024
aee230a
Remove _parse_mykrobe_amr_genes
ryanjameskennedy Jan 3, 2024
6def2ee
Review fixes regarding mykrobe variant parser
ryanjameskennedy Jan 3, 2024
84f1712
Restored unreleased version
mhkc Jan 4, 2024
7e07701
minor refactoring
mhkc Jan 4, 2024
2337a3d
Fix conflicts
ryanjameskennedy Jan 4, 2024
4850c86
Update models to handle freq, kmer counts & conf
ryanjameskennedy Jan 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pylint
pip install -e .
pip install pytest
pip install -e .[dev]
- name: Analysing the code with pylint
run: |
pylint --fail-under 9 $(git ls-files '*.py')
6 changes: 5 additions & 1 deletion CHANGELOG.md
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this PR is also intended to be prepared for release we should also add

## [Unreleased]

### Added

### Fixed

### Changed

Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
## [Unreleased]
## [0.3.0]

### Added

- Pytest for Mycobacterium tuberculosis

### Fixed

### Changed

- Mykrobe output parser handles csv format instead of json

## [0.2.0]

### Added
Expand Down
14 changes: 9 additions & 5 deletions prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
from typing import List
import pandas as pd

import click
from pydantic import TypeAdapter, ValidationError
Expand Down Expand Up @@ -188,10 +189,13 @@ def create_bonsai_input(
# mykrobe
if mykrobe:
LOG.info("Parse mykrobe results")
pred_res = json.load(mykrobe)
pred_res = pd.read_csv(mykrobe, quotechar='"')
pred_res.columns.values[3] = "variants"
pred_res.columns.values[4] = "genes"
pred_res = pred_res.to_dict(orient="records")

# verify that sample id is in prediction result
if not sample_id in pred_res:
if not sample_id in pred_res[0]["sample"]:
LOG.warning(
"Sample id %s is not in Mykrobe result, possible sample mixup",
sample_id,
Expand All @@ -202,17 +206,17 @@ def create_bonsai_input(
results["run_metadata"]["databases"].append(
SoupVersion(
name="mykrobe-predictor",
version=pred_res[sample_id]["version"]["mykrobe-predictor"],
version=pred_res[0]["mykrobe_version"],
type=SoupType.DB,
)
)
# parse mykrobe result
amr_res = parse_mykrobe_amr_pred(pred_res[sample_id], ElementType.AMR)
amr_res = parse_mykrobe_amr_pred(pred_res, ElementType.AMR)
if amr_res is not None:
results["element_type_result"].append(amr_res)

lin_res: MethodIndex = parse_mykrobe_lineage_results(
pred_res[sample_id], TypingMethod.LINEAGE
pred_res, TypingMethod.LINEAGE
)
results["typing_result"].append(lin_res)

Expand Down
9 changes: 5 additions & 4 deletions prp/models/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from enum import Enum
from typing import Dict, List, Optional, Union

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, Field

from .base import RWModel

Expand Down Expand Up @@ -90,6 +90,7 @@ class GeneBase(BaseModel):
coverage: Optional[float] = None
ref_start_pos: Optional[int] = None
ref_end_pos: Optional[int] = None
drugs: Optional[List[Union[Dict,str]]] = None
ref_gene_length: Optional[int] = Field(
default=None,
alias="target_length",
Expand Down Expand Up @@ -147,8 +148,8 @@ class VariantBase(DatabaseReference):
position: int
ref_nt: str
alt_nt: str
ref_aa: str
alt_aa: str
ref_aa: Optional[str] = None
alt_aa: Optional[str] = None
# prediction info
depth: Optional[float] = None
contig_id: Optional[str] = None
Expand All @@ -175,7 +176,7 @@ class VariantBase(DatabaseReference):
nucleotide_change: Optional[str] = None
protein_change: Optional[str] = None
annotation: Optional[List[Dict]] = None
drugs: Optional[List[Dict]] = None
drugs: Optional[List[Union[Dict,str]]] = None


class ResistanceVariant(VariantBase):
Expand Down
2 changes: 1 addition & 1 deletion prp/models/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class LineageInformation(RWModel):
rd: str | None = None
fraction: float | None = None
variant: str | None = None
coverage: Dict[str, Any] = None
coverage: Dict[str, Any] | None = None


class ResultMlstBase(RWModel):
Expand Down
108 changes: 58 additions & 50 deletions prp/parse/phenotype/mykrobe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...models.phenotype import PredictionSoftware as Software
from ...models.phenotype import ResistanceGene, ResistanceVariant, VariantType
from ...models.sample import MethodIndex
from .utils import is_prediction_result_empty
from .utils import is_prediction_result_empty, _default_amr_phenotype

LOG = logging.getLogger(__name__)

Expand All @@ -21,10 +21,10 @@ def _get_mykrobe_amr_sr_profie(mykrobe_result):
return {}

for element_type in mykrobe_result:
if mykrobe_result[element_type]["predict"].upper() == "R":
resistant.add(element_type)
if element_type["susceptibility"].upper() == "R":
resistant.add(element_type["drug"])
else:
susceptible.add(element_type)
susceptible.add(element_type["drug"])
return {"susceptible": list(susceptible), "resistant": list(resistant)}


Expand All @@ -33,22 +33,28 @@ def _parse_mykrobe_amr_genes(mykrobe_result) -> Tuple[ResistanceGene, ...]:
results = []
for element_type in mykrobe_result:
# skip non-resistance yeilding
if not mykrobe_result[element_type]["predict"].upper() == "R":
if not element_type["susceptibility"].upper() == "R":
continue

hits = mykrobe_result[element_type]["called_by"]
for hit_name, hit in hits.items():
gene = ResistanceGene(
gene_symbol=hit_name.split("_")[0],
accession=None,
depth=hit["info"]["coverage"]["alternate"]["median_depth"],
identity=None,
coverage=hit["info"]["coverage"]["alternate"]["percent_coverage"],
phenotypes=[element_type.lower()],
element_type=ElementType.AMR,
element_subtype=ElementAmrSubtype.AMR,
)
results.append(gene)
try:
depth = float(element_type["genes"].split(':')[-1])
coverage = float(element_type["genes"].split(':')[-2])
except AttributeError:
depth = None
coverage = None

gene = ResistanceGene(
gene_symbol=element_type["variants"].split("_")[0],
accession=None,
depth=depth,
identity=None,
coverage=coverage,
drugs=[element_type["drug"].lower()],
phenotypes=[_default_amr_phenotype()],
element_type=ElementType.AMR,
element_subtype=ElementAmrSubtype.AMR,
)
results.append(gene)
return results


Expand Down Expand Up @@ -90,36 +96,39 @@ def _parse_mykrobe_amr_variants(mykrobe_result) -> Tuple[ResistanceVariant, ...]

for element_type in mykrobe_result:
# skip non-resistance yeilding
if not mykrobe_result[element_type]["predict"].upper() == "R":
if not element_type["susceptibility"].upper() == "R":
continue

if element_type["variants"] is not None:
continue

hits = mykrobe_result[element_type]["called_by"]
for hit in hits:
if hits[hit]["variant"] is not None:
continue

var_info = hit.split("-")[1]
_, ref_nt, alt_nt, position = get_mutation_type(var_info)
var_nom = hit.split("-")[0].split("_")[1]
var_type, *_ = get_mutation_type(var_nom)
variant = ResistanceVariant(
variant_type=var_type,
genes=[hit.split("_")[0]],
phenotypes=[element_type],
position=position,
ref_nt=ref_nt,
alt_nt=alt_nt,
depth=hits[hit]["info"]["coverage"]["alternate"]["median_depth"],
ref_database=None,
ref_id=None,
type=None,
change=var_nom,
nucleotide_change=None,
protein_change=None,
annotation=None,
drugs=None,
)
results.append(variant)
try:
ryanjameskennedy marked this conversation as resolved.
Show resolved Hide resolved
depth = float(element_type["genes"].split(':')[-1])
ryanjameskennedy marked this conversation as resolved.
Show resolved Hide resolved
except AttributeError:
depth = None

var_info = element_type["variants"].split("-")[1]
_, ref_nt, alt_nt, position = get_mutation_type(var_info)
mhkc marked this conversation as resolved.
Show resolved Hide resolved
var_nom = element_type["variants"].split("-")[0].split("_")[1]
var_type, *_ = get_mutation_type(var_nom)
variant = ResistanceVariant(
mhkc marked this conversation as resolved.
Show resolved Hide resolved
variant_type=var_type,
genes=[element_type["variants"].split("_")[0]],
ryanjameskennedy marked this conversation as resolved.
Show resolved Hide resolved
phenotypes=[_default_amr_phenotype()],
position=position,
ref_nt=ref_nt,
alt_nt=alt_nt,
depth=depth,
ref_database=None,
ref_id=None,
type=None,
change=var_nom,
nucleotide_change=None,
protein_change=None,
annotation=None,
drugs=[element_type["drug"].lower()],
)
results.append(variant)
return results


Expand All @@ -128,11 +137,10 @@ def parse_mykrobe_amr_pred(
) -> ElementTypeResult | None:
"""Parse mykrobe resistance prediction results."""
LOG.info("Parsing mykrobe prediction")
pred = prediction["susceptibility"]
resistance = ElementTypeResult(
phenotypes=_get_mykrobe_amr_sr_profie(pred),
genes=_parse_mykrobe_amr_genes(pred),
mutations=_parse_mykrobe_amr_variants(pred),
phenotypes=_get_mykrobe_amr_sr_profie(prediction),
genes=_parse_mykrobe_amr_genes(prediction),
ryanjameskennedy marked this conversation as resolved.
Show resolved Hide resolved
mutations=_parse_mykrobe_amr_variants(prediction),
)

# verify prediction result
Expand Down
5 changes: 3 additions & 2 deletions prp/parse/phenotype/tbprofiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...models.phenotype import PredictionSoftware as Software
from ...models.phenotype import ResistanceVariant
from ...models.sample import MethodIndex
from .utils import _default_variant
from .utils import _default_variant, _default_amr_phenotype

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,10 +50,11 @@ def _parse_tbprofiler_amr_variants(tbprofiler_result) -> Tuple[ResistanceVariant

for hit in tbprofiler_result["dr_variants"]:
var_type = "substitution"

variant = ResistanceVariant(
variant_type=var_type,
genes=[hit["gene"]],
phenotypes=hit["gene_associated_drugs"],
phenotypes=[_default_amr_phenotype()],
position=int(hit["genome_pos"]),
ref_nt=hit["ref"],
alt_nt=hit["alt"],
Expand Down
8 changes: 8 additions & 0 deletions prp/parse/phenotype/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Shared utility functions."""
from ...models.phenotype import ElementTypeResult, ResistanceGene
from ...models.phenotype import ElementType, PhenotypeInfo


def _default_resistance() -> ElementTypeResult:
Expand Down Expand Up @@ -49,6 +50,13 @@ def _default_variant() -> ElementTypeResult:
mutations = [mutation]
return ElementTypeResult(phenotypes=[], genes=[], mutations=mutations)

def _default_amr_phenotype() -> PhenotypeInfo:
return PhenotypeInfo(
type = ElementType.AMR,
group = ElementType.AMR,
name = ElementType.AMR,
)


def is_prediction_result_empty(result: ElementTypeResult) -> bool:
"""Check if prediction result is emtpy.
Expand Down
5 changes: 3 additions & 2 deletions prp/parse/phenotype/virulencefinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def parse_vir_gene(
info: Dict[str, Any], subtype: ElementVirulenceSubtype = ElementVirulenceSubtype.VIR
) -> VirulenceGene:
"""Parse virulence gene prediction results."""
start_pos, end_pos = map(int, info["position_in_ref"].split(".."))
# Some genes doesnt have accession numbers
accnr = None if info["accession"] == "NA" else info["accession"]
Expand All @@ -35,7 +36,7 @@ def parse_vir_gene(


def _parse_virulencefinder_vir_results(pred: str) -> ElementTypeResult:
"""Parse virulence prediction results from ARIBA."""
"""Parse virulence prediction results from virulencefinder."""
# parse virulence finder results
species = list(k for k in pred["virulencefinder"]["results"])
vir_genes = []
Expand Down Expand Up @@ -66,7 +67,7 @@ def parse_virulencefinder_vir_pred(path: str) -> ElementTypeResult | None:
:rtype: ElementTypeResult | None
"""
LOG.info("Parsing virulencefinder virulence prediction")
with open(path) as inpt:
with open(path, 'rb') as inpt:
pred = json.load(inpt)
if "virulencefinder" in pred:
results: ElementTypeResult = _parse_virulencefinder_vir_results(pred)
Expand Down
Loading
Loading