Skip to content

Commit

Permalink
Merge pull request #84 from Clinical-Genomics-Lund/update-sample-data…
Browse files Browse the repository at this point in the history
…model

Update how metadata are stored in bonsai input
  • Loading branch information
ryanjameskennedy authored Aug 28, 2024
2 parents 95ae98e + 3186b2c commit be622d8
Show file tree
Hide file tree
Showing 13 changed files with 249 additions and 111 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@
- Added flag to set verbosity level.
- Validate TbProfiler schema version.
- Added CLI command for adding IGV annotation tracks
- Added `indel_variants` to json result and ability to filter vcf into various categories

### Fixed

- Fixed genome_annotation appending bug
- Fixed variant sorting as `reference_sequence` can be None

### Changed

- Updated sample metadata in the output format. RunMetadata was split into two fields, one for sequencing and one for pipeline information. Lims id and sample name are now included in the SampleBase object.
- Split SV and SNV from tbprofiler output
- Update annotate_delly to extract SVs from vcf

## [0.9.3]

### Added
Expand Down
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
prp --help
```

### Use the method help argument for information regarding the input for each of prp's methods (`create-bonsai-input`, `create-cdm-input`, `create-qc-result`, `print-schema`, `validate`)
### Use the method help argument for information regarding the input for each of prp's methods (`add-igv-annotation-track`, `annotate-delly`, `create-bonsai-input`, `create-cdm-input`, `create-qc-result`, `print-schema`, `rerun-bonsai-input`, `validate`)
```
prp <method> --help
```

### Create bonsai input from pipeline data
```
prp create-bonsai-input -i SAMPLE_ID -u RUN_METADATA_FILE -q QUAST_FILENAME -d PROCESS_METADATA_FILE -k KRAKEN_FILE -a AMRFINDER_FILE -m MLST_FILE -c CGMLST_FILE -v VIRULENCEFINDER_FILE -r RESFINDER_FILE -p POSTALIGNQC_FILE -k MYKROBE_FILE -t TBPROFILER_FILE [--correct_alleles] -o OUTPUT_FILE [-h]
prp create-bonsai-input -i SAMPLE_ID -u RUN_METADATA_FILE -q QUAST_FILENAME -d PROCESS_METADATA_FILE -k KRAKEN_FILE -a AMRFINDER_FILE -m MLST_FILE -c CGMLST_FILE -v VIRULENCEFINDER_FILE -r RESFINDER_FILE -p POSTALIGNQC_FILE -k MYKROBE_FILE -t TBPROFILER_FILE --vcf VCF_FILE [--snv-vcf SNV_VCF_FILE] [--sv-vcf SV_VCF_FILE] [--symlink-dir SYMLINK_DIR] [--correct_alleles] -o OUTPUT_FILE [-h]
```

### Create CDM input from pipeline data
Expand All @@ -34,3 +34,18 @@ prp create-cdm-input -q QUAST_FILENAME -c CGMLST_FILE -p POSTALIGNQC_FILE [--cor
```
prp create-qc-result -i SAMPLE_ID --b BAM_FILE [-e BED_FILE] [-a BAITS_FILE] -r REFERENCE_FILE [-c CPUS] -o OUTPUT_FILE [-h]
```

### Rerun bonsai input creation for all samples
```
prp rerun-bonsai-input -i INPUT_DIR -j JASEN_DIR -s SYMLINK_DIR -o OUTPUT_DIR -o OUTPUT_FILE [-h]
```

### Add IGV annotation track to result
```
prp add-igv-annotation-track -n TRACK_NAME -a ANNOTATION_FILE -b BONSAI_INPUT_FILE -o OUTPUT_FILE [-h]
```

### Validate output format of result json file
```
prp validate -o OUTPUT_FILE [-h]
```
69 changes: 41 additions & 28 deletions prp/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def cli(silent, debug):
multiple=True,
help="Genome annotations bed format",
)
@click.option("--vcf", type=click.Path(), help="VCF filepath")
@click.option("--snv-vcf", type=click.Path(), help="VCF with SNV variants")
@click.option("--sv-vcf", type=click.Path(), help="VCF with SV variants")
@click.option("--symlink-dir", type=click.Path(), help="Dir for symlink")
Expand Down Expand Up @@ -158,6 +159,7 @@ def create_bonsai_input(
reference_genome_fasta,
reference_genome_gff,
genome_annotation,
vcf,
snv_vcf,
sv_vcf,
symlink_dir,
Expand All @@ -166,14 +168,15 @@ def create_bonsai_input(
): # pylint: disable=too-many-arguments
"""Combine pipeline results into a standardized json output file."""
LOG.info("Start generating pipeline result json")
# Get basic sample object
sample_info, seq_info, pipeline_info = parse_run_info(run_metadata, process_metadata)
results = {
"run_metadata": {
"run": parse_run_info(run_metadata),
"databases": get_database_info(process_metadata),
},
"sequencing": seq_info,
"pipeline": pipeline_info,
"qc": [],
"typing_result": [],
"element_type_result": [],
**sample_info # add sample_name & lims_id
}
# qc
if quast:
Expand Down Expand Up @@ -274,8 +277,8 @@ def create_bonsai_input(
)
raise click.Abort()

# add mykrobe db version
results["run_metadata"]["databases"].append(
# add mykrobe db version to the list of softwares
results['pipeline'].softwares.append(
SoupVersion(
name="mykrobe-predictor",
version=pred_res[0]["mykrobe_version"],
Expand Down Expand Up @@ -316,18 +319,21 @@ def create_bonsai_input(
type=SoupType.DB,
)
]
results["run_metadata"]["databases"].extend(db_info)
sw_list = results["pipeline"].softwares.extend(db_info)
lin_res: MethodIndex = parse_tbprofiler_lineage_results(pred_res)
results["typing_result"].append(lin_res)
amr_res: MethodIndex = parse_tbprofiler_amr_pred(pred_res)
results["element_type_result"].append(amr_res)

# parse SNV and SV variants.
if snv_vcf:
results["snv_variants"] = load_variants(snv_vcf)
results["snv_variants"] = load_variants(snv_vcf)["snv_variants"]

if sv_vcf:
results["sv_variants"] = load_variants(sv_vcf)
results["sv_variants"] = load_variants(sv_vcf)["sv_variants"]

if vcf:
results.update(load_variants(vcf))

# entries for reference genome and read mapping
if all([bam, reference_genome_fasta, reference_genome_gff]):
Expand All @@ -348,11 +354,12 @@ def create_bonsai_input(
{"name": f"annotation_{i}", "file": Path(annot).name}
for i, annot in enumerate(genome_annotation, start=1)
]
for vcf in [sv_vcf, snv_vcf]:
if vcf:
vcf = _get_path(symlink_dir, "vcf", vcf)
name = "SNV" if vcf == sv_vcf else "SV"
annotations.append({"name": name, "file": vcf})
vcf_dict = {"SV": sv_vcf, "SNV": snv_vcf, "VCF": vcf}
for name in vcf_dict:
vcf_filepath = vcf_dict[name]
if vcf_filepath:
vcf_filepath = _get_path(symlink_dir, "vcf", vcf_filepath)
annotations.append({"name": name, "file": vcf_filepath})
# store annotation results
results["genome_annotation"] = annotations if annotations else None

Expand All @@ -361,7 +368,7 @@ def create_bonsai_input(
sample_id=sample_id, schema_version=OUTPUT_SCHEMA_VERSION, **results
)
except ValidationError as err:
click.secho("Input failed Validation", fg="red")
click.secho("Generated result failed validation", fg="red")
click.secho(err)
raise click.Abort
LOG.info("Storing results to: %s", output)
Expand Down Expand Up @@ -541,42 +548,48 @@ def annotate_delly(vcf, bed, output):
writer = Writer(output.absolute(), vcf_obj)
annotate_delly_variants(writer, vcf_obj, annotation, annot_chrom=annot_chrom)

click.secho(f"Wrote annotated delly variants to {output}", fg="green")
click.secho(f"Wrote annotated delly variants to {output.name}", fg="green")


@cli.command()
@click.option("-n", "--name", type=str, help="Track name.")
@click.option("-n", "--track-name", type=str, help="Track name.")
@click.option(
"-a", "--annotation-file", type=click.Path(exists=True), help="Path to file."
)
@click.option(
"-r",
"--result",
"-b",
"--bonsai-input-file",
required=True,
type=click.Path(writable=True),
help="PRP result.",
help="PRP result file (used as bonsai input).",
)
@click.option(
"-o",
"--output",
required=True,
type=click.File("w"),
help="output filepath",
)
@click.argument("output", type=click.File("w"))
def add_igv_annotation_track(name, annotation_file, result, output):
"""Add IGV annotation track to result."""
with open(result, "r", encoding="utf-8") as jfile:
def add_igv_annotation_track(track_name, annotation_file, bonsai_input_file, output):
"""Add IGV annotation track to result (bonsai input file)."""
with open(bonsai_input_file, "r", encoding="utf-8") as jfile:
result_obj = PipelineResult(**json.load(jfile))

# Get genome annotation
if result_obj.genome_annotation is None or isinstance(
if not isinstance(
result_obj.genome_annotation, list
):
track_info = []
else:
track_info = result.genome_annotation
track_info = result_obj.genome_annotation

# add new tracks
track_info.append({"name": name, "file": annotation_file})
track_info.append({"name": track_name, "file": annotation_file})

# update data model
upd_result = result_obj.model_copy(update={"genome_annotation": track_info})

# overwrite result
output.write(upd_result.model_dump_json(indent=3))

click.secho(f"Wrote updated result to {output}", fg="green")
click.secho(f"Wrote updated result to {output.name}", fg="green")
43 changes: 17 additions & 26 deletions prp/models/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Metadata models."""
from datetime import datetime
from enum import Enum
from typing import Optional

from pydantic import BaseModel, Field

Expand All @@ -22,35 +23,25 @@ class SoupVersion(BaseModel):
type: SoupType


class RunInformation(RWModel):
"""Store information on a run how the run was conducted."""
class SequencingInfo(RWModel):
"""Information on the sample was sequenced."""

run_id: str
platform: str
instrument: Optional[str]
method: dict[str, str] = {}
date: datetime | None


class PipelineInfo(RWModel):
"""Information on the sample was analysed."""

pipeline: str
version: str
commit: str
analysis_profile: str = Field(
...,
alias="analysisProfile",
description="The analysis profile used when starting the pipeline",
)
configuration_files: list[str] = Field(
..., alias="configurationFiles", description="Nextflow configuration used"
)
analysis_profile: str
configuration_files: list[str]
workflow_name: str
sample_name: str
lims_id: str
sequencing_run: str
sequencing_platform: str
sequencing_type: str
command: str
date: datetime


SoupVersions = list[SoupVersion]


class RunMetadata(BaseModel):
"""Run metadata"""

run: RunInformation
databases: SoupVersions
softwares: list[SoupVersion]
date: datetime
5 changes: 3 additions & 2 deletions prp/models/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class VariantType(str, Enum):
SNV = "SNV"
MNV = "MNV"
SV = "SV"
INDEL = "INDEL"
STR = "STR"


Expand Down Expand Up @@ -215,7 +216,7 @@ class VariantBase(RWModel):
phenotypes: list[PhenotypeInfo] = []

# variant location
reference_sequence: str = Field(
reference_sequence: str | None = Field(
...,
description="Reference sequence such as chromosome, gene or contig id.",
alias="gene_symbol",
Expand Down Expand Up @@ -251,7 +252,7 @@ class MykrobeVariant(VariantBase):
class TbProfilerVariant(VariantBase):
"""Container for TbProfiler variant information"""

variant_effect: str
variant_effect: str | None = None
hgvs_nt_change: Optional[str] = Field(..., description="DNA change in HGVS format")
hgvs_aa_change: Optional[str] = Field(
..., description="Protein change in HGVS format"
Expand Down
14 changes: 12 additions & 2 deletions prp/models/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pydantic import Field

from .base import RWModel
from .metadata import RunMetadata
from .metadata import SequencingInfo, PipelineInfo
from .phenotype import (
AMRMethodIndex,
StressMethodIndex,
Expand Down Expand Up @@ -44,8 +44,17 @@ class SampleBase(RWModel):
"""Base datamodel for sample data structure"""

sample_id: str = Field(..., alias="sampleId", min_length=3, max_length=100)
run_metadata: RunMetadata = Field(..., alias="runMetadata")
sample_name: str
lims_id: str

# metadata
sequencing: SequencingInfo
pipeline: PipelineInfo

# quality
qc: list[QcMethodIndex] = Field(...)

# species identification
species_prediction: list[SppMethodIndex] = Field(..., alias="speciesPrediction")


Expand Down Expand Up @@ -81,6 +90,7 @@ class PipelineResult(SampleBase):
# optional variant info
snv_variants: Optional[list[VariantBase]] = None
sv_variants: Optional[list[VariantBase]] = None
indel_variants: Optional[list[VariantBase]] = None
# optional alignment info
reference_genome: Optional[ReferenceGenome] = None
read_mapping: Optional[str] = None
Expand Down
Loading

0 comments on commit be622d8

Please sign in to comment.