Merge pull request #84 from Clinical-Genomics-Lund/update-sample-data…

…model Update how metadata are stored in bonsai input
Clinical-Genomics-Lund · Aug 28, 2024 · be622d8 · be622d8
2 parents 95ae98e + 3186b2c
commit be622d8
Show file tree

Hide file tree

Showing 13 changed files with 249 additions and 111 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,19 @@
  - Added flag to set verbosity level.
  - Validate TbProfiler schema version.
  - Added CLI command for adding IGV annotation tracks
+ - Added `indel_variants` to json result and ability to filter vcf into various categories
 
 ### Fixed
 
+ - Fixed genome_annotation appending bug
+ - Fixed variant sorting as `reference_sequence` can be None
+
 ### Changed
 
+ - Updated sample metadata in the output format. RunMetadata was split into two fields, one for sequencing and one for pipeline information. Lims id and sample name are now included in the SampleBase object.
+ - Split SV and SNV from tbprofiler output
+ - Update annotate_delly to extract SVs from vcf
+
 ## [0.9.3]
 
 ### Added

diff --git a/README.md b/README.md
@@ -15,14 +15,14 @@
 prp --help
 ```
 
-### Use the method help argument for information regarding the input for each of prp's methods (`create-bonsai-input`, `create-cdm-input`, `create-qc-result`, `print-schema`, `validate`)
+### Use the method help argument for information regarding the input for each of prp's methods (`add-igv-annotation-track`, `annotate-delly`, `create-bonsai-input`, `create-cdm-input`, `create-qc-result`, `print-schema`, `rerun-bonsai-input`, `validate`)
 ```
 prp <method> --help
 ```
 
 ### Create bonsai input from pipeline data
 ```
-prp create-bonsai-input -i SAMPLE_ID -u RUN_METADATA_FILE -q QUAST_FILENAME -d PROCESS_METADATA_FILE -k KRAKEN_FILE -a AMRFINDER_FILE -m MLST_FILE -c CGMLST_FILE -v VIRULENCEFINDER_FILE -r RESFINDER_FILE -p POSTALIGNQC_FILE -k MYKROBE_FILE -t TBPROFILER_FILE [--correct_alleles] -o OUTPUT_FILE [-h]
+prp create-bonsai-input -i SAMPLE_ID -u RUN_METADATA_FILE -q QUAST_FILENAME -d PROCESS_METADATA_FILE -k KRAKEN_FILE -a AMRFINDER_FILE -m MLST_FILE -c CGMLST_FILE -v VIRULENCEFINDER_FILE -r RESFINDER_FILE -p POSTALIGNQC_FILE -k MYKROBE_FILE -t TBPROFILER_FILE --vcf VCF_FILE [--snv-vcf SNV_VCF_FILE] [--sv-vcf SV_VCF_FILE] [--symlink-dir SYMLINK_DIR] [--correct_alleles] -o OUTPUT_FILE [-h]
 ```
 
 ### Create CDM input from pipeline data
@@ -34,3 +34,18 @@ prp create-cdm-input -q QUAST_FILENAME -c CGMLST_FILE -p POSTALIGNQC_FILE [--cor
 ```
 prp create-qc-result -i SAMPLE_ID --b BAM_FILE [-e BED_FILE] [-a BAITS_FILE] -r REFERENCE_FILE [-c CPUS] -o OUTPUT_FILE [-h]
 ```
+
+### Rerun bonsai input creation for all samples
+```
+prp rerun-bonsai-input -i INPUT_DIR  -j JASEN_DIR -s SYMLINK_DIR -o OUTPUT_DIR -o OUTPUT_FILE [-h]
+```
+
+### Add IGV annotation track to result
+```
+prp add-igv-annotation-track -n TRACK_NAME -a ANNOTATION_FILE -b BONSAI_INPUT_FILE -o OUTPUT_FILE [-h]
+```
+
+### Validate output format of result json file
+```
+prp validate -o OUTPUT_FILE [-h]
+```
diff --git a/prp/cli.py b/prp/cli.py
@@ -131,6 +131,7 @@ def cli(silent, debug):
     multiple=True,
     help="Genome annotations bed format",
 )
+@click.option("--vcf", type=click.Path(), help="VCF filepath")
 @click.option("--snv-vcf", type=click.Path(), help="VCF with SNV variants")
 @click.option("--sv-vcf", type=click.Path(), help="VCF with SV variants")
 @click.option("--symlink-dir", type=click.Path(), help="Dir for symlink")
@@ -158,6 +159,7 @@ def create_bonsai_input(
     reference_genome_fasta,
     reference_genome_gff,
     genome_annotation,
+    vcf,
     snv_vcf,
     sv_vcf,
     symlink_dir,
@@ -166,14 +168,15 @@ def create_bonsai_input(
 ):  # pylint: disable=too-many-arguments
     """Combine pipeline results into a standardized json output file."""
     LOG.info("Start generating pipeline result json")
+    # Get basic sample object
+    sample_info, seq_info, pipeline_info = parse_run_info(run_metadata, process_metadata)
     results = {
-        "run_metadata": {
-            "run": parse_run_info(run_metadata),
-            "databases": get_database_info(process_metadata),
-        },
+        "sequencing": seq_info,
+        "pipeline": pipeline_info,
         "qc": [],
         "typing_result": [],
         "element_type_result": [],
+        **sample_info  # add sample_name & lims_id
     }
     # qc
     if quast:
@@ -274,8 +277,8 @@ def create_bonsai_input(
             )
             raise click.Abort()
 
-        # add mykrobe db version
-        results["run_metadata"]["databases"].append(
+        # add mykrobe db version to the list of softwares
+        results['pipeline'].softwares.append(
             SoupVersion(
                 name="mykrobe-predictor",
                 version=pred_res[0]["mykrobe_version"],
@@ -316,18 +319,21 @@ def create_bonsai_input(
                     type=SoupType.DB,
                 )
             ]
-            results["run_metadata"]["databases"].extend(db_info)
+            sw_list = results["pipeline"].softwares.extend(db_info)
             lin_res: MethodIndex = parse_tbprofiler_lineage_results(pred_res)
             results["typing_result"].append(lin_res)
             amr_res: MethodIndex = parse_tbprofiler_amr_pred(pred_res)
             results["element_type_result"].append(amr_res)
 
     # parse SNV and SV variants.
     if snv_vcf:
-        results["snv_variants"] = load_variants(snv_vcf)
+        results["snv_variants"] = load_variants(snv_vcf)["snv_variants"]
 
     if sv_vcf:
-        results["sv_variants"] = load_variants(sv_vcf)
+        results["sv_variants"] = load_variants(sv_vcf)["sv_variants"]
+
+    if vcf:
+        results.update(load_variants(vcf))
 
     # entries for reference genome and read mapping
     if all([bam, reference_genome_fasta, reference_genome_gff]):
@@ -348,11 +354,12 @@ def create_bonsai_input(
             {"name": f"annotation_{i}", "file": Path(annot).name}
             for i, annot in enumerate(genome_annotation, start=1)
         ]
-        for vcf in [sv_vcf, snv_vcf]:
-            if vcf:
-                vcf = _get_path(symlink_dir, "vcf", vcf)
-                name = "SNV" if vcf == sv_vcf else "SV"
-                annotations.append({"name": name, "file": vcf})
+        vcf_dict = {"SV": sv_vcf, "SNV": snv_vcf, "VCF": vcf}
+        for name in vcf_dict:
+            vcf_filepath = vcf_dict[name]
+            if vcf_filepath:
+                vcf_filepath = _get_path(symlink_dir, "vcf", vcf_filepath)
+                annotations.append({"name": name, "file": vcf_filepath})
         # store annotation results
         results["genome_annotation"] = annotations if annotations else None
 
@@ -361,7 +368,7 @@ def create_bonsai_input(
             sample_id=sample_id, schema_version=OUTPUT_SCHEMA_VERSION, **results
         )
     except ValidationError as err:
-        click.secho("Input failed Validation", fg="red")
+        click.secho("Generated result failed validation", fg="red")
         click.secho(err)
         raise click.Abort
     LOG.info("Storing results to: %s", output)
@@ -541,42 +548,48 @@ def annotate_delly(vcf, bed, output):
     writer = Writer(output.absolute(), vcf_obj)
     annotate_delly_variants(writer, vcf_obj, annotation, annot_chrom=annot_chrom)
 
-    click.secho(f"Wrote annotated delly variants to {output}", fg="green")
+    click.secho(f"Wrote annotated delly variants to {output.name}", fg="green")
 
 
 @cli.command()
-@click.option("-n", "--name", type=str, help="Track name.")
+@click.option("-n", "--track-name", type=str, help="Track name.")
 @click.option(
     "-a", "--annotation-file", type=click.Path(exists=True), help="Path to file."
 )
 @click.option(
-    "-r",
-    "--result",
+    "-b",
+    "--bonsai-input-file",
     required=True,
     type=click.Path(writable=True),
-    help="PRP result.",
+    help="PRP result file (used as bonsai input).",
+)
+@click.option(
+    "-o",
+    "--output",
+    required=True,
+    type=click.File("w"),
+    help="output filepath",
 )
-@click.argument("output", type=click.File("w"))
-def add_igv_annotation_track(name, annotation_file, result, output):
-    """Add IGV annotation track to result."""
-    with open(result, "r", encoding="utf-8") as jfile:
+def add_igv_annotation_track(track_name, annotation_file, bonsai_input_file, output):
+    """Add IGV annotation track to result (bonsai input file)."""
+    with open(bonsai_input_file, "r", encoding="utf-8") as jfile:
         result_obj = PipelineResult(**json.load(jfile))
 
     # Get genome annotation
-    if result_obj.genome_annotation is None or isinstance(
+    if not isinstance(
         result_obj.genome_annotation, list
     ):
         track_info = []
     else:
-        track_info = result.genome_annotation
+        track_info = result_obj.genome_annotation
 
     # add new tracks
-    track_info.append({"name": name, "file": annotation_file})
+    track_info.append({"name": track_name, "file": annotation_file})
 
     # update data model
     upd_result = result_obj.model_copy(update={"genome_annotation": track_info})
 
     # overwrite result
     output.write(upd_result.model_dump_json(indent=3))
 
-    click.secho(f"Wrote updated result to {output}", fg="green")
+    click.secho(f"Wrote updated result to {output.name}", fg="green")
diff --git a/prp/models/metadata.py b/prp/models/metadata.py
@@ -1,6 +1,7 @@
 """Metadata models."""
 from datetime import datetime
 from enum import Enum
+from typing import Optional
 
 from pydantic import BaseModel, Field
 
@@ -22,35 +23,25 @@ class SoupVersion(BaseModel):
     type: SoupType
 
 
-class RunInformation(RWModel):
-    """Store information on a run how the run was conducted."""
+class SequencingInfo(RWModel):
+    """Information on the sample was sequenced."""
+
+    run_id: str
+    platform: str
+    instrument: Optional[str]
+    method: dict[str, str] = {}
+    date: datetime | None
+
+
+class PipelineInfo(RWModel):
+    """Information on the sample was analysed."""
 
     pipeline: str
     version: str
     commit: str
-    analysis_profile: str = Field(
-        ...,
-        alias="analysisProfile",
-        description="The analysis profile used when starting the pipeline",
-    )
-    configuration_files: list[str] = Field(
-        ..., alias="configurationFiles", description="Nextflow configuration used"
-    )
+    analysis_profile: str
+    configuration_files: list[str]
     workflow_name: str
-    sample_name: str
-    lims_id: str
-    sequencing_run: str
-    sequencing_platform: str
-    sequencing_type: str
     command: str
-    date: datetime
-
-
-SoupVersions = list[SoupVersion]
-
-
-class RunMetadata(BaseModel):
-    """Run metadata"""
-
-    run: RunInformation
-    databases: SoupVersions
+    softwares: list[SoupVersion]
+    date: datetime
diff --git a/prp/models/phenotype.py b/prp/models/phenotype.py
@@ -31,6 +31,7 @@ class VariantType(str, Enum):
     SNV = "SNV"
     MNV = "MNV"
     SV = "SV"
+    INDEL = "INDEL"
     STR = "STR"
 
 
@@ -215,7 +216,7 @@ class VariantBase(RWModel):
     phenotypes: list[PhenotypeInfo] = []
 
     # variant location
-    reference_sequence: str = Field(
+    reference_sequence: str | None = Field(
         ...,
         description="Reference sequence such as chromosome, gene or contig id.",
         alias="gene_symbol",
@@ -251,7 +252,7 @@ class MykrobeVariant(VariantBase):
 class TbProfilerVariant(VariantBase):
     """Container for TbProfiler variant information"""
 
-    variant_effect: str
+    variant_effect: str | None = None
     hgvs_nt_change: Optional[str] = Field(..., description="DNA change in HGVS format")
     hgvs_aa_change: Optional[str] = Field(
         ..., description="Protein change in HGVS format"

diff --git a/prp/models/sample.py b/prp/models/sample.py
@@ -5,7 +5,7 @@
 from pydantic import Field
 
 from .base import RWModel
-from .metadata import RunMetadata
+from .metadata import SequencingInfo, PipelineInfo
 from .phenotype import (
     AMRMethodIndex,
     StressMethodIndex,
@@ -44,8 +44,17 @@ class SampleBase(RWModel):
     """Base datamodel for sample data structure"""
 
     sample_id: str = Field(..., alias="sampleId", min_length=3, max_length=100)
-    run_metadata: RunMetadata = Field(..., alias="runMetadata")
+    sample_name: str
+    lims_id: str
+
+    # metadata
+    sequencing: SequencingInfo
+    pipeline: PipelineInfo
+
+    # quality
     qc: list[QcMethodIndex] = Field(...)
+
+    # species identification
     species_prediction: list[SppMethodIndex] = Field(..., alias="speciesPrediction")
 
 
@@ -81,6 +90,7 @@ class PipelineResult(SampleBase):
     # optional variant info
     snv_variants: Optional[list[VariantBase]] = None
     sv_variants: Optional[list[VariantBase]] = None
+    indel_variants: Optional[list[VariantBase]] = None
     # optional alignment info
     reference_genome: Optional[ReferenceGenome] = None
     read_mapping: Optional[str] = None