diff --git a/docs/index.rst b/docs/index.rst index 19f7cc3cd..dc1e8877a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -69,6 +69,7 @@ Project Info step/ngs_data_qc step/ngs_mapping step/somatic_gene_fusion_calling + step/somatic_neoepitope_prediction step/somatic_purity_ploidy_estimate step/somatic_targeted_seq_cnv_calling step/somatic_variant_annotation diff --git a/snappy_pipeline/apps/snappy_snake.py b/snappy_pipeline/apps/snappy_snake.py index f5f8bf3c2..74bd46e9d 100644 --- a/snappy_pipeline/apps/snappy_snake.py +++ b/snappy_pipeline/apps/snappy_snake.py @@ -35,6 +35,7 @@ somatic_gene_fusion_calling, somatic_hla_loh_calling, somatic_msi_calling, + somatic_neoepitope_prediction, somatic_purity_ploidy_estimate, somatic_targeted_seq_cnv_calling, somatic_variant_annotation, @@ -86,6 +87,7 @@ "somatic_gene_fusion_calling": somatic_gene_fusion_calling, "somatic_hla_loh_calling": somatic_hla_loh_calling, "somatic_msi_calling": somatic_msi_calling, + "somatic_neoepitope_prediction": somatic_neoepitope_prediction, "somatic_purity_ploidy_estimate": somatic_purity_ploidy_estimate, "somatic_targeted_seq_cnv_calling": somatic_targeted_seq_cnv_calling, "somatic_variant_annotation": somatic_variant_annotation, diff --git a/snappy_pipeline/models/annotation.py b/snappy_pipeline/models/annotation.py index 468f0bf96..22d7d1a9c 100644 --- a/snappy_pipeline/models/annotation.py +++ b/snappy_pipeline/models/annotation.py @@ -35,4 +35,7 @@ class Vep(SnappyModel): ] num_threads: int = 8 buffer_size: int = 1000 + plugins: list[str] = [] + """To use this option in VEP, you should download the plugin repository from the link https://github.com/Ensembl/VEP_plugins""" + plugins_dir: str = "" output_options: list[str] = ["everything"] diff --git a/snappy_pipeline/workflow_model.py b/snappy_pipeline/workflow_model.py index aeda7682e..1b1a7fefd 100644 --- a/snappy_pipeline/workflow_model.py +++ b/snappy_pipeline/workflow_model.py @@ -36,6 +36,9 @@ from snappy_pipeline.workflows.somatic_variant_filtration.model import SomaticVariantFiltration from snappy_pipeline.workflows.somatic_variant_signatures.model import SomaticVariantSignatures from snappy_pipeline.workflows.somatic_wgs_cnv_calling.model import SomaticWgsCnvCalling +from snappy_pipeline.workflows.somatic_neoepitope_prediction.model import ( + SomaticNeoepitopePrediction, +) from snappy_pipeline.workflows.somatic_wgs_sv_calling.model import SomaticWgsSvCalling from snappy_pipeline.workflows.sv_calling_targeted.model import SvCallingTargeted from snappy_pipeline.workflows.sv_calling_wgs.model import SvCallingWgs @@ -113,6 +116,7 @@ class StepConfig(TypedDict, total=False): somatic_gene_fusion_calling: SomaticGeneFusionCalling somatic_hla_loh_calling: SomaticHlaLohCalling somatic_msi_calling: SomaticMsiCalling + somatic_neoepitope_prediction: SomaticNeoepitopePrediction somatic_purity_ploidy_estimate: SomaticPurityPloidyEstimate somatic_targeted_seq_cnv_calling: SomaticTargetedSeqCnvCalling somatic_variant_annotation: SomaticVariantAnnotation diff --git a/snappy_pipeline/workflows/hla_typing/Snakefile b/snappy_pipeline/workflows/hla_typing/Snakefile index 0f627c46d..3dc829c5a 100644 --- a/snappy_pipeline/workflows/hla_typing/Snakefile +++ b/snappy_pipeline/workflows/hla_typing/Snakefile @@ -67,51 +67,53 @@ rule hla_typing_link_out_run: # OptiType -------------------------------------------------------------------- - -rule hla_typing_optitype_run: - input: - **wf.get_input_files("optitype", "run"), - output: - **wf.get_output_files("optitype", "run"), - threads: wf.get_resource("optitype", "run", "threads") - resources: - time=wf.get_resource("optitype", "run", "time"), - memory=wf.get_resource("optitype", "run", "memory"), - partition=wf.get_resource("optitype", "run", "partition"), - tmpdir=wf.get_resource("optitype", "run", "tmpdir"), - log: - **wf.get_log_file("optitype", "run"), - params: - args=wf.substep_dispatch("optitype", "get_args", "run"), - wrapper: - wf.wrapper_path("optitype") +if "optitype" in wf.w_config.step_config["hla_typing"].tools: + + rule hla_typing_optitype_run: + input: + **wf.get_input_files("optitype", "run"), + output: + **wf.get_output_files("optitype", "run"), + threads: wf.get_resource("optitype", "run", "threads") + resources: + time=wf.get_resource("optitype", "run", "time"), + memory=wf.get_resource("optitype", "run", "memory"), + partition=wf.get_resource("optitype", "run", "partition"), + tmpdir=wf.get_resource("optitype", "run", "tmpdir"), + log: + **wf.get_log_file("optitype", "run"), + params: + args=wf.substep_dispatch("optitype", "get_args", "run"), + wrapper: + wf.wrapper_path("optitype") # arcasHLA -------------------------------------------------------------------- # NB: reference is updated in the installed package -rule hla_typing_arcashla_prepare_reference: - output: - touch("work/arcashla.prepare_reference/out/.done"), - log: - "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log", - wrapper: - wf.wrapper_path("arcashla/prepare_reference") - - -rule hla_typing_arcashla_run: - input: - unpack(wf.get_input_files("arcashla", "run")), - output: - **wf.get_output_files("arcashla", "run"), - threads: wf.get_resource("arcashla", "run", "threads") - resources: - time=wf.get_resource("arcashla", "run", "time"), - memory=wf.get_resource("arcashla", "run", "memory"), - partition=wf.get_resource("arcashla", "run", "partition"), - tmpdir=wf.get_resource("arcashla", "run", "tmpdir"), - log: - wf.get_log_file("arcashla", "run"), - wrapper: - wf.wrapper_path("arcashla/run") +if "arcashla" in wf.w_config.step_config["hla_typing"].tools: + + rule hla_typing_arcashla_prepare_reference: + output: + touch("work/arcashla.prepare_reference/out/.done"), + log: + "work/arcashla.prepare_reference/log/arcashla.prepare_reference.log", + wrapper: + wf.wrapper_path("arcashla/prepare_reference") + + rule hla_typing_arcashla_run: + input: + unpack(wf.get_input_files("arcashla", "run")), + output: + **wf.get_output_files("arcashla", "run"), + threads: wf.get_resource("arcashla", "run", "threads") + resources: + time=wf.get_resource("arcashla", "run", "time"), + memory=wf.get_resource("arcashla", "run", "memory"), + partition=wf.get_resource("arcashla", "run", "partition"), + tmpdir=wf.get_resource("arcashla", "run", "tmpdir"), + log: + wf.get_log_file("arcashla", "run"), + wrapper: + wf.wrapper_path("arcashla/run") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile new file mode 100644 index 000000000..7b18f7238 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/Snakefile @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +"""CUBI Pipeline somatic neoepitope prediction step Snakefile""" + +import os + +from snappy_pipeline import expand_ref +from snappy_pipeline.workflows.somatic_neoepitope_prediction import ( + SomaticNeoepitopePredictionWorkflow, +) + +__author__ = "Pham Gia Cuong" + + +# Configuration =============================================================== + + +configfile: "config.yaml" + + +# Expand "$ref" JSON pointers in configuration (also works for YAML) +config, lookup_paths, config_paths = expand_ref("config.yaml", config) + +# WorkflowImpl Object Setup =================================================== +wf = SomaticNeoepitopePredictionWorkflow(workflow, config, lookup_paths, config_paths, os.getcwd()) + + +localrules: + # Linking files from work/ to output/ should be done locally + somatic_neoepitope_preparation_link_out_run, + + +rule all: + input: + wf.get_result_files(), + + +# Generic linking out --------------------------------------------------------- + + +rule somatic_neoepitope_preparation_link_out_run: + input: + wf.get_input_files("link_out", "run"), + output: + wf.get_output_files("link_out", "run"), + run: + shell(wf.get_shell_cmd("link_out", "run", wildcards)) + + +rule somatic_neoepitope_prediction_pvactools_install: + output: + **wf.get_output_files("pvacseq", "install"), + params: + container="docker://griffithlab/pvactools", + resources: + time=wf.get_resource("pvacseq", "install", "time"), + memory=wf.get_resource("pvacseq", "install", "memory"), + partition=wf.get_resource("pvacseq", "install", "partition"), + log: + **wf.get_log_file("pvacseq", "install"), + wrapper: + wf.wrapper_path("singularity") + + +rule somatic_neoepitope_preparation: + input: + unpack(wf.get_input_files("pvacseq", "prepare")), + output: + **wf.get_output_files("pvacseq", "prepare"), + log: + **wf.get_log_file("pvacseq", "prepare"), + threads: wf.get_resource("pvacseq", "prepare", "threads") + resources: + time=wf.get_resource("pvacseq", "prepare", "time"), + memory=wf.get_resource("pvacseq", "prepare", "memory"), + partition=wf.get_resource("pvacseq", "prepare", "partition"), + tmpdir=wf.get_resource("pvacseq", "prepare", "tmpdir"), + params: + **{"args": wf.get_params("pvacseq", "prepare")}, + wrapper: + wf.wrapper_path("pvactools/combining") + + +rule somatic_neoepitope_prediction: + input: + unpack(wf.get_input_files("pvacseq", "predict")), + output: + **wf.get_output_files("pvacseq", "predict"), + log: + **wf.get_log_file("pvacseq", "predict"), + threads: wf.get_resource("pvacseq", "predict", "threads") + resources: + time=wf.get_resource("pvacseq", "predict", "time"), + memory=wf.get_resource("pvacseq", "predict", "memory"), + partition=wf.get_resource("pvacseq", "predict", "partition"), + tmpdir=wf.get_resource("pvacseq", "predict", "tmpdir"), + params: + **{"args": wf.get_params("pvacseq", "predict")}, + wrapper: + wf.wrapper_path("pvactools/pvacseq") diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py new file mode 100644 index 000000000..9721a32f1 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/__init__.py @@ -0,0 +1,468 @@ +# -*- coding: utf-8 -*- +"""Implementation of the ``somatic_neoepitope_prediction`` step + +The somatic_neoepitope_prediction step allows for the prediction of neoepitopes from somatic +(small) variant calling results and a transcript database such as ENSEMBL. Further, the step +allows for the binding prediction to a given set of HLA alleles. + +.. note:: + + Status: not implemented yet + +========== +Step Input +========== + +.. note:: TODO + +=========== +Step Output +=========== + +.. note:: TODO + +===================== +Default Configuration +===================== + +The default configuration is as follows. + +.. include:: DEFAULT_CONFIG_somatic_neoepitope_prediction.rst + +""" + +from collections import OrderedDict +import sys + +from biomedsheets.shortcuts import CancerCaseSheet, CancerCaseSheetOptions, is_not_background +from snakemake.io import expand + +from snappy_pipeline.utils import dictify, listify +from snappy_pipeline.workflows.abstract import BaseStep, BaseStepPart, LinkOutStepPart +from snappy_pipeline.workflows.ngs_mapping import NgsMappingWorkflow, ResourceUsage +from snappy_pipeline.workflows.somatic_variant_annotation import SomaticVariantAnnotationWorkflow +from snappy_pipeline.workflows.somatic_variant_calling import ( + SOMATIC_VARIANT_CALLERS_MATCHED, + SomaticVariantCallingWorkflow, +) +from snappy_pipeline.workflows.hla_typing import HlaTypingWorkflow +from .model import SomaticNeoepitopePrediction as SomaticNeoepitopePredictionConfigModel + + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +#: Extensions of files to create as main payload +PREPARE_EXT_VALUES = (".vcf.gz", ".vcf.gz.tbi", ".vcf.gz.md5", ".vcf.gz.tbi.md5") +PREDICT_EXT_VALUES = ( + ".all_epitopes.tsv", + ".filtered.tsv", + ".all_epitopes.tsv.md5", + ".filtered.tsv.md5", +) +#: Default configuration for the somatic_gene_fusion_calling step +DEFAULT_CONFIG = SomaticNeoepitopePredictionConfigModel.default_config_yaml_string() + + +class PvacSeqStepPart(BaseStepPart): + """ + Preparation VCF file for pvactool + """ + + #: Step name + name = "pvacseq" + + #: Actions + actions = ("install", "prepare", "predict") + + #: Resources + resource_usage = { + "install": ResourceUsage( + threads=2, + time="03:00:00", + memory="6G", + ), + "prepare": ResourceUsage( + threads=1, + time="01:00:00", + memory="6G", + ), + "predict": ResourceUsage( + threads=4, + time="4:00:00", + memory="30G", + ), + } + + def __init__(self, parent): + super().__init__(parent) + # Build shortcut from cancer bio sample name to matched cancer sample + self.tumor_ngs_library_to_sample_pair = OrderedDict() + for sheet in self.parent.shortcut_sheets: + self.tumor_ngs_library_to_sample_pair.update( + sheet.all_sample_pairs_by_tumor_dna_ngs_library + ) + # Build mapping from donor name to donor. + self.donors = OrderedDict() + for sheet in self.parent.shortcut_sheets: + for donor in sheet.donors: + self.donors[donor.name] = donor + + def get_input_files(self, action): + self._validate_action(action) + # if action == "install": + # return {"container": "work/containers/out/pvactools.simg"} + if action == "prepare": + return self._get_input_files_prepare + if action == "predict": + return self._get_input_files_predict + + @dictify + def _get_input_files_predict(self, wildcards): + yield "container", "work/containers/out/pvactools.simg" + prepare_tpl = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.vcf.gz" + ) + yield "combine_vcf", prepare_tpl + hla_typing = self.parent.sub_workflows["hla_typing"] + hla_tpl = "output/optitype.{ngs_library}/out/optitype.{ngs_library}.txt" + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + yield ( + "hla_normal_dna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.normal_sample.dna_ngs_library.name, + ) + ), + ) + yield ( + "hla_tumor_dna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.tumor_sample.dna_ngs_library.name, + ) + ), + ) + yield ( + "hla_tumor_rna", + hla_typing( + (hla_tpl).format( + ngs_library=sample_pair.tumor_sample.rna_ngs_library.name, + ) + ), + ) + + @dictify + def _get_input_files_prepare(self, wildcards): + """Return path to somatic variant annotated file""" + # It only works with vep now. + # Validate action + # self._validate_action(action) + # yield "container","work/containers/out/pvactools.simg" + tpl = ( + "output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + ) + if self.config["preparation"]["full_vep_annotation"]: + vep_key_ext = {"vcf": ".full.vcf.gz", "vcf_tbi": ".full.vcf.gz.tbi"} + else: + vep_key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} + variant_annotation = self.parent.sub_workflows["somatic_variant_annotation"] + if self.config["preparation"]["format"] == "snappy_custom": + for key, ext in vep_key_ext.items(): + yield key, variant_annotation(tpl + ext) + + ngs_mapping = self.parent.sub_workflows["ngs_mapping"] + star_key_ext = {"expression": ".GeneCounts.tab", "bam": ".bam"} + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + rna_tpl = "output/{mapper}.{rna_library}/out/{mapper}.{rna_library}" + for key, ext in star_key_ext.items(): + yield ( + key, + ngs_mapping( + (rna_tpl + ext).format( + mapper=self.w_config.step_config["ngs_mapping"].tools.rna[0], + rna_library=sample_pair.tumor_sample.rna_ngs_library.name, + ) + ), + ) + + @dictify + def get_output_files(self, action): + """Return output files""" + # Validate action + self._validate_action(action) + if action == "install": + yield "container", "work/containers/out/pvactools.simg" + if action == "prepare": + path_prepare = self._get_output_files_prepare() + yield from path_prepare.items() + if action == "predict": + path_predict = self._get_output_files_predict() + yield from path_predict.items() + + @dictify + def _get_output_files_prepare(self): + if self.config["preparation"]["mode"] == "gene": + prefix = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + elif self.config["preparation"]["mode"] == "transcript": + prefix = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + ) + key_ext = {"vcf": ".vcf.gz", "vcf_tbi": ".vcf.gz.tbi"} + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_output_files_predict(self): + key_ext = {"all_epitopes": ".all_epitopes.tsv", "filtered_epitopes": ".filtered.tsv"} + prefix = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}" + for key, ext in key_ext.items(): + yield key, prefix + ext + yield key + "_md5", prefix + ext + ".md5" + + @dictify + def _get_log_file(self, action): + """Return mapping of log files.""" + # Validate action + self._validate_action(action) + if action == "install": + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, "work/containers/log/pvactool_install" + ext + + if action == "prepare": + if self.config["preparation"]["mode"] == "gene": + prefix = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + elif self.config["preparation"]["mode"] == "transcript": + prefix = ( + "work/preprare/{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.TX.{tumor_library}" + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + if action == "predict": + prefix = ( + "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/" + "log/{tumor_library}" + ) + key_ext = ( + ("log", ".log"), + ("conda_info", ".conda_info.txt"), + ("conda_list", ".conda_list.txt"), + ) + for key, ext in key_ext: + yield key, prefix + ext + + def get_params(self, action): + self._validate_action(action) + if action == "prepare": + return getattr(self, "_get_params_run_prepare") + if action == "predict": + return getattr(self, "_get_params_run_predict") + + def _get_params_run_prepare(self, wildcards): + d = { + "max_depth": self.config["preparation"]["max_depth"], + "format": self.config["preparation"]["format"], + "expression_column": self.config["preparation"]["expression_column"], + "id_column": self.config["preparation"]["id_column"], + "ignore_ensembl_id_version": self.config["preparation"]["ignore_ensembl_id_version"], + "mode": self.config["preparation"]["mode"], + } + return d + + def _get_params_run_predict(self, wildcards): + d = { + "epitope_length": ",".join( + map(str, self.config["prediction"]["CLASS_I_EPITOPE_LENGTH"]) + ), + "algorithms": " ".join(self.config["prediction"]["algorithms"]), + "BINDING_THRESHOLD": self.config["prediction"]["BINDING_THRESHOLD"], + "percentile_threshold": self.config["prediction"]["percentile_threshold"], + "allele_specific_binding_thresholds": self.config["prediction"][ + "allele_specific_binding_thresholds" + ], + "aggregate_inclusion_binding_threshold": self.config["prediction"][ + "aggregate_inclusion_binding_threshold" + ], + "netmhc_stab": self.config["prediction"]["netmhc_stab"], + "NET_CHOP_THRESHOLD": self.config["prediction"]["NET_CHOP_THRESHOLD"], + "PROBLEMATIC_AMINO_ACIDS": self.config["prediction"]["PROBLEMATIC_AMINO_ACIDS"], + # 'run_reference_proteome_similarity': self.config["prediction"]["run_reference_proteome_similarity"], + "FAST_SIZE": self.config["prediction"]["FAST_SIZE"], + "exclude_NAs": self.config["prediction"]["exclude_NAs"], + "NORMAL_COV": self.config["prediction"]["NORMAL_COV"], + "TDNA_COV": self.config["prediction"]["TDNA_COV"], + "TRNA_COV": self.config["prediction"]["TRNA_COV"], + "NORMAL_VAF": self.config["prediction"]["NORMAL_VAF"], + "maximum_transcript_support_level": self.config["prediction"][ + "maximum_transcript_support_level" + ], + "pass_only": self.config["prediction"]["pass_only"], + "tumor_purity": self.config["prediction"]["tumor_purity"], + } + for sheet in filter(is_not_background, self.parent.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if sample_pair.tumor_sample.dna_ngs_library.name == wildcards.tumor_library: + d["normal_sample"] = sample_pair.normal_sample.dna_ngs_library.name + return d + + +class SomaticNeoepitopePredictionWorkflow(BaseStep): + """Perform neoepitope prediction workflow""" + + name = "somatic_neoepitope_prediction" + sheet_shortcut_class = CancerCaseSheet + sheet_shortcut_kwargs = { + "options": CancerCaseSheetOptions(allow_missing_normal=True, allow_missing_tumor=True) + } + + @classmethod + def default_config_yaml(cls): + """Return default config YAML, to be overwritten by project-specific one.""" + return DEFAULT_CONFIG + + def __init__(self, workflow, config, config_lookup_paths, config_paths, workdir): + super().__init__( + workflow, + config, + config_lookup_paths, + config_paths, + workdir, + config_model_class=SomaticNeoepitopePredictionConfigModel, + previous_steps=( + SomaticVariantCallingWorkflow, + SomaticVariantAnnotationWorkflow, + NgsMappingWorkflow, + HlaTypingWorkflow, + ), + ) + # Register sub step classes so the sub steps are available + config = self.config + self.register_sub_step_classes((PvacSeqStepPart, LinkOutStepPart)) + self.register_sub_workflow( + "somatic_variant_annotation", + config["path_somatic_variant_annotation"], + ) + self.register_sub_workflow( + "ngs_mapping", + config["path_rna_ngs_mapping"], + ) + self.register_sub_workflow( + "hla_typing", + config["path_hla_typing"], + ) + # Copy over "tools" setting from somatic_variant_calling/ngs_mapping if not set here + if not config.tools_ngs_mapping: + config.tools_ngs_mapping = self.w_config["step_config"]["ngs_mapping"].tools.dna + if not config.tools_somatic_variant_calling: + config.tools_somatic_variant_calling = self.w_config["step_config"][ + "somatic_variant_calling" + ].tools + if not config.tools_somatic_variant_annotation: + config.tools_somatic_variant_annotation = ["vep"] + if not config.tools_rna_mapping: + config.tools_rna_mapping = self.w_config["step_config"]["ngs_mapping"].tools.rna + if not config.preparation.path_features: + config.preparation.path_features = self.w_config["static_data_config"].features.path + + @listify + def get_result_files(self): + cfg_mode = self.config["preparation"]["mode"] + callers = set(self.config["tools_somatic_variant_calling"]) + anno_callers = set(self.config["tools_somatic_variant_annotation"]) + predict_tpl = "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library.name}.epitopes/" + yield from self._yield_result_files_matched( + predict_tpl + "out/MHC_Class_I/{tumor_library.name}" + "{ext}", + mapper=self.config["tools_ngs_mapping"], + var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers, + mode="GX" if cfg_mode == "gene" else "TX", + ext=PREDICT_EXT_VALUES, + ) + + yield from self._yield_result_files_matched( + predict_tpl + "log/{tumor_library.name}" + "{ext}", + mode="GX" if cfg_mode == "gene" else "TX", + mapper=self.config["tools_ngs_mapping"], + var_caller=callers & set(SOMATIC_VARIANT_CALLERS_MATCHED), + anno_caller=anno_callers, + ext=( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ), + ) + + def _yield_result_files_matched(self, tpl, **kwargs): + """Build output paths from path template and extension list. + + This function returns the results from the matched somatic variant callers such as + Mutect. + """ + for sheet in filter(is_not_background, self.shortcut_sheets): + for sample_pair in sheet.all_sample_pairs: + if ( + not sample_pair.tumor_sample.dna_ngs_library + or not sample_pair.normal_sample.dna_ngs_library + or not sample_pair.tumor_sample.rna_ngs_library + ): + msg = ( + "INFO: sample pair for cancer bio sample {} has is missing primary" + "normal or primary cancer NGS library" + ) + print(msg.format(sample_pair.tumor_sample.name), file=sys.stderr) + continue + yield from expand( + tpl, + tumor_library=[sample_pair.tumor_sample.dna_ngs_library], + **kwargs, + ) + + def check_config(self): + """Check that path_somatic_variant_annotation, preparation/mode and preparation/format are present in the configuration""" + self.ensure_w_config( + ( + "step_config", + "somatic_neoepitope_prediction", + "path_somatic_variant_annotation", + ), + "Path to variant (directory of vcf files) not configured but required for somatic neoepitope prediction", + ) + + self.ensure_w_config( + ("step_config", "somatic_neoepitope_prediction", "preparation", "mode"), + "The mode is required for adding gene expression data to somatic variant annotated file", + ) + + self.ensure_w_config( + ("step_config", "somatic_neoepitope_prediction", "preparation", "format"), + "Format is required to determine which tool generated the TPM values. ", + ) diff --git a/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py new file mode 100644 index 000000000..a17452d03 --- /dev/null +++ b/snappy_pipeline/workflows/somatic_neoepitope_prediction/model.py @@ -0,0 +1,122 @@ +import enum +from typing import Annotated + +from pydantic import Field + +from snappy_pipeline.models import EnumField, SnappyModel, SnappyStepModel + + +class Format(enum.StrEnum): + stringtie = "stringtie" + snappy_custom = "snappy_custom" + cufflinks = "cufflinks" + kallisto = "kallisto" + custom = "custom" + + +class Preparation(SnappyModel): + format: Annotated[Format, EnumField(Format)] = Format.snappy_custom + "The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom)" + "Use `custom` to process file formats not explicitly supported." + "The `custom` option requires the use of the --id-column and --expression-column arguments." + path_features: str = "" + """Gencode file path, required for star and snappy format""" + mode: Annotated[str, Field(examples=["gene", "transcript"])] = "gene" + """Determine whether the expression file contains gene or transcript TPM values.""" + full_vep_annotation: bool = True + """The somatic_variant_annotation has been done in fully annotation mode""" + id_column: str = "" + """Gene/Transcript id column name. Need when using the `custom` format.""" + expression_column: str = "" + """Expression column name. Need when using the `custom` format.""" + ignore_ensembl_id_version: bool = True + """Ignore the ensemble id version""" + max_depth: int = 4000 + "max_depth option of bcftools mpileup" + + +class Algorithms(enum.StrEnum): + BigMHC_EL = "BigMHC_EL" + BigMHC_IM = "BigMHC_IM" + DeepImmuno = "DeepImmuno" + MHCflurry = "MHCflurry" + MHCflurryEL = "MHCflurryEL" + MHCnuggetsI = "MHCnuggetsI" + MHCnuggetsII = "MHCnuggetsII" + NNalign = "NNalign" + NetMHC = "NetMHC" + NetMHCIIpan = "NetMHCIIpan" + NetMHCIIpanEL = "NetMHCIIpanEL" + NetMHCcons = "NetMHCcons" + NetMHCpan = "NetMHCpan" + NetMHCpanEL = "NetMHCpanEL" + PickPocket = "PickPocket" + SMM = "SMM" + SMMPMBEC = "SMMPMBEC" + SMMalign = "SMMalign" + all_class_i = "all_class_i" + + +class Prediction(SnappyModel): + CLASS_I_EPITOPE_LENGTH: list[int] = [8, 9, 10, 11] + "Length of MHC Class I subpeptides (neoepitopes) to predict." + "Multiple epitope lengths can be specified using a comma-separated list." + "Typical epitope lengths vary between 8-15." + algorithms: Annotated[list[Algorithms], EnumField(Algorithms, [])] + "The epitope prediction algorithms to use." + "For running with all prediction assign to ['all_class_i']" + # Following parameter please follow the documentation of pvactools to know its function + BINDING_THRESHOLD: int = 500 + percentile_threshold: float | None = Field(default=None) + allele_specific_binding_thresholds: bool = False + aggregate_inclusion_binding_threshold: int = 5000 + netmhc_stab: bool = False + NET_CHOP_THRESHOLD: float = 0.5 + PROBLEMATIC_AMINO_ACIDS: list[str] | None = Field(default=None) + # top-score-metric + # net-chop-method + # net-chop-threshold + + # E.g amino_acid:position - G:2 would check for a Glycine at + # the second position of the epitope + # run_reference_proteome_similarity: bool = False + # blastp_path + # blastp-db + # peptide-fasta + FAST_SIZE: int = 200 + exclude_NAs: bool = False + NORMAL_COV: int = 5 + TDNA_COV: int = 10 + TRNA_COV: int = 10 + NORMAL_VAF: float = 0.02 + maximum_transcript_support_level: int = Field(gt=0, lt=6, default=1) + # allele-specific-anchors + # anchor_contribution_threshold: float=0.8 + pass_only: bool = False + tumor_purity: float | None = Field(default=None) + + +class SomaticNeoepitopePrediction(SnappyStepModel): + path_somatic_variant_annotation: Annotated[ + str, Field(examples=["../somatic_variant_annotation"]) + ] + path_container: Annotated[ + str, Field(examples=["../somatic_neoepitope_prediction/work/containers/out/pvactools.simg"]) + ] + """ + Running somatic neoepitope prediction with pvactools + is required,with the container + """ + path_hla_typing: Annotated[str, Field(examples=["../hla_typing"])] + path_rna_ngs_mapping: Annotated[str, Field(examples=["../ngs_mapping"])] + tools_somatic_variant_annotation: list[str] = ["vep"] + tools_ngs_mapping: list[str] = [] + tools_somatic_variant_calling: list[str] = [] + tools_rna_mapping: list[str] = [] + """Deafult to those configured for ngs_mapping""" + tools_ngs_mapping: list[str] = [] + """Deafult to those configured for ngs_mapping""" + tools_somatic_variant_calling: list[str] = [] + """Deafult to those configured for somatic_variant_calling""" + preparation: Preparation | None = None + prediction: Prediction | None = None diff --git a/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py new file mode 100644 index 000000000..8b3658c14 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/comb_rna.py @@ -0,0 +1,495 @@ +import argparse +import logging +import re +import sys + +import pandas as pd +import pyranges as pr +from vcfpy import OrderedDict, Reader, Writer + + +########################## Define tool parameters ########################## +def define_parser(): + parser = argparse.ArgumentParser( + "comb_rna.py", + description="A tool that will add the data from several expression tools' output files" + + "and allelic depths from mRNA sequencing data for snvs" + + "to the VCF INFO column. Supported tools are StringTie, Kallisto, Star, snappy_custom" + + "and Cufflinks. There also is a ``custom`` option to annotate with data " + + "from any tab-delimited file.", + ) + + parser.add_argument("--input-vcf", help="A VEP-annotated VCF file") + parser.add_argument("--expression-file", help="A expression file") + parser.add_argument("--genecode", help="A genecode file for calculate TPM from star gene count") + parser.add_argument( + "--rna-vcf", + help="A VCF file of RNA-seq data", + ) + parser.add_argument( + "--format", + choices=["kallisto", "stringtie", "cufflinks", "star", "snappy_custom", "custom"], + help="The file format of the expression file to process. " + + "Use `custom` to process file formats not explicitly supported. " + + "The `custom` option requires the use of the --id-column and --expression-column arguments.", + ) + parser.add_argument( + "--mode", + choices=["gene", "transcript"], + help="The type of expression data in the expression_file", + ) + parser.add_argument( + "-i", + "--id-column", + help="The column header in the expression_file for the column containing gene/transcript ids. Required when using the `custom` format.", + ) + parser.add_argument( + "-e", + "--expression-column", + help="The column header in the expression_file for the column containing expression data. Required when using the `custom` and `star` format.", + ) + parser.add_argument( + "-s", + "--sample-name", + help="If the input_vcf contains multiple samples, the name of the sample to annotate.", + ) + parser.add_argument( + "-o", + "--output-vcf", + help="Path to write the output VCF file. If not provided, the output VCF file will be " + + "written next to the input VCF file with a .tx.vcf or .gx.vcf file ending.", + ) + parser.add_argument( + "--ignore-ensembl-id-version", + help='Assumes that the final period and number denotes the Ensembl ID version and ignores it (i.e. for "ENST00001234.3" - ignores the ".3").', + action="store_true", + ) + + return parser + + +########################## Arguments check ########################## +def args_check(args): + if args.format == "custom": + if args.id_column is None: + raise Exception( + "--id-column is not set. This is required when using the `custom` format." + ) + if args.expression_column is None: + raise Exception( + "--expression-column is not set. This is required when using the `custom` format." + ) + if args.format == "star": + if args.genecode is None: + raise Exception("--genecode is not set. This is required when using the `star` format") + + +########################## Get expected column names ########################## +def get_expected_column_names(format, mode, id_column, exp_column): + match (format, mode): + case ("star", _): + try: + exp = int(exp_column) + if 1 < exp and exp <= 4: + return [0, exp - 1] + else: + raise Exception("Expression column is invalid") + except ValueError: + raise ValueError("Expression column in star should be a number") + case ("snappy_custom", _): + return [0, 1] + case ("kallisto", "gene"): + return ["gene", "abundance"] + case ("kallisto", "transcript"): + return ["target_id", "tpm"] + case ("cufflinks", _): + return ["tracking_id", "FPKM"] + case ("stringtie", "gene"): + return ["Gene ID", "TPM"] + case ("stringtie", "transcript"): + return ["transcript_id", "TPM"] + case ("custom", _): + return [id_column, exp_column] + case _: + raise Exception("We don't support {format} with mode as {mode} yet.") + + +########################## Genecode for star parser ########################## +def count_to_TPM(genecode_path, exp_df, no_version=False): + gc = pr.read_gtf(genecode_path, as_df=True) + if no_version: + gc.loc[:, "gene_id"] = gc.loc[:, "gene_id"].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + gc = gc[gc["gene_id"].isin(exp_df.iloc[:, 0])] + gc = gc[(gc.Feature == "gene")] + exon = gc[["Chromosome", "Start", "End", "gene_id", "gene_name"]] + # Convert columns to proper types. + exon.loc[:, "Start"] = exon.loc[:, "Start"].astype(int) + exon.loc[:, "End"] = exon.loc[:, "End"].astype(int) + # Sort in place. + exon = exon.sort_values(by=["Chromosome", "Start", "End"]) + # Group the rows by the Ensembl gene identifier. + groups = exon.groupby("gene_id") + lengths = groups.apply(count_bp) + # Create a new DataFrame with gene lengths and EnsemblID. + gene_id = lengths.index + if len(gene_id) != len(exp_df): + raise Exception("Duplicated gene id in expression file") + ldf = pd.DataFrame({"length": lengths.values, "gene_id": gene_id}).sort_values(by="gene_id") + # Calculate TPM + tpm = calculate_TPM(exp_df, ldf) + tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], tpm)} + return tpm_dict + + +def calculate_TPM(exp_df, ldf): + "TPM = (reads_transcript / transcript_length_kb) / scaled_total_mapped_reads" + transcript_length_kb = ldf["length"] / 1000 + RPK = exp_df.iloc[:, 1] / transcript_length_kb + scale_factor_permil = sum(RPK) / (10**6) + return RPK / scale_factor_permil + + +def count_bp(df): + """Given a DataFrame with the exon coordinates from Gencode for a single + gene, return the total number of coding bases in that gene. + Example: + >>> import numpy as np + >>> n = 3 + >>> r = lambda x: np.random.sample(x) * 10 + >>> d = pd.DataFrame([np.sort([a,b]) for a,b in zip(r(n), r(n))], columns=['start','end']).astype(int) + >>> d + start end + 0 6 9 + 1 3 4 + 2 4 9 + >>> count_bp(d) + 7 + Here is a visual representation of the 3 exons and the way they are added: + 123456789 Length + 0 ---- 4 + 1 -- 2 + 2 ------ 6 + ======= 7 + """ + start = df.Start.min() + end = df.End.max() + bp = [False] * (end - start + 1) + for i in range(df.shape[0]): + s = df.iloc[i]["Start"] - start + e = df.iloc[i]["End"] - start + 1 + bp[s:e] = [True] * (e - s) + return sum(bp) + + +########################## Expression file parser ########################## +def colnames_check(names, exp_names): + return set(names).issubset(set(exp_names)) + + +def expression_file_parser( + path, format, mode, id_column, exp_column, no_version=False, genecode="" +): + col_names = get_expected_column_names(format, mode, id_column, exp_column) + if isinstance(col_names[0], int): + exp_df = pd.read_csv(path, skiprows=4, sep="\t", header=None) + exp_df = exp_df.iloc[:, col_names] + exp_df = exp_df.sort_values(by=0) + if no_version: + exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + tpm_dict = count_to_TPM(genecode, exp_df, no_version) + else: + if format == "stringtie" and mode == "transcript": + exp_df = pr.read_gtf(path, as_df=True) + exp_df = exp_df[exp_df["feature"] == "transcript"] + else: + exp_df = pd.read_csv(path, sep="\t", header=0) + + if colnames_check(col_names, exp_df.columns): + exp_df = exp_df.loc[:, col_names] + else: + raise Exception( + "Gene id column or expression column is not presense in expression file" + ) + + if no_version: + exp_df.iloc[:, 0] = exp_df.iloc[:, 0].apply(lambda x: re.sub(r"\.[0-9]+$", "", x)) + tpm_dict = {k: v for k, v in zip(exp_df.iloc[:, 0], exp_df.iloc[:, 1])} + return tpm_dict + + +########################## Dna Vcf parser ########################## +def dna_vcf_parser(path, sample_name): + vcf_reader = Reader.from_path(path) + is_multi_sample = len(vcf_reader.header.samples.names) > 1 + if is_multi_sample and sample_name is None: + vcf_reader.close() + raise Exception( + "ERROR: VCF {} contains more than one sample. Please use the -s option to specify which sample to annotate.".format( + path + ) + ) + elif is_multi_sample and sample_name not in vcf_reader.header.samples.names: + vcf_reader.close() + raise Exception( + "ERROR: VCF {} does not contain a sample column for sample {}.".format( + path, sample_name + ) + ) + if "CSQ" not in vcf_reader.header.info_ids(): + vcf_reader.close() + raise Exception( + "ERROR: VCF {} is not VEP-annotated. Please annotate the VCF with VEP before running this tool.".format( + path + ) + ) + return vcf_reader, is_multi_sample + + +########################## Rna Vcf parser ########################## +def rna_vcf_parser(path): + rna_vcf_reader = Reader.from_path(path) + return rna_vcf_reader + + +########################## Add expression values functions ########################## +def to_array(dictionary): + array = [] + for key, value in dictionary.items(): + array.append("{}|{}".format(key, value)) + return sorted(array) + + +def add_Expression_to_vcf( + entry, + is_multi_sample, + sample_name, + exp_dict, + genes, + tag, + no_version, + missing_expressions_count, + entry_count, +): + expressions = {} + for gene in genes: + entry_count += 1 + if no_version: + gene = re.sub(r"\.[0-9]+$", "", gene) + if gene in exp_dict: + expressions[gene] = exp_dict[gene] + else: + missing_expressions_count += 1 + if is_multi_sample: + entry.FORMAT += [tag] + entry.call_for_sample[sample_name].data[tag] = to_array(expressions) + else: + entry.add_format(tag, to_array(expressions)) + return (entry, missing_expressions_count, entry_count) + + +def add_AD_to_vcf(entry, is_multi_sample, sample_name, rna_vcf): + RAD_temp = [] + ROT = [] + try: + rna_records = rna_vcf.fetch(entry.CHROM, entry.affected_start, entry.affected_end) + rna_record = next(rna_records) + except (StopIteration, ValueError): + # if there is no snp. RAD and RoT will be set to 0 + if is_multi_sample: + entry.FORMAT += ["RAD"] + entry.call_for_sample[sample_name].data["RAD"] = 0 + entry.FORMAT += ["ROT"] + entry.call_for_sample[sample_name].data["ROT"] = 0 + else: + entry.add_format("RAD", 0) + entry.add_format("ROT", 0) + else: + rna_alt = [alt.value for alt in rna_record.ALT] + rna_AD = [c.data.get("AD") for c in rna_record.calls][0] + ROT_temp = sum(rna_AD) - rna_AD[0] + if rna_alt: + rna_AD = [c.data.get("AD") for c in rna_record.calls][0] + for dna_alt in entry.ALT: + if dna_alt.value in rna_alt: + index = rna_record.ALT.index(dna_alt) + RAD_temp.append(rna_AD[0]) # AD of REF + RAD_temp.append(rna_AD[index + 1]) # AD of ALT + ROT_temp = ROT_temp - rna_AD[index + 1] + ROT.append(ROT_temp) + if is_multi_sample: + entry.FORMAT += ["RAD"] + entry.call_for_sample[sample_name].data["RAD"] = RAD_temp + entry.FORMAT += ["ROT"] + entry.call_for_sample[sample_name].data["ROT"] = ROT_temp + else: + entry.add_format("RAD", RAD_temp) + entry.add_format("ROT", ROT_temp) + return entry + + +########################## Add fields to header ########################## +def create_vcf_writer(path, vcf_reader, mode, rna_vcf, output_vcf): + (head, _, tail) = path.rpartition(".vcf") + new_header = vcf_reader.header.copy() + if mode == "gene": + new_header.add_format_line( + OrderedDict( + [ + ("ID", "GX"), + ("Number", "."), + ("Type", "String"), + ("Description", "Gene Expressions"), + ] + ) + ) + output_file = ("").join([head, ".gx.vcf", tail]) + elif mode == "transcript": + new_header.add_format_line( + OrderedDict( + [ + ("ID", "TX"), + ("Number", "."), + ("Type", "String"), + ("Description", "Transcript Expressions"), + ] + ) + ) + output_file = ("").join([head, ".tx.vcf", tail]) + if rna_vcf: + new_header.add_format_line( + OrderedDict( + [ + ("ID", "RAD"), + ("Number", "A"), + ("Type", "Integer"), + ( + "Description", + "Allelic depths from mRNA sequencing data of the same sample", + ), + ] + ) + ) + new_header.add_format_line( + OrderedDict( + [ + ("ID", "ROT"), + ("Number", "1"), + ("Type", "Integer"), + ( + "Description", + "Sum of allelic depths of other alternative allele from mRNA sequencing data of the same sample", + ), + ] + ) + ) + (head, _, tail) = output_file.rpartition(".vcf") + output_file = ("").join([head, ".pileup.vcf", tail]) + if output_vcf: + output_file = output_vcf + return Writer.from_path(output_file, new_header) + + +########################## Generate new vcf file ########################## +def adding_extra_information(args): + dna_vcf, is_multi_sample = dna_vcf_parser(args.input_vcf, args.sample_name) + if args.rna_vcf: + rna_vcf = rna_vcf_parser(args.rna_vcf) + # CONTIGS = [x.id for x in dna_vcf.header.get_lines("contig")] + format_pattern = re.compile("Format: (.*)") + csq_format = ( + format_pattern.search(dna_vcf.header.get_info_field_info("CSQ").description) + .group(1) + .split("|") + ) + if args.ignore_ensembl_id_version: + no_version = True + + vcf_writer = create_vcf_writer( + args.input_vcf, dna_vcf, args.mode, args.rna_vcf, args.output_vcf + ) + exp_dict = expression_file_parser( + args.expression_file, + args.format, + args.mode, + args.id_column, + args.expression_column, + no_version, + args.genecode, + ) + + missing_expressions_count = 0 + entry_count = 0 + + for entry in dna_vcf: + # Add expression data + transcript_ids = set() + genes = set() + if "CSQ" not in entry.INFO: + logging.warning( + "Variant is missing VEP annotation. INFO column doesn't contain CSQ field for variant {}".format( + entry + ) + ) + vcf_writer.write_record(entry) + continue + for transcript in entry.INFO["CSQ"]: + for key, value in zip(csq_format, transcript.split("|")): + if key == "Feature" and value != "" and not value.startswith("ENSR"): + transcript_ids.add(value) + if key == "Gene" and value != "": + genes.add(value) + + if args.mode == "gene": + genes = list(genes) + if len(genes) > 0: + (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf( + entry, + is_multi_sample, + args.sample_name, + exp_dict, + genes, + "GX", + no_version, + missing_expressions_count, + entry_count, + ) + elif args.mode == "transcript": + transcript_ids = list(transcript_ids) + if len(transcript_ids) > 0: + (entry, missing_expressions_count, entry_count) = add_Expression_to_vcf( + entry, + is_multi_sample, + args.sample_name, + exp_dict, + transcript_ids, + "TX", + no_version, + missing_expressions_count, + entry_count, + ) + # Add RAD and ROT + if rna_vcf is not None: + if entry.is_snv(): + entry = add_AD_to_vcf(entry, is_multi_sample, args.sample_name, rna_vcf) + vcf_writer.write_record(entry) + + dna_vcf.close() + vcf_writer.close() + + if missing_expressions_count > 0: + logging.warning( + "{} of {} {}s did not have an expression entry for their {} id.".format( + missing_expressions_count, entry_count, args.mode, args.mode + ) + ) + + +########################## MAIN ########################## +def main(args_input=sys.argv[1:]): + parser = define_parser() + args = parser.parse_args(args_input) + adding_extra_information(args) + + +if __name__ == "__main__": + main() diff --git a/snappy_wrappers/wrappers/pvactools/combining/environment.yaml b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml new file mode 100644 index 000000000..47a97a8e3 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/environment.yaml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - vcfpy>=0.12.3 + - numpy>=1.25.2, <1.26 + - pyranges>=0.0.129, <0.1 + - pandas>=2.1.1, <2.2 + - python>=3.10.8, <3.11 + - bcftools>=1.17, <1.18 diff --git a/snappy_wrappers/wrappers/pvactools/combining/wrapper.py b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py new file mode 100644 index 000000000..f5a48fdea --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/combining/wrapper.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +""" +Wrapper for preparation the vcf file for somatic neoepitope prediction +""" + +import os + +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +exp_format = config["preparation"]["format"] +preparation_vcf = os.path.join( + os.path.dirname(__file__), + "comb_rna.py", +) +ensemble_id = ( + "--ignore-ensembl-id-version" if snakemake.params.args["ignore_ensembl_id_version"] else "" +) +max_depth = snakemake.params.args["max_depth"] +format = snakemake.params.args["format"] +mode = snakemake.params.args["mode"] +expression_column = ( + f"--expression-file {snakemake.params.args['expression_column']}" if format == "custom" else "" +) + +id_column = f"--expression-file {snakemake.params.args['id_column']}" if format == "custom" else "" + +bam_file = snakemake.input.bam +gencount_file = snakemake.input.expression + +rna_vcf = ( + f"--rna-vcf $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz " + if format == "snappy_custom" + else "" +) +expression_file = ( + f"--expression-file {gencount_file} " + if format == "snappy_custom" + else config["preparation"]["expression_file"] +) + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi +# ----------------------------------------------------------------------------- +# Create auto-cleaned temporary directory +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +if [[ {format}=="snappy_custom" ]]; then + # Getting region of SNVs only + bcftools filter -i 'TYPE="snp"' {snakemake.input.vcf} | bcftools query -f '%CHROM\t%POS0\t%END\n' > $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed + + # Getting snvs from RNA sequencing data + bcftools mpileup -Ov \ + --annotate FORMAT/AD,FORMAT/DP \ + -R $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.bed \ + -f {snakemake.config[static_data_config][reference][path]} \ + --per-sample-mF \ + --max-depth {max_depth} \ + --redo-BAQ -Oz \ + -o $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz \ + {bam_file} + tabix -f $TMPDIR/{snakemake.wildcards.tumor_library}.tmp.vcf.gz +fi + +#Need to add option for RNA vcf as well +python3 {preparation_vcf} \ + --genecode {snakemake.config[static_data_config][features][path]} \ + --input-vcf {snakemake.input.vcf} --format {format} \ + --mode {mode} \ + {expression_column} \ + {id_column} \ + -s {snakemake.wildcards.tumor_library} \ + -o /dev/stdout \ + {expression_file}\ + {rna_vcf}\ + "{ensemble_id}" \ +| bgzip -c > {snakemake.output.vcf} +tabix -f {snakemake.output.vcf} + +pushd $(dirname {snakemake.output.vcf}) +md5sum $(basename {snakemake.output.vcf}) >$(basename {snakemake.output.vcf}).md5 +md5sum $(basename {snakemake.output.vcf_tbi}) >$(basename {snakemake.output.vcf_tbi}).md5 +""" +) diff --git a/snappy_wrappers/wrappers/pvactools/environment.yaml b/snappy_wrappers/wrappers/pvactools/environment.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml new file mode 100644 index 000000000..7ef7fd65f --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/pvacseq/environment.yaml @@ -0,0 +1,15 @@ +channels: + - conda-forge + - bioconda + - defaults + - default +dependencies: + - python==3.6.15 + # - pip==21.3.1 + # - pip: + # - pvactools==3.1.3 + # - mhcflurry==2.0.1 + # - mhcnuggets==2.3.3 + # - git+https://github.com/griffithlab/bigmhc.git#egg=bigmhc + # - git+https://github.com/griffithlab/deepimmuno.git#egg=deepimmuno + # - tensorflow==1.5.0 \ No newline at end of file diff --git a/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py new file mode 100644 index 000000000..ec4245fd1 --- /dev/null +++ b/snappy_wrappers/wrappers/pvactools/pvacseq/wrapper.py @@ -0,0 +1,159 @@ +import os + +from snakemake import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +algorithms = snakemake.params.args["algorithms"] +epitope_length = snakemake.params.args["epitope_length"] +normal_smaple = snakemake.params.args["normal_sample"] +BINDING_THRESHOLD = f"-b {snakemake.params.args["BINDING_THRESHOLD"]} " +percentile_threshold = ( + f"--percentile-threshold {snakemake.params.args["percentile_threshold"]} " + if snakemake.params.args["percentile_threshold"] is not None + else "" +) + + +allele_specific_binding_thresholds = ( + "--allele-specific-binding-thresholds " + if snakemake.params.args["allele_specific_binding_thresholds"] + else "" +) + +aggregate_inclusion_binding_threshold = ( + f"--aggregate-inclusion-binding-threshold {snakemake.params.args["aggregate_inclusion_binding_threshold"]} ", +) + +netmhc_stab = "--netmhc-stab " if snakemake.params.args["netmhc_stab"] else "" + +NET_CHOP_THRESHOLD = (f"--net-chop-threshold {snakemake.params.args["NET_CHOP_THRESHOLD"]} ",) + +PROBLEMATIC_AMINO_ACIDS = ( + f"--problematic-amino-acids {snakemake.params.args["PROBLEMATIC_AMINO_ACIDS"]} ", +) + +# if {snakemake.params.args["run_reference_proteome_similarity"]}: +# run_reference_proteome_similarity= f"--run-reference-proteome-similarity", + +FAST_SIZE = (f"-s {snakemake.params.args["FAST_SIZE"]} ",) + +exclude_NAs = "--exclude-NAs " if snakemake.params.args["exclude_NAs"] else "" + +NORMAL_COV = (f"--normal-cov {snakemake.params.args["NORMAL_COV"]} ",) +TDNA_COV = (f"--tdna-cov {snakemake.params.args["TDNA_COV"] }",) +TRNA_COV = (f"--trna-cov {snakemake.params.args["TRNA_COV"]} ",) +NORMAL_VAF = (f"--normal-vaf {snakemake.params.args["NORMAL_VAF"]} ",) + +maximum_transcript_support_level = ( + f"--maximum-transcript-support-level {snakemake.params.args["maximum_transcript_support_level"]} ", +) + +pass_only = "--pass-only " if snakemake.params.args["pass_only"] else "" + +tumor_purity = ( + f"--tumor-purity {snakemake.params.args["tumor_purity"]} " + if snakemake.params.args["tumor_purity"] is not None + else "" +) + +op_dir = "/".join(snakemake.output.all_epitopes.split("/")[:-2]) + +files_to_bind = { + "combine_vcf": snakemake.input.combine_vcf, + "op_dir": op_dir, +} + +# TODO: Put the following in a function (decide where...) +# Replace with full absolute paths +files_to_bind = {k: os.path.realpath(v) for k, v in files_to_bind.items()} +# Directories that mut be bound +dirs_to_bind = {k: os.path.dirname(v) for k, v in files_to_bind.items()} +# List of unique directories to bind: on cluster: -> from container: /bindings/d) +bound_dirs = {e[1]: e[0] for e in enumerate(list(set(dirs_to_bind.values())))} +# Binding command +bindings = " ".join(["-B {}:/bindings/d{}".format(k, v) for k, v in bound_dirs.items()]) +bound_files = { + k: "/bindings/d{}/{}".format(bound_dirs[dirs_to_bind[k]], os.path.basename(v)) + for k, v in files_to_bind.items() +} + +shell.executable("/bin/bash") + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +# Also pipe stderr to log file +if [[ -n "{snakemake.log.log}" ]]; then + if [[ "$(set +e; tty; set -e)" != "" ]]; then + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + exec 2> >(tee -a "{snakemake.log.log}" >&2) + else + rm -f "{snakemake.log.log}" && mkdir -p $(dirname {snakemake.log.log}) + echo "No tty, logging disabled" >"{snakemake.log.log}" + fi +fi +# Write out information about conda installation. +conda list >{snakemake.log.conda_list} +conda info >{snakemake.log.conda_info} +md5sum {snakemake.log.conda_list} >{snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} >{snakemake.log.conda_info_md5} + +# Setup auto-cleaned tmpdir +export TMPDIR=$(mktemp -d) +trap "rm -rf $TMPDIR" EXIT + +# Compute md5 checksum +md5() {{ + fn=$1 + d=$(dirname $fn) + f=$(basename $fn) + pushd $d 1> /dev/null 2>&1 + checksum=$(md5sum $f) + popd 1> /dev/null 2>&1 + echo "$checksum" +}} + +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + +hla_types=$(cat {snakemake.input.hla_normal_dna} {snakemake.input.hla_tumor_dna} {snakemake.input.hla_tumor_rna} | sed 's/^/HLA-/' | sort | uniq | tr '\n' ',' | sed 's/,$//') +cmd="pvacseq run --normal-sample-name {normal_smaple} \ + -e1 {epitope_length} \ + --iedb-install-directory /opt/iedb \ + -t {snakemake.threads} \ + {bound_files[combine_vcf]} \ + {snakemake.wildcards.tumor_library} \ + $hla_types \ + {algorithms} \ + {bound_files[op_dir]} \ + {BINDING_THRESHOLD}\ + {percentile_threshold}\ + {allele_specific_binding_thresholds}\ + {aggregate_inclusion_binding_threshold}\ + {NET_CHOP_THRESHOLD}\ + {PROBLEMATIC_AMINO_ACIDS}\ + {FAST_SIZE}\ + {NORMAL_COV}\ + {TDNA_COV}\ + {TRNA_COV}\ + {NORMAL_VAF}\ + {exclude_NAs}\ + {maximum_transcript_support_level}" +echo 'TMPDIR=/bindings/d2' > $TMPDIR/{snakemake.wildcards.tumor_library}.sh +echo $cmd >> $TMPDIR/{snakemake.wildcards.tumor_library}.sh +apptainer exec --home $PWD -B $TMPDIR:/bindings/d2 {bindings} {config[path_container]} bash /bindings/d2/{snakemake.wildcards.tumor_library}.sh + +md5 {snakemake.output.all_epitopes} > {snakemake.output.all_epitopes_md5} +md5 {snakemake.output.filtered_epitopes} > {snakemake.output.filtered_epitopes_md5} +""" +) diff --git a/snappy_wrappers/wrappers/stringtie/environment.yaml b/snappy_wrappers/wrappers/stringtie/environment.yaml new file mode 100644 index 000000000..b79f83c81 --- /dev/null +++ b/snappy_wrappers/wrappers/stringtie/environment.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - stringtie==2.2.1 \ No newline at end of file diff --git a/snappy_wrappers/wrappers/stringtie/wrapper.py b/snappy_wrappers/wrappers/stringtie/wrapper.py new file mode 100644 index 000000000..0ded26e3d --- /dev/null +++ b/snappy_wrappers/wrappers/stringtie/wrapper.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +""" +Wrapper for quantify gene and transcript expression +""" + +from snakemake.shell import shell + +__author__ = "Pham Gia Cuong" +__email__ = "pham.gia-cuong@bih-charite.de" + +step = snakemake.config["pipeline_step"]["name"] +config = snakemake.config["step_config"][step] + +shell( + r""" +# ----------------------------------------------------------------------------- +# Redirect stderr to log file by default and enable printing executed commands +exec 2> >(tee -a "{snakemake.log.log}") +set -x +# ----------------------------------------------------------------------------- + +# Write out information about conda installation +conda list > {snakemake.log.conda_list} +conda info > {snakemake.log.conda_info} + + +stringtie \ + -G {snakemake.config[static_data_config][features][path]} \ + -p 4 \ + -v \ + -o {snakemake.output.expression} \ + {snakemake.input.rna_bam} + +pushd $(dirname {snakemake.output.expression}) +md5sum $(basename {snakemake.output.expression}) > $(basename {snakemake.output.expression_md5}) +""" +) + + +# Compute MD5 sums of logs +shell( + r""" +md5sum {snakemake.log.log} > {snakemake.log.log_md5} +md5sum {snakemake.log.conda_list} > {snakemake.log.conda_list_md5} +md5sum {snakemake.log.conda_info} > {snakemake.log.conda_info_md5} +""" +) diff --git a/snappy_wrappers/wrappers/vep/run/wrapper.py b/snappy_wrappers/wrappers/vep/run/wrapper.py index 84da509d6..46c4b96a6 100644 --- a/snappy_wrappers/wrappers/vep/run/wrapper.py +++ b/snappy_wrappers/wrappers/vep/run/wrapper.py @@ -11,6 +11,15 @@ vep_config = snakemake.config["step_config"][current_step]["vep"] pick_order = ",".join(vep_config["pick_order"]) script_output_options = " ".join(["--" + x for x in vep_config["output_options"]]) +if vep_config["plugins"]: + plugins = " ".join(["--plugin " + x for x in vep_config["plugins"]]) + if not vep_config["plugins_dir"]: + raise Exception("Please provide plugins directory if you want to use plugins") + else: + plugins_dir = "--dir_plugins " + vep_config["plugins_dir"] +else: + plugins = "" + plugins_dir = "" full = snakemake.output.full if "full" in snakemake.output.keys() else "" @@ -43,6 +52,8 @@ echo --dir_cache {vep_config[cache_dir]} fi) \ {script_output_options} \ + {plugins} \ + {plugins_dir} \ --{vep_config[tx_flag]} \ --fasta {snakemake.config[static_data_config][reference][path]} \ --input_file {snakemake.input.vcf} --format vcf \ @@ -63,6 +74,8 @@ echo --dir_cache {vep_config[cache_dir]} fi) \ {script_output_options} \ + {plugins} \ + {plugins_dir} \ --pick --pick_order {pick_order} \ --{vep_config[tx_flag]} \ --fasta {snakemake.config[static_data_config][reference][path]} \ diff --git a/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py new file mode 100644 index 000000000..9eb3cc435 --- /dev/null +++ b/tests/snappy_pipeline/workflows/test_workflows_somatic_neoepitope_prediction.py @@ -0,0 +1,322 @@ +# -*- coding: utf-8 -*- +"""Tests for the somatic_variant_calling workflow module code""" + +import textwrap + +import pytest +import ruamel.yaml as ruamel_yaml +from snakemake.io import Wildcards + +from snappy_pipeline.workflows.somatic_neoepitope_prediction import ( + SomaticNeoepitopePredictionWorkflow, +) + +from .common import get_expected_log_files_dict, get_expected_output_vcf_files_dict +from .conftest import patch_module_fs + + +# Test tumor mutational burden calculation with vcf file from somatic variant calling step +@pytest.fixture(scope="module") # otherwise: performance issues +def minimal_config(): + """Return YAML parsing result for configuration""" + yaml = ruamel_yaml.YAML() + return yaml.load( + textwrap.dedent( + r""" + static_data_config: + reference: + path: /path/to/ref.fa + cosmic: + path: /path/to/cosmic.vcf.gz + dbsnp: + path: /path/to/dbsnp.vcf.gz + features: + path: /path/to/genecode.gtf + + step_config: + ngs_mapping: + tools: + dna: ['bwa'] + rna: ['star'] + star: + path_index: /path/to/star/index + bwa: + path_index: /path/to/bwa/index.fa + + somatic_variant_calling: + tools: + - mutect2 + - scalpel + mutect2: {} + scalpel: + path_target_regions: /path/to/target/regions.bed + + hla_typing: + path_link_in: '' # OPTIONAL Override data set configuration search paths for FASTQ files + tools: [optitype] # REQUIRED - available: 'optitype' and 'arcashla' + optitype: + max_reads: 5000 + num_mapping_threads: 4 + + somatic_variant_annotation: + path_somatic_variant_calling: ../somatic_variant_calling + tools: ["vep"] + vep: + cache_dir: /path/to/dir/cache + species: homo_sapiens_merged + assembly: GRCh38 + cache_version: '102' + tx_flag: gencode_basic + num_threads: 8 + buffer_size: 500 + output_options: + - everything + somatic_neoepitope_prediction: + path_somatic_variant_annotation: ../somatic_variant_annotation # REQUIRED + path_container: ../somatic_neoepitope_prediction/work/containers/out/pvactools.simg + path_rna_ngs_mapping: ../ngs_mapping + path_hla_typing: ../hla_typing + tools_somatic_variant_annotation: [vep] + preparation: + format: snappy_custom # REQUIRED - The file format of the expression file to process. (stringtie, kallisto, cufflinks, snappy_custom, custom) + mode: gene # REQUIRED - Determine whether the expression file contains gene or transcript TPM values. + prediction: + algorithms: ['MHCflurry','MHCnuggetsI'] + + data_sets: + first_batch: + file: sheet.tsv + search_patterns: + - {'left': '*/*/*_R1.fastq.gz', 'right': '*/*/*_R2.fastq.gz'} + search_paths: ['/path'] + type: matched_cancer + naming_scheme: only_secondary_id + """ + ).lstrip() + ) + + +@pytest.fixture +def somatic_neoepitope_prediction_workflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + work_dir, + config_paths, + cancer_sheet_fake_fs, + aligner_indices_fake_fs, + mocker, +): + # Patch out file-system related things in abstract (the crawling link in step is defined there) + patch_module_fs("snappy_pipeline.workflows.abstract", cancer_sheet_fake_fs, mocker) + patch_module_fs("snappy_pipeline.workflows.ngs_mapping.model", aligner_indices_fake_fs, mocker) + # Update the "globals" attribute of the mock workflow (snakemake.workflow.Workflow) so we + # can obtain paths from the function as if we really had a NGSMappingPipelineStep there + + dummy_workflow.globals = { + "ngs_mapping": lambda x: "NGS_MAPPING/" + x, + "somatic_variant_annotation": lambda x: "SOMATIC_VARIANT_ANNOTATION/" + x, + "hla_typing": lambda x: "HLA_TYPING/" + x, + } + # Construct the workflow object + return SomaticNeoepitopePredictionWorkflow( + dummy_workflow, + minimal_config, + config_lookup_paths, + config_paths, + work_dir, + ) + + +def test_somatic_neoepitope_preparation_step_part_get_input_files( + somatic_neoepitope_prediction_workflow, +): + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "anno_caller": "vep", + "tumor_library": "P001-T1-DNA1-WGS1", + } + ) + # Define expected + vcf_base_out = ( + "SOMATIC_VARIANT_ANNOTATION/output/{mapper}.{var_caller}.{anno_caller}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + ) + # vcf_base_out = ( + # "SOMATIC_VARIANT_ANNOTATION/output/bwa.mutect2.vep.P001-T1-DNA1-WES1/out/" + # "{mapper}.{var_caller}.{anno_caller}.{tumor_library}" + # ) + ngs_mapping_base_out = "NGS_MAPPING/output/star.P001-T1-RNA1-mRNA_seq1/out/" + expected = { + "vcf": vcf_base_out + ".full.vcf.gz", + "vcf_tbi": vcf_base_out + ".full.vcf.gz.tbi", + "expression": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.GeneCounts.tab", + "bam": ngs_mapping_base_out + "star.P001-T1-RNA1-mRNA_seq1.bam", + } + + # Get actual + actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "prepare")(wildcards) + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_input_files( + somatic_neoepitope_prediction_workflow, +): + # Define expected + wildcards = Wildcards( + fromdict={ + "mapper": "bwa", + "var_caller": "mutect2", + "anno_caller": "vep", + "tumor_library": "P001-T1-DNA1-WGS1", + } + ) + vcf_base_out = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}" + ) + hla_typing_base_out = "HLA_TYPING/output/" + expected = { + "container": "work/containers/out/pvactools.simg", + "combine_vcf": vcf_base_out + ".vcf.gz", + "hla_normal_dna": hla_typing_base_out + + "optitype.P001-N1-DNA1-WGS1/out/optitype.P001-N1-DNA1-WGS1.txt", + "hla_tumor_dna": hla_typing_base_out + + "optitype.P001-T1-DNA1-WGS1/out/optitype.P001-T1-DNA1-WGS1.txt", + "hla_tumor_rna": hla_typing_base_out + + "optitype.P001-T1-RNA1-mRNA_seq1/out/optitype.P001-T1-RNA1-mRNA_seq1.txt", + } + + # Get actual + actual = somatic_neoepitope_prediction_workflow.get_input_files("pvacseq", "predict")(wildcards) + assert actual == expected + + +def test_somatic_neoepitope_preparation_step_part_get_output_files( + somatic_neoepitope_prediction_workflow, +): + base_out = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/out/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + expected = get_expected_output_vcf_files_dict(base_out) + actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "prepare") + assert actual == expected + + +def test_somatic_neoepitope_preparation_step_part_get_log_files( + somatic_neoepitope_prediction_workflow, +): + base_out = ( + "work/prepare/{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}/log/" + "{mapper}.{var_caller}.{anno_caller}.GX.{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "prepare") + assert actual == expected + + +def test_somatic_neoepitope_preparation_step_part_get_resource( + somatic_neoepitope_prediction_workflow, +): + # Define expected + expected_dict = {"threads": 1, "time": "01:00:00", "memory": "6G"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_neoepitope_prediction_workflow.get_resource( + "pvacseq", "prepare", resource + )() + assert actual == expected, msg_error + + +def test_somatic_neoepitope_prediction_step_part_get_output_files( + somatic_neoepitope_prediction_workflow, +): + base_out = "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/out/MHC_Class_I/{tumor_library}" + expected = { + "all_epitopes": base_out + ".all_epitopes.tsv", + "all_epitopes_md5": base_out + ".all_epitopes.tsv.md5", + "filtered_epitopes": base_out + ".filtered.tsv", + "filtered_epitopes_md5": base_out + ".filtered.tsv.md5", + } + actual = somatic_neoepitope_prediction_workflow.get_output_files("pvacseq", "predict") + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_log_files( + somatic_neoepitope_prediction_workflow, +): + base_out = ( + "work/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.{tumor_library}.epitopes/" + "log/{tumor_library}" + ) + expected = get_expected_log_files_dict(base_out=base_out) + actual = somatic_neoepitope_prediction_workflow.get_log_file("pvacseq", "predict") + assert actual == expected + + +def test_somatic_neoepitope_prediction_step_part_get_resource( + somatic_neoepitope_prediction_workflow, +): + # Define expected + expected_dict = {"threads": 4, "time": "4:00:00", "memory": "30G"} + # Evaluate + for resource, expected in expected_dict.items(): + msg_error = f"Assertion error for resource '{resource}'." + actual = somatic_neoepitope_prediction_workflow.get_resource( + "pvacseq", "predict", resource + )() + assert actual == expected, msg_error + + +def test_somatic_neoepitope_prediction_workflow(somatic_neoepitope_prediction_workflow): + """Test simple functionality of the workflow""" + # Check created sub steps + expected = ["link_out", "pvacseq"] + actual = list(sorted(somatic_neoepitope_prediction_workflow.sub_steps.keys())) + assert actual == expected + + # Check result file construction + tpl = ( + "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/" + "out/MHC_Class_I/P00{i}-T{t}-DNA1-WGS1{ext}" + ) + log_tpl = ( + "output/predict/{mapper}.{var_caller}.{anno_caller}.{mode}.P00{i}-T{t}-DNA1-WGS1.epitopes/" + "log/P00{i}-T{t}-DNA1-WGS1{ext}" + ) + expected = [ + log_tpl.format( + mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX" + ) + for (i, t) in ((1, 1), (2, 2)) + for var_caller in ("mutect2", "scalpel") + for ext in ( + ".log", + ".log.md5", + ".conda_info.txt", + ".conda_info.txt.md5", + ".conda_list.txt", + ".conda_list.txt.md5", + ) + ] + + expected += [ + tpl.format( + mapper="bwa", var_caller=var_caller, anno_caller="vep", i=i, t=t, ext=ext, mode="GX" + ) + for (i, t) in ((1, 1), (2, 2)) + for var_caller in ("mutect2", "scalpel") + for ext in ( + ".all_epitopes.tsv", + ".filtered.tsv", + ".all_epitopes.tsv.md5", + ".filtered.tsv.md5", + ) + ] + expected = list(sorted(expected)) + actual = list(sorted(somatic_neoepitope_prediction_workflow.get_result_files())) + assert expected == actual