From 6e6b0c34e98654a77dc8f6c4d15057eeef65ddbe Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 18:42:04 +0100
Subject: [PATCH 001/191] update gitignore file to current folder layout

---
 .gitignore | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index ee6ed52e..8fe284d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,20 @@
 *
-!scripts
-!scripts/*
-!scripts/common
-!scripts/common/*
-scripts/.snakemake*
-!Snakefile
-!config.yaml
-!samples.tsv
-!resources
-!resources/*
-!envs
-!envs/*
-!environment.yaml
+!.test
+!.test/*
+!.test/config/*
+!.test/data/*
+!config
+!config/*
+!config/HLA_Data/*
+!workflow
+!workflow/*
+!workflow/report/*
+!workflow/envs/*
+!workflow/schemas/*
+!workflow/scripts/*
+!workflow/rules/*
+!workflow/rules/annotation/*
+!workflow/resources/*
+!.gitignore
 !LICENSE
 !README.md
-!rules
-!rules/*
-!.gitignore
-!.editorconfig
-!.gitattributes
-!.test
-!.test/data

From e73b8d8f8c8532c909a362b87165ec37858d3b89 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 18:42:59 +0100
Subject: [PATCH 002/191] switch schemas and tsvs for samples and units to same
 column names as dna-seq-varlociraptor workflow

---
 .test/config/samples.tsv             |  6 ++---
 .test/config/units.tsv               |  2 +-
 config/samples.tsv                   |  9 +++----
 config/units.tsv                     |  3 ++-
 workflow/schemas/samples.schema.yaml | 36 +++++++++++++++++++++-------
 workflow/schemas/units.schema.yaml   | 20 +++++++++-------
 6 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
index 044002d7..2aca82dc 100644
--- a/.test/config/samples.tsv
+++ b/.test/config/samples.tsv
@@ -1,3 +1,3 @@
-sample	type	matched_normal	purity	platform
-A_normal	normal			ILLUMINA
-A_tumor	tumor	A_normal	1	ILLUMINA
+sample_name	group	alias	purity	platform
+A_normal	A	normal	1	ILLUMINA
+A_tumor	A	tumor	.99	ILLUMINA
diff --git a/.test/config/units.tsv b/.test/config/units.tsv
index cdb8af8e..4984edd8 100644
--- a/.test/config/units.tsv
+++ b/.test/config/units.tsv
@@ -1,3 +1,3 @@
-sample	sequencing_type	unit	fq1	fq2	sra	adapters
+sample_name	sequencing_type	unit_name	fq1	fq2	sra	adapters
 A_normal	DNA	lane1	data/reads/A_normal.1.fastq.gz	data/reads/A_normal.2.fastq.gz		
 A_tumor	DNA	lane1	data/reads/A_tumor.1.fastq.gz	data/reads/A_tumor.2.fastq.gz		
diff --git a/config/samples.tsv b/config/samples.tsv
index 6e291f15..249e0521 100644
--- a/config/samples.tsv
+++ b/config/samples.tsv
@@ -1,4 +1,5 @@
-sample	type	matched_normal	purity	platform
-A_normal	normal			ILLUMINA
-A_tumor	tumor	A_normal	1	ILLUMINA
-B_tumor	tumor	A_normal	1	ILLUMINA
+sample_name	group	alias	purity	platform
+A_normal	A	normal	1	ILLUMINA
+A_tumor	A	tumor	.99	ILLUMINA
+B_normal	B	normal	1	ILLUMINA
+B_tumor	B	tumor	.98	ILLUMINA
diff --git a/config/units.tsv b/config/units.tsv
index f8758dfb..c417a45c 100644
--- a/config/units.tsv
+++ b/config/units.tsv
@@ -1,5 +1,6 @@
-sample	sequencing_type	unit	fq1	fq2	sra	adapters
+sample_name	sequencing_type	unit_name	fq1	fq2	sra	adapters
 A_normal	DNA	lane1	A_normal_1.fastq.gz	A_normal_2.fastq.gz		
 A_tumor	DNA	lane2	A_tumor_1.fastq.gz	A_tumor_2.fastq.gz		
+B_normal	DNA	lane1	B_normal_1.fastq.gz	B_normal_2.fastq.gz		
 B_tumor	DNA	lane1	B_tumor_1.fastq.gz	B_tumor_2.fastq.gz		
 B_tumor	RNA	lane1	B_tumor_RNA_1.fastq.gz	B_tumor_RNA_2.fastq.gz		
diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
index e626f60f..e795cdee 100644
--- a/workflow/schemas/samples.schema.yaml
+++ b/workflow/schemas/samples.schema.yaml
@@ -2,12 +2,18 @@ $schema: "http://json-schema.org/draft-04/schema#"
 
 description: an entry in the sample sheet
 properties:
-  sample:
+  sample_name:
     type: string
-    description: sample name/identifier
-  type:
+    description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-')
+    pattern: "^[a-zA-Z_0-9-]+$"
+  alias:
     type: string
-    description: healthy or tumor sample
+    description: sample name within the VCF/BCF files generated for a group (e.g. tumor, normal, etc.) (alphanumeric string, that may additionally contain '_' and '-')
+    pattern: "^[a-zA-Z_0-9-]+$"
+  group:
+    type: string
+    description: group of samples called jointly (alphanumeric string, that may additionally contain '_' and '-')
+    pattern: "^[a-zA-Z_0-9-]+$"
   matched_normal:
     type: string
     description: the corresponding healthy control to this tumor sample
@@ -18,11 +24,25 @@ properties:
     description: Purity to use for tumor/normal groups.
   platform:
     type: string
-    enum: ["CAPILLARY", "LS454", "ILLUMINA", "SOLID", "HELICOS", "IONTORRENT", "ONT", "PACBIO"]
-
+    enum:
+      - "CAPILLARY"
+      - "LS454"
+      - "ILLUMINA"
+      - "SOLID"
+      - "HELICOS"
+      - "IONTORRENT"
+      - "ONT"
+      - "PACBIO"
+    description: used sequencing platform
+  purity:
+    type: number
+    minimum: 0.0
+    maximum: 1.0
+    description: Purity to use for tumor/normal groups.
   
 
 required:
-  - sample
-  - type
+  - sample_name
+  - alias
+  - group
   - platform
diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml
index 6fb9a6cf..4fec2074 100644
--- a/workflow/schemas/units.schema.yaml
+++ b/workflow/schemas/units.schema.yaml
@@ -2,18 +2,22 @@ $schema: "http://json-schema.org/draft-04/schema#"
 description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
 type: object
 properties:
-  sample:
+  sample_name:
     type: string
-    description: sample name/id the unit has been sequenced from
+    pattern: "^[a-zA-Z_0-9-]+$"
+    description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-')
   sequencing_type:
     type: string
     enum: ["DNA", "RNA"]
-  unit:
+    description: type of sequenced material ('DNA' or 'RNA')
+  unit_name:
     type: string
-    description: unit or lane name
+    pattern: "^[a-zA-Z_0-9-]+$"
+    description: unit id (alphanumeric string, that may additionally contain '_' and '-')
   fq1:
     type: string
-    description: path to FASTQ file
+    pattern: "^[^ \t]+$"
+    description: path to FASTQ file (may not contain whitespace)
   fq2:
     type: string
     description: path to second FASTQ file (leave empty in case of single-end)
@@ -22,9 +26,9 @@ properties:
     description: SRA id for automatic download of unit
   adapters:
     type: string
-    description: adapter trimming settings to use (for cutadapt)
+    description: cutadapt adapter trimming settings to use (see https://cutadapt.readthedocs.io)
 
 required:
-  - sample
-  - unit
+  - sample_name
+  - unit_name
   - sequencing_type

From 23f6738eeeb1dab7fdfca15bb10c601252909d77 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 18:46:05 +0100
Subject: [PATCH 003/191] adapt parsing and wildcards to new samples.tsv and
 units.tsv column names, make wildcard names more descriptive

---
 workflow/rules/MHC_binding.smk       |  38 +++++-----
 workflow/rules/annotation.smk        |   8 +-
 workflow/rules/calling.smk           |  18 ++---
 workflow/rules/candidate_calling.smk |  10 +--
 workflow/rules/common.smk            | 105 +++++++++++++++++----------
 workflow/rules/filtering.smk         |  36 ++++-----
 workflow/rules/microphaser.smk       |   8 +-
 workflow/rules/phylogeny.smk         |   2 +-
 workflow/rules/tmb.smk               |   6 +-
 workflow/rules/varlociraptor.smk     |  30 ++++----
 10 files changed, 145 insertions(+), 116 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 2d95e343..e43f248d 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -1,12 +1,12 @@
 # rule mhcflurry:
 #     input:
-#         peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{group}.fa",
+#         peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{peptide_type}.fa",
 #         alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv",
 #         wt_alleles=get_germline_optitype
 #     output:
-#         "results/mhcflurry/{sample}/{chr}/output.{group}.csv"
+#         "results/mhcflurry/{sample}/{chr}/output.{peptide_type}.csv"
 #     log:
-#         "logs/mhcflurry/{sample}-{chr}-{group}.log"
+#         "logs/mhcflurry/{sample}-{chr}-{peptide_type}.log"
 #     run:
 #         if "wt" in input.peptides:
 #             alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0])
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{group}.fa",
+        peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{group}.xls",
+        "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{sample}-{chr}-{group}.log",
+        "logs/netMHCpan/{sample}-{chr}-{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{group}.fa",
+        peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{group}.xls",
+        "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{sample}-{chr}-{group}.log",
+        "logs/netMHCIIpan/{sample}-{chr}-{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,13 +53,13 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{group}}.xls",
+            "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{peptide_type}}.xls",
             chr=contigs,
         ),
     output:
-        "results/{mhc}/{sample}/{sample}.mhc.{group}.tsv",
+        "results/{mhc}/{sample}/{sample}.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse-mhc/{mhc}-{sample}-{group}.log",
+        "logs/parse-mhc/{mhc}-{sample}-{peptide_type}.log",
     wildcard_constraints:
         group="wt|mt",
     script:
@@ -68,13 +68,13 @@ rule parse_mhc_out:
 
 # rule parse_mhcflurry:
 #     input:
-#         expand("results/mhcflurry/{{sample}}/{chr}/output.{{group}}.csv", chr=contigs)
+#         expand("results/mhcflurry/{{sample}}/{chr}/output.{{peptide_type}}.csv", chr=contigs)
 #     output:
-#         "results/mhcflurry/{sample}/{sample}.mhc.{group}.csv"
+#         "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv"
 #     wildcard_constraints:
 #         group="wt|mt"
 #     log:
-#         "logs/parse-mhc/mhcflurry-{sample}-{group}.log"
+#         "logs/parse-mhc/mhcflurry-{sample}-{peptide_type}.log"
 #     conda:
 #         "../envs/xsv.yaml"
 #     shell:
@@ -84,8 +84,8 @@ rule parse_mhc_out:
 rule mhc_csv_table:
     input:
         info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv",
-        mt="results/{mhc}/{sample}/{sample}.mhc.mt.tsv",
-        wt="results/{mhc}/{sample}/{sample}.mhc.wt.tsv",
+        neo="results/{mhc}/{sample}/{sample}.mhc.neo.tsv",
+        normal="results/{mhc}/{sample}/{sample}.mhc.normal.tsv",
     output:
         report(
             "results/neoantigens/{mhc}/{sample}.DNA.tsv",
@@ -101,8 +101,8 @@ rule mhc_csv_table:
 # rule mhcflurry_table:
 #     input:
 #         info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv",
-#         mt="results/mhcflurry/{sample}/{sample}.mhc.mt.tsv",
-#         wt="results/mhcflurry/{sample}/{sample}.mhc.wt.tsv"
+#         neo="results/mhcflurry/{sample}/{sample}.mhc.neo.tsv",
+#         normal="results/mhcflurry/{sample}/{sample}.mhc.normal.tsv"
 #     output:
 #         report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)")
 #     script:
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 64088375..8a504e98 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -1,12 +1,12 @@
 rule annotate_variants:
     input:
-        calls="results/calls/{group}.{scatteritem}.bcf",
+        calls="results/calls/{cancer_sample}.{scatteritem}.bcf",
         cache="resources/vep/cache",
         plugins="resources/vep/plugins",
     output:
-        calls="results/calls/{group}.{scatteritem}.annotated.bcf",
+        calls="results/calls/{cancer_sample}.{scatteritem}.annotated.bcf",
         stats=report(
-            "results/calls/{group}.{scatteritem}.stats.html",
+            "results/calls/{cancer_sample}.{scatteritem}.stats.html",
             caption="../report/stats.rst",
             category="QC",
         ),
@@ -16,7 +16,7 @@ rule annotate_variants:
         plugins=config["annotations"]["vep"]["plugins"],
         extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]),
     log:
-        "logs/vep/{group}.{scatteritem}.annotate.log",
+        "logs/vep/{cancer_sample}.{scatteritem}.annotate.log",
     wrapper:
         "0.59.2/bio/vep/annotate"
 
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index 98ef992f..70a228a4 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -2,16 +2,16 @@ rule strelka_somatic:
     input:
         normal=get_normal_bam,
         normal_index=get_normal_bai,
-        tumor="results/recal/{sample}.sorted.bam",
-        tumor_index="results/recal/{sample}.sorted.bam.bai",
+        tumor="results/recal/{cancer_sample}.sorted.bam",
+        tumor_index="results/recal/{cancer_sample}.sorted.bam.bai",
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/somatic/{sample}/results/variants/somatic.snvs.vcf.gz",
-        "results/strelka/somatic/{sample}/results/variants/somatic.indels.vcf.gz",
+        "results/strelka/somatic/{cancer_sample}/results/variants/somatic.snvs.vcf.gz",
+        "results/strelka/somatic/{cancer_sample}/results/variants/somatic.indels.vcf.gz",
     log:
-        "logs/calling/strelka_somatic/{sample}.log",
+        "logs/calling/strelka_somatic/{cancer_sample}.log",
     params:
         config_extra="--callRegions {} {}".format(
             "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
@@ -24,15 +24,15 @@ rule strelka_somatic:
 
 rule strelka_germline:
     input:
-        bam="results/recal/{normal}.sorted.bam",
-        normal_index="results/recal/{normal}.sorted.bam.bai",
+        bam="results/recal/{normal_sample}.sorted.bam",
+        normal_index="results/recal/{normal_sample}.sorted.bam.bai",
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/germline/{normal}/results/variants/variants.vcf.gz",
+        "results/strelka/germline/{normal_sample}/results/variants/variants.vcf.gz",
     log:
-        "logs/calling/strelka_germline/{normal}.log",
+        "logs/calling/strelka_germline/{normal_sample}.log",
     params:
         config_extra="--callRegions {} {}".format(
             "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk
index 74386763..2cdfbb2c 100644
--- a/workflow/rules/candidate_calling.smk
+++ b/workflow/rules/candidate_calling.smk
@@ -4,9 +4,9 @@ rule freebayes:
         # you can have a list of samples here
         samples=get_paired_bams,
     output:
-        "results/candidate-calls/{pair}.freebayes.bcf",
+        "results/candidate-calls/{cancer_sample}.freebayes.bcf",
     log:
-        "logs/{pair}.log",
+        "logs/{cancer_sample}.log",
     params:
         extra=config["params"].get("freebayes", ""),
         chunksize=100000,
@@ -17,11 +17,11 @@ rule freebayes:
 
 rule scatter_candidates:
     input:
-        "results/candidate-calls/{pair}.{caller}.bcf",
+        "results/candidate-calls/{cancer_sample}.{caller}.bcf",
     output:
-        scatter.calling("results/candidate-calls/{{pair}}.{{caller}}.{scatteritem}.bcf"),
+        scatter.calling("results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"),
     log:
-        "logs/scatter-candidates/{pair}.{caller}.log",
+        "logs/scatter-candidates/{cancer_sample}.{caller}.log",
     conda:
         "../envs/rbt.yaml"
     shell:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3bc6c60d..5b371175 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -10,13 +10,31 @@ ftp = FTP.RemoteProvider()
 
 validate(config, schema="../schemas/config.schema.yaml")
 
-##### sample sheets #####
-
-samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
+##### samples sheet #####
+
+samples = (
+    pd.read_csv(
+        config["samples"],
+        sep="\t",
+        dtype={"sample_name": str, "group": str},
+        comment="#",
+    )
+    .set_index("sample_name", drop=False)
+    .sort_index()
+)
 validate(samples, schema="../schemas/samples.schema.yaml")
 
-units = pd.read_csv(config["units"], dtype=str, sep="\t").set_index(
-    ["sample", "sequencing_type", "unit"], drop=False
+##### units sheet #####
+
+units = (
+    pd.read_csv(
+        config["units"],
+        sep="\t",
+        dtype={"sample_name": str, "sequencing_type": str, "unit_name": str},
+        comment="#",
+    )
+    .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False)
+    .sort_index()
 )
 validate(units, schema="../schemas/units.schema.yaml")
 
@@ -27,10 +45,14 @@ contigs.extend(["X", "Y"])
 
 
 wildcard_constraints:
-    pair="|".join(samples[samples.type == "tumor"]["sample"]),
-    sample="|".join(samples["sample"]),
+    cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]),
+    sample="|".join(samples["sample_name"]),
+    unit="|".join(units["unit_name"]),
+    alias="|".join(pd.unique(samples["alias"])),
+    group="|".join(pd.unique(samples["group"])),
     caller="|".join(["freebayes", "delly"]),
-    event="somatic|germline|complete",
+    peptide_type="|".join(["normal", "neo"]),
+    event="|".join(["somatic", "germline", "complete"]),
 
 
 ### Output generation ###
@@ -46,9 +68,9 @@ def is_activated(xpath):
 def get_final_output():
     if config["epitope_prediction"]["activate"]:
         final_output = expand(
-            "results/neoantigens/{mhc}/{S.sample}.{S.sequencing_type}.xlsx",
-            S=units.loc[samples[samples.type == "tumor"]["sample"]]
-            .drop_duplicates(["sample", "sequencing_type"])
+            "results/neoantigens/{mhc}/{S.sample_name}.{S.sequencing_type}.xlsx",
+            S=units.loc[samples[samples.alias == "tumor"]["sample_name"]]
+            .drop_duplicates(["sample_name", "sequencing_type"])
             .itertuples(),
             mhc=list(
                 filter(
@@ -68,12 +90,12 @@ def get_final_output():
                     "results/HLA-LA/hlaI_{sample}.tsv",
                     "results/HLA-LA/hlaII_{sample}.tsv",
                 ],
-                sample=samples["sample"],
+                sample=samples["sample_name"],
             )
         else:
             final_output = expand(
                 "results/optitype/{sample}/hla_alleles_{sample}.tsv",
-                sample=samples["sample"],
+                sample=samples["sample_name"],
             )
     return final_output
 
@@ -82,7 +104,7 @@ def get_fusion_output():
     if config["fusion"]["arriba"]["activate"]:
         fusion_output = expand(
             "results/fusion/arriba/{sample}.fusions.tsv",
-            sample=units[units["sequencing_type"] == "RNA"]["sample"],
+            sample=units[units["sequencing_type"] == "RNA"]["sample_name"],
         )
     else:
         fusion_output = []
@@ -93,7 +115,7 @@ def get_tmb_targets():
     if is_activated("tmb"):
         return expand(
             "results/plots/tmb/{group}.{mode}.svg",
-            group=samples[(samples.type == "tumor")]["sample"],
+            group=samples[(samples.alias == "tumor")]["sample_name"],
             mode=config["tmb"].get("mode", "curve"),
         )
     else:
@@ -125,13 +147,13 @@ def get_cutadapt_input(wildcards):
     if pd.isna(unit["fq2"]):
         # single end local sample
         return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format(
-            S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending
+            S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending
         )
     else:
         # paired end local sample
         return expand(
             "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format(
-                S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending
+                S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending
             ),
             read=["fq1", "fq2"],
         )
@@ -225,7 +247,7 @@ def get_optitype_reads_input(wildcards):
 
 def get_oncoprint_batch(wildcards):
     if wildcards.batch == "all":
-        groups = samples[samples["type"] == "tumor"]["sample"].unique()
+        groups = samples[samples["alias"] == "tumor"]["sample_name"].unique()
     else:
         groups = samples.loc[
             samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch,
@@ -241,15 +263,15 @@ def get_oncoprint_batch(wildcards):
 
 def get_annotated_bcf(wildcards):
     selection = ".annotated"
-    return "results/calls/{pair}.{scatteritem}{selection}.bcf".format(
-        pair=wildcards.pair, selection=selection, scatteritem=wildcards.scatteritem
+    return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format(
+        cancer_sample=wildcards.cancer_sample, selection=selection, scatteritem=wildcards.scatteritem
     )
 
 
 def get_scattered_calls(ext=".bcf"):
     def inner(wildcards):
         return expand(
-            "results/calls/{{pair}}.{caller}.{{scatteritem}}.sorted{ext}",
+            "results/calls/{{cancer_sample}}.{caller}.{{scatteritem}}.sorted{ext}",
             caller=caller,
             ext=ext,
         )
@@ -278,7 +300,7 @@ def get_pair_variants(wildcards, index):
     ]
     variants.append(
         "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format(
-            get_normal(wildcards), ext
+            get_normal(wildcards.sample), ext
         )
     )
     return variants
@@ -286,9 +308,9 @@ def get_pair_variants(wildcards, index):
 
 def get_pair_observations(wildcards):
     return expand(
-        "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf",
+        "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf",
         caller=wildcards.caller,
-        pair=wildcards.pair,
+        cancer_sample=wildcards.cancer_sample,
         scatteritem=wildcards.scatteritem,
         sample=get_paired_samples(wildcards),
     )
@@ -297,7 +319,7 @@ def get_pair_observations(wildcards):
 def get_merge_input(ext=".bcf"):
     def inner(wildcards):
         return expand(
-            "results/calls/{{pair}}.{vartype}.{{event}}.fdr-controlled{ext}",
+            "results/calls/{{cancer_sample}}.{vartype}.{{event}}.fdr-controlled{ext}",
             ext=ext,
             vartype=["SNV", "INS", "DEL", "MNV"],
             filter=config["calling"]["fdr-control"]["events"][wildcards.event],
@@ -308,8 +330,11 @@ def get_merge_input(ext=".bcf"):
 
 def get_pair_aliases(wildcards):
     return [
-        samples.loc[samples.loc[wildcards.pair, "matched_normal"], "type"],
-        samples.loc[wildcards.pair, "type"],
+        samples.loc[
+            get_normal(wildcards.cancer_sample),
+            "alias"
+        ],
+        samples.loc[wildcards.cancer_sample, "alias"],
     ]
 
 
@@ -350,8 +375,8 @@ def kallisto_params(wildcards, input):
 
 def get_paired_samples(wildcards):
     return [
-        samples.loc[(wildcards.pair), "matched_normal"],
-        samples.loc[wildcards.pair, "sample"],
+        get_normal(wildcards.cancer_sample),
+        samples.loc[wildcards.cancer_sample, "sample_name"],
     ]
 
 
@@ -367,8 +392,12 @@ def get_paired_bais(wildcards):
     )
 
 
-def get_normal(wildcards):
-    return samples.loc[(wildcards.sample), "matched_normal"]
+def get_normal(sample_name):
+    normal_sample = samples.loc[
+        (samples["group"] == samples.loc[sample_name, "group"]) & (samples["alias"] == "normal"),
+        "sample_name"
+    ].iat[0]
+    return normal_sample
 
 
 def get_reads(wildcards):
@@ -382,30 +411,30 @@ def get_seperate(sample, group):
 def get_proteome(wildcards):
     return expand(
         "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin",
-        normal=get_normal(wildcards),
+        normal=get_normal(wildcards.sample),
         mhc=wildcards.mhc,
     )
 
 
 def get_alleles_MHCI(wildcards):
-    if wildcards.group == "wt":
+    if wildcards.peptide_type == "wt":
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=get_normal(wildcards)
+            S=get_normal(wildcards.sample)
         )
     else:
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample)
 
 
 def get_alleles_MHCII(wildcards):
-    if wildcards.group == "wt":
-        return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards))
+    if wildcards.peptide_type == "wt":
+        return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.sample))
     else:
         return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample)
 
 
 def get_normal_bam(wildcards):
-    return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards))
+    return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample))
 
 
 def get_normal_bai(wildcards):
-    return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards))
+    return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards.cancer_sample))
diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
index f32c203d..169265f0 100644
--- a/workflow/rules/filtering.smk
+++ b/workflow/rules/filtering.smk
@@ -17,13 +17,13 @@ rule filter_odds:
     input:
         get_annotated_bcf,
     output:
-        "results/calls/{pair}.{event}.{scatteritem}.filtered_odds.bcf",
+        "results/calls/{cancer_sample}.{event}.{scatteritem}.filtered_odds.bcf",
     params:
         events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][
             "varlociraptor"
         ],
     log:
-        "logs/filter-calls/posterior_odds/{pair}.{scatteritem}.{event}.log",
+        "logs/filter-calls/posterior_odds/{cancer_sample}.{scatteritem}.{event}.log",
     conda:
         "../envs/varlociraptor.yaml"
     shell:
@@ -33,15 +33,15 @@ rule filter_odds:
 rule gather_calls:
     input:
         calls=gather.calling(
-            "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf"
+            "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf"
         ),
         idx=gather.calling(
-            "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi"
+            "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi"
         ),
     output:
-        "results/calls/{pair}.{event}.filtered_odds.bcf",
+        "results/calls/{cancer_sample}.{event}.filtered_odds.bcf",
     log:
-        "logs/gather-calls/{pair}.{event}.log",
+        "logs/gather-calls/{cancer_sample}.{event}.log",
     params:
         "-a -Ob",
     wrapper:
@@ -50,11 +50,11 @@ rule gather_calls:
 
 rule control_fdr:
     input:
-        "results/calls/{pair}.{event}.filtered_odds.bcf",
+        "results/calls/{cancer_sample}.{event}.filtered_odds.bcf",
     output:
-        "results/calls/{pair}.{vartype}.{event}.fdr-controlled.bcf",
+        "results/calls/{cancer_sample}.{vartype}.{event}.fdr-controlled.bcf",
     log:
-        "logs/control-fdr/{pair}.{vartype}.{event}.log",
+        "logs/control-fdr/{cancer_sample}.{vartype}.{event}.log",
     params:
         query=get_fdr_control_params,
     conda:
@@ -69,9 +69,9 @@ rule merge_calls:
         calls=get_merge_input(".bcf"),
         idx=get_merge_input(".bcf.csi"),
     output:
-        "results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
+        "results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
     log:
-        "logs/merge-calls/{pair}.{event}.log",
+        "logs/merge-calls/{cancer_sample}.{event}.log",
     params:
         "-a -Ob",
     wrapper:
@@ -80,11 +80,11 @@ rule merge_calls:
 
 rule change_samplenames:
     input:
-        call="results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
+        call="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
     output:
-        temp("results/merged-calls/{pair}.{event}.renaming.txt"),
+        temp("results/merged-calls/{cancer_sample}.{event}.renaming.txt"),
     log:
-        "logs/change-samplenames/{pair}.{event}.log",
+        "logs/change-samplenames/{cancer_sample}.{event}.log",
     params:
         prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0],
     shell:
@@ -93,12 +93,12 @@ rule change_samplenames:
 
 rule reheader_varlociraptor:
     input:
-        vcf="results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
-        samples="results/merged-calls/{pair}.{event}.renaming.txt",
+        vcf="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
+        samples="results/merged-calls/{cancer_sample}.{event}.renaming.txt",
     output:
-        "results/merged-calls/{pair}.{event}.reheader.bcf",
+        "results/merged-calls/{cancer_sample}.{event}.reheader.bcf",
     log:
-        "logs/reheader-calls/{pair}.{event}.log",
+        "logs/reheader-calls/{cancer_sample}.{event}.log",
     params:
         extra="",
         view_extra="-O b",
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index bd9e6510..582ac7c1 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -6,8 +6,8 @@ rule microphaser_somatic:
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.mt.fa",
-        wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.wt.fa",
+        mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.normal.fa",
         tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv",
     log:
         "logs/microphaser/somatic/{sample}-{contig}.log",
@@ -83,10 +83,10 @@ rule microphaser_filter:
         proteome=get_proteome,
     output:
         mt_fasta=(
-            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.mt.fa"
+            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.wt.fa"
+            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.normal.fa"
         ),
         tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv",
         removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv",
diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk
index 872247ac..d7401bf8 100644
--- a/workflow/rules/phylogeny.smk
+++ b/workflow/rules/phylogeny.smk
@@ -1,7 +1,7 @@
 def get_somatic_calls(wildcards):
     return expand(
         "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
-        sample=samples[samples.type == "tumor"]["sample"],
+        sample=samples[samples.alias == "tumor"]["sample_name"],
     )
 
 
diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk
index 2ef064a1..e2b2a43d 100644
--- a/workflow/rules/tmb.smk
+++ b/workflow/rules/tmb.smk
@@ -2,13 +2,13 @@ if config["tmb"]["activate"]:
 
     rule estimate_tmb:
         input:
-            "results/merged-calls/{pair}.somatic.fdr-controlled.bcf",
+            "results/merged-calls/{cancer_sample}.somatic.fdr-controlled.bcf",
         output:
-            "results/plots/tmb/{pair}.{plotmode}.vl.json",
+            "results/plots/tmb/{cancer_sample}.{plotmode}.vl.json",
         conda:
             "../envs/varlociraptor.yaml"
         log:
-            "logs/tmb/{pair}-{plotmode}.log",
+            "logs/tmb/{cancer_sample}-{plotmode}.log",
         params:
             **config["tmb"],
         shell:
diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk
index 4cb6013a..f94052fd 100644
--- a/workflow/rules/varlociraptor.smk
+++ b/workflow/rules/varlociraptor.smk
@@ -3,14 +3,14 @@ rule render_scenario:
         config["calling"]["scenario"],
     output:
         report(
-            "results/scenarios/{pair}.yaml",
+            "results/scenarios/{cancer_sample}.yaml",
             caption="../report/scenario.rst",
             category="Variant calling scenarios",
         ),
     params:
         samples=samples,
     log:
-        "logs/scenarious/{pair}.log",
+        "logs/scenarious/{cancer_sample}.log",
     conda:
         "../envs/render_scenario.yaml"
     script:
@@ -21,15 +21,15 @@ rule varlociraptor_preprocess:
     input:
         ref="resources/genome.fasta",
         ref_idx="resources/genome.fasta.fai",
-        candidates="results/candidate-calls/{pair}.{caller}.{scatteritem}.bcf",
+        candidates="results/candidate-calls/{cancer_sample}.{caller}.{scatteritem}.bcf",
         bam="results/recal/{sample}.sorted.bam",
-        bai="results/recal/{sample}.sorted.bai",
+        bai="results/recal/{sample}.sorted.bam.bai",
     output:
-        "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf",
+        "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf",
     params:
         omit_isize="",
     log:
-        "logs/varlociraptor/preprocess/{pair}/{sample}.{caller}.{scatteritem}.log",
+        "logs/varlociraptor/preprocess/{cancer_sample}/{sample}.{caller}.{scatteritem}.log",
     conda:
         "../envs/varlociraptor.yaml"
     shell:
@@ -40,11 +40,11 @@ rule varlociraptor_preprocess:
 rule varlociraptor_call:
     input:
         obs=get_pair_observations,
-        scenario="results/scenarios/{pair}.yaml",
+        scenario="results/scenarios/{cancer_sample}.yaml",
     output:
-        temp("results/calls/{pair}.{caller}.{scatteritem}.bcf"),
+        temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf"),
     log:
-        "logs/varlociraptor/call/{pair}.{caller}.{scatteritem}.log",
+        "logs/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.log",
     params:
         obs=lambda w, input: [
             "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs)
@@ -52,7 +52,7 @@ rule varlociraptor_call:
     conda:
         "../envs/varlociraptor.yaml"
     benchmark:
-        "benchmarks/varlociraptor/call/{pair}.{caller}.{scatteritem}.tsv"
+        "benchmarks/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.tsv"
     shell:
         "varlociraptor "
         "call variants generic --obs {params.obs} "
@@ -61,11 +61,11 @@ rule varlociraptor_call:
 
 rule sort_calls:
     input:
-        "results/calls/{pair}.{caller}.{scatteritem}.bcf",
+        "results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf",
     output:
-        temp("results/calls/{pair}.{caller}.{scatteritem}.sorted.bcf"),
+        temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.sorted.bcf"),
     log:
-        "logs/bcf-sort/{pair}.{caller}.{scatteritem}.log",
+        "logs/bcf-sort/{cancer_sample}.{caller}.{scatteritem}.log",
     conda:
         "../envs/bcftools.yaml"
     resources:
@@ -80,9 +80,9 @@ rule bcftools_concat:
         calls=get_scattered_calls(),
         indexes=get_scattered_calls(ext=".bcf.csi"),
     output:
-        "results/calls/{pair}.{scatteritem}.bcf",
+        "results/calls/{cancer_sample}.{scatteritem}.bcf",
     log:
-        "logs/concat-calls/{pair}.{scatteritem}.log",
+        "logs/concat-calls/{cancer_sample}.{scatteritem}.log",
     params:
         "-a -Ob",  # TODO Check this
     wrapper:

From caafd0131e4bd46ca263a8c4fefbe332866621c9 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 19:57:43 +0100
Subject: [PATCH 004/191] exclude .github/workflows/* from .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8fe284d1..02a9591f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 *
+!.github/workflows/*
 !.test
 !.test/*
 !.test/config/*

From b0651b1bd56e9b5ed08c18aa24e34a460936cd50 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 19:58:28 +0100
Subject: [PATCH 005/191] update github actions workflow

---
 .github/workflows/main.yaml | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 503d4a5e..364e02a4 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -8,33 +8,42 @@ on:
 
 
 jobs:
+  Formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Formatting
+        uses: github/super-linter@v4
+        env:
+          VALIDATE_ALL_CODEBASE: false
+          DEFAULT_BRANCH: master
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          VALIDATE_SNAKEMAKE_SNAKEFMT: true
+
+
   Linting:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v2
     - name: Lint workflow
-      uses: snakemake/snakemake-github-action@v1.17.0
+      uses: snakemake/snakemake-github-action@v1.22.0
       with:
         directory: .
         snakefile: workflow/Snakefile
         args: "--lint"
-        stagein: |
-          export TMPDIR=/tmp
 
 
   Testing:
     runs-on: ubuntu-latest
-    needs: Linting
+    needs:
+      - Linting
+      - Formatting
     steps:
-    - uses: actions/checkout@v1
-    - name: Checkout submodules
-      uses: textbook/git-checkout-submodule-action@2.0.0
+    - uses: actions/checkout@v2
 
     - name: Test workflow (local FASTQs)
-      uses: snakemake/snakemake-github-action@v1.17.0
+      uses: snakemake/snakemake-github-action@v1.22.0
       with:
         directory: .test
         snakefile: workflow/Snakefile
         args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba"
-        stagein: |
-          export TMPDIR=/tmp

From e65f344afd0d50e57644ba5476297b159be78301 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 20:05:09 +0100
Subject: [PATCH 006/191] snakefmt

---
 workflow/rules/calling.smk           |  6 ++++--
 workflow/rules/candidate_calling.smk |  4 +++-
 workflow/rules/common.smk            | 23 ++++++++++++++---------
 3 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index 70a228a4..a3889467 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -14,7 +14,8 @@ rule strelka_somatic:
         "logs/calling/strelka_somatic/{cancer_sample}.log",
     params:
         config_extra="--callRegions {} {}".format(
-            "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
+            "resources/genome.callregions.bed.gz",
+            config["params"]["strelka"]["config"],
         ),
         run_extra=config["params"]["strelka"]["run"],
     threads: 22
@@ -35,7 +36,8 @@ rule strelka_germline:
         "logs/calling/strelka_germline/{normal_sample}.log",
     params:
         config_extra="--callRegions {} {}".format(
-            "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
+            "resources/genome.callregions.bed.gz",
+            config["params"]["strelka"]["config"],
         ),
         run_extra="",
     threads: 22
diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk
index 2cdfbb2c..d6b6942d 100644
--- a/workflow/rules/candidate_calling.smk
+++ b/workflow/rules/candidate_calling.smk
@@ -19,7 +19,9 @@ rule scatter_candidates:
     input:
         "results/candidate-calls/{cancer_sample}.{caller}.bcf",
     output:
-        scatter.calling("results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"),
+        scatter.calling(
+            "results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"
+        ),
     log:
         "logs/scatter-candidates/{cancer_sample}.{caller}.log",
     conda:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 5b371175..54befabf 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -264,7 +264,9 @@ def get_oncoprint_batch(wildcards):
 def get_annotated_bcf(wildcards):
     selection = ".annotated"
     return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format(
-        cancer_sample=wildcards.cancer_sample, selection=selection, scatteritem=wildcards.scatteritem
+        cancer_sample=wildcards.cancer_sample,
+        selection=selection,
+        scatteritem=wildcards.scatteritem,
     )
 
 
@@ -330,10 +332,7 @@ def get_merge_input(ext=".bcf"):
 
 def get_pair_aliases(wildcards):
     return [
-        samples.loc[
-            get_normal(wildcards.cancer_sample),
-            "alias"
-        ],
+        samples.loc[get_normal(wildcards.cancer_sample), "alias"],
         samples.loc[wildcards.cancer_sample, "alias"],
     ]
 
@@ -394,8 +393,9 @@ def get_paired_bais(wildcards):
 
 def get_normal(sample_name):
     normal_sample = samples.loc[
-        (samples["group"] == samples.loc[sample_name, "group"]) & (samples["alias"] == "normal"),
-        "sample_name"
+        (samples["group"] == samples.loc[sample_name, "group"])
+        & (samples["alias"] == "normal"),
+        "sample_name",
     ].iat[0]
     return normal_sample
 
@@ -433,8 +433,13 @@ def get_alleles_MHCII(wildcards):
 
 
 def get_normal_bam(wildcards):
-    return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample))
+    return expand(
+        "results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample)
+    )
 
 
 def get_normal_bai(wildcards):
-    return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards.cancer_sample))
+    return expand(
+        "results/recal/{normal}.sorted.bam.bai",
+        normal=get_normal(wildcards.cancer_sample),
+    )

From c7b6897ef75cd72e53d1b27cc53f46cd5f2106d8 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 20:47:46 +0100
Subject: [PATCH 007/191] remove unused rule gzip_fastq that confuses adapter
 trimming mode

---
 workflow/rules/utils.smk | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index e7586ef0..65161772 100644
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -34,17 +34,6 @@ rule tabix_known_variants:
         "0.59.2/bio/tabix"
 
 
-rule gzip_fastq:
-    input:
-        "{prefix}.fastq",
-    output:
-        "{prefix}.fastq.gz",
-    log:
-        "logs/gz-fastq/{prefix}.log",
-    shell:
-        "gzip < {input} > {output}"
-
-
 rule tsv_to_excel:
     input:
         tsv="results/{x}.tsv",

From 688c027c2f69882b6cb941a59ecd04dd0480d037 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 20:48:07 +0100
Subject: [PATCH 008/191] fix trimming rules syntax

---
 workflow/rules/common.smk | 30 +++++++++++++++++++++++-------
 workflow/rules/trim.smk   |  3 ---
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 54befabf..273704e3 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -53,6 +53,8 @@ wildcard_constraints:
     caller="|".join(["freebayes", "delly"]),
     peptide_type="|".join(["normal", "neo"]),
     event="|".join(["somatic", "germline", "complete"]),
+    read="|".join(["single", "R1", "R2"]),
+    seqtype="|".join(["DNA", "RNA"]),
 
 
 ### Output generation ###
@@ -132,28 +134,38 @@ caller = list(
 
 
 def get_cutadapt_input(wildcards):
-    unit = units.loc[wildcards.sample].loc[wildcards.unit].loc[wildcards.seqtype]
+    unit = units.loc[
+        (units["sample_name"] == wildcards.sample)
+        & (units["unit_name"] == wildcards.unit)
+        & (units["sequencing_type"] == wildcards.seqtype)
+    ]
 
-    if pd.isna(unit["fq1"]):
+    if pd.isna(unit["fq1"].iat[0]):
         # SRA sample (always paired-end for now)
         accession = unit["sra"]
         return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
 
-    if unit["fq1"].endswith("gz"):
+    if unit["fq1"].iat[0].endswith("gz"):
         ending = ".gz"
     else:
         ending = ""
 
-    if pd.isna(unit["fq2"]):
+    if pd.isna(unit["fq2"].iat[0]):
         # single end local sample
         return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format(
-            S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending
+            S=unit["sample_name"].iat[0],
+            U=unit["unit_name"].iat[0],
+            T=unit["sequencing_type"].iat[0],
+            E=ending,
         )
     else:
         # paired end local sample
         return expand(
             "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format(
-                S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending
+                S=unit["sample_name"].iat[0],
+                U=unit["unit_name"].iat[0],
+                T=unit["sequencing_type"].iat[0],
+                E=ending,
             ),
             read=["fq1", "fq2"],
         )
@@ -189,7 +201,11 @@ def get_fastqs(wc):
     if config["trimming"]["activate"]:
         return expand(
             "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz",
-            unit=units.loc[wc.seqtype].loc[wc.sample, "unit_name"],
+            unit=units.loc[
+                (units["sequencing_type"] == wc.seqtype)
+                & (units["sample_name"] == wc.sample),
+                "unit_name",
+            ],
             sample=wc.sample,
             read=wc.read,
             seqtype=wc.seqtype,
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
index f26ed6f3..cf09cba1 100644
--- a/workflow/rules/trim.smk
+++ b/workflow/rules/trim.smk
@@ -64,8 +64,5 @@ rule merge_fastqs:
         "results/merged/{seqtype}/{sample}_{read}.fastq.gz",
     log:
         "logs/merge-fastqs/{seqtype}_{sample}_{read}.log",
-    wildcard_constraints:
-        read="single|R1|R2",
-        seqtype="DNA|RNA",
     shell:
         "cat {input} > {output} 2> {log}"

From f228219c91a97b980634aa8ca6731efa9b94f4e6 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 20:49:12 +0100
Subject: [PATCH 009/191] turn EVERYTHING on in the GitHub Actions test
 workflow and run...

---
 .test/config/config.yaml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index d10fc559..85e69827 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -3,14 +3,14 @@ units: "config/units.tsv"
 
 # boolean if read trimming should be skipped
 trimming:
-  activate: false
+  activate: true
 
 remove_duplicates:
   activate: true
 
 calling:
   freebayes:
-    activate: false
+    activate: true
   # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
   scenario: config/scenario.yaml
   filter:
@@ -40,14 +40,14 @@ calling:
 
 fusion:
   arriba:
-    activate: false
+    activate: true
     blacklist:
      "arriba_blacklist"
     params:
       "-T -P"
 
 tmb:
-  activate: false
+  activate: true
   coding_genome_size: 3e7
   # Name of the tumor sample in the scenario.yaml.
   tumor_sample: tumor
@@ -56,16 +56,16 @@ tmb:
 
 
 epitope_prediction:
-  activate: false
+  activate: true
 
 
 affinity:
   netMHCpan:
-    activate: false
+    activate: true
     params: "-BA -l 9 -s -xls"
     location: "../netMHCpan-4.0"
   netMHCIIpan:
-    activate: false
+    activate: true
     params: "-length 15 -s -xls"
     location: "../netMHCIIpan-4.0"
 
@@ -73,11 +73,11 @@ affinity:
 HLAtyping:
   # activate to use razers3 to pre-filter reads before using optitype
   optitype_prefiltering:
-    activate: false
+    activate: true
   optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
   # activate to predict MHC-I and MHC-II alleles with HLA-LA
   HLA_LA:
-    activate: false
+    activate: true
 
 
 ref:

From ed496c4a352991d499c7c52d99182c099238a0ef Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 13 Jan 2022 22:13:09 +0100
Subject: [PATCH 010/191] add missing conda env for rule get_callregions

---
 workflow/envs/htslib.yaml | 5 +++++
 workflow/rules/ref.smk    | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 workflow/envs/htslib.yaml

diff --git a/workflow/envs/htslib.yaml b/workflow/envs/htslib.yaml
new file mode 100644
index 00000000..b5d7959d
--- /dev/null
+++ b/workflow/envs/htslib.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - htslib =1.14
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 86b209c3..7e3a3d18 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -120,7 +120,7 @@ rule get_callregions:
     params:
         n_contigs=config["ref"]["n_chromosomes"],
     conda:
-        "../envs/index.yaml"
+        "../envs/htslib.yaml"
     shell:
         "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})"
         " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}"

From 530ef666c3e86c129b32f82939ed55801259e73e Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 14 Jan 2022 18:03:43 +0100
Subject: [PATCH 011/191] update cutadapt wrappers and try fixing adapter
 handling

---
 workflow/rules/common.smk | 12 +++++++++++-
 workflow/rules/trim.smk   | 14 ++++++--------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 273704e3..bc922529 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -30,7 +30,7 @@ units = (
     pd.read_csv(
         config["units"],
         sep="\t",
-        dtype={"sample_name": str, "sequencing_type": str, "unit_name": str},
+        dtype={"sample_name": str, "sequencing_type": str, "unit_name": str, "adapters": str},
         comment="#",
     )
     .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False)
@@ -181,6 +181,16 @@ def get_cutadapt_pipe_input(wildcards):
     assert len(files) > 0, "no files found at {}".format(pattern)
     return files
 
+def get_cutadapt_adapters(wildcards):
+    unit = units.loc[wildcards.sample].loc[wildcards.unit]
+    try:
+        adapters = unit["adapters"]
+        if isinstance(adapters, str):
+            return adapters
+        return ""
+    except KeyError:
+        return ""
+
 
 def is_paired_end(sample, seqtype):
     sample_units = units.loc[sample].loc[seqtype]
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
index cf09cba1..b0145449 100644
--- a/workflow/rules/trim.smk
+++ b/workflow/rules/trim.smk
@@ -32,13 +32,11 @@ rule cutadapt_pe:
     log:
         "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
     params:
-        others=config["params"]["cutadapt"],
-        adapters=lambda w: str(
-            units.loc[w.sample].loc[w.seqtype].loc[w.unit, "adapters"]
-        ),
+        extra=config["params"]["cutadapt"],
+        adapters=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "0.59.2/bio/cutadapt/pe"
+        "0.85.1/bio/cutadapt/pe"
 
 
 rule cutadapt_se:
@@ -50,11 +48,11 @@ rule cutadapt_se:
     log:
         "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
     params:
-        others=config["params"]["cutadapt"],
-        adapters_r1=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]),
+        extra=config["params"]["cutadapt"],
+        adapters_r1=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "0.59.2/bio/cutadapt/se"
+        "0.85.1/bio/cutadapt/se"
 
 
 rule merge_fastqs:

From 8c5d952d9fe63585a923210c3ae164902550a7b1 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 14 Jan 2022 18:08:27 +0100
Subject: [PATCH 012/191] snakefmt

---
 workflow/rules/common.smk | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index bc922529..14c7e6be 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -30,7 +30,12 @@ units = (
     pd.read_csv(
         config["units"],
         sep="\t",
-        dtype={"sample_name": str, "sequencing_type": str, "unit_name": str, "adapters": str},
+        dtype={
+            "sample_name": str,
+            "sequencing_type": str,
+            "unit_name": str,
+            "adapters": str,
+        },
         comment="#",
     )
     .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False)
@@ -181,6 +186,7 @@ def get_cutadapt_pipe_input(wildcards):
     assert len(files) > 0, "no files found at {}".format(pattern)
     return files
 
+
 def get_cutadapt_adapters(wildcards):
     unit = units.loc[wildcards.sample].loc[wildcards.unit]
     try:

From 48300a4448a5220e80668e05594e75e663a4666c Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 14 Jan 2022 18:31:02 +0100
Subject: [PATCH 013/191] fix cutadapt wrappers' version number

---
 workflow/rules/trim.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
index b0145449..b5573c61 100644
--- a/workflow/rules/trim.smk
+++ b/workflow/rules/trim.smk
@@ -36,7 +36,7 @@ rule cutadapt_pe:
         adapters=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "0.85.1/bio/cutadapt/pe"
+        "v0.85.1/bio/cutadapt/pe"
 
 
 rule cutadapt_se:
@@ -52,7 +52,7 @@ rule cutadapt_se:
         adapters_r1=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "0.85.1/bio/cutadapt/se"
+        "v0.85.1/bio/cutadapt/se"
 
 
 rule merge_fastqs:

From 7800664cc3a4d5e030d59ed0355ced86113eb5d0 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 14 Jan 2022 20:21:25 +0100
Subject: [PATCH 014/191] deactivate HLA_LA in test workflow, as it includes
 huge graph download that times out the workflow

---
 .test/config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 85e69827..839a126c 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -77,7 +77,7 @@ HLAtyping:
   optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
   # activate to predict MHC-I and MHC-II alleles with HLA-LA
   HLA_LA:
-    activate: true
+    activate: false
 
 
 ref:

From 5d35a0979d81f885bf36c0190ccf04e3afa1cabc Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 14 Jan 2022 21:36:31 +0100
Subject: [PATCH 015/191] also turn off netMHCIIpan to avoid HLA-LA download

---
 .test/config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 839a126c..4add6179 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -65,7 +65,7 @@ affinity:
     params: "-BA -l 9 -s -xls"
     location: "../netMHCpan-4.0"
   netMHCIIpan:
-    activate: true
+    activate: false
     params: "-length 15 -s -xls"
     location: "../netMHCIIpan-4.0"
 

From 680d976969d62a164169eca32dd1594eef34529e Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Tue, 18 Jan 2022 19:44:53 +0100
Subject: [PATCH 016/191] try dna-seq-varlociraptor workflow formulations of
 cutadapt_input function definitions

---
 workflow/rules/common.smk | 43 ++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 14c7e6be..838f6641 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -139,37 +139,33 @@ caller = list(
 
 
 def get_cutadapt_input(wildcards):
-    unit = units.loc[
-        (units["sample_name"] == wildcards.sample)
-        & (units["unit_name"] == wildcards.unit)
-        & (units["sequencing_type"] == wildcards.seqtype)
-    ]
+    unit = units.loc[wildcards.sample].loc[wildcards.seqtype].loc[wildcards.unit]
 
-    if pd.isna(unit["fq1"].iat[0]):
+    if pd.isna(unit["fq1"]):
         # SRA sample (always paired-end for now)
         accession = unit["sra"]
         return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
 
-    if unit["fq1"].iat[0].endswith("gz"):
+    if unit["fq1"].endswith("gz"):
         ending = ".gz"
     else:
         ending = ""
 
-    if pd.isna(unit["fq2"].iat[0]):
+    if pd.isna(unit["fq2"]):
         # single end local sample
         return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format(
-            S=unit["sample_name"].iat[0],
-            U=unit["unit_name"].iat[0],
-            T=unit["sequencing_type"].iat[0],
+            S=unit.sample_name,
+            U=unit.unit_name,
+            T=unit.sequencing_type,
             E=ending,
         )
     else:
         # paired end local sample
         return expand(
             "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format(
-                S=unit["sample_name"].iat[0],
-                U=unit["unit_name"].iat[0],
-                T=unit["sequencing_type"].iat[0],
+                S=unit.sample_name,
+                U=unit.unit_name,
+                T=unit.sequencing_type,
                 E=ending,
             ),
             read=["fq1", "fq2"],
@@ -182,8 +178,23 @@ def get_cutadapt_pipe_input(wildcards):
         .loc[wildcards.seqtype]
         .loc[wildcards.unit, wildcards.fq]
     )
-    files = list(sorted(glob.glob(pattern)))
-    assert len(files) > 0, "no files found at {}".format(pattern)
+    if "*" in pattern:
+        files = sorted(
+            glob.glob(
+                units.loc[wildcards.sample]
+                .loc[wildcards.seqtype]
+                .loc[wildcards.unit, wildcards.fq]
+            )
+        )
+        if not files:
+            raise ValueError(
+                "No raw fastq files found for unit pattern {} (sample {}, sequencing type {}). "
+                "Please check the your sample sheet.".format(
+                    wildcards.unit, wildcards.sample, wildcards.seqtype
+                )
+            )
+    else:
+        files = [pattern]
     return files
 
 

From cde64d8cd42bf5e51b3257437a96e1f9fc4eaac5 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Tue, 18 Jan 2022 19:45:16 +0100
Subject: [PATCH 017/191] update cutadapt rules to very latest wrapper release

---
 workflow/rules/trim.smk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
index b5573c61..0ada209d 100644
--- a/workflow/rules/trim.smk
+++ b/workflow/rules/trim.smk
@@ -36,7 +36,7 @@ rule cutadapt_pe:
         adapters=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "v0.85.1/bio/cutadapt/pe"
+        "v0.86.0/bio/cutadapt/pe"
 
 
 rule cutadapt_se:
@@ -49,10 +49,10 @@ rule cutadapt_se:
         "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
     params:
         extra=config["params"]["cutadapt"],
-        adapters_r1=get_cutadapt_adapters,
+        adapters=get_cutadapt_adapters,
     threads: 8
     wrapper:
-        "v0.85.1/bio/cutadapt/se"
+        "v0.86.0/bio/cutadapt/se"
 
 
 rule merge_fastqs:

From c390783e0ae2a7abe58ba99017423d47a28a6144 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 19 Jan 2022 17:51:01 +0100
Subject: [PATCH 018/191] fix render_scenario after changes to wildcards,
 samples.tsv and units.tsv

---
 workflow/scripts/render-scenario.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py
index 692b0ee5..8a1d06eb 100644
--- a/workflow/scripts/render-scenario.py
+++ b/workflow/scripts/render-scenario.py
@@ -3,6 +3,7 @@
 
 with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out:
     samples = snakemake.params.samples
+    group = samples.loc[samples["sample_name"] == snakemake.wildcards.cancer_sample, "group"]
     out.write(Template(template.read()).render(
-        samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])]
+        samples=samples[samples["group"] == group]
     ))

From ffd5547a0a336cfe9221948f68f7891893a156e4 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 19 Jan 2022 17:52:35 +0100
Subject: [PATCH 019/191] for now, deactivate stuff not needed for
 dna-seq-varlociraptor compatibility

---
 .test/config/config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 4add6179..e432f441 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -3,14 +3,14 @@ units: "config/units.tsv"
 
 # boolean if read trimming should be skipped
 trimming:
-  activate: true
+  activate: false
 
 remove_duplicates:
-  activate: true
+  activate: false
 
 calling:
   freebayes:
-    activate: true
+    activate: false
   # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
   scenario: config/scenario.yaml
   filter:

From beba26e39ab413464094ecce5634ac6d8cfa3e28 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 19 Jan 2022 18:33:59 +0100
Subject: [PATCH 020/191] activate freebayes in .test to try to get tests to
 run further

---
 .test/config/config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index e432f441..1d5f7b1b 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -10,7 +10,7 @@ remove_duplicates:
 
 calling:
   freebayes:
-    activate: false
+    activate: true
   # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
   scenario: config/scenario.yaml
   filter:

From 74e7c359f5d355dea66c67be6ea2555afd9ee006 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 14:44:22 +0100
Subject: [PATCH 021/191] make razers3 rule wildcards compatible with
 dna-seq-varlociraptor workflow to get entry point to work seamlessly

---
 workflow/rules/HLAtyping.smk | 8 ++++----
 workflow/rules/common.smk    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 682808ef..c519a48f 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -39,9 +39,9 @@ rule parse_HLA_LA:
 
 rule razers3:
     input:
-        reads="results/merged/DNA/{sample}_{fq}.fastq.gz",
+        reads="results/merged/{sample}_{read}.fastq.gz"
     output:
-        bam="results/razers3/bam/{sample}_{fq}.bam",
+        bam="results/razers3/bam/{sample}_{read}.bam",
     threads: 8
     log:
         "logs/razers3/{sample}_{fq}.log",
@@ -54,9 +54,9 @@ rule razers3:
 
 rule bam2fq:
     input:
-        "results/razers3/bam/{sample}_{fq}.bam",
+        "results/razers3/bam/{sample}_{read}.bam",
     output:
-        "results/razers3/fastq/{sample}_{fq}.fished.fastq",
+        "results/razers3/fastq/{sample}_{read}.fished.fastq",
     params:
         "",
     log:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 838f6641..eb59c416 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -279,7 +279,7 @@ def get_optitype_reads_input(wildcards):
     if is_activated("HLAtyping/optitype_prefiltering"):
         if is_paired_end(wildcards.sample, "DNA"):
             return expand(
-                "results/razers3/fastq/{sample}_{fq}.fished.fastq",
+                "results/razers3/fastq/{sample}_{read}.fished.fastq",
                 sample=wildcards.sample,
                 fq=["R1", "R2"],
             )

From d04e2422a1752037bdcc003d9ec32f46e0c8b4a4 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 14:49:22 +0100
Subject: [PATCH 022/191] fix formatting and linting error in rule razers3

---
 workflow/rules/HLAtyping.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index c519a48f..46b16f1e 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -39,12 +39,12 @@ rule parse_HLA_LA:
 
 rule razers3:
     input:
-        reads="results/merged/{sample}_{read}.fastq.gz"
+        reads="results/merged/{sample}_{read}.fastq.gz",
     output:
         bam="results/razers3/bam/{sample}_{read}.bam",
     threads: 8
     log:
-        "logs/razers3/{sample}_{fq}.log",
+        "logs/razers3/{sample}_{read}.log",
     params:
         genome=config["HLAtyping"]["optitype_data"],
         extra=config["params"]["razers3"],

From 2cd85a1473fc814755c1af166a23f59e1ea15eeb Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 15:53:01 +0100
Subject: [PATCH 023/191] also fix rule bam2fq wildcards

---
 workflow/rules/HLAtyping.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 46b16f1e..26f84454 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -60,7 +60,7 @@ rule bam2fq:
     params:
         "",
     log:
-        "logs/razers3-bam2fq/{sample}-{fq}.log",
+        "logs/razers3-bam2fq/{sample}-{read}.log",
     threads: 1
     wrapper:
         "0.61.0/bio/samtools/bam2fq/interleaved"

From 020e44ba9905949a3f65940b62381cefcd108687 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 16:13:16 +0100
Subject: [PATCH 024/191] fix last occurence of fq wildcard

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index eb59c416..aedcb7f8 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -281,7 +281,7 @@ def get_optitype_reads_input(wildcards):
             return expand(
                 "results/razers3/fastq/{sample}_{read}.fished.fastq",
                 sample=wildcards.sample,
-                fq=["R1", "R2"],
+                read=["R1", "R2"],
             )
         return "results/razers3/fastq/{sample}_single.fastq"
     else:

From 2fe0c7ee61177985aa57213fa9e35cfad28ab00a Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 16:34:20 +0100
Subject: [PATCH 025/191] fix rule razers3 input path

---
 workflow/rules/HLAtyping.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 26f84454..e68cd89b 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -39,7 +39,7 @@ rule parse_HLA_LA:
 
 rule razers3:
     input:
-        reads="results/merged/{sample}_{read}.fastq.gz",
+        reads="results/merged/DNA/{sample}_{read}.fastq.gz",
     output:
         bam="results/razers3/bam/{sample}_{read}.bam",
     threads: 8

From 0771ccaee8c781f21f85238ac42b6479afdc2bfb Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 16:35:28 +0100
Subject: [PATCH 026/191] clearly distinguish cancer_samples and normal_samples
 via globally restricted wildcards

---
 workflow/rules/MHC_binding.smk | 36 ++++++++++-----------
 workflow/rules/common.smk      | 13 ++++----
 workflow/rules/microphaser.smk | 58 +++++++++++++++++-----------------
 3 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index e43f248d..50185ca4 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCpan/{cancer_sample}.{chr}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls",
+        "results/netMHCpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{sample}-{chr}-{peptide_type}.log",
+        "logs/netMHCpan/{cancer_sample}-{chr}-{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCIIpan/{cancer_sample}.{chr}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls",
+        "results/netMHCIIpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{sample}-{chr}-{peptide_type}.log",
+        "logs/netMHCIIpan/{cancer_sample}-{chr}-{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,13 +53,13 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{peptide_type}}.xls",
+            "results/{{mhc}}/{{cancer_sample}}/{chr}/{{cancer_sample}}.{chr}.{{peptide_type}}.xls",
             chr=contigs,
         ),
     output:
-        "results/{mhc}/{sample}/{sample}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{cancer_sample}/{sample}.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse-mhc/{mhc}-{sample}-{peptide_type}.log",
+        "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log",
     wildcard_constraints:
         group="wt|mt",
     script:
@@ -83,17 +83,17 @@ rule parse_mhc_out:
 
 rule mhc_csv_table:
     input:
-        info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv",
-        neo="results/{mhc}/{sample}/{sample}.mhc.neo.tsv",
-        normal="results/{mhc}/{sample}/{sample}.mhc.normal.tsv",
+        info="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv",
+        neo="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.neo.tsv",
+        normal="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.normal.tsv",
     output:
         report(
-            "results/neoantigens/{mhc}/{sample}.DNA.tsv",
+            "results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv",
             caption="../report/WES_results.rst",
             category="Results WES (netMHC)",
         ),
     log:
-        "logs/create-mhc-table/{mhc}-{sample}.log",
+        "logs/create-mhc-table/{mhc}-{cancer_sample}.log",
     script:
         "../scripts/merge_data.py"
 
@@ -111,17 +111,17 @@ rule mhc_csv_table:
 
 rule add_RNA_info:
     input:
-        counts="results/kallisto/{sample}",
-        table="results/neoantigens/{mhc}/{sample}.DNA.tsv",
+        counts="results/kallisto/{cancer_sample}",
+        table="results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv",
     output:
         report(
-            "results/neoantigens/{mhc}/{sample}.RNA.tsv",
+            "results/neoantigens/{mhc}/{cancer_sample}.RNA.tsv",
             caption="../report/RNA_results.rst",
             category="Results RNA",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
     log:
-        "logs/add-RNA/{mhc}-{sample}.log",
+        "logs/add-RNA/{mhc}-{cancer_sample}.log",
     script:
         "../scripts/add_rna_info.py"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index aedcb7f8..d1b2c109 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -51,6 +51,7 @@ contigs.extend(["X", "Y"])
 
 wildcard_constraints:
     cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]),
+    normal_sample="|".join(samples[samples.alias == "normal"]["sample_name"]),
     sample="|".join(samples["sample_name"]),
     unit="|".join(units["unit_name"]),
     alias="|".join(pd.unique(samples["alias"])),
@@ -453,8 +454,8 @@ def get_seperate(sample, group):
 
 def get_proteome(wildcards):
     return expand(
-        "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin",
-        normal=get_normal(wildcards.sample),
+        "results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin",
+        normal_sample=get_normal(wildcards.cancer_sample),
         mhc=wildcards.mhc,
     )
 
@@ -462,17 +463,17 @@ def get_proteome(wildcards):
 def get_alleles_MHCI(wildcards):
     if wildcards.peptide_type == "wt":
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=get_normal(wildcards.sample)
+            S=get_normal(wildcards.cancer_sample)
         )
     else:
-        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample)
+        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.cancer_sample)
 
 
 def get_alleles_MHCII(wildcards):
     if wildcards.peptide_type == "wt":
-        return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.sample))
+        return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.cancer_sample))
     else:
-        return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample)
+        return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample)
 
 
 def get_normal_bam(wildcards):
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 582ac7c1..caef83cf 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,16 +1,16 @@
 rule microphaser_somatic:
     input:
-        vcf="results/strelka/merged/{sample}/all_variants.norm.annotated.bcf",
-        bam="results/recal/{sample}.sorted.bam",
-        bai="results/recal/{sample}.sorted.bam.bai",
+        vcf="results/strelka/merged/{cancer_sample}/all_variants.norm.annotated.bcf",
+        bam="results/recal/{cancer_sample}.sorted.bam",
+        bai="results/recal/{cancer_sample}.sorted.bam.bai",
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.neo.fa",
-        wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.normal.fa",
-        tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv",
+        mt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.normal.fa",
+        tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv",
     log:
-        "logs/microphaser/somatic/{sample}-{contig}.log",
+        "logs/microphaser/somatic/{cancer_sample}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -22,20 +22,20 @@ rule microphaser_somatic:
 
 rule microphaser_germline:
     input:
-        vcf="results/strelka/germline/{normal}/results/variants/variants.reheader.norm.bcf",
-        bam="results/recal/{normal}.sorted.bam",
-        bai="results/recal/{normal}.sorted.bam.bai",
+        vcf="results/strelka/germline/{normal_sample}/results/variants/variants.reheader.norm.bcf",
+        bam="results/recal/{normal_sample}.sorted.bam",
+        bai="results/recal/{normal_sample}.sorted.bam.bai",
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
         wt_fasta=(
-            "results/microphaser/fasta/germline/{normal}/{normal}.germline.{contig}.fa"
+            "results/microphaser/fasta/germline/{normal_sample}/{normal_sample}.germline.{contig}.fa"
         ),
         wt_tsv=(
-            "results/microphaser/info/germline/{normal}/{normal}.germline.{contig}.tsv"
+            "results/microphaser/info/germline/{normal_sample}/{normal_sample}.germline.{contig}.tsv"
         ),
     log:
-        "logs/microphaser/germline/{normal}-{contig}.log",
+        "logs/microphaser/germline/{normal_sample}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -48,25 +48,25 @@ rule microphaser_germline:
 rule concat_proteome:
     input:
         expand(
-            "results/microphaser/fasta/germline/{{normal}}/{{normal}}.germline.{contig}.fa",
+            "results/microphaser/fasta/germline/{{normal_sample}}/{{normal_sample}}.germline.{contig}.fa",
             contig=contigs,
         ),
     output:
-        "results/microphaser/fasta/germline/{normal}/reference_proteome.fa",
+        "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa",
     log:
-        "logs/microphaser/concat-ref-proteome/{normal}.log",
+        "logs/microphaser/concat-ref-proteome/{normal_sample}.log",
     shell:
         "cat {input} > {output} 2> {log}"
 
 
 rule build_germline_proteome:
     input:
-        "results/microphaser/fasta/germline/{normal}/reference_proteome.fa",
+        "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa",
     output:
-        bin="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin",
-        fasta="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.peptides.fasta",
+        bin="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin",
+        fasta="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.peptides.fasta",
     log:
-        "logs/microphaser/build-ref-proteome-db/{normal}-{mhc}.log",
+        "logs/microphaser/build-ref-proteome-db/{normal_sample}-{mhc}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -79,19 +79,19 @@ rule build_germline_proteome:
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv",
+        tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv",
         proteome=get_proteome,
     output:
         mt_fasta=(
-            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.neo.fa"
+            "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.normal.fa"
+            "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv",
-        removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.tsv",
+        removed="results/microphaser/info/{cancer_sample}/removed/{mhc}/{cancer_sample}.{contig}.removed.tsv",
     log:
-        "logs/microphaser/filter/{sample}-{mhc}-{contig}.log",
+        "logs/microphaser/filter/{cancer_sample}-{mhc}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -105,13 +105,13 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/{{sample}}/filtered/{{mhc}}/{{sample}}.{contig}.tsv",
+            "results/microphaser/info/{{cancer_sample}}/filtered/{{mhc}}/{{cancer_sample}}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv",
+        "results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv",
     log:
-        "logs/concat-tsv/{sample}-{mhc}.log",
+        "logs/concat-tsv/{cancer_sample}-{mhc}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From 16631009f2ff7fc5d3464ae6a0de6b53aebec5a4 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 16:38:28 +0100
Subject: [PATCH 027/191] fix missing sample -> cancer_sample refactoring

---
 workflow/rules/MHC_binding.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 50185ca4..ef80b8fb 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -57,7 +57,7 @@ rule parse_mhc_out:
             chr=contigs,
         ),
     output:
-        "results/{mhc}/{cancer_sample}/{sample}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.{peptide_type}.tsv",
     log:
         "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log",
     wildcard_constraints:

From c6606427a57232d2d1d34f89d065f06b41286b10 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Fri, 21 Jan 2022 16:39:29 +0100
Subject: [PATCH 028/191] snakefmt

---
 workflow/rules/common.smk | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index d1b2c109..33fd5918 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -466,12 +466,16 @@ def get_alleles_MHCI(wildcards):
             S=get_normal(wildcards.cancer_sample)
         )
     else:
-        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.cancer_sample)
+        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
+            S=wildcards.cancer_sample
+        )
 
 
 def get_alleles_MHCII(wildcards):
     if wildcards.peptide_type == "wt":
-        return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.cancer_sample))
+        return "results/HLA-LA/hlaI_{S}.tsv".format(
+            S=get_normal(wildcards.cancer_sample)
+        )
     else:
         return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample)
 

From 82bb46986ea5b0e4bf92849994881225234f4527 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 16 Mar 2022 14:07:55 +0100
Subject: [PATCH 029/191] remove (download) and fix (index) HLALA caching

---
 workflow/rules/ref.smk | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 7e3a3d18..4443311a 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -183,7 +183,6 @@ rule download_HLALA_graph:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     log:
         "logs/download-HLA-LA-graph.log",
-    cache: True
     shell:
         "cd resources/graphs && wget  http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
         "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz"
@@ -193,8 +192,11 @@ rule index_HLALA:
     input:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     output:
-        "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
-        "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathindex",
+        multiext(
+            "resources/graphs/PRG_MHC_GRCh38_withIMGT/",
+            "serializedGRAPH",
+            "serializedGRAPH_preGapPathindex"
+        )
     cache: True
     conda:
         "../envs/hla_la.yaml"

From 5524f1d73abc5d8878ebeb88f6cd6332918ffaed Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Thu, 31 Mar 2022 13:48:18 +0200
Subject: [PATCH 030/191] snakefmt

---
 workflow/rules/ref.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 4443311a..64cd6e06 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -195,8 +195,8 @@ rule index_HLALA:
         multiext(
             "resources/graphs/PRG_MHC_GRCh38_withIMGT/",
             "serializedGRAPH",
-            "serializedGRAPH_preGapPathindex"
-        )
+            "serializedGRAPH_preGapPathindex",
+        ),
     cache: True
     conda:
         "../envs/hla_la.yaml"

From 08dab98f999b7b321f5c9e64cf295cbc4f7ba359 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Wed, 6 Apr 2022 14:50:51 +0200
Subject: [PATCH 031/191] WIP: switch to a proper focus on groups of samples
 for one individual, not yet tested

---
 .test/config/config.yaml       |   3 +
 workflow/rules/MHC_binding.smk |  40 ++++++-------
 workflow/rules/calling.smk     |  71 ++++++++++-------------
 workflow/rules/common.smk      | 103 ++++++++++++++++++++-------------
 workflow/rules/microphaser.smk |  68 +++++++++++-----------
 5 files changed, 150 insertions(+), 135 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 1d5f7b1b..78bf043b 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -127,6 +127,9 @@ params:
           9
       netMHCIIpan:
           15
+    events:
+      tumor: "strelka_somatic"
+      normal: "strelka_germline"
   kallisto:
     "-b 100"
   star: >-
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index ef80b8fb..893d403f 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -1,12 +1,12 @@
 # rule mhcflurry:
 #     input:
-#         peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{peptide_type}.fa",
+#         peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{contig}.{peptide_type}.fa",
 #         alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv",
 #         wt_alleles=get_germline_optitype
 #     output:
-#         "results/mhcflurry/{sample}/{chr}/output.{peptide_type}.csv"
+#         "results/mhcflurry/{sample}/{contig}/output.{peptide_type}.csv"
 #     log:
-#         "logs/mhcflurry/{sample}-{chr}-{peptide_type}.log"
+#         "logs/mhcflurry/{sample}-{contig}-{peptide_type}.log"
 #     run:
 #         if "wt" in input.peptides:
 #             alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0])
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCpan/{cancer_sample}.{chr}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa"
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls",
+        "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{cancer_sample}-{chr}-{peptide_type}.log",
+        "logs/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCIIpan/{cancer_sample}.{chr}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa"
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls",
+        "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{cancer_sample}-{chr}-{peptide_type}.log",
+        "logs/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,22 +53,20 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{cancer_sample}}/{chr}/{{cancer_sample}}.{chr}.{{peptide_type}}.xls",
-            chr=contigs,
+            "results/{{mhc}}/{{group}}/{{tumor_event}}.{contig}.{{peptide_type}}.xls",
+            contig=contigs,
         ),
     output:
-        "results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{group}.{tumor_event}.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log",
-    wildcard_constraints:
-        group="wt|mt",
+        "logs/parse_mhc_out/{mhc}/{group}.{tumor_event}.{peptide_type}.log",
     script:
         "../scripts/group_mhc_output.py"
 
 
 # rule parse_mhcflurry:
 #     input:
-#         expand("results/mhcflurry/{{sample}}/{chr}/output.{{peptide_type}}.csv", chr=contigs)
+#         expand("results/mhcflurry/{{sample}}/{contig}/output.{{peptide_type}}.csv", contig=contigs)
 #     output:
 #         "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv"
 #     wildcard_constraints:
@@ -83,17 +81,17 @@ rule parse_mhc_out:
 
 rule mhc_csv_table:
     input:
-        info="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv",
-        neo="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.neo.tsv",
-        normal="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.normal.tsv",
+        info="results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv",
+        neo="results/{mhc}/{group}.{tumor_event}.mhc.neo.tsv",
+        normal="results/{mhc}/{group}.{tumor_event}.mhc.normal.tsv",
     output:
         report(
-            "results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv",
+            "results/neoantigens/{group}.{tumor_event}.{mhc}.DNA.tsv",
             caption="../report/WES_results.rst",
             category="Results WES (netMHC)",
         ),
     log:
-        "logs/create-mhc-table/{mhc}-{cancer_sample}.log",
+        "logs/mhc_csv_table/{group}.{mhc}.{tumor_event}.log",
     script:
         "../scripts/merge_data.py"
 
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index a3889467..48e51d5f 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -1,17 +1,17 @@
-rule strelka_somatic:
+rule strelka_tumor:
     input:
-        normal=get_normal_bam,
-        normal_index=get_normal_bai,
-        tumor="results/recal/{cancer_sample}.sorted.bam",
-        tumor_index="results/recal/{cancer_sample}.sorted.bam.bai",
+        normal=get_normal_bam(),
+        normal_index=get_normal_bam(ext=".bam.bai"),
+        tumor=get_tumor_bam(),
+        tumor_index=get_tumor_bam(ext=".bam.bai"),
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/somatic/{cancer_sample}/results/variants/somatic.snvs.vcf.gz",
-        "results/strelka/somatic/{cancer_sample}/results/variants/somatic.indels.vcf.gz",
+        "results/strelka/{group}.strelka_somatic.snvs.vcf.gz",
+        "results/strelka/{group}.strelka_somatic.indels.vcf.gz",
     log:
-        "logs/calling/strelka_somatic/{cancer_sample}.log",
+        "logs/calling/strelka/{group}.strelka_somatic.log",
     params:
         config_extra="--callRegions {} {}".format(
             "resources/genome.callregions.bed.gz",
@@ -25,13 +25,13 @@ rule strelka_somatic:
 
 rule strelka_germline:
     input:
-        bam="results/recal/{normal_sample}.sorted.bam",
-        normal_index="results/recal/{normal_sample}.sorted.bam.bai",
+        bam=get_normal_bam(),
+        normal_index=get_normal_bam(ext=".bam.bai"),
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/germline/{normal_sample}/results/variants/variants.vcf.gz",
+        "results/strelka/{group}.strelka_germline.variants.vcf.gz",
     log:
         "logs/calling/strelka_germline/{normal_sample}.log",
     params:
@@ -61,17 +61,17 @@ rule vcf_to_bcf:
 rule concat_somatic:
     input:
         calls=expand(
-            "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf",
+            "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf",
             type=["snvs", "indels"],
         ),
         indices=expand(
-            "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf.csi",
+            "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf.csi",
             type=["snvs", "indels"],
         ),
     output:
-        "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf",
+        "results/strelka/{group}.strelka_somatic.bcf",
     log:
-        "bcftools/concat-somatic/{sample}.log",
+        "bcftools/concat_somatic/{group}.log",
     params:
         "-O b -a",
     wrapper:
@@ -80,11 +80,11 @@ rule concat_somatic:
 
 rule get_tumor_from_somatic:
     input:
-        "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf",
+        "results/strelka/{group}.strelka_somatic.bcf",
     output:
-        "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
+        "results/strelka/{group}.strelka_somatic.tumor.bcf",
     log:
-        "logs/bcftools/view-TUMOR/{sample}.log",
+        "logs/bcftools/get_tumor_from_somatic/{group}.strelka_somatic.tumor.log",
     params:
         "-O b -s TUMOR",
     wrapper:
@@ -93,12 +93,12 @@ rule get_tumor_from_somatic:
 
 rule reheader_germline:
     input:
-        vcf="{germline}/variants.output.bcf",
+        vcf="results/strelka/{group}.strelka_germline.variants.output.bcf",
         samples="resources/sampleheader.txt",
     output:
-        "{germline}/variants.reheader.bcf",
+        "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
     log:
-        "logs/bcftools/reheader/{germline}.log",
+        "logs/bcftools/reheader_germline/{group}.log",
     params:
         extra="",
         view_extra="-O b",
@@ -108,33 +108,24 @@ rule reheader_germline:
 
 rule concat_variants:
     input:
-        calls=lambda w: get_pair_variants(w, index=False),
-        index=lambda w: get_pair_variants(w, index=True),
+        calls=[ 
+            "results/strelka/{group}.strelka_somatic.tumor.bcf",
+            "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
+        ],
+        index=[ 
+            "results/strelka/{group}.strelka_somatic.tumor.bcf.csi",
+            "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi",
+        ],
     output:
-        "results/strelka/merged/{sample}/all_variants.bcf",
+        "results/strelka/merged/{group}.strelka_somatic.strelka_germline.bcf",
     log:
-        "bcftools/concat-all/{sample}.log",
+        "bcftools/concat_variants/{group}.strelka_somatic.strelka_germline.log",
     params:
         extra="-O b -a",
     wrapper:
         "0.64.0/bio/bcftools/concat"
 
 
-rule preprocess_variants:
-    input:
-        variants="{variants}.bcf",
-    output:
-        "{variants}.prepy.bcf",
-    params:
-        extra="-L --somatic",
-        genome="resources/genome.fasta",
-    log:
-        "logs/prepy/{variants}.log",
-    threads: 2
-    wrapper:
-        "0.60.0/bio/hap.py/pre.py"
-
-
 rule norm_vcf:
     input:
         "{prefix}.bcf",
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 33fd5918..9e2f88b5 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -74,22 +74,27 @@ def is_activated(xpath):
 
 
 def get_final_output():
+    final_output = []
     if config["epitope_prediction"]["activate"]:
-        final_output = expand(
-            "results/neoantigens/{mhc}/{S.sample_name}.{S.sequencing_type}.xlsx",
-            S=units.loc[samples[samples.alias == "tumor"]["sample_name"]]
-            .drop_duplicates(["sample_name", "sequencing_type"])
-            .itertuples(),
-            mhc=list(
-                filter(
-                    None,
-                    [
-                        "netMHCpan" if is_activated("affinity/netMHCpan") else None,
-                        "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None,
-                    ],
-                )
-            ),
-        )
+        for group in pd.unique(samples["group"]):
+            samples = samples.loc[samples["group"] == group, "sample_name"]
+            sequencing_types = pd.unique(units.loc[units.sample_name in samples, "sequencing_type"])
+            final_output.extend(
+                expand(
+                    "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv",
+                    group=group,
+                    tumor_event=config["params"]["microphaser"]["events"]["tumor"],
+                    mhc=list(
+                        filter(
+                            None,
+                            [
+                                "netMHCpan" if is_activated("affinity/netMHCpan") else None,
+                                "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None,
+                            ],
+                        )
+                    ),
+                    seqtype=sequencing_types,
+            )
     else:
         if config["HLAtyping"]["HLA_LA"]["activate"]:
             final_output = expand(
@@ -340,13 +345,13 @@ def get_pair_variants(wildcards, index):
     else:
         ext = ""
     variants = [
-        "results/strelka/somatic/{}/results/variants/somatic.complete.tumor.bcf{}".format(
+        "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf{ext}".format(
             wildcards.sample, ext
         )
     ]
     variants.append(
         "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format(
-            get_normal(wildcards.sample), ext
+            get_normal_from_group(wildcards.group), ext
         )
     )
     return variants
@@ -376,7 +381,7 @@ def get_merge_input(ext=".bcf"):
 
 def get_pair_aliases(wildcards):
     return [
-        samples.loc[get_normal(wildcards.cancer_sample), "alias"],
+        samples.loc[get_normal_from_sample(wildcards.cancer_sample), "alias"],
         samples.loc[wildcards.cancer_sample, "alias"],
     ]
 
@@ -389,6 +394,16 @@ def get_tabix_params(wildcards):
     raise ValueError("Invalid format for tabix: {}".format(wildcards.format))
 
 
+def get_normal_bam(wildcards, ext=".bam"):
+    normal_sample = get_normal_from_group(wildcards.group)
+    return f"results/recal/{normal_sample}.sorted{ext}"
+
+
+def get_tumor_bam(wildcards, ext=".bam"):
+    tumor_sample = get_tumor_from_group(wildcards.group)
+    return f"results/recal/{tumor_sample}.sorted{ext}"
+
+
 ## RNA ##
 
 
@@ -418,7 +433,7 @@ def kallisto_params(wildcards, input):
 
 def get_paired_samples(wildcards):
     return [
-        get_normal(wildcards.cancer_sample),
+        get_normal_from_sample(wildcards.cancer_sample),
         samples.loc[wildcards.cancer_sample, "sample_name"],
     ]
 
@@ -435,7 +450,7 @@ def get_paired_bais(wildcards):
     )
 
 
-def get_normal(sample_name):
+def get_normal_from_sample(sample_name):
     normal_sample = samples.loc[
         (samples["group"] == samples.loc[sample_name, "group"])
         & (samples["alias"] == "normal"),
@@ -444,6 +459,24 @@ def get_normal(sample_name):
     return normal_sample
 
 
+def get_normal_from_group(group):
+    normal_sample = samples.loc[
+        (samples["group"] == group)
+        & (samples["alias"] == "normal"),
+        "sample_name",
+    ].iat[0]
+    return normal_sample
+
+
+def get_tumor_from_group(group):
+    tumor_sample = samples.loc[
+        (samples["group"] == group)
+        & (samples["alias"] == "tumor"),
+        "sample_name",
+    ].iat[0]
+    return tumor_sample
+
+
 def get_reads(wildcards):
     return get_seperate(wildcards.sample, wildcards.group)
 
@@ -454,40 +487,30 @@ def get_seperate(sample, group):
 
 def get_proteome(wildcards):
     return expand(
-        "results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin",
-        normal_sample=get_normal(wildcards.cancer_sample),
+        "results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
+        normal_event=config["params"]["microphaser"]["events"]["normal"],
         mhc=wildcards.mhc,
     )
 
 
 def get_alleles_MHCI(wildcards):
-    if wildcards.peptide_type == "wt":
+    if wildcards.peptide_type == "normal":
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=get_normal(wildcards.cancer_sample)
+            S=get_normal_from_group(wildcards.group)
         )
     else:
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=wildcards.cancer_sample
+            S=get_tumor_from_group(wildcards.group)
         )
 
 
 def get_alleles_MHCII(wildcards):
-    if wildcards.peptide_type == "wt":
+    if wildcards.peptide_type == "normal":
         return "results/HLA-LA/hlaI_{S}.tsv".format(
-            S=get_normal(wildcards.cancer_sample)
+            S=get_normal_from_group(wildcards.group)
         )
     else:
-        return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample)
-
-
-def get_normal_bam(wildcards):
-    return expand(
-        "results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample)
-    )
-
+        return "results/HLA-LA/hlaI_{S}.tsv".format(
+            S=get_tumor_from_group(wildcards.group)
+        )
 
-def get_normal_bai(wildcards):
-    return expand(
-        "results/recal/{normal}.sorted.bam.bai",
-        normal=get_normal(wildcards.cancer_sample),
-    )
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index caef83cf..695fac1b 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,16 +1,16 @@
-rule microphaser_somatic:
+rule microphaser_tumor:
     input:
-        vcf="results/strelka/merged/{cancer_sample}/all_variants.norm.annotated.bcf",
-        bam="results/recal/{cancer_sample}.sorted.bam",
-        bai="results/recal/{cancer_sample}.sorted.bam.bai",
+        vcf="results/strelka/merged/{group}.{tumor_event}.{normal_event}.norm.annotated.bcf",
+        bam=get_tumor_bam(),
+        bai=get_tumor_bam(ext=".bam.bai"),
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.neo.fa",
-        wt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.normal.fa",
-        tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv",
+        mt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa",
+        tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv",
     log:
-        "logs/microphaser/somatic/{cancer_sample}-{contig}.log",
+        "logs/microphaser_tumor/{group}/{tumor_event}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -20,22 +20,22 @@ rule microphaser_somatic:
         "< {input.track} > {output.mt_fasta} 2> {log}"
 
 
-rule microphaser_germline:
+rule microphaser_normal:
     input:
-        vcf="results/strelka/germline/{normal_sample}/results/variants/variants.reheader.norm.bcf",
-        bam="results/recal/{normal_sample}.sorted.bam",
-        bai="results/recal/{normal_sample}.sorted.bam.bai",
+        vcf="results/strelka/normal/{group}.{normal_event}.variants.reheader.norm.bcf",
+        bam=get_normal_bam(),
+        bai=get_normal_bam(ext=".bam.bai"),
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
         wt_fasta=(
-            "results/microphaser/fasta/germline/{normal_sample}/{normal_sample}.germline.{contig}.fa"
+            "results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"
         ),
         wt_tsv=(
-            "results/microphaser/info/germline/{normal_sample}/{normal_sample}.germline.{contig}.tsv"
+            "results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"
         ),
     log:
-        "logs/microphaser/germline/{normal_sample}-{contig}.log",
+        "logs/microphaser_germline/{group}/{normal_event}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -45,28 +45,28 @@ rule microphaser_germline:
         "< {input.track} > {output.wt_fasta} 2> {log}"
 
 
-rule concat_proteome:
+rule concat_normal_proteome:
     input:
         expand(
-            "results/microphaser/fasta/germline/{{normal_sample}}/{{normal_sample}}.germline.{contig}.fa",
+            "results/microphaser/fasta/{{group}}/normal.{{normal_event}}.{contig}.fa",
             contig=contigs,
         ),
     output:
-        "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa",
     log:
-        "logs/microphaser/concat-ref-proteome/{normal_sample}.log",
+        "logs/microphaser/concat_normal_proteome/{group}.{normal_event}.log",
     shell:
         "cat {input} > {output} 2> {log}"
 
 
-rule build_germline_proteome:
+rule build_normal_proteome_db:
     input:
-        "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa",
     output:
-        bin="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin",
-        fasta="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.peptides.fasta",
+        bin="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
+        fasta="results/microphaser/fasta/{group}.{normal_event}.{mhc}.normal_proteome.peptides.fasta",
     log:
-        "logs/microphaser/build-ref-proteome-db/{normal_sample}-{mhc}.log",
+        "logs/microphaser/build_normal_proteome_db/{group}.{normal_event}-{mhc}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -79,19 +79,19 @@ rule build_germline_proteome:
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv",
-        proteome=get_proteome,
+        tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv",
+        proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
     output:
         mt_fasta=(
-            "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.neo.fa"
+            "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.normal.fa"
+            "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.tsv",
-        removed="results/microphaser/info/{cancer_sample}/removed/{mhc}/{cancer_sample}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv",
+        removed="results/microphaser/info/removed/{group}/{mhc}.{tumor_event}.{contig}.removed.tsv",
     log:
-        "logs/microphaser/filter/{cancer_sample}-{mhc}-{contig}.log",
+        "logs/microphaser_filter/{group}/{mhc}.{tumor_event}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -105,13 +105,13 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/{{cancer_sample}}/filtered/{{mhc}}/{{cancer_sample}}.{contig}.tsv",
+            "results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv",
+        "results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv",
     log:
-        "logs/concat-tsv/{cancer_sample}-{mhc}.log",
+        "logs/concat_tsvs/{group}.{mhc}.{tumor_event}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From 8cc18bb6fba3015711135afc86afc4617a3d3664 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 15:52:16 +0200
Subject: [PATCH 032/191] add missing parenthesis

---
 workflow/rules/common.smk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9e2f88b5..7472c96a 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -94,6 +94,7 @@ def get_final_output():
                         )
                     ),
                     seqtype=sequencing_types,
+                )
             )
     else:
         if config["HLAtyping"]["HLA_LA"]["activate"]:

From 14d4eb93f50d4820347e156395473918870be20a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:33:59 +0000
Subject: [PATCH 033/191] remove unused function get_pair_variants() in
 common.smk

---
 workflow/rules/common.smk | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 7472c96a..6b20358f 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -340,24 +340,6 @@ def get_fdr_control_params(wildcards):
     return {"threshold": threshold, "events": events}
 
 
-def get_pair_variants(wildcards, index):
-    if index:
-        ext = ".csi"
-    else:
-        ext = ""
-    variants = [
-        "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf{ext}".format(
-            wildcards.sample, ext
-        )
-    ]
-    variants.append(
-        "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format(
-            get_normal_from_group(wildcards.group), ext
-        )
-    )
-    return variants
-
-
 def get_pair_observations(wildcards):
     return expand(
         "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf",

From d57cac359311f589eeebdb46e58d0ae7784df10c Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:35:06 +0000
Subject: [PATCH 034/191] switch input functions with extra args to def
 inner(wildcards): syntax

---
 workflow/rules/common.smk | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 6b20358f..8e5b8d08 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -271,12 +271,14 @@ def get_read_group(wildcards):
     )
 
 
-def get_recalibrate_quality_input(wildcards, bai=False):
+def get_recalibrate_quality_input(bai=False):
     ext = ".bai" if bai else ""
-    if is_activated("remove_duplicates"):
-        return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext)
-    else:
-        return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext)
+    def inner(wildcards):
+        if is_activated("remove_duplicates"):
+            return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext)
+        else:
+            return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext)
+    return inner
 
 
 ## HLA Typing ##
@@ -377,14 +379,18 @@ def get_tabix_params(wildcards):
     raise ValueError("Invalid format for tabix: {}".format(wildcards.format))
 
 
-def get_normal_bam(wildcards, ext=".bam"):
-    normal_sample = get_normal_from_group(wildcards.group)
-    return f"results/recal/{normal_sample}.sorted{ext}"
+def get_normal_bam(ext=".bam"):
+    def inner(wildcards):
+        normal_sample = get_normal_from_group(wildcards.group)
+        return f"results/recal/{normal_sample}.sorted{ext}"
+    return inner
 
 
-def get_tumor_bam(wildcards, ext=".bam"):
-    tumor_sample = get_tumor_from_group(wildcards.group)
-    return f"results/recal/{tumor_sample}.sorted{ext}"
+def get_tumor_bam(ext=".bam"):
+    def inner(wildcards):
+        tumor_sample = get_tumor_from_group(wildcards.group)
+        return f"results/recal/{tumor_sample}.sorted{ext}"
+    return inner
 
 
 ## RNA ##

From 569aaadbc9451202f487077a9b9bc481a76d95d3 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:36:53 +0000
Subject: [PATCH 035/191] fix overlooked wildcard rename substitution

---
 workflow/rules/calling.smk     | 2 +-
 workflow/rules/microphaser.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index 48e51d5f..4193d528 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -33,7 +33,7 @@ rule strelka_germline:
     output:
         "results/strelka/{group}.strelka_germline.variants.vcf.gz",
     log:
-        "logs/calling/strelka_germline/{normal_sample}.log",
+        "logs/calling/strelka_germline/{group}.log",
     params:
         config_extra="--callRegions {} {}".format(
             "resources/genome.callregions.bed.gz",
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 695fac1b..939b2526 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -10,7 +10,7 @@ rule microphaser_tumor:
         wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa",
         tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv",
     log:
-        "logs/microphaser_tumor/{group}/{tumor_event}-{contig}.log",
+        "logs/microphaser_tumor/{group}/{tumor_event}.{normal_event}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:

From 3c9e0e73e6b3855f1dfdeb0c5bc96d2f47ab9f6f Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:39:40 +0000
Subject: [PATCH 036/191] rule concat_tsvs: fix wildcard pass-through via
 double brackets

---
 workflow/rules/microphaser.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 939b2526..548e39e5 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -105,7 +105,7 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv",
+            "results/microphaser/info/filtered/{{group}}/{{mhc}}.{{tumor_event}}.{contig}.tsv",
             contig=contigs,
         ),
     output:

From b785271752faf1ad6038a3b29c73e6aa4cf51a3d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:40:11 +0000
Subject: [PATCH 037/191] missing comma

---
 workflow/rules/MHC_binding.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 893d403f..eca8f056 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -18,7 +18,7 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa"
+        peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
         "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",
@@ -35,7 +35,7 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa"
+        peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
         "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",

From 67c328662faed68fbc8f240d096e577637775470 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 14:59:01 +0000
Subject: [PATCH 038/191] snakefmt

---
 workflow/rules/calling.smk     |  4 ++--
 workflow/rules/common.smk      | 23 +++++++++++++++--------
 workflow/rules/microphaser.smk |  8 ++------
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index 4193d528..3ac76cb4 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -108,11 +108,11 @@ rule reheader_germline:
 
 rule concat_variants:
     input:
-        calls=[ 
+        calls=[
             "results/strelka/{group}.strelka_somatic.tumor.bcf",
             "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
         ],
-        index=[ 
+        index=[
             "results/strelka/{group}.strelka_somatic.tumor.bcf.csi",
             "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi",
         ],
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 8e5b8d08..3eb2fd65 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -78,7 +78,9 @@ def get_final_output():
     if config["epitope_prediction"]["activate"]:
         for group in pd.unique(samples["group"]):
             samples = samples.loc[samples["group"] == group, "sample_name"]
-            sequencing_types = pd.unique(units.loc[units.sample_name in samples, "sequencing_type"])
+            sequencing_types = pd.unique(
+                units.loc[units.sample_name in samples, "sequencing_type"]
+            )
             final_output.extend(
                 expand(
                     "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv",
@@ -88,8 +90,12 @@ def get_final_output():
                         filter(
                             None,
                             [
-                                "netMHCpan" if is_activated("affinity/netMHCpan") else None,
-                                "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None,
+                                "netMHCpan"
+                                if is_activated("affinity/netMHCpan")
+                                else None,
+                                "netMHCIIpan"
+                                if is_activated("affinity/netMHCIIpan")
+                                else None,
                             ],
                         )
                     ),
@@ -273,11 +279,13 @@ def get_read_group(wildcards):
 
 def get_recalibrate_quality_input(bai=False):
     ext = ".bai" if bai else ""
+
     def inner(wildcards):
         if is_activated("remove_duplicates"):
             return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext)
         else:
             return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext)
+
     return inner
 
 
@@ -383,6 +391,7 @@ def get_normal_bam(ext=".bam"):
     def inner(wildcards):
         normal_sample = get_normal_from_group(wildcards.group)
         return f"results/recal/{normal_sample}.sorted{ext}"
+
     return inner
 
 
@@ -390,6 +399,7 @@ def get_tumor_bam(ext=".bam"):
     def inner(wildcards):
         tumor_sample = get_tumor_from_group(wildcards.group)
         return f"results/recal/{tumor_sample}.sorted{ext}"
+
     return inner
 
 
@@ -450,8 +460,7 @@ def get_normal_from_sample(sample_name):
 
 def get_normal_from_group(group):
     normal_sample = samples.loc[
-        (samples["group"] == group)
-        & (samples["alias"] == "normal"),
+        (samples["group"] == group) & (samples["alias"] == "normal"),
         "sample_name",
     ].iat[0]
     return normal_sample
@@ -459,8 +468,7 @@ def get_normal_from_group(group):
 
 def get_tumor_from_group(group):
     tumor_sample = samples.loc[
-        (samples["group"] == group)
-        & (samples["alias"] == "tumor"),
+        (samples["group"] == group) & (samples["alias"] == "tumor"),
         "sample_name",
     ].iat[0]
     return tumor_sample
@@ -502,4 +510,3 @@ def get_alleles_MHCII(wildcards):
         return "results/HLA-LA/hlaI_{S}.tsv".format(
             S=get_tumor_from_group(wildcards.group)
         )
-
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 548e39e5..0c07d588 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -28,12 +28,8 @@ rule microphaser_normal:
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        wt_fasta=(
-            "results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"
-        ),
-        wt_tsv=(
-            "results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"
-        ),
+        wt_fasta=("results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"),
+        wt_tsv=("results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"),
     log:
         "logs/microphaser_germline/{group}/{normal_event}-{contig}.log",
     conda:

From 5e7e7e9e6dac4e6d2d1e32b8f5c4a22b8d5ee251 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 15:56:01 +0000
Subject: [PATCH 039/191] fix samples handling for get_final_output()

---
 workflow/rules/common.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3eb2fd65..70897797 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -77,9 +77,9 @@ def get_final_output():
     final_output = []
     if config["epitope_prediction"]["activate"]:
         for group in pd.unique(samples["group"]):
-            samples = samples.loc[samples["group"] == group, "sample_name"]
+            smps = samples.loc[samples["group"] == group, "sample_name"]
             sequencing_types = pd.unique(
-                units.loc[units.sample_name in samples, "sequencing_type"]
+                units.loc[units["sample_name"].isin(smps), "sequencing_type"]
             )
             final_output.extend(
                 expand(

From 59ef419872076a6919aac31e9dd749545eb60afd Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 15:56:46 +0000
Subject: [PATCH 040/191] include required params: microphase: events: entry
 "tumor:"

---
 config/config.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/config/config.yaml b/config/config.yaml
index f1e6818a..f6e5d8c3 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -116,6 +116,8 @@ params:
           9
       netMHCIIpan:
           15
+    events:
+      tumor: "tumor_unfiltered"
   kallisto:
     "-b 100"
   star: >-

From 4cfa6c1d5ec8fb9fe2e805473b33d12b3865ca80 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 16:18:19 +0000
Subject: [PATCH 041/191] set and use microphaser normal event definition in
 config.yaml

---
 config/config.yaml             | 1 +
 workflow/rules/common.smk      | 4 +---
 workflow/rules/microphaser.smk | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index f6e5d8c3..b031d126 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -118,6 +118,7 @@ params:
           15
     events:
       tumor: "tumor_unfiltered"
+      normal: "normal_unfiltered"
   kallisto:
     "-b 100"
   star: >-
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 70897797..b956b741 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -484,12 +484,10 @@ def get_seperate(sample, group):
 
 def get_proteome(wildcards):
     return expand(
-        "results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
+        "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin",
         normal_event=config["params"]["microphaser"]["events"]["normal"],
-        mhc=wildcards.mhc,
     )
 
-
 def get_alleles_MHCI(wildcards):
     if wildcards.peptide_type == "normal":
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 0c07d588..1437d866 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -76,7 +76,7 @@ rule build_normal_proteome_db:
 rule microphaser_filter:
     input:
         tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv",
-        proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
+        proteome=get_proteome(),
     output:
         mt_fasta=(
             "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa"

From d6c1287fb780fa7b42e10d6c038103913854c68b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 20:57:31 +0000
Subject: [PATCH 042/191] fix input function syntax

---
 workflow/rules/common.smk  | 10 ++--------
 workflow/rules/mapping.smk |  8 ++++----
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index b956b741..600b9fce 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -282,9 +282,9 @@ def get_recalibrate_quality_input(bai=False):
 
     def inner(wildcards):
         if is_activated("remove_duplicates"):
-            return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext)
+            return f"results/dedup/{wildcards.sample}.sorted.bam{ext}"
         else:
-            return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext)
+            return f"results/mapped/{wildcards.sample}.sorted.bam{ext}"
 
     return inner
 
@@ -482,12 +482,6 @@ def get_seperate(sample, group):
     return units.loc[(sample, "DNA"), "fq{}".format(str(group))]
 
 
-def get_proteome(wildcards):
-    return expand(
-        "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin",
-        normal_event=config["params"]["microphaser"]["events"]["normal"],
-    )
-
 def get_alleles_MHCI(wildcards):
     if wildcards.peptide_type == "normal":
         return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
index 8cfcb51c..13be68ad 100644
--- a/workflow/rules/mapping.smk
+++ b/workflow/rules/mapping.smk
@@ -32,8 +32,8 @@ rule mark_duplicates:
 
 rule recalibrate_base_qualities:
     input:
-        bam=get_recalibrate_quality_input,
-        bai=lambda w: get_recalibrate_quality_input(w, bai=True),
+        bam=get_recalibrate_quality_input(),
+        bai=get_recalibrate_quality_input(bai=True),
         ref="resources/genome.fasta",
         ref_dict="resources/genome.dict",
         ref_fai="resources/genome.fasta.fai",
@@ -53,8 +53,8 @@ rule recalibrate_base_qualities:
 
 rule apply_bqsr:
     input:
-        bam=get_recalibrate_quality_input,
-        bai=lambda w: get_recalibrate_quality_input(w, bai=True),
+        bam=get_recalibrate_quality_input(),
+        bai=get_recalibrate_quality_input(bai=True),
         ref="resources/genome.fasta",
         ref_dict="resources/genome.dict",
         ref_fai="resources/genome.fasta.fai",

From 75cfee31de99731d6bc40aca59c36da5b985c181 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 9 May 2022 20:58:21 +0000
Subject: [PATCH 043/191] fix input of rule microphaser_filter

---
 workflow/rules/microphaser.smk | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 1437d866..b8d0dae9 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -76,7 +76,10 @@ rule build_normal_proteome_db:
 rule microphaser_filter:
     input:
         tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv",
-        proteome=get_proteome(),
+        proteome=expand(
+            "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin",
+            normal_event=config["params"]["microphaser"]["events"]["normal"],
+        ),
     output:
         mt_fasta=(
             "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa"

From 71790e3b041977916495588739c9768119525323 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 10 May 2022 16:02:14 +0000
Subject: [PATCH 044/191] reworked to consistently allow for multiple tumor
 aliases per group

---
 config/config.yaml             |  4 +-
 workflow/rules/HLAtyping.smk   | 26 ++++-----
 workflow/rules/MHC_binding.smk | 36 ++++++-------
 workflow/rules/RNA.smk         |  4 +-
 workflow/rules/calling.smk     | 44 +++++++--------
 workflow/rules/common.smk      | 99 +++++++++++++++++-----------------
 workflow/rules/microphaser.smk | 34 ++++++------
 7 files changed, 122 insertions(+), 125 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index b031d126..e5b9670a 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -117,8 +117,8 @@ params:
       netMHCIIpan:
           15
     events:
-      tumor: "tumor_unfiltered"
-      normal: "normal_unfiltered"
+      tumor: "strelka_somatic"
+      normal: "strelka_germline"
   kallisto:
     "-b 100"
   star: >-
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index e68cd89b..bacfa45a 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -1,13 +1,13 @@
 rule HLA_LA:
     input:
-        bam="results/recal/{sample}.sorted.bam",
-        bai="results/recal/{sample}.sorted.bam.bai",
+        bam=get_bam_from_group_and_alias(),
+        bai=get_bam_from_group_and_alias(ext=".bai"),
         index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
     output:
-        "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt",
+        "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt",
     threads: 7
     log:
-        "logs/HLA-LA/{sample}.log",
+        "logs/HLA-LA/{group}.{alias}.log",
     params:
         graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
         graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
@@ -19,20 +19,20 @@ rule HLA_LA:
 
 rule parse_HLA_LA:
     input:
-        "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt",
+        "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt",
     output:
         report(
-            "results/HLA-LA/hlaI_{sample}.tsv",
+            "results/HLA-LA/{group}.{alias}.hlaI.tsv",
             caption="../report/HLA_Types.rst",
             category="HLA-Typing(HLA-LA)",
         ),
         report(
-            "results/HLA-LA/hlaII_{sample}.tsv",
+            "results/HLA-LA/{group}.{alias}.hlaII.tsv",
             caption="../report/HLA_Types.rst",
             category="HLA-Typing(HLA-LA)",
         ),
     log:
-        "logs/parse-HLA-LA/{sample}.log",
+        "logs/parse-HLA-LA/{group}.{alias}.log",
     script:
         "../scripts/parse_HLA_types.py"
 
@@ -71,10 +71,10 @@ rule OptiType:
         reads=get_optitype_reads_input,
     output:
         multiext(
-            "results/optitype/{sample}/{sample}", "_coverage_plot.pdf", "_result.tsv"
+            "results/optitype/{group}/{group}.{alias}", ".coverage_plot.pdf", ".result.tsv"
         ),
     log:
-        "logs/optitype/{sample}.log",
+        "logs/optitype/{group}.{alias}.log",
     params:
         extra=config["params"]["optitype"],
         sequencing_type="dna",
@@ -84,15 +84,15 @@ rule OptiType:
 
 rule parse_Optitype:
     input:
-        "results/optitype/{sample}/{sample}_result.tsv",
+        "results/optitype/{group}/{group}.{alias}.result.tsv",
     output:
         report(
-            "results/optitype/{sample}/hla_alleles_{sample}.tsv",
+            "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
             caption="../report/HLA_Types.rst",
             category="HLA-Typing(Optitype)",
         ),
     log:
-        "logs/parse-optitype/{sample}.log",
+        "logs/parse-optitype/{group}.{alias}.log",
     shell:
         "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' "
         '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}'
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index eca8f056..d33b0f82 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCIIpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,13 +53,13 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{group}}/{{tumor_event}}.{contig}.{{peptide_type}}.xls",
+            "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{contig}.{{peptide_type}}.xls",
             contig=contigs,
         ),
     output:
-        "results/{mhc}/{group}.{tumor_event}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse_mhc_out/{mhc}/{group}.{tumor_event}.{peptide_type}.log",
+        "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{peptide_type}.log",
     script:
         "../scripts/group_mhc_output.py"
 
@@ -81,17 +81,17 @@ rule parse_mhc_out:
 
 rule mhc_csv_table:
     input:
-        info="results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv",
-        neo="results/{mhc}/{group}.{tumor_event}.mhc.neo.tsv",
-        normal="results/{mhc}/{group}.{tumor_event}.mhc.normal.tsv",
+        info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv",
+        neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.neo.tsv",
+        normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.normal.tsv",
     output:
         report(
-            "results/neoantigens/{group}.{tumor_event}.{mhc}.DNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv",
             caption="../report/WES_results.rst",
             category="Results WES (netMHC)",
         ),
     log:
-        "logs/mhc_csv_table/{group}.{mhc}.{tumor_event}.log",
+        "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
     script:
         "../scripts/merge_data.py"
 
@@ -109,17 +109,17 @@ rule mhc_csv_table:
 
 rule add_RNA_info:
     input:
-        counts="results/kallisto/{cancer_sample}",
-        table="results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv",
+        counts="results/kallisto/{group}.{tumor_alias}",
+        table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv",
     output:
         report(
-            "results/neoantigens/{mhc}/{cancer_sample}.RNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.RNA.tsv",
             caption="../report/RNA_results.rst",
             category="Results RNA",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
     log:
-        "logs/add-RNA/{mhc}-{cancer_sample}.log",
+        "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
     script:
         "../scripts/add_rna_info.py"
diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk
index 940d7c02..2a927c18 100644
--- a/workflow/rules/RNA.smk
+++ b/workflow/rules/RNA.smk
@@ -3,11 +3,11 @@ rule kallisto_quant:
         fastq=get_quant_reads_input,
         index="resources/kallisto/transcripts.idx",
     output:
-        directory("results/kallisto/{sample}"),
+        directory("results/kallisto/{group}.{tumor_alias}"),
     params:
         extra=kallisto_params,
     log:
-        "results/logs/kallisto/quant/{sample}.log",
+        "results/logs/kallisto/quant/{group}.{tumor_alias}.log",
     wrapper:
         "0.60.1/bio/kallisto/quant"
 
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
index 3ac76cb4..9fca7233 100644
--- a/workflow/rules/calling.smk
+++ b/workflow/rules/calling.smk
@@ -1,17 +1,17 @@
 rule strelka_tumor:
     input:
         normal=get_normal_bam(),
-        normal_index=get_normal_bam(ext=".bam.bai"),
-        tumor=get_tumor_bam(),
-        tumor_index=get_tumor_bam(ext=".bam.bai"),
+        normal_index=get_normal_bam(ext=".bai"),
+        tumor=get_tumor_bam_from_group_and_alias(),
+        tumor_index=get_tumor_bam_from_group_and_alias(ext=".bai"),
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/{group}.strelka_somatic.snvs.vcf.gz",
-        "results/strelka/{group}.strelka_somatic.indels.vcf.gz",
+        "results/strelka/{group}.{tumor_alias}.strelka_somatic.snvs.vcf.gz",
+        "results/strelka/{group}.{tumor_alias}.strelka_somatic.indels.vcf.gz",
     log:
-        "logs/calling/strelka/{group}.strelka_somatic.log",
+        "logs/calling/strelka/{group}.{tumor_alias}.strelka_somatic.log",
     params:
         config_extra="--callRegions {} {}".format(
             "resources/genome.callregions.bed.gz",
@@ -26,12 +26,12 @@ rule strelka_tumor:
 rule strelka_germline:
     input:
         bam=get_normal_bam(),
-        normal_index=get_normal_bam(ext=".bam.bai"),
+        normal_index=get_normal_bam(ext=".bai"),
         fasta="resources/genome.fasta",
         fasta_index="resources/genome.fasta.fai",
         callregions="resources/genome.callregions.bed.gz",
     output:
-        "results/strelka/{group}.strelka_germline.variants.vcf.gz",
+        "results/strelka/{group}.normal.strelka_germline.variants.vcf.gz",
     log:
         "logs/calling/strelka_germline/{group}.log",
     params:
@@ -61,17 +61,17 @@ rule vcf_to_bcf:
 rule concat_somatic:
     input:
         calls=expand(
-            "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf",
+            "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf",
             type=["snvs", "indels"],
         ),
         indices=expand(
-            "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf.csi",
+            "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf.csi",
             type=["snvs", "indels"],
         ),
     output:
-        "results/strelka/{group}.strelka_somatic.bcf",
+        "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf",
     log:
-        "bcftools/concat_somatic/{group}.log",
+        "bcftools/concat_somatic/{group}.{tumor_alias}.log",
     params:
         "-O b -a",
     wrapper:
@@ -80,11 +80,11 @@ rule concat_somatic:
 
 rule get_tumor_from_somatic:
     input:
-        "results/strelka/{group}.strelka_somatic.bcf",
+        "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf",
     output:
-        "results/strelka/{group}.strelka_somatic.tumor.bcf",
+        "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf",
     log:
-        "logs/bcftools/get_tumor_from_somatic/{group}.strelka_somatic.tumor.log",
+        "logs/bcftools/get_tumor_from_somatic/{group}.{tumor_alias}.strelka_somatic.tumor.log",
     params:
         "-O b -s TUMOR",
     wrapper:
@@ -93,12 +93,12 @@ rule get_tumor_from_somatic:
 
 rule reheader_germline:
     input:
-        vcf="results/strelka/{group}.strelka_germline.variants.output.bcf",
+        vcf="results/strelka/{group}.normal.strelka_germline.variants.output.bcf",
         samples="resources/sampleheader.txt",
     output:
-        "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
+        "results/strelka/{group}.normal.strelka_germline.variants.reheader.bcf",
     log:
-        "logs/bcftools/reheader_germline/{group}.log",
+        "logs/bcftools/reheader_germline/{group}.normal.log",
     params:
         extra="",
         view_extra="-O b",
@@ -109,17 +109,17 @@ rule reheader_germline:
 rule concat_variants:
     input:
         calls=[
-            "results/strelka/{group}.strelka_somatic.tumor.bcf",
+            "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf",
             "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
         ],
         index=[
-            "results/strelka/{group}.strelka_somatic.tumor.bcf.csi",
+            "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf.csi",
             "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi",
         ],
     output:
-        "results/strelka/merged/{group}.strelka_somatic.strelka_germline.bcf",
+        "results/strelka/merged/{group}.{tumor_alias}.strelka_somatic.strelka_germline.bcf",
     log:
-        "bcftools/concat_variants/{group}.strelka_somatic.strelka_germline.log",
+        "bcftools/concat_variants/{group}.{tumor_alias}.strelka_somatic.strelka_germline.log",
     params:
         extra="-O b -a",
     wrapper:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 600b9fce..0e159e7c 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -16,7 +16,11 @@ samples = (
     pd.read_csv(
         config["samples"],
         sep="\t",
-        dtype={"sample_name": str, "group": str},
+        dtype={
+            "sample_name": str,
+            "group": str,
+            "alias": str,
+        },
         comment="#",
     )
     .set_index("sample_name", drop=False)
@@ -55,6 +59,7 @@ wildcard_constraints:
     sample="|".join(samples["sample_name"]),
     unit="|".join(units["unit_name"]),
     alias="|".join(pd.unique(samples["alias"])),
+    tumor_alias="|".join(pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])),
     group="|".join(pd.unique(samples["group"])),
     caller="|".join(["freebayes", "delly"]),
     peptide_type="|".join(["normal", "neo"]),
@@ -81,10 +86,12 @@ def get_final_output():
             sequencing_types = pd.unique(
                 units.loc[units["sample_name"].isin(smps), "sequencing_type"]
             )
+            tumor_aliases = samples.loc[(samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias"]
             final_output.extend(
                 expand(
-                    "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv",
+                    "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv",
                     group=group,
+                    tumor_alias=tumor_aliases,
                     tumor_event=config["params"]["microphaser"]["events"]["tumor"],
                     mhc=list(
                         filter(
@@ -261,8 +268,8 @@ def get_fastqs(wc):
     return units.loc[wc.sample].loc[wc.seqtype, fq].tolist()
 
 
-def get_map_reads_input(wildcards):
-    if is_paired_end(wildcards.sample, "DNA"):
+def get_map_reads_input(sample):
+    if is_paired_end(sample, "DNA"):
         return [
             "results/merged/DNA/{sample}_R1.fastq.gz",
             "results/merged/DNA/{sample}_R2.fastq.gz",
@@ -293,16 +300,17 @@ def get_recalibrate_quality_input(bai=False):
 
 
 def get_optitype_reads_input(wildcards):
+    sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias)
     if is_activated("HLAtyping/optitype_prefiltering"):
-        if is_paired_end(wildcards.sample, "DNA"):
+        if is_paired_end(sample, "DNA"):
             return expand(
                 "results/razers3/fastq/{sample}_{read}.fished.fastq",
-                sample=wildcards.sample,
+                sample=sample,
                 read=["R1", "R2"],
             )
         return "results/razers3/fastq/{sample}_single.fastq"
     else:
-        return get_map_reads_input(wildcards)
+        return get_map_reads_input(sample)
 
 
 def get_oncoprint_batch(wildcards):
@@ -387,27 +395,45 @@ def get_tabix_params(wildcards):
     raise ValueError("Invalid format for tabix: {}".format(wildcards.format))
 
 
+def get_sample_from_group_and_alias(group, alias):
+    sample = samples.loc[
+        (samples["group"] == group) & (samples["alias"] == alias),
+        "sample_name"
+    ]
+    return sample
+
+
+
 def get_normal_bam(ext=".bam"):
     def inner(wildcards):
-        normal_sample = get_normal_from_group(wildcards.group)
+        normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal")
         return f"results/recal/{normal_sample}.sorted{ext}"
 
     return inner
 
 
-def get_tumor_bam(ext=".bam"):
+def get_tumor_bam_from_group_and_alias(ext=".bam"):
     def inner(wildcards):
-        tumor_sample = get_tumor_from_group(wildcards.group)
+        tumor_sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
         return f"results/recal/{tumor_sample}.sorted{ext}"
 
     return inner
 
 
+def get_bam_from_group_and_alias(ext=".bam"):
+    def inner(wildcards):
+        sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias)
+        return f"results/recal/{sample}.sorted{ext}"
+
+    return inner
+
+
 ## RNA ##
 
 
 def get_quant_reads_input(wildcards):
-    if is_paired_end(wildcards.sample, "RNA"):
+    sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
+    if is_paired_end(sample, "RNA"):
         return [
             "results/merged/RNA/{sample}_R1.fastq.gz",
             "results/merged/RNA/{sample}_R2.fastq.gz",
@@ -449,31 +475,6 @@ def get_paired_bais(wildcards):
     )
 
 
-def get_normal_from_sample(sample_name):
-    normal_sample = samples.loc[
-        (samples["group"] == samples.loc[sample_name, "group"])
-        & (samples["alias"] == "normal"),
-        "sample_name",
-    ].iat[0]
-    return normal_sample
-
-
-def get_normal_from_group(group):
-    normal_sample = samples.loc[
-        (samples["group"] == group) & (samples["alias"] == "normal"),
-        "sample_name",
-    ].iat[0]
-    return normal_sample
-
-
-def get_tumor_from_group(group):
-    tumor_sample = samples.loc[
-        (samples["group"] == group) & (samples["alias"] == "tumor"),
-        "sample_name",
-    ].iat[0]
-    return tumor_sample
-
-
 def get_reads(wildcards):
     return get_seperate(wildcards.sample, wildcards.group)
 
@@ -483,22 +484,18 @@ def get_seperate(sample, group):
 
 
 def get_alleles_MHCI(wildcards):
-    if wildcards.peptide_type == "normal":
-        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=get_normal_from_group(wildcards.group)
-        )
-    else:
-        return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
-            S=get_tumor_from_group(wildcards.group)
-        )
+    alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
+    return expand(
+        "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
+        group=wildcards.group,
+        alias=alias,
+    )
 
 
 def get_alleles_MHCII(wildcards):
-    if wildcards.peptide_type == "normal":
-        return "results/HLA-LA/hlaI_{S}.tsv".format(
-            S=get_normal_from_group(wildcards.group)
-        )
-    else:
-        return "results/HLA-LA/hlaI_{S}.tsv".format(
-            S=get_tumor_from_group(wildcards.group)
+    alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
+    return expand(
+        "results/HLA-LA/{group}.{alias}.hlaI.tsv",
+        group=wildcards.group,
+        alias=alias
         )
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index b8d0dae9..fb685cf8 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,16 +1,16 @@
 rule microphaser_tumor:
     input:
-        vcf="results/strelka/merged/{group}.{tumor_event}.{normal_event}.norm.annotated.bcf",
-        bam=get_tumor_bam(),
-        bai=get_tumor_bam(ext=".bam.bai"),
+        vcf="results/strelka/merged/{group}.{tumor_alias}.{tumor_event}.{normal_event}.norm.annotated.bcf",
+        bam=get_tumor_bam_from_group_and_alias(),
+        bai=get_tumor_bam_from_group_and_alias(ext=".bai"),
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.neo.fa",
-        wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa",
-        tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv",
+        mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.normal.fa",
+        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv",
     log:
-        "logs/microphaser_tumor/{group}/{tumor_event}.{normal_event}.{contig}.log",
+        "logs/microphaser_tumor/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -22,7 +22,7 @@ rule microphaser_tumor:
 
 rule microphaser_normal:
     input:
-        vcf="results/strelka/normal/{group}.{normal_event}.variants.reheader.norm.bcf",
+        vcf="results/strelka/{group}.normal.{normal_event}.variants.reheader.norm.bcf",
         bam=get_normal_bam(),
         bai=get_normal_bam(ext=".bam.bai"),
         track="resources/annotation/{contig}.gtf",
@@ -75,22 +75,22 @@ rule build_normal_proteome_db:
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv",
+        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{contig}.tsv",
         proteome=expand(
             "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin",
             normal_event=config["params"]["microphaser"]["events"]["normal"],
         ),
     output:
         mt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.normal.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv",
-        removed="results/microphaser/info/removed/{group}/{mhc}.{tumor_event}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.tsv",
+        removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.removed.tsv",
     log:
-        "logs/microphaser_filter/{group}/{mhc}.{tumor_event}.{contig}.log",
+        "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -104,13 +104,13 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/filtered/{{group}}/{{mhc}}.{{tumor_event}}.{contig}.tsv",
+            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{mhc}}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv",
+        "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv",
     log:
-        "logs/concat_tsvs/{group}.{mhc}.{tumor_event}.log",
+        "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From 6a07ec612915b6769ac9467ed35ed122355fc72d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 10 May 2022 16:04:37 +0000
Subject: [PATCH 045/191] snakefmt

---
 workflow/rules/HLAtyping.smk |  4 +++-
 workflow/rules/common.smk    | 23 +++++++++++++----------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index bacfa45a..d8513699 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -71,7 +71,9 @@ rule OptiType:
         reads=get_optitype_reads_input,
     output:
         multiext(
-            "results/optitype/{group}/{group}.{alias}", ".coverage_plot.pdf", ".result.tsv"
+            "results/optitype/{group}/{group}.{alias}",
+            ".coverage_plot.pdf",
+            ".result.tsv",
         ),
     log:
         "logs/optitype/{group}.{alias}.log",
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 0e159e7c..ffdde739 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -59,7 +59,9 @@ wildcard_constraints:
     sample="|".join(samples["sample_name"]),
     unit="|".join(units["unit_name"]),
     alias="|".join(pd.unique(samples["alias"])),
-    tumor_alias="|".join(pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])),
+    tumor_alias="|".join(
+        pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])
+    ),
     group="|".join(pd.unique(samples["group"])),
     caller="|".join(["freebayes", "delly"]),
     peptide_type="|".join(["normal", "neo"]),
@@ -86,7 +88,10 @@ def get_final_output():
             sequencing_types = pd.unique(
                 units.loc[units["sample_name"].isin(smps), "sequencing_type"]
             )
-            tumor_aliases = samples.loc[(samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias"]
+            tumor_aliases = samples.loc[
+                (samples["group"] == group) & (samples["alias"].str.match("tumor")),
+                "alias",
+            ]
             final_output.extend(
                 expand(
                     "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv",
@@ -397,13 +402,11 @@ def get_tabix_params(wildcards):
 
 def get_sample_from_group_and_alias(group, alias):
     sample = samples.loc[
-        (samples["group"] == group) & (samples["alias"] == alias),
-        "sample_name"
+        (samples["group"] == group) & (samples["alias"] == alias), "sample_name"
     ]
     return sample
 
 
-
 def get_normal_bam(ext=".bam"):
     def inner(wildcards):
         normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal")
@@ -414,7 +417,9 @@ def get_normal_bam(ext=".bam"):
 
 def get_tumor_bam_from_group_and_alias(ext=".bam"):
     def inner(wildcards):
-        tumor_sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
+        tumor_sample = get_sample_from_group_and_alias(
+            wildcards.group, wildcards.tumor_alias
+        )
         return f"results/recal/{tumor_sample}.sorted{ext}"
 
     return inner
@@ -495,7 +500,5 @@ def get_alleles_MHCI(wildcards):
 def get_alleles_MHCII(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
-        "results/HLA-LA/{group}.{alias}.hlaI.tsv",
-        group=wildcards.group,
-        alias=alias
-        )
+        "results/HLA-LA/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias
+    )

From f4279b25780826b74e1e32c4e03f6829b0379256 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 08:26:25 +0000
Subject: [PATCH 046/191] fix sample selection from group and alias

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index ffdde739..ef85e5d4 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -403,7 +403,7 @@ def get_tabix_params(wildcards):
 def get_sample_from_group_and_alias(group, alias):
     sample = samples.loc[
         (samples["group"] == group) & (samples["alias"] == alias), "sample_name"
-    ]
+    ].squeeze()
     return sample
 
 

From 6843133a6873f03a93e5a11211649cdbef1fd1f0 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 08:26:57 +0000
Subject: [PATCH 047/191] remove unused varlociraptor valling from workflow

---
 config/config.yaml                   |  19 -----
 workflow/Snakefile                   |   3 -
 workflow/rules/annotation.smk        |  23 ------
 workflow/rules/candidate_calling.smk |  30 --------
 workflow/rules/common.smk            |  97 ------------------------
 workflow/rules/filtering.smk         | 106 ---------------------------
 workflow/rules/varlociraptor.smk     |  89 ----------------------
 7 files changed, 367 deletions(-)
 delete mode 100644 workflow/rules/candidate_calling.smk
 delete mode 100644 workflow/rules/filtering.smk
 delete mode 100644 workflow/rules/varlociraptor.smk

diff --git a/config/config.yaml b/config/config.yaml
index e5b9670a..a4fe01ee 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -8,25 +8,6 @@ trimming:
 remove_duplicates:
   activate: true
 
-calling:
-  freebayes:
-    activate: false
-  # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
-  scenario: config/scenario.yaml
-  fdr-control:
-    threshold: 0.05
-    events: 
-      complete:
-        varlociraptor: 
-          - "somatic"
-          - "germline"
-      somatic:
-        varlociraptor:
-          - "somatic"
-      germline:
-        varlociraptor:
-          - "germline"
-
 fusion:
   arriba:
     activate: false
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 2b881b44..7a5270dc 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -31,10 +31,7 @@ include: "rules/trim.smk"
 include: "rules/ref.smk"
 include: "rules/mapping.smk"
 include: "rules/calling.smk"
-include: "rules/candidate_calling.smk"
-include: "rules/varlociraptor.smk"
 include: "rules/annotation.smk"
-include: "rules/filtering.smk"
 include: "rules/microphaser.smk"
 include: "rules/HLAtyping.smk"
 include: "rules/MHC_binding.smk"
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
index 8a504e98..271ab427 100644
--- a/workflow/rules/annotation.smk
+++ b/workflow/rules/annotation.smk
@@ -1,26 +1,3 @@
-rule annotate_variants:
-    input:
-        calls="results/calls/{cancer_sample}.{scatteritem}.bcf",
-        cache="resources/vep/cache",
-        plugins="resources/vep/plugins",
-    output:
-        calls="results/calls/{cancer_sample}.{scatteritem}.annotated.bcf",
-        stats=report(
-            "results/calls/{cancer_sample}.{scatteritem}.stats.html",
-            caption="../report/stats.rst",
-            category="QC",
-        ),
-    params:
-        # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
-        # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
-        plugins=config["annotations"]["vep"]["plugins"],
-        extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]),
-    log:
-        "logs/vep/{cancer_sample}.{scatteritem}.annotate.log",
-    wrapper:
-        "0.59.2/bio/vep/annotate"
-
-
 rule annotate_strelka_variants:
     input:
         calls="results/strelka/{calls}.bcf",
diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk
deleted file mode 100644
index d6b6942d..00000000
--- a/workflow/rules/candidate_calling.smk
+++ /dev/null
@@ -1,30 +0,0 @@
-rule freebayes:
-    input:
-        ref="resources/genome.fasta",
-        # you can have a list of samples here
-        samples=get_paired_bams,
-    output:
-        "results/candidate-calls/{cancer_sample}.freebayes.bcf",
-    log:
-        "logs/{cancer_sample}.log",
-    params:
-        extra=config["params"].get("freebayes", ""),
-        chunksize=100000,
-    threads: 60
-    wrapper:
-        "0.65.0/bio/freebayes"
-
-
-rule scatter_candidates:
-    input:
-        "results/candidate-calls/{cancer_sample}.{caller}.bcf",
-    output:
-        scatter.calling(
-            "results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"
-        ),
-    log:
-        "logs/scatter-candidates/{cancer_sample}.{caller}.log",
-    conda:
-        "../envs/rbt.yaml"
-    shell:
-        "rbt vcf-split {input} {output}"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index ef85e5d4..9c942bf2 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -301,9 +301,6 @@ def get_recalibrate_quality_input(bai=False):
     return inner
 
 
-## HLA Typing ##
-
-
 def get_optitype_reads_input(wildcards):
     sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias)
     if is_activated("HLAtyping/optitype_prefiltering"):
@@ -331,67 +328,6 @@ def get_oncoprint_batch(wildcards):
     )
 
 
-## variant calls ##
-
-
-def get_annotated_bcf(wildcards):
-    selection = ".annotated"
-    return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format(
-        cancer_sample=wildcards.cancer_sample,
-        selection=selection,
-        scatteritem=wildcards.scatteritem,
-    )
-
-
-def get_scattered_calls(ext=".bcf"):
-    def inner(wildcards):
-        return expand(
-            "results/calls/{{cancer_sample}}.{caller}.{{scatteritem}}.sorted{ext}",
-            caller=caller,
-            ext=ext,
-        )
-
-    return inner
-
-
-def get_fdr_control_params(wildcards):
-    query = config["calling"]["fdr-control"]["events"][wildcards.event]
-    threshold = query.get(
-        "threshold", config["calling"]["fdr-control"].get("threshold", 0.05)
-    )
-    events = query["varlociraptor"]
-    return {"threshold": threshold, "events": events}
-
-
-def get_pair_observations(wildcards):
-    return expand(
-        "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf",
-        caller=wildcards.caller,
-        cancer_sample=wildcards.cancer_sample,
-        scatteritem=wildcards.scatteritem,
-        sample=get_paired_samples(wildcards),
-    )
-
-
-def get_merge_input(ext=".bcf"):
-    def inner(wildcards):
-        return expand(
-            "results/calls/{{cancer_sample}}.{vartype}.{{event}}.fdr-controlled{ext}",
-            ext=ext,
-            vartype=["SNV", "INS", "DEL", "MNV"],
-            filter=config["calling"]["fdr-control"]["events"][wildcards.event],
-        )
-
-    return inner
-
-
-def get_pair_aliases(wildcards):
-    return [
-        samples.loc[get_normal_from_sample(wildcards.cancer_sample), "alias"],
-        samples.loc[wildcards.cancer_sample, "alias"],
-    ]
-
-
 def get_tabix_params(wildcards):
     if wildcards.format == "vcf":
         return "-p vcf"
@@ -433,9 +369,6 @@ def get_bam_from_group_and_alias(ext=".bam"):
     return inner
 
 
-## RNA ##
-
-
 def get_quant_reads_input(wildcards):
     sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
     if is_paired_end(sample, "RNA"):
@@ -458,36 +391,6 @@ def kallisto_params(wildcards, input):
     return extra
 
 
-## helper functions ##
-
-
-def get_paired_samples(wildcards):
-    return [
-        get_normal_from_sample(wildcards.cancer_sample),
-        samples.loc[wildcards.cancer_sample, "sample_name"],
-    ]
-
-
-def get_paired_bams(wildcards):
-    return expand(
-        "results/recal/{sample}.sorted.bam", sample=get_paired_samples(wildcards)
-    )
-
-
-def get_paired_bais(wildcards):
-    return expand(
-        "results/recal/{sample}.sorted.bam.bai", sample=get_paired_samples(wildcards)
-    )
-
-
-def get_reads(wildcards):
-    return get_seperate(wildcards.sample, wildcards.group)
-
-
-def get_seperate(sample, group):
-    return units.loc[(sample, "DNA"), "fq{}".format(str(group))]
-
-
 def get_alleles_MHCI(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
deleted file mode 100644
index 169265f0..00000000
--- a/workflow/rules/filtering.smk
+++ /dev/null
@@ -1,106 +0,0 @@
-rule filter_by_annotation:
-    input:
-        "{prefix}.bcf",
-    output:
-        "{prefix}.{filter}.filtered_ann.bcf",
-    log:
-        "logs/filter-calls/annotation/{prefix}.{filter}.log",
-    params:
-        filter=lambda w: config["calling"]["filter"][w.filter],
-    conda:
-        "../envs/vembrane.yaml"
-    shell:
-        'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}'
-
-
-rule filter_odds:
-    input:
-        get_annotated_bcf,
-    output:
-        "results/calls/{cancer_sample}.{event}.{scatteritem}.filtered_odds.bcf",
-    params:
-        events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][
-            "varlociraptor"
-        ],
-    log:
-        "logs/filter-calls/posterior_odds/{cancer_sample}.{scatteritem}.{event}.log",
-    conda:
-        "../envs/varlociraptor.yaml"
-    shell:
-        "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}"
-
-
-rule gather_calls:
-    input:
-        calls=gather.calling(
-            "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf"
-        ),
-        idx=gather.calling(
-            "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi"
-        ),
-    output:
-        "results/calls/{cancer_sample}.{event}.filtered_odds.bcf",
-    log:
-        "logs/gather-calls/{cancer_sample}.{event}.log",
-    params:
-        "-a -Ob",
-    wrapper:
-        "0.67.0/bio/bcftools/concat"
-
-
-rule control_fdr:
-    input:
-        "results/calls/{cancer_sample}.{event}.filtered_odds.bcf",
-    output:
-        "results/calls/{cancer_sample}.{vartype}.{event}.fdr-controlled.bcf",
-    log:
-        "logs/control-fdr/{cancer_sample}.{vartype}.{event}.log",
-    params:
-        query=get_fdr_control_params,
-    conda:
-        "../envs/varlociraptor.yaml"
-    shell:
-        "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} "
-        "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}"
-
-
-rule merge_calls:
-    input:
-        calls=get_merge_input(".bcf"),
-        idx=get_merge_input(".bcf.csi"),
-    output:
-        "results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
-    log:
-        "logs/merge-calls/{cancer_sample}.{event}.log",
-    params:
-        "-a -Ob",
-    wrapper:
-        "0.59.2/bio/bcftools/concat"
-
-
-rule change_samplenames:
-    input:
-        call="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
-    output:
-        temp("results/merged-calls/{cancer_sample}.{event}.renaming.txt"),
-    log:
-        "logs/change-samplenames/{cancer_sample}.{event}.log",
-    params:
-        prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0],
-    shell:
-        "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}"
-
-
-rule reheader_varlociraptor:
-    input:
-        vcf="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf",
-        samples="results/merged-calls/{cancer_sample}.{event}.renaming.txt",
-    output:
-        "results/merged-calls/{cancer_sample}.{event}.reheader.bcf",
-    log:
-        "logs/reheader-calls/{cancer_sample}.{event}.log",
-    params:
-        extra="",
-        view_extra="-O b",
-    wrapper:
-        "0.60.0/bio/bcftools/reheader"
diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk
deleted file mode 100644
index f94052fd..00000000
--- a/workflow/rules/varlociraptor.smk
+++ /dev/null
@@ -1,89 +0,0 @@
-rule render_scenario:
-    input:
-        config["calling"]["scenario"],
-    output:
-        report(
-            "results/scenarios/{cancer_sample}.yaml",
-            caption="../report/scenario.rst",
-            category="Variant calling scenarios",
-        ),
-    params:
-        samples=samples,
-    log:
-        "logs/scenarious/{cancer_sample}.log",
-    conda:
-        "../envs/render_scenario.yaml"
-    script:
-        "../scripts/render-scenario.py"
-
-
-rule varlociraptor_preprocess:
-    input:
-        ref="resources/genome.fasta",
-        ref_idx="resources/genome.fasta.fai",
-        candidates="results/candidate-calls/{cancer_sample}.{caller}.{scatteritem}.bcf",
-        bam="results/recal/{sample}.sorted.bam",
-        bai="results/recal/{sample}.sorted.bam.bai",
-    output:
-        "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf",
-    params:
-        omit_isize="",
-    log:
-        "logs/varlociraptor/preprocess/{cancer_sample}/{sample}.{caller}.{scatteritem}.log",
-    conda:
-        "../envs/varlociraptor.yaml"
-    shell:
-        "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} "
-        "{input.ref} --bam {input.bam} --output {output} 2> {log}"
-
-
-rule varlociraptor_call:
-    input:
-        obs=get_pair_observations,
-        scenario="results/scenarios/{cancer_sample}.yaml",
-    output:
-        temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf"),
-    log:
-        "logs/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.log",
-    params:
-        obs=lambda w, input: [
-            "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs)
-        ],
-    conda:
-        "../envs/varlociraptor.yaml"
-    benchmark:
-        "benchmarks/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.tsv"
-    shell:
-        "varlociraptor "
-        "call variants generic --obs {params.obs} "
-        "--scenario {input.scenario} > {output} 2> {log}"
-
-
-rule sort_calls:
-    input:
-        "results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf",
-    output:
-        temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.sorted.bcf"),
-    log:
-        "logs/bcf-sort/{cancer_sample}.{caller}.{scatteritem}.log",
-    conda:
-        "../envs/bcftools.yaml"
-    resources:
-        mem_mb=8000,
-    shell:
-        "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` "
-        "-Ob {input} > {output} 2> {log}"
-
-
-rule bcftools_concat:
-    input:
-        calls=get_scattered_calls(),
-        indexes=get_scattered_calls(ext=".bcf.csi"),
-    output:
-        "results/calls/{cancer_sample}.{scatteritem}.bcf",
-    log:
-        "logs/concat-calls/{cancer_sample}.{scatteritem}.log",
-    params:
-        "-a -Ob",  # TODO Check this
-    wrapper:
-        "0.59.2/bio/bcftools/concat"

From 98cc4b1bffa66d7c2b2569a37f30bbc4bf153882 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 08:31:06 +0000
Subject: [PATCH 048/191] remove varlociraptor requirements from
 config.schema.yaml

---
 workflow/schemas/config.schema.yaml | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 6fafb49f..657407a9 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -63,34 +63,6 @@ properties:
       - tumor_sample
       - somatic_events
 
-
-  calling:
-    type: object
-    properties:
-      freebayes:
-        type: object
-        properties:
-          activate:
-            type: boolean
-      scenario:
-        type: string
-      fdr-control:
-        type: object
-        properties:
-          threshold:
-            type: number
-            minimum: 0.0
-            maximum: 1.0
-          events:
-            $ref: "#/definitions/evententry"
-            description: "a map of <eventname: event> pairs"
-        required:
-          - threshold
-          - events
-    required:
-      - freebayes
-      - scenario
-      - fdr-control
   
   remove_duplicates:
     type: object

From 27dc8c82ac30a481c18b41f35ba38138029a42e0 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 08:39:38 +0000
Subject: [PATCH 049/191] remove calling requirement from config.schema.yaml

---
 workflow/schemas/config.schema.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 657407a9..9f882ac1 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -204,7 +204,6 @@ required:
   - units
   - ref
   - tmb
-  - calling
   - params
   - annotations
   - epitope_prediction

From f902faeb73acaaa7f69336872463b0e8f7920a8c Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 09:07:21 +0000
Subject: [PATCH 050/191] take {normal_event} wildcard through to final output
 requesting in common.smk

---
 workflow/rules/MHC_binding.smk | 34 +++++++++++++++++-----------------
 workflow/rules/common.smk      |  3 ++-
 workflow/rules/microphaser.smk | 23 ++++++++++-------------
 3 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index d33b0f82..e8f2d0aa 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCIIpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCIIpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,13 +53,13 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{contig}.{{peptide_type}}.xls",
+            "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{contig}.{{peptide_type}}.xls",
             contig=contigs,
         ),
     output:
-        "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{peptide_type}.log",
+        "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{peptide_type}.log",
     script:
         "../scripts/group_mhc_output.py"
 
@@ -81,17 +81,17 @@ rule parse_mhc_out:
 
 rule mhc_csv_table:
     input:
-        info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv",
-        neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.neo.tsv",
-        normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.normal.tsv",
+        info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv",
+        neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.neo.tsv",
+        normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.normal.tsv",
     output:
         report(
-            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv",
             caption="../report/WES_results.rst",
             category="Results WES (netMHC)",
         ),
     log:
-        "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
+        "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
     script:
         "../scripts/merge_data.py"
 
@@ -110,16 +110,16 @@ rule mhc_csv_table:
 rule add_RNA_info:
     input:
         counts="results/kallisto/{group}.{tumor_alias}",
-        table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv",
+        table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv",
     output:
         report(
-            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.RNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.RNA.tsv",
             caption="../report/RNA_results.rst",
             category="Results RNA",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
     log:
-        "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
+        "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
     script:
         "../scripts/add_rna_info.py"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9c942bf2..500bc770 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -94,10 +94,11 @@ def get_final_output():
             ]
             final_output.extend(
                 expand(
-                    "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv",
+                    "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{seqtype}.tsv",
                     group=group,
                     tumor_alias=tumor_aliases,
                     tumor_event=config["params"]["microphaser"]["events"]["tumor"],
+                    normal_event=config["params"]["microphaser"]["events"]["normal"],
                     mhc=list(
                         filter(
                             None,
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index fb685cf8..7bb2588c 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -75,22 +75,19 @@ rule build_normal_proteome_db:
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{contig}.tsv",
-        proteome=expand(
-            "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin",
-            normal_event=config["params"]["microphaser"]["events"]["normal"],
-        ),
+        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv",
+        proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
     output:
         mt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.neo.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.normal.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.tsv",
-        removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.tsv",
+        removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.removed.tsv",
     log:
-        "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.log",
+        "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -104,13 +101,13 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{mhc}}.{contig}.tsv",
+            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{{mhc}}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv",
+        "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv",
     log:
-        "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{mhc}.log",
+        "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From a5e854b2c91bd8885e32c8c70664323f0004e48d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 09:17:14 +0000
Subject: [PATCH 051/191] attempt to fix key error

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 500bc770..bb7fe8ff 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -236,7 +236,7 @@ def get_cutadapt_adapters(wildcards):
 
 
 def is_paired_end(sample, seqtype):
-    sample_units = units.loc[sample].loc[seqtype]
+    sample_units = units.loc[(units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)]
     fq2_null = sample_units["fq2"].isnull()
     sra_null = sample_units["sra"].isnull()
     paired = ~fq2_null | ~sra_null

From 2263cc20a2c0099757382cee6c6d72f1b4983fdf Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 09:23:58 +0000
Subject: [PATCH 052/191] snakefmt

---
 workflow/rules/common.smk | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index bb7fe8ff..75d18c16 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -236,7 +236,9 @@ def get_cutadapt_adapters(wildcards):
 
 
 def is_paired_end(sample, seqtype):
-    sample_units = units.loc[(units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)]
+    sample_units = units.loc[
+        (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)
+    ]
     fq2_null = sample_units["fq2"].isnull()
     sra_null = sample_units["sra"].isnull()
     paired = ~fq2_null | ~sra_null

From f30fab9f6457ad7e8a0d89918e890078c2aecf6f Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 09:42:54 +0000
Subject: [PATCH 053/191] fix final output requests for hla typing and string
 formatting in helper functions

---
 workflow/rules/common.smk | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 75d18c16..94b2a9b1 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -119,15 +119,15 @@ def get_final_output():
         if config["HLAtyping"]["HLA_LA"]["activate"]:
             final_output = expand(
                 [
-                    "results/optitype/{sample}/hla_alleles_{sample}.tsv",
-                    "results/HLA-LA/hlaI_{sample}.tsv",
-                    "results/HLA-LA/hlaII_{sample}.tsv",
+                    "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
+                    "results/HLA-LA/{group}.{alias}.hlaI.tsv",
+                    "results/HLA-LA/{group}.{alias}.hlaII.tsv",
                 ],
                 sample=samples["sample_name"],
             )
         else:
             final_output = expand(
-                "results/optitype/{sample}/hla_alleles_{sample}.tsv",
+                "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
                 sample=samples["sample_name"],
             )
     return final_output
@@ -279,10 +279,10 @@ def get_fastqs(wc):
 def get_map_reads_input(sample):
     if is_paired_end(sample, "DNA"):
         return [
-            "results/merged/DNA/{sample}_R1.fastq.gz",
-            "results/merged/DNA/{sample}_R2.fastq.gz",
+            f"results/merged/DNA/{sample}_R1.fastq.gz",
+            f"results/merged/DNA/{sample}_R2.fastq.gz",
         ]
-    return "results/merged/DNA/{sample}_single.fastq.gz"
+    return f"results/merged/DNA/{sample}_single.fastq.gz"
 
 
 def get_read_group(wildcards):
@@ -313,7 +313,7 @@ def get_optitype_reads_input(wildcards):
                 sample=sample,
                 read=["R1", "R2"],
             )
-        return "results/razers3/fastq/{sample}_single.fastq"
+        return f"results/razers3/fastq/{sample}_single.fastq"
     else:
         return get_map_reads_input(sample)
 
@@ -376,10 +376,10 @@ def get_quant_reads_input(wildcards):
     sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
     if is_paired_end(sample, "RNA"):
         return [
-            "results/merged/RNA/{sample}_R1.fastq.gz",
-            "results/merged/RNA/{sample}_R2.fastq.gz",
+            f"results/merged/RNA/{sample}_R1.fastq.gz",
+            f"results/merged/RNA/{sample}_R2.fastq.gz",
         ]
-    return "results/merged/RNA/{sample}_single.fastq.gz"
+    return f"results/merged/RNA/{sample}_single.fastq.gz"
 
 
 def kallisto_params(wildcards, input):

From c56b4635e6574cf0c89af57aae504b58eb34ba13 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 09:43:38 +0000
Subject: [PATCH 054/191] use `group.alias` instead of `sample` for HLA-LA
 `--sampleID`

---
 workflow/rules/HLAtyping.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index d8513699..f6151cd4 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -14,7 +14,7 @@ rule HLA_LA:
     conda:
         "../envs/hla_la.yaml"
     shell:
-        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
+        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}.{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
 
 
 rule parse_HLA_LA:

From b1a3c629c1514a4ced1dd3ea356f8298ad8cd48e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 11 May 2022 13:59:26 +0000
Subject: [PATCH 055/191] keep on trying

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 94b2a9b1..3a46ca53 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -236,7 +236,7 @@ def get_cutadapt_adapters(wildcards):
 
 
 def is_paired_end(sample, seqtype):
-    sample_units = units.loc[
+    sample_units = units[
         (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)
     ]
     fq2_null = sample_units["fq2"].isnull()

From f1c492f7972675d34c54056cf479cc397361fa94 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 12 May 2022 07:45:31 +0000
Subject: [PATCH 056/191] fix HLA_LA sampleID

---
 workflow/rules/HLAtyping.smk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index f6151cd4..70afe0f5 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -4,22 +4,22 @@ rule HLA_LA:
         bai=get_bam_from_group_and_alias(ext=".bai"),
         index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
     output:
-        "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt",
+        "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     threads: 7
     log:
-        "logs/HLA-LA/{group}.{alias}.log",
+        "logs/HLA-LA/{group}_{alias}.log",
     params:
         graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
         graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
     conda:
         "../envs/hla_la.yaml"
     shell:
-        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}.{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
+        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
 
 
 rule parse_HLA_LA:
     input:
-        "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt",
+        "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     output:
         report(
             "results/HLA-LA/{group}.{alias}.hlaI.tsv",

From 8a8286030215b5a37d594f9e2b55444eddb52e7b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 12 May 2022 07:50:18 +0000
Subject: [PATCH 057/191] revert get_map_reads_input to work on wildcards, as
 it is (also) an input function

---
 workflow/rules/common.smk | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3a46ca53..dcee2414 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -236,9 +236,7 @@ def get_cutadapt_adapters(wildcards):
 
 
 def is_paired_end(sample, seqtype):
-    sample_units = units[
-        (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)
-    ]
+    sample_units = units.loc[sample].loc[seqtype]
     fq2_null = sample_units["fq2"].isnull()
     sra_null = sample_units["sra"].isnull()
     paired = ~fq2_null | ~sra_null
@@ -276,13 +274,13 @@ def get_fastqs(wc):
     return units.loc[wc.sample].loc[wc.seqtype, fq].tolist()
 
 
-def get_map_reads_input(sample):
-    if is_paired_end(sample, "DNA"):
+def get_map_reads_input(wildcards):
+    if is_paired_end(wildcards.sample, "DNA"):
         return [
-            f"results/merged/DNA/{sample}_R1.fastq.gz",
-            f"results/merged/DNA/{sample}_R2.fastq.gz",
+            f"results/merged/DNA/{wildcards.sample}_R1.fastq.gz",
+            f"results/merged/DNA/{wildcards.sample}_R2.fastq.gz",
         ]
-    return f"results/merged/DNA/{sample}_single.fastq.gz"
+    return f"results/merged/DNA/{wildcards.sample}_single.fastq.gz"
 
 
 def get_read_group(wildcards):
@@ -315,7 +313,8 @@ def get_optitype_reads_input(wildcards):
             )
         return f"results/razers3/fastq/{sample}_single.fastq"
     else:
-        return get_map_reads_input(sample)
+        wildcards["sample"] = sample
+        return get_map_reads_input(wildcards)
 
 
 def get_oncoprint_batch(wildcards):

From a456dc52c4ed8a17467f5c35f6b3f40699f9ce4d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 17 May 2022 14:27:21 +0000
Subject: [PATCH 058/191] first attempt to reduce workflow to essentials
 (excluding everything up to variant calling), currently not working

---
 config/config.yaml                         |  53 +----
 workflow/Snakefile                         |   9 -
 workflow/rules/MHC_binding.smk             |  34 +--
 workflow/rules/RNA.smk                     |  71 ------
 workflow/rules/annotation.smk              |  21 --
 workflow/rules/calling.smk                 | 140 ------------
 workflow/rules/common.smk                  | 237 ++-------------------
 workflow/rules/mapping.smk                 |  70 ------
 workflow/rules/microphaser.smk             | 107 +++++++---
 workflow/rules/oncoprint.smk               |  28 ---
 workflow/rules/phylogeny.smk               |  12 +-
 workflow/rules/ref.smk                     | 117 ----------
 workflow/rules/tmb.smk                     |  20 --
 workflow/rules/trim.smk                    |  66 ------
 workflow/rules/utils.smk                   |  18 +-
 workflow/rules/vega.smk                    |  15 --
 workflow/schemas/config.schema.yaml        | 103 ++-------
 workflow/scripts/build_oncoprint_matrix.py |  42 ----
 workflow/scripts/oncoprint.R               |  56 -----
 workflow/scripts/render-scenario.py        |   9 -
 20 files changed, 137 insertions(+), 1091 deletions(-)
 delete mode 100644 workflow/rules/RNA.smk
 delete mode 100644 workflow/rules/annotation.smk
 delete mode 100644 workflow/rules/calling.smk
 delete mode 100644 workflow/rules/mapping.smk
 delete mode 100644 workflow/rules/oncoprint.smk
 delete mode 100644 workflow/rules/tmb.smk
 delete mode 100644 workflow/rules/trim.smk
 delete mode 100644 workflow/rules/vega.smk
 delete mode 100644 workflow/scripts/build_oncoprint_matrix.py
 delete mode 100644 workflow/scripts/oncoprint.R
 delete mode 100644 workflow/scripts/render-scenario.py

diff --git a/config/config.yaml b/config/config.yaml
index a4fe01ee..3bf1b781 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,29 +1,6 @@
 samples: "config/samples.tsv"
 units: "config/units.tsv"
 
-# boolean if read trimming should be skipped
-trimming:
-  activate: false
-
-remove_duplicates:
-  activate: true
-
-fusion:
-  arriba:
-    activate: false
-    blacklist:
-     "arriba_blacklist"
-    params:
-      "-T -P"
-
-tmb:
-  activate: false
-  coding_genome_size: 3e7
-  # Name of the tumor sample in the scenario.yaml.
-  tumor_sample: tumor
-  somatic_events:
-    - somatic
-
 
 epitope_prediction:
   activate: true
@@ -43,11 +20,11 @@ affinity:
 HLAtyping:
   # activate to use razers3 to pre-filter reads before using optitype
   optitype_prefiltering:
-    activate: false
+    activate: True
   optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
   # activate to predict MHC-I and MHC-II alleles with HLA-LA
   HLA_LA:
-    activate: false
+    activate: true
 
 
 ref:
@@ -71,20 +48,6 @@ annotations:
       - LoFtool
 
 params:
-  cutadapt: ""
-  bwa:
-    "-M"
-  picard:
-    MarkDuplicates:
-      "VALIDATION_STRINGENCY=LENIENT"
-  gatk:
-    BaseRecalibrator: "--tmp-dir tmp"
-    applyBQSR: ""
-  strelka:
-    config:
-      "--exome"
-    run:
-      "--mode local"
   razers3:
     "-i 95 -m 1 -dr 0"
   optitype:
@@ -97,12 +60,6 @@ params:
           9
       netMHCIIpan:
           15
-    events:
-      tumor: "strelka_somatic"
-      normal: "strelka_germline"
-  kallisto:
-    "-b 100"
-  star: >-
-    --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip
-    --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0
-    --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3
+    variant_sets:
+      normal: "normal_only"
+      tumor: "tumor_only"
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 7a5270dc..f75eba84 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -27,21 +27,12 @@ container: "docker://continuumio/miniconda3"
 
 include: "rules/common.smk"
 include: "rules/utils.smk"
-include: "rules/trim.smk"
 include: "rules/ref.smk"
-include: "rules/mapping.smk"
-include: "rules/calling.smk"
-include: "rules/annotation.smk"
 include: "rules/microphaser.smk"
 include: "rules/HLAtyping.smk"
 include: "rules/MHC_binding.smk"
-include: "rules/RNA.smk"
-include: "rules/tmb.smk"
-include: "rules/vega.smk"
 
 
 rule all:
     input:
         get_final_output(),
-        get_fusion_output(),
-        get_tmb_targets(),
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index e8f2d0aa..2e80b180 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -18,12 +18,12 @@
 
 rule netMHCpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
@@ -35,12 +35,12 @@ rule netMHCpan:
 
 rule netMHCIIpan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCIIpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls",
+        "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
     log:
-        "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log",
+        "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
@@ -53,13 +53,13 @@ rule netMHCIIpan:
 rule parse_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{contig}.{{peptide_type}}.xls",
+            "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.xls",
             contig=contigs,
         ),
     output:
-        "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.{peptide_type}.tsv",
+        "results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv",
     log:
-        "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{peptide_type}.log",
+        "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log",
     script:
         "../scripts/group_mhc_output.py"
 
@@ -81,17 +81,17 @@ rule parse_mhc_out:
 
 rule mhc_csv_table:
     input:
-        info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv",
-        neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.neo.tsv",
-        normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.normal.tsv",
+        info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
+        neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv",
+        normal="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.normal.tsv",
     output:
         report(
-            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
             caption="../report/WES_results.rst",
             category="Results WES (netMHC)",
         ),
     log:
-        "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
+        "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     script:
         "../scripts/merge_data.py"
 
@@ -110,16 +110,16 @@ rule mhc_csv_table:
 rule add_RNA_info:
     input:
         counts="results/kallisto/{group}.{tumor_alias}",
-        table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv",
+        table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
     output:
         report(
-            "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.RNA.tsv",
+            "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv",
             caption="../report/RNA_results.rst",
             category="Results RNA",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
     log:
-        "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
+        "logs/add-RNA/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     script:
         "../scripts/add_rna_info.py"
diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk
deleted file mode 100644
index 2a927c18..00000000
--- a/workflow/rules/RNA.smk
+++ /dev/null
@@ -1,71 +0,0 @@
-rule kallisto_quant:
-    input:
-        fastq=get_quant_reads_input,
-        index="resources/kallisto/transcripts.idx",
-    output:
-        directory("results/kallisto/{group}.{tumor_alias}"),
-    params:
-        extra=kallisto_params,
-    log:
-        "results/logs/kallisto/quant/{group}.{tumor_alias}.log",
-    wrapper:
-        "0.60.1/bio/kallisto/quant"
-
-
-rule STAR_align:
-    input:
-        "resources/STAR_index",
-        fq1=lambda wc: units.loc[(wc.sample, "RNA"), "fq1"],
-        fq2=lambda wc: units.loc[(wc.sample, "RNA"), "fq2"],
-        gtf="resources/genome.gtf",
-    output:
-        # see STAR manual for additional output files
-        "results/star/{sample}/Aligned.out.bam",
-        "results/star/{sample}/ReadsPerGene.out.tab",
-    log:
-        "logs/star/{sample}.log",
-    params:
-        # path to STAR reference genome index
-        index="STAR_index",
-        # optional parameters - designed to get chimeric alignments for fusion detection
-        extra=lambda wc, input: "--quantMode GeneCounts --sjdbGTFfile {} {}".format(
-            input.gtf, config["params"]["star"]
-        ),
-    threads: 8
-    wrapper:
-        "0.42.0/bio/star/align"
-
-
-rule arriba:
-    input:
-        bam="results/star/{sample}/Aligned.out.bam",
-        genome="resources/genome.fasta",
-        annotation="resources/genome.gtf",
-    output:
-        fusions="results/arriba/{sample}.fusions.tsv",
-        discarded="results/arriba/{sample}.fusions.discarded.tsv",
-    params:
-        blacklist=config["fusion"]["arriba"]["blacklist"],
-        extra=config["fusion"]["arriba"]["params"],
-    log:
-        "results/logs/arriba/{sample}.log",
-    threads: 1
-    wrapper:
-        "0.60.1/bio/arriba"
-
-
-## TODO: Update
-# rule fusioncatcher:
-#    input:
-#        fq1=lambda w: units.loc[(w.sample, "RNA"), "fq1"],
-#        fq2=lambda w: units.loc[(w.sample, "RNA"), "fq2"]
-#    output:
-#        directory("fusioncatcher/{sample}")
-#    params:
-#        extra="-T tmp -d ../../fusioncatcher_data"
-#    log:
-#        "logs/fusioncatcher/{sample}.log"
-#    threads:
-#        8
-#    shell:
-#        "fusioncatcher -i {input.fq1},{input.fq2} -o {output} {params.extra} -p {threads} > {log}"
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
deleted file mode 100644
index 271ab427..00000000
--- a/workflow/rules/annotation.smk
+++ /dev/null
@@ -1,21 +0,0 @@
-rule annotate_strelka_variants:
-    input:
-        calls="results/strelka/{calls}.bcf",
-        cache="resources/vep/cache",
-        plugins="resources/vep/plugins",
-    output:
-        calls="results/strelka/{calls}.annotated.bcf",
-        stats=report(
-            "results/strelka/{calls}.annotated.stats.html",
-            caption="../report/stats.rst",
-            category="QC",
-        ),
-    params:
-        # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
-        # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
-        plugins=[],  #config["annotations"]["vep"]["plugins"],
-        extra="--vcf_info_field ANN --hgvs --symbol --canonical",
-    log:
-        "logs/vep/{calls}.strelka.annotate.log",
-    wrapper:
-        "0.59.2/bio/vep/annotate"
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
deleted file mode 100644
index 9fca7233..00000000
--- a/workflow/rules/calling.smk
+++ /dev/null
@@ -1,140 +0,0 @@
-rule strelka_tumor:
-    input:
-        normal=get_normal_bam(),
-        normal_index=get_normal_bam(ext=".bai"),
-        tumor=get_tumor_bam_from_group_and_alias(),
-        tumor_index=get_tumor_bam_from_group_and_alias(ext=".bai"),
-        fasta="resources/genome.fasta",
-        fasta_index="resources/genome.fasta.fai",
-        callregions="resources/genome.callregions.bed.gz",
-    output:
-        "results/strelka/{group}.{tumor_alias}.strelka_somatic.snvs.vcf.gz",
-        "results/strelka/{group}.{tumor_alias}.strelka_somatic.indels.vcf.gz",
-    log:
-        "logs/calling/strelka/{group}.{tumor_alias}.strelka_somatic.log",
-    params:
-        config_extra="--callRegions {} {}".format(
-            "resources/genome.callregions.bed.gz",
-            config["params"]["strelka"]["config"],
-        ),
-        run_extra=config["params"]["strelka"]["run"],
-    threads: 22
-    wrapper:
-        "0.65.0/bio/strelka/somatic"
-
-
-rule strelka_germline:
-    input:
-        bam=get_normal_bam(),
-        normal_index=get_normal_bam(ext=".bai"),
-        fasta="resources/genome.fasta",
-        fasta_index="resources/genome.fasta.fai",
-        callregions="resources/genome.callregions.bed.gz",
-    output:
-        "results/strelka/{group}.normal.strelka_germline.variants.vcf.gz",
-    log:
-        "logs/calling/strelka_germline/{group}.log",
-    params:
-        config_extra="--callRegions {} {}".format(
-            "resources/genome.callregions.bed.gz",
-            config["params"]["strelka"]["config"],
-        ),
-        run_extra="",
-    threads: 22
-    wrapper:
-        "0.65.0/bio/strelka/germline"
-
-
-rule vcf_to_bcf:
-    input:
-        "{variants}.vcf.gz",
-    output:
-        "{variants}.output.bcf",
-    log:
-        "logs/bcftools/to-bcf/{variants}.log",
-    params:
-        "-O b -f PASS",
-    wrapper:
-        "0.60.0/bio/bcftools/view"
-
-
-rule concat_somatic:
-    input:
-        calls=expand(
-            "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf",
-            type=["snvs", "indels"],
-        ),
-        indices=expand(
-            "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf.csi",
-            type=["snvs", "indels"],
-        ),
-    output:
-        "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf",
-    log:
-        "bcftools/concat_somatic/{group}.{tumor_alias}.log",
-    params:
-        "-O b -a",
-    wrapper:
-        "0.60.0/bio/bcftools/concat"
-
-
-rule get_tumor_from_somatic:
-    input:
-        "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf",
-    output:
-        "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf",
-    log:
-        "logs/bcftools/get_tumor_from_somatic/{group}.{tumor_alias}.strelka_somatic.tumor.log",
-    params:
-        "-O b -s TUMOR",
-    wrapper:
-        "0.60.0/bio/bcftools/view"
-
-
-rule reheader_germline:
-    input:
-        vcf="results/strelka/{group}.normal.strelka_germline.variants.output.bcf",
-        samples="resources/sampleheader.txt",
-    output:
-        "results/strelka/{group}.normal.strelka_germline.variants.reheader.bcf",
-    log:
-        "logs/bcftools/reheader_germline/{group}.normal.log",
-    params:
-        extra="",
-        view_extra="-O b",
-    wrapper:
-        "0.60.0/bio/bcftools/reheader"
-
-
-rule concat_variants:
-    input:
-        calls=[
-            "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf",
-            "results/strelka/{group}.strelka_germline.variants.reheader.bcf",
-        ],
-        index=[
-            "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf.csi",
-            "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi",
-        ],
-    output:
-        "results/strelka/merged/{group}.{tumor_alias}.strelka_somatic.strelka_germline.bcf",
-    log:
-        "bcftools/concat_variants/{group}.{tumor_alias}.strelka_somatic.strelka_germline.log",
-    params:
-        extra="-O b -a",
-    wrapper:
-        "0.64.0/bio/bcftools/concat"
-
-
-rule norm_vcf:
-    input:
-        "{prefix}.bcf",
-        genome="resources/genome.fasta",
-    output:
-        "{prefix}.norm.bcf",
-    log:
-        "logs/bcftools/norm/{prefix}.log",
-    params:
-        "-f {} -O b -m-".format("resources/genome.fasta"),  # optional parameters for bcftools norm (except -o)
-    wrapper:
-        "0.65.0/bio/bcftools/norm"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index dcee2414..9e25bb50 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -94,11 +94,9 @@ def get_final_output():
             ]
             final_output.extend(
                 expand(
-                    "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{seqtype}.tsv",
+                    "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
                     group=group,
                     tumor_alias=tumor_aliases,
-                    tumor_event=config["params"]["microphaser"]["events"]["tumor"],
-                    normal_event=config["params"]["microphaser"]["events"]["normal"],
                     mhc=list(
                         filter(
                             None,
@@ -133,108 +131,12 @@ def get_final_output():
     return final_output
 
 
-def get_fusion_output():
-    if config["fusion"]["arriba"]["activate"]:
-        fusion_output = expand(
-            "results/fusion/arriba/{sample}.fusions.tsv",
-            sample=units[units["sequencing_type"] == "RNA"]["sample_name"],
-        )
-    else:
-        fusion_output = []
-    return fusion_output
-
-
-def get_tmb_targets():
-    if is_activated("tmb"):
-        return expand(
-            "results/plots/tmb/{group}.{mode}.svg",
-            group=samples[(samples.alias == "tumor")]["sample_name"],
-            mode=config["tmb"].get("mode", "curve"),
-        )
-    else:
-        return []
-
-
 caller = list(
     filter(None, ["freebayes" if is_activated("calling/freebayes") else None])
 )
 
 ### helper functions ###
 
-## alignment ##
-
-
-def get_cutadapt_input(wildcards):
-    unit = units.loc[wildcards.sample].loc[wildcards.seqtype].loc[wildcards.unit]
-
-    if pd.isna(unit["fq1"]):
-        # SRA sample (always paired-end for now)
-        accession = unit["sra"]
-        return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
-
-    if unit["fq1"].endswith("gz"):
-        ending = ".gz"
-    else:
-        ending = ""
-
-    if pd.isna(unit["fq2"]):
-        # single end local sample
-        return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format(
-            S=unit.sample_name,
-            U=unit.unit_name,
-            T=unit.sequencing_type,
-            E=ending,
-        )
-    else:
-        # paired end local sample
-        return expand(
-            "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format(
-                S=unit.sample_name,
-                U=unit.unit_name,
-                T=unit.sequencing_type,
-                E=ending,
-            ),
-            read=["fq1", "fq2"],
-        )
-
-
-def get_cutadapt_pipe_input(wildcards):
-    pattern = (
-        units.loc[wildcards.sample]
-        .loc[wildcards.seqtype]
-        .loc[wildcards.unit, wildcards.fq]
-    )
-    if "*" in pattern:
-        files = sorted(
-            glob.glob(
-                units.loc[wildcards.sample]
-                .loc[wildcards.seqtype]
-                .loc[wildcards.unit, wildcards.fq]
-            )
-        )
-        if not files:
-            raise ValueError(
-                "No raw fastq files found for unit pattern {} (sample {}, sequencing type {}). "
-                "Please check the your sample sheet.".format(
-                    wildcards.unit, wildcards.sample, wildcards.seqtype
-                )
-            )
-    else:
-        files = [pattern]
-    return files
-
-
-def get_cutadapt_adapters(wildcards):
-    unit = units.loc[wildcards.sample].loc[wildcards.unit]
-    try:
-        adapters = unit["adapters"]
-        if isinstance(adapters, str):
-            return adapters
-        return ""
-    except KeyError:
-        return ""
-
-
 def is_paired_end(sample, seqtype):
     sample_units = units.loc[sample].loc[seqtype]
     fq2_null = sample_units["fq2"].isnull()
@@ -250,56 +152,11 @@ def is_paired_end(sample, seqtype):
     return all_paired
 
 
-def get_fastqs(wc):
-    if config["trimming"]["activate"]:
-        return expand(
-            "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz",
-            unit=units.loc[
-                (units["sequencing_type"] == wc.seqtype)
-                & (units["sample_name"] == wc.sample),
-                "unit_name",
-            ],
-            sample=wc.sample,
-            read=wc.read,
-            seqtype=wc.seqtype,
-        )
-    unit = units.loc[wc.sample].loc[wc.seqtype]
-    if all(pd.isna(unit["fq1"])):
-        # SRA sample (always paired-end for now)
-        accession = unit["sra"]
-        return expand(
-            "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1]
-        )
-    fq = "fq{}".format(wc.read[-1])
-    return units.loc[wc.sample].loc[wc.seqtype, fq].tolist()
-
-
-def get_map_reads_input(wildcards):
-    if is_paired_end(wildcards.sample, "DNA"):
-        return [
-            f"results/merged/DNA/{wildcards.sample}_R1.fastq.gz",
-            f"results/merged/DNA/{wildcards.sample}_R2.fastq.gz",
-        ]
-    return f"results/merged/DNA/{wildcards.sample}_single.fastq.gz"
-
-
-def get_read_group(wildcards):
-    """Denote sample name and platform in read group."""
-    return r"-R '@RG\tID:{sample}\tSM:{sample}\tPL:{platform}'".format(
-        sample=wildcards.sample, platform=samples.loc[wildcards.sample, "platform"]
-    )
-
-
-def get_recalibrate_quality_input(bai=False):
-    ext = ".bai" if bai else ""
-
-    def inner(wildcards):
-        if is_activated("remove_duplicates"):
-            return f"results/dedup/{wildcards.sample}.sorted.bam{ext}"
-        else:
-            return f"results/mapped/{wildcards.sample}.sorted.bam{ext}"
-
-    return inner
+def get_sample_from_group_and_alias(group, alias):
+    sample = samples.loc[
+        (samples["group"] == group) & (samples["alias"] == alias), "sample_name"
+    ].squeeze()
+    return sample
 
 
 def get_optitype_reads_input(wildcards):
@@ -317,82 +174,23 @@ def get_optitype_reads_input(wildcards):
         return get_map_reads_input(wildcards)
 
 
-def get_oncoprint_batch(wildcards):
-    if wildcards.batch == "all":
-        groups = samples[samples["alias"] == "tumor"]["sample_name"].unique()
-    else:
-        groups = samples.loc[
-            samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch,
-            "group",
-        ].unique()
-    return expand(
-        "results/merged-calls/{group}.{{event}}.fdr-controlled.bcf", group=groups
-    )
-
-
-def get_tabix_params(wildcards):
-    if wildcards.format == "vcf":
-        return "-p vcf"
-    if wildcards.format == "txt":
-        return "-s 1 -b 2 -e 2"
-    raise ValueError("Invalid format for tabix: {}".format(wildcards.format))
-
-
-def get_sample_from_group_and_alias(group, alias):
-    sample = samples.loc[
-        (samples["group"] == group) & (samples["alias"] == alias), "sample_name"
-    ].squeeze()
-    return sample
-
-
-def get_normal_bam(ext=".bam"):
-    def inner(wildcards):
-        normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal")
-        return f"results/recal/{normal_sample}.sorted{ext}"
-
-    return inner
-
-
-def get_tumor_bam_from_group_and_alias(ext=".bam"):
-    def inner(wildcards):
-        tumor_sample = get_sample_from_group_and_alias(
-            wildcards.group, wildcards.tumor_alias
-        )
-        return f"results/recal/{tumor_sample}.sorted{ext}"
-
-    return inner
-
-
 def get_bam_from_group_and_alias(ext=".bam"):
     def inner(wildcards):
-        sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias)
+        alias = wildcards.get("alias",
+            wildcards.get("tumor_alias",
+                wildcards.get("normal_alias", "unknown")
+            )
+        )
+        if alias == "unknown":
+            raise CustomException(
+                "get_bam_from_group_and_alias() requires on of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'."
+            )
+        sample = get_sample_from_group_and_alias(wildcards.group, alias)
         return f"results/recal/{sample}.sorted{ext}"
 
     return inner
 
 
-def get_quant_reads_input(wildcards):
-    sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias)
-    if is_paired_end(sample, "RNA"):
-        return [
-            f"results/merged/RNA/{sample}_R1.fastq.gz",
-            f"results/merged/RNA/{sample}_R2.fastq.gz",
-        ]
-    return f"results/merged/RNA/{sample}_single.fastq.gz"
-
-
-def kallisto_params(wildcards, input):
-    extra = config["params"]["kallisto"]
-    if len(input.fastq) == 1:
-        extra += " --single"
-        extra += (
-            " --fragment-length {unit.fragment_len_mean} " "--sd {unit.fragment_len_sd}"
-        ).format(unit=units.loc[(wildcards.sample, wildcards.unit)])
-    else:
-        extra += " --fusion"
-    return extra
-
-
 def get_alleles_MHCI(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
@@ -405,5 +203,6 @@ def get_alleles_MHCI(wildcards):
 def get_alleles_MHCII(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
-        "results/HLA-LA/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias
+        #TODO: check that hlaII is correct here, and not hlaI which it previously was
+        "results/HLA-LA/{group}.{alias}.hlaII.tsv", group=wildcards.group, alias=alias
     )
diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
deleted file mode 100644
index 13be68ad..00000000
--- a/workflow/rules/mapping.smk
+++ /dev/null
@@ -1,70 +0,0 @@
-rule map_reads:
-    input:
-        reads=get_map_reads_input,
-        idx=rules.bwa_index.output,
-    output:
-        temp("results/mapped/{sample}.sorted.bam"),
-    log:
-        "logs/bwa_mem/{sample}.log",
-    params:
-        index=lambda w, input: os.path.splitext(input.idx[0])[0],
-        extra=get_read_group,
-        sort="samtools",
-        sort_order="coordinate",
-    threads: 8
-    wrapper:
-        "0.56.0/bio/bwa/mem"
-
-
-rule mark_duplicates:
-    input:
-        "results/mapped/{sample}.sorted.bam",
-    output:
-        bam=temp("results/dedup/{sample}.sorted.bam"),
-        metrics="results/qc/dedup/{sample}.metrics.txt",
-    log:
-        "logs/picard/dedup/{sample}.log",
-    params:
-        config["params"]["picard"]["MarkDuplicates"],
-    wrapper:
-        "0.39.0/bio/picard/markduplicates"
-
-
-rule recalibrate_base_qualities:
-    input:
-        bam=get_recalibrate_quality_input(),
-        bai=get_recalibrate_quality_input(bai=True),
-        ref="resources/genome.fasta",
-        ref_dict="resources/genome.dict",
-        ref_fai="resources/genome.fasta.fai",
-        known="resources/variation.noiupac.vcf.gz",
-        tbi="resources/variation.noiupac.vcf.gz.tbi",
-    output:
-        recal_table=temp("results/recal/{sample}.grp"),
-    params:
-        extra=config["params"]["gatk"]["BaseRecalibrator"],
-        java_opts="",
-    log:
-        "logs/gatk/baserecalibrator/{sample}.log",
-    threads: 8
-    wrapper:
-        "0.62.0/bio/gatk/baserecalibratorspark"
-
-
-rule apply_bqsr:
-    input:
-        bam=get_recalibrate_quality_input(),
-        bai=get_recalibrate_quality_input(bai=True),
-        ref="resources/genome.fasta",
-        ref_dict="resources/genome.dict",
-        ref_fai="resources/genome.fasta.fai",
-        recal_table="results/recal/{sample}.grp",
-    output:
-        bam=protected("results/recal/{sample}.sorted.bam"),
-    log:
-        "logs/gatk/gatk_applybqsr/{sample}.log",
-    params:
-        extra=config["params"]["gatk"]["applyBQSR"],  # optional
-        java_opts="",  # optional
-    wrapper:
-        "0.62.0/bio/gatk/applybqsr"
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 7bb2588c..044b825d 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,68 +1,108 @@
+rule norm_bcf:
+    input:
+        "results/final-calls/{group}.{set}.bcf",
+        genome="resources/genome.fasta",
+    output:
+        "results/final-calls/{group}.{set}.norm.bcf",
+    log:
+        "logs/bcftools/norm/{group}.{set}.log",
+    params:
+        lambda w, input: "-f {} -O b -m-".format(input.genome),  # optional parameters for bcftools norm (except -o)
+    wrapper:
+        "0.65.0/bio/bcftools/norm"
+
+
+rule merge_tumor_normal:
+    input:
+        calls=expand(
+            "results/final-calls/{{group}}.{sets}.norm.bcf",
+            sets=[
+                config["params"]["microphaser"]["variant_sets"]["normal"],
+                config["params"]["microphaser"]["variant_sets"]["tumor"],
+            ],
+        ),
+        index=expand(
+            "results/final-calls/{{group}}.{sets}.norm.csi",
+            sets=[
+                config["params"]["microphaser"]["variant_sets"]["normal"],
+                config["params"]["microphaser"]["variant_sets"]["tumor"],
+            ],
+        ),
+    output:
+        "results/final-calls/{group}.merged_tumor_normal.norm.bcf",
+    log:
+        "bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log",
+    params:
+        extra="-O b -a",
+    wrapper:
+        "0.64.0/bio/bcftools/concat"
+
+
 rule microphaser_tumor:
     input:
-        vcf="results/strelka/merged/{group}.{tumor_alias}.{tumor_event}.{normal_event}.norm.annotated.bcf",
-        bam=get_tumor_bam_from_group_and_alias(),
-        bai=get_tumor_bam_from_group_and_alias(ext=".bai"),
+        bcf="results/final-calls/{group}.merged_tumor_normal.norm.annotated.bcf",
+        bam=get_bam_from_group_and_alias(),
+        bai=get_bam_from_group_and_alias(ext=".bai"),
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.neo.fa",
-        wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.normal.fa",
-        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv",
+        mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.normal.fa",
+        tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv",
     log:
-        "logs/microphaser_tumor/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.log",
+        "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
         window_length=config["params"]["microphaser"]["window_len"],
     shell:
-        "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
+        "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
 
 
 rule microphaser_normal:
     input:
-        vcf="results/strelka/{group}.normal.{normal_event}.variants.reheader.norm.bcf",
-        bam=get_normal_bam(),
-        bai=get_normal_bam(ext=".bam.bai"),
+        bcf="results/final-calls/{group}.{normal_set}.variants.reheader.norm.bcf",
+        bam=get_bam_from_group_and_alias(),
+        bai=get_bam_from_group_and_alias(ext=".bai"),
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        wt_fasta=("results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"),
-        wt_tsv=("results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"),
+        wt_fasta=("results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"),
+        wt_tsv=("results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"),
     log:
-        "logs/microphaser_germline/{group}/{normal_event}-{contig}.log",
+        "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
         window_length=config["params"]["microphaser"]["window_len"],
     shell:
-        "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
+        "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"
 
 
 rule concat_normal_proteome:
     input:
         expand(
-            "results/microphaser/fasta/{{group}}/normal.{{normal_event}}.{contig}.fa",
+            "results/microphaser/fasta/{{group}}/normal.{{normal_set}}.{contig}.fa",
             contig=contigs,
         ),
     output:
-        "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa",
     log:
-        "logs/microphaser/concat_normal_proteome/{group}.{normal_event}.log",
+        "logs/microphaser/concat_normal_proteome/{group}.{normal_set}.log",
     shell:
         "cat {input} > {output} 2> {log}"
 
 
 rule build_normal_proteome_db:
     input:
-        "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa",
     output:
-        bin="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
-        fasta="results/microphaser/fasta/{group}.{normal_event}.{mhc}.normal_proteome.peptides.fasta",
+        bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin",
+        fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta",
     log:
-        "logs/microphaser/build_normal_proteome_db/{group}.{normal_event}-{mhc}.log",
+        "logs/microphaser/build_normal_proteome_db/{group}.{normal_set}-{mhc}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -75,19 +115,22 @@ rule build_normal_proteome_db:
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv",
-        proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin",
+        tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv",
+        proteome=expand(
+            "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin",
+            normal_set=config["params"]["microphaser"]["variant_sets"]["normal"],
+        ),
     output:
         mt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.neo.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.normal.fa"
+            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.tsv",
-        removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.tsv",
+        removed="results/microphaser/info/removed/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.removed.tsv",
     log:
-        "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.log",
+        "logs/microphaser_filter/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -101,13 +144,13 @@ rule microphaser_filter:
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{{mhc}}.{contig}.tsv",
+            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.merged_tumor_normal.{{mhc}}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv",
+        "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
     log:
-        "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log",
+        "logs/concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     conda:
         "../envs/xsv.yaml"
     shell:
diff --git a/workflow/rules/oncoprint.smk b/workflow/rules/oncoprint.smk
deleted file mode 100644
index aa670044..00000000
--- a/workflow/rules/oncoprint.smk
+++ /dev/null
@@ -1,28 +0,0 @@
-rule build_oncoprint_table:
-    input:
-        bcf=get_oncoprint_batch,
-    output:
-        "plots/oncoprint/{batch}.{event}.tsv",
-    log:
-        "logs/oncoprint/{batch}.{event}.table.log",
-    conda:
-        "../envs/oncoprinttable.yaml"
-    script:
-        "../scripts/build_oncoprint_matrix.py"
-
-
-rule plot_oncoprint:
-    input:
-        "plots/oncoprint/{batch}.{event}.tsv",
-    output:
-        report(
-            "plots/oncoprint/{batch}.{event}.pdf",
-            category="Oncoprint",
-            caption="../report/oncoprint.rst",
-        ),
-    log:
-        "logs/oncoprint/{batch}.{event}.plot.log",
-    conda:
-        "../envs/oncoprint.yaml"
-    script:
-        "../scripts/oncoprint.R"
diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk
index d7401bf8..63d2ad4a 100644
--- a/workflow/rules/phylogeny.smk
+++ b/workflow/rules/phylogeny.smk
@@ -1,6 +1,6 @@
 def get_somatic_calls(wildcards):
     return expand(
-        "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
+        "results/final-calls/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
         sample=samples[samples.alias == "tumor"]["sample_name"],
     )
 
@@ -9,20 +9,20 @@ rule merge_snvs:
     input:
         calls=get_somatic_calls,
     output:
-        "results/strelka/merged_calls.vcf",
+        "results/final-calls/merged_calls.vcf",
     log:
         "results/logs/bcftools/merge.log",
     params:
-        "--use-header strelka/sampleheader.txt --force-samples",
+        "--use-header final-calls/sampleheader.txt --force-samples",
     wrapper:
         "0.36.0/bio/bcftools/merge"
 
 
 rule query:
     input:
-        "results/strelka/merged_calls.vcf",
+        "results/final-calls/merged_calls.vcf",
     output:
-        "results/strelka/call_matrix.tsv",
+        "results/final-calls/call_matrix.tsv",
     log:
         "results/logs/bcftools/query.log",
     params:
@@ -35,7 +35,7 @@ rule query:
 
 rule nj_tree:
     input:
-        matrix="results/strelka/call_matrix.tsv",
+        matrix="results/final-calls/call_matrix.tsv",
     output:
         pdf="results/plots/phylogeny_njtree.pdf",
     log:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 64cd6e06..513ea5dd 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -28,20 +28,6 @@ rule get_cdna:
         "0.45.1/bio/reference/ensembl-sequence"
 
 
-rule kallisto_index:
-    input:
-        "resources/genome.cdna.fasta",
-    output:
-        "resources/kallisto/transcripts.idx",
-    params:
-        extra="",
-    log:
-        "logs/kallisto/index.log",
-    cache: True
-    wrapper:
-        "0.60.1/bio/kallisto/index"
-
-
 rule get_annotation:
     output:
         "resources/genome.gtf",
@@ -58,23 +44,6 @@ rule get_annotation:
         "0.45.1/bio/reference/ensembl-annotation"
 
 
-rule STAR_index:
-    input:
-        fasta="resources/genome.fasta",
-        gtf="resources/genome.gtf",
-    output:
-        directory("resources/STAR_index"),
-    params:
-        sjdb_overhang="100",
-        extra="",
-    log:
-        "logs/star/index.log",
-    threads: 32
-    cache: True
-    wrapper:
-        "0.42.0/bio/star/index"
-
-
 rule split_annotation:
     input:
         "resources/genome.gtf",
@@ -110,66 +79,6 @@ rule genome_dict:
         "0.45.1/bio/picard/createsequencedictionary"
 
 
-rule get_callregions:
-    input:
-        "resources/genome.fasta.fai",
-    output:
-        "resources/genome.callregions.bed.gz",
-    log:
-        "logs/get-callregions.log",
-    params:
-        n_contigs=config["ref"]["n_chromosomes"],
-    conda:
-        "../envs/htslib.yaml"
-    shell:
-        "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})"
-        " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}"
-
-
-rule get_known_variants:
-    input:
-        # use fai to annotate contig lengths for GATK BQSR
-        fai="resources/genome.fasta.fai",
-    output:
-        vcf="resources/variation.vcf.gz",
-    log:
-        "logs/get-known-variants.log",
-    params:
-        species=config["ref"]["species"],
-        release=config["ref"]["release"],
-        build=config["ref"]["build"],
-        type="all",
-    cache: True
-    wrapper:
-        "0.59.2/bio/reference/ensembl-variation"
-
-
-rule remove_iupac_codes:
-    input:
-        "resources/variation.vcf.gz",
-    output:
-        "resources/variation.noiupac.vcf.gz",
-    log:
-        "logs/fix-iupac-alleles.log",
-    conda:
-        "../envs/rbt.yaml"
-    cache: True
-    shell:
-        "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}"
-
-
-rule bwa_index:
-    input:
-        "resources/genome.fasta",
-    output:
-        multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"),
-    log:
-        "logs/bwa_index.log",
-    cache: True
-    wrapper:
-        "0.45.1/bio/bwa/index"
-
-
 rule download_HLALA_graph:
     output:
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"),
@@ -209,32 +118,6 @@ rule index_HLALA:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
-rule get_vep_cache:
-    output:
-        directory("resources/vep/cache"),
-    params:
-        species=config["ref"]["species"],
-        build=config["ref"]["build"],
-        release=config["ref"]["release"],
-    log:
-        "logs/vep/cache.log",
-    cache: True
-    wrapper:
-        "0.59.2/bio/vep/cache"
-
-
-rule get_vep_plugins:
-    output:
-        directory("resources/vep/plugins"),
-    params:
-        release=config["ref"]["release"],
-    log:
-        "logs/vep/plugins.log",
-    cache: True
-    wrapper:
-        "0.59.2/bio/vep/plugins"
-
-
 rule make_sampleheader:
     output:
         "resources/sampleheader.txt",
diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk
deleted file mode 100644
index e2b2a43d..00000000
--- a/workflow/rules/tmb.smk
+++ /dev/null
@@ -1,20 +0,0 @@
-if config["tmb"]["activate"]:
-
-    rule estimate_tmb:
-        input:
-            "results/merged-calls/{cancer_sample}.somatic.fdr-controlled.bcf",
-        output:
-            "results/plots/tmb/{cancer_sample}.{plotmode}.vl.json",
-        conda:
-            "../envs/varlociraptor.yaml"
-        log:
-            "logs/tmb/{cancer_sample}-{plotmode}.log",
-        params:
-            **config["tmb"],
-        shell:
-            "varlociraptor estimate tmb "
-            " --plot-mode {wildcards.plotmode} "
-            "--coding-genome-size {params.coding_genome_size} "
-            "--somatic-tumor-events {params.somatic_events} "
-            "--tumor-sample {params.tumor_sample} "
-            "< {input} > {output} 2> {log}"
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
deleted file mode 100644
index 0ada209d..00000000
--- a/workflow/rules/trim.smk
+++ /dev/null
@@ -1,66 +0,0 @@
-rule get_sra:
-    output:
-        "sra/{accession}_1.fastq",
-        "sra/{accession}_2.fastq",
-    log:
-        "logs/get-sra/{accession}.log",
-    wrapper:
-        "0.56.0/bio/sra-tools/fasterq-dump"
-
-
-rule cutadapt_pipe:
-    input:
-        get_cutadapt_pipe_input,
-    output:
-        pipe("pipe/cutadapt/{sample}/{seqtype}/{unit}.{fq}.{ext}"),
-    log:
-        "logs/pipe-fastqs/cutadapt/{sample}-{seqtype}-{unit}.{fq}.{ext}.log",
-    wildcard_constraints:
-        ext=r"fastq|fastq\.gz",
-    threads: 0  # this does not need CPU
-    shell:
-        "cat {input} > {output} 2> {log}"
-
-
-rule cutadapt_pe:
-    input:
-        get_cutadapt_input,
-    output:
-        fastq1="results/trimmed/{sample}/{seqtype}/{unit}_R1.fastq.gz",
-        fastq2="results/trimmed/{sample}/{seqtype}/{unit}_R2.fastq.gz",
-        qc="results/trimmed/{sample}/{seqtype}/{unit}.paired.qc.txt",
-    log:
-        "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
-    params:
-        extra=config["params"]["cutadapt"],
-        adapters=get_cutadapt_adapters,
-    threads: 8
-    wrapper:
-        "v0.86.0/bio/cutadapt/pe"
-
-
-rule cutadapt_se:
-    input:
-        get_cutadapt_input,
-    output:
-        fastq="results/trimmed/{sample}/{seqtype}/{unit}.single.fastq.gz",
-        qc="results/trimmed/{sample}/{seqtype}/{unit}.single.qc.txt",
-    log:
-        "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
-    params:
-        extra=config["params"]["cutadapt"],
-        adapters=get_cutadapt_adapters,
-    threads: 8
-    wrapper:
-        "v0.86.0/bio/cutadapt/se"
-
-
-rule merge_fastqs:
-    input:
-        get_fastqs,
-    output:
-        "results/merged/{seqtype}/{sample}_{read}.fastq.gz",
-    log:
-        "logs/merge-fastqs/{seqtype}_{sample}_{read}.log",
-    shell:
-        "cat {input} > {output} 2> {log}"
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index 65161772..92f0ab16 100644
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -2,7 +2,7 @@ rule bcf_index:
     input:
         "{prefix}.bcf",
     output:
-        "{prefix}.bcf.csi",
+        "{prefix}.csi",
     log:
         "logs/bcf-index/{prefix}.log",
     wrapper:
@@ -13,27 +13,13 @@ rule bam_index:
     input:
         "{prefix}.bam",
     output:
-        "{prefix}.bam.bai",
+        "{prefix}.bai",
     log:
         "logs/bam-index/{prefix}.log",
     wrapper:
         "0.59.2/bio/samtools/index"
 
 
-rule tabix_known_variants:
-    input:
-        "resources/{prefix}.{format}.gz",
-    output:
-        "resources/{prefix}.{format}.gz.tbi",
-    log:
-        "logs/tabix/{prefix}.{format}.log",
-    params:
-        get_tabix_params,
-    cache: True
-    wrapper:
-        "0.59.2/bio/tabix"
-
-
 rule tsv_to_excel:
     input:
         tsv="results/{x}.tsv",
diff --git a/workflow/rules/vega.smk b/workflow/rules/vega.smk
deleted file mode 100644
index 4199e8fc..00000000
--- a/workflow/rules/vega.smk
+++ /dev/null
@@ -1,15 +0,0 @@
-rule vg2svg:
-    input:
-        "{prefix}.vl.json",
-    output:
-        report(
-            "{prefix}.svg",
-            caption="../report/tmb.rst",
-            category="Tumor Mutational Burden",
-        ),
-    log:
-        "logs/vega/{prefix}.log",
-    conda:
-        "../envs/vega.yaml"
-    shell:
-        "vl2svg {input} {output} > {log} 2>&1"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 9f882ac1..80f219ff 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -42,40 +42,6 @@ properties:
       - build
       - n_chromosomes
 
-
-  tmb:
-    type: object
-    properties:
-      activate:
-        type: boolean
-      coding_genome_size:
-        # TODO allow integer here!
-        type: string
-      tumor_sample:
-        type: string
-      somatic_events:
-        type: array
-        items: 
-          type: string
-    required:
-      - activate
-      - coding_genome_size
-      - tumor_sample
-      - somatic_events
-
-  
-  remove_duplicates:
-    type: object
-    properties:
-      activate:
-        type: boolean
-
-  trimming:
-    type: object
-    properties:
-      activate:
-        type: boolean
-
   epitope_prediction:
     type: object
     properties:
@@ -109,23 +75,6 @@ properties:
           activate:
             type: boolean
 
-  annotations:
-    type: object
-    properties:
-      vep:
-        properties:
-          params:
-            type: string
-          plugins:
-            type: array
-            items:
-              type: string
-        required:
-          - params
-          - plugins
-    required:
-      - vep
-
   fusion:
     type: object
     properties:
@@ -142,34 +91,6 @@ properties:
   params:
     type: object
     properties:
-      cutadapt:
-        type: string
-      bwa:
-        type: string
-      gatk:
-        type: object
-        properties:
-          BaseRecalibrator:
-            type: string
-          applyBQSR:
-            type: string
-        required:
-          - BaseRecalibrator
-          - applyBQSR
-      picard:
-        type: object
-        properties:
-          MarkDuplicates:
-            type: string
-        required:
-          - MarkDuplicates
-      strelka:
-        type: object
-        properties:
-          config:
-            type: string
-          run:
-            type: string
       razers3:
         type: string
       optitype:
@@ -186,15 +107,21 @@ properties:
                 type: integer
               netMHCIIpan:
                 type: integer
-      kallisto:
-        type: string
-      star:
-        type: string
+          variant_sets:
+            type: object
+            properties:
+              normal:
+                type: string
+              tumor:
+                type: string
+            required:
+              - normal
+              - tumor
+        required:
+          - window_len
+          - peptide_len
+          - variant_sets
     required:
-      - bwa
-      - gatk
-      - picard
-      - strelka
       - razers3
       - microphaser
       - optitype
@@ -203,8 +130,6 @@ required:
   - samples
   - units
   - ref
-  - tmb
   - params
-  - annotations
   - epitope_prediction
   - affinity
diff --git a/workflow/scripts/build_oncoprint_matrix.py b/workflow/scripts/build_oncoprint_matrix.py
deleted file mode 100644
index 2b65721b..00000000
--- a/workflow/scripts/build_oncoprint_matrix.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pysam
-import pandas as pd
-import os
-import sys
-
-# redirect output to log file
-log = open(snakemake.log[0], "w")
-sys.stdout = log
-sys.stderr = log
-
-input_files = snakemake.input
-
-df = pd.DataFrame(columns=["Sample"])
-for sample_file in input_files:
-    variant_file = pysam.VariantFile(sample_file)
-    sample_name = os.path.basename(sample_file).split(".")[0]
-    gene_variant_dict = {"Sample": [sample_name]}
-    for rec in variant_file.fetch():
-        for sample in rec.samples:
-            allele_frequencies = rec.samples[sample]["AF"] #Can be multiple entries
-            for allele_frequency in allele_frequencies:
-                variant = rec.info["SVLEN"]
-                if variant[0]:
-                    variant_type = "INDEL"
-                else:
-                    variant_type = "SNV" 
-                transcripts = rec.info["ANN"]
-                for transcript in transcripts:
-                    gene = transcript.split("|")[3]
-                    impact = transcript.split("|")[2]
-                    if gene and impact != "MODIFIER":
-                        if gene not in gene_variant_dict:
-                            gene_variant_dict[gene] = set()
-                        gene_variant_dict[gene].add(variant_type)
-                break
-    for key, value in gene_variant_dict.items():
-        gene_variant_dict[key] = ','.join(value)
-    sample_df = pd.DataFrame(gene_variant_dict, index=[0])
-    df = pd.concat([df, sample_df], join="outer", ignore_index=False, sort=False)
-df.set_index("Sample", inplace=True)
-with open(snakemake.output[0], 'w') as output_f:
-    print(df.to_csv(sep="\t", index=True), file=output_f)
diff --git a/workflow/scripts/oncoprint.R b/workflow/scripts/oncoprint.R
deleted file mode 100644
index 5476b305..00000000
--- a/workflow/scripts/oncoprint.R
+++ /dev/null
@@ -1,56 +0,0 @@
-# log to file
-log <- file(snakemake@log[[1]], open="wt")
-sink(log)
-sink(log, type="message")
-
-library(ComplexHeatmap)
-library(ggplot2)
-
-table = read.table(snakemake@input[[1]], sep="\t", header=TRUE, row.names=1)
-mat = as.matrix(table)
-mat = t(mat)
-## remove "full" lines
-mat = mat[rowSums(mat == "") > 0,]
-## remove single lines
-mat = mat[rowSums(mat != "") > 0,]
-
-col = c(SNV = "blue", INDEL = "red")
-
-alter_fun = list(
-        SNV = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, 
-            gp = gpar(fill = col["SNV"], col = NA)),
-        INDEL = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.4, 
-            gp = gpar(fill = col["INDEL"], col = NA))
-    )
-
-
-heatmap_legend_param = list(title = "Alterations", at = c("SNV", "INDEL"), 
-        labels = c("SNV", "INDEL"))
-if (ncol(mat) > 1 ){
-    mat <- mat[order(apply(mat, 1, function(row) sum(row != "")), decreasing = T), ]
-}
-
-i = 0
-c = 0
-matlist = list()
-while (i + 2000 < nrow(mat)) {
-    m <- mat[(i + 1):(i + 2000), , drop=FALSE]
-    rows_matrix <- nrow(m)
-    height_plot <- (rows_matrix/5)
-    if (height_plot < 4) {
-        height_plot <- 4
-    }
-    pdf(file = sub("0.pdf", paste(c, ".pdf", sep=''), snakemake@output[[1]]), height=height_plot)
-    if (rows_matrix > 0) {
-        oncoprint <- oncoPrint(m,
-            alter_fun = alter_fun, col = col, 
-            remove_empty_columns = FALSE, remove_empty_rows = TRUE,
-            pct_side = "right", row_names_side = "left",
-            show_column_names=T,
-            column_title = "OncoPrint", heatmap_legend_param = heatmap_legend_param)
-        draw(oncoprint, newpage=F)
-    }
-    dev.off()
-    i = i + 2000
-    c = c + 1
-}
\ No newline at end of file
diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py
deleted file mode 100644
index 8a1d06eb..00000000
--- a/workflow/scripts/render-scenario.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from jinja2 import Template
-import pandas as pd
-
-with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out:
-    samples = snakemake.params.samples
-    group = samples.loc[samples["sample_name"] == snakemake.wildcards.cancer_sample, "group"]
-    out.write(Template(template.read()).render(
-        samples=samples[samples["group"] == group]
-    ))

From fcedef9b006696eed92dfafe0e08c08a3dc5d22a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:07:38 +0000
Subject: [PATCH 059/191] config.yaml: add info where to obtain netMHCpan and
 netMHCIIpan software

---
 config/config.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/config/config.yaml b/config/config.yaml
index 3bf1b781..9391901f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -10,10 +10,22 @@ affinity:
   netMHCpan:
     activate: true
     params: "-BA -l 9 -s -xls"
+    # Please download netMHCpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+    # To make the `netMHCpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
     location: "../netMHCpan-4.0"
   netMHCIIpan:
     activate: false
     params: "-length 15 -s -xls"
+    # Please download netMHCIIpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+    # To make the `netMHCIIpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
     location: "../netMHCIIpan-4.0"
 
 

From a429875d344e23c5ff20b6f4b076f9af81cf772d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:09:50 +0000
Subject: [PATCH 060/191] switch optitype read fishing from razers3 to yara, to
 avoid extraneous memory usage

---
 config/config.yaml           |  9 ++++++---
 workflow/rules/HLAtyping.smk | 36 ++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 9391901f..587b707f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -30,10 +30,13 @@ affinity:
 
 
 HLAtyping:
-  # activate to use razers3 to pre-filter reads before using optitype
+  # activate to use yara to pre-filter reads before using optitype
   optitype_prefiltering:
     activate: True
   optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
+  # version of the IMGT-IPD repository to use for determining HLA allele 
+  # regions, the repo is at: https://github.com/ANHIG/IMGTHLA
+  imgt_hla_version: "v3.48.0-alpha"
   # activate to predict MHC-I and MHC-II alleles with HLA-LA
   HLA_LA:
     activate: true
@@ -60,8 +63,8 @@ annotations:
       - LoFtool
 
 params:
-  razers3:
-    "-i 95 -m 1 -dr 0"
+  yara:
+    "-e 3"
   optitype:
     ""
   microphaser:
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 70afe0f5..518a621c 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -37,33 +37,37 @@ rule parse_HLA_LA:
         "../scripts/parse_HLA_types.py"
 
 
-rule razers3:
+rule yara:
     input:
         reads="results/merged/DNA/{sample}_{read}.fastq.gz",
+        index="resources/yara/hla_alleles.index",
     output:
-        bam="results/razers3/bam/{sample}_{read}.bam",
-    threads: 8
+        bam=temp("results/yara/{sample}_{read}.bam"),
+    threads: 12
     log:
-        "logs/razers3/{sample}_{read}.log",
+        "logs/yara/{sample}_{read}.log",
+    conda:
+        "../envs/yara.yaml"
     params:
-        genome=config["HLAtyping"]["optitype_data"],
-        extra=config["params"]["razers3"],
-    wrapper:
-        "0.61.0/bio/razers3"
+        extra=config["params"]["yara"],
+    shell:
+        "( yara_mapper {params.extra} -t {threads} -f bam {input.index} {input.reads} > {output.bam} ) 2> {log}"
 
 
-rule bam2fq:
+rule filter_yara:
     input:
-        "results/razers3/bam/{sample}_{read}.bam",
+        "results/yara/{sample}_{read}.bam",
     output:
-        "results/razers3/fastq/{sample}_{read}.fished.fastq",
-    params:
-        "",
+        temp("results/yara/{sample}_{read}.filtered.bam"),
     log:
-        "logs/razers3-bam2fq/{sample}-{read}.log",
-    threads: 1
+        "logs/filter_yara/{sample}_{read}.filtered.log",
+    threads: 3
+    params:
+        extra="-h -F 4 -b1"
     wrapper:
-        "0.61.0/bio/samtools/bam2fq/interleaved"
+        "v1.5.0/bio/samtools/view"
+
+
 
 
 rule OptiType:

From c5d3b32f3044d8da0901b7c66d20d24497b881f1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:11:34 +0000
Subject: [PATCH 061/191] add yara indexing

---
 workflow/rules/ref.smk | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 513ea5dd..b1922b7a 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -118,6 +118,21 @@ rule index_HLALA:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
+
+
+rule yara_hla_index:
+    input:
+        config["HLAtyping"]["optitype_data"]
+    output:
+        "resources/yara/hla_alleles.index"
+    log:
+        "logs/yara_hla_index.log"
+    conda:
+        "../envs/yara.yaml"
+    shell:
+        "( yara_index {input} -o {output} ) 2> {log}"
+
+
 rule make_sampleheader:
     output:
         "resources/sampleheader.txt",

From 1be3bd226f992f5e3aa10193645604d0cf7cb382 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:13:31 +0000
Subject: [PATCH 062/191] change optitype read fishing to extraction of HLA
 gene regions from mapped bam, update optitype

---
 workflow/rules/HLAtyping.smk | 47 ++++++++++++++++------------
 workflow/rules/ref.smk       | 60 ++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 26 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 518a621c..a259a57d 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -37,42 +37,49 @@ rule parse_HLA_LA:
         "../scripts/parse_HLA_types.py"
 
 
-rule yara:
+rule get_hla_aligning_reads:
     input:
-        reads="results/merged/DNA/{sample}_{read}.fastq.gz",
-        index="resources/yara/hla_alleles.index",
+        bam=get_bam_from_group_and_alias(),
+        bai=get_bam_from_group_and_alias(ext=".bai"),
+        regions="resources/hla_alleles/hla_allele_regions.expanded_1000.bed",
     output:
-        bam=temp("results/yara/{sample}_{read}.bam"),
-    threads: 12
+        bam="results/fished/{group}.{alias}.bam",
+        idx="results/fished/{group}.{alias}.bai",
     log:
-        "logs/yara/{sample}_{read}.log",
-    conda:
-        "../envs/yara.yaml"
+        "logs/get_hla_reads/{group}.{alias}.log",
     params:
-        extra=config["params"]["yara"],
-    shell:
-        "( yara_mapper {params.extra} -t {threads} -f bam {input.index} {input.reads} > {output.bam} ) 2> {log}"
+        extra=lambda wc, input: f"--regions-file {input.regions}"
+    wrapper:
+        "v1.7.0/bio/samtools/view"
+
+
+ruleorder: get_hla_aligning_reads > bam_index
 
 
-rule filter_yara:
+rule hla_reads_single_ends:
     input:
-        "results/yara/{sample}_{read}.bam",
+        "results/fished/{group}.{alias}.bam",
+        "results/fished/{group}.{alias}.bai",
     output:
-        temp("results/yara/{sample}_{read}.filtered.bam"),
+        bam="results/fished/{group}.{alias}.{read}.bam",
+        idx="results/fished/{group}.{alias}.{read}.bai",
     log:
-        "logs/filter_yara/{sample}_{read}.filtered.log",
-    threads: 3
+        "logs/split_hla_reads/{group}.{alias}.{read}.log",
     params:
-        extra="-h -F 4 -b1"
+        extra=lambda wc: "-f 0x80" if wc.read == "R2" else "-f 0x40"
     wrapper:
-        "v1.5.0/bio/samtools/view"
+        "v1.7.0/bio/samtools/view"
 
 
+ruleorder: hla_reads_single_ends > bam_index
 
 
 rule OptiType:
     input:
-        reads=get_optitype_reads_input,
+        reads=[
+            "results/fished/{group}.{alias}.R1.bam",
+            "results/fished/{group}.{alias}.R2.bam",
+        ],
     output:
         multiext(
             "results/optitype/{group}/{group}.{alias}",
@@ -85,7 +92,7 @@ rule OptiType:
         extra=config["params"]["optitype"],
         sequencing_type="dna",
     wrapper:
-        "0.63.0/bio/optitype"
+        "v1.7.0/bio/optitype"
 
 
 rule parse_Optitype:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index b1922b7a..0a374f9d 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -118,19 +118,67 @@ rule index_HLALA:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
+rule download_hla_allele_list:
+    input:
+        HTTP.remote(
+            expand(
+                "raw.githubusercontent.com/ANHIG/IMGTHLA/{version}/Allelelist.txt",
+                version=config["HLAtyping"]["imgt_hla_version"]
+            ),
+        ),
+    output:
+        "resources/hla_alleles/Allelelist.txt",
+    log:
+        "logs/hla_alleles/download_Allelelist.log",
+    shell:
+        "( mv {input} {output} ) 2> {log}"
+
+
+rule get_hla_allele_names:
+    input:
+        "resources/hla_alleles/Allelelist.txt",
+    output:
+        "resources/hla_alleles/hla_allele_names.txt",
+    log:
+        "logs/hla_alleles/hla_allele_names.log",
+    conda:
+        "../envs/grep_sed.yaml"
+    shell:
+        '( grep -v "^\\(#\\|Allele\\)" {input} | '
+        '  cut -d "," -f 2,2 | '
+        '  cut -d "*" -f 1,1 | '
+        "  uniq | "
+        "  sed -e 's/^\\([A-Z]\\)$/HLA-\\1/' | "
+        "  sed -e 's/^\\(D[A-Z]\\{{2,2\\}}[1-9]*\\)$/HLA-\\1/' "
+        "  >{output} ) 2> {log}"
+
+
+rule get_hla_regions_from_gtf:
+    input:
+        gtf="resources/genome.gtf",
+        allele_names="resources/hla_alleles/hla_allele_names.txt",
+    output:
+        "resources/hla_alleles/hla_allele_regions.bed",
+    log:
+        "logs/hla_alleles/hla_allele_regions.log",
+    conda:
+        "../envs/rust.yaml"
+    script:
+        "../scripts/hla_regions_from_gtf.rs"
 
 
-rule yara_hla_index:
+rule expand_hla_regions:
     input:
-        config["HLAtyping"]["optitype_data"]
+        bed="resources/hla_alleles/hla_allele_regions.bed",
+        genome="resources/genome.fasta.fai",
     output:
-        "resources/yara/hla_alleles.index"
+        "resources/hla_alleles/hla_allele_regions.expanded_1000.bed",
     log:
-        "logs/yara_hla_index.log"
+        "logs/hla_alleles/hla_allele_regions.expanded_1000.log",
     conda:
-        "../envs/yara.yaml"
+        "../envs/bedtools.yaml"
     shell:
-        "( yara_index {input} -o {output} ) 2> {log}"
+        "( sort {input.bed} | bedtools slop -b 1000 -g {input.genome} | bedtools merge > {output} ) 2> {log}"
 
 
 rule make_sampleheader:

From 1943dfbcbfa95e8c42c5d3d70631b3b83eceaed5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:14:08 +0000
Subject: [PATCH 063/191] add yara.yaml (documentation only)

---
 workflow/envs/yara.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 workflow/envs/yara.yaml

diff --git a/workflow/envs/yara.yaml b/workflow/envs/yara.yaml
new file mode 100644
index 00000000..e5489375
--- /dev/null
+++ b/workflow/envs/yara.yaml
@@ -0,0 +1,5 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - yara =1.0.2

From b2deec62e03cf51c695e20e66c7fb65e251f00b1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:14:40 +0000
Subject: [PATCH 064/191] remove now unused yara.yaml

---
 workflow/envs/yara.yaml | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 workflow/envs/yara.yaml

diff --git a/workflow/envs/yara.yaml b/workflow/envs/yara.yaml
deleted file mode 100644
index e5489375..00000000
--- a/workflow/envs/yara.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-dependencies:
-  - yara =1.0.2

From b2d14f1aa0ef83a429059352521a6aac8010b8de Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:15:27 +0000
Subject: [PATCH 065/191] adjust config.schemal.yaml to yara instead of razers3

---
 workflow/schemas/config.schema.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 80f219ff..54271eca 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -91,7 +91,7 @@ properties:
   params:
     type: object
     properties:
-      razers3:
+      yara:
         type: string
       optitype:
         type: string
@@ -122,7 +122,7 @@ properties:
           - peptide_len
           - variant_sets
     required:
-      - razers3
+      - yara
       - microphaser
       - optitype
 

From 98dd4b1f4b61b41842cdccd265402eb0dff26329 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:17:04 +0000
Subject: [PATCH 066/191] remove yara/razers3 from all config.yaml and schema

---
 .test/config/config.yaml            | 2 --
 config/config.yaml                  | 2 --
 workflow/schemas/config.schema.yaml | 3 ---
 3 files changed, 7 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 78bf043b..3042b885 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -115,8 +115,6 @@ params:
       "--exome"
     run:
       "--mode local"
-  razers3:
-    "-i 95 -m 1 -dr 0"
   optitype:
     ""
   microphaser:
diff --git a/config/config.yaml b/config/config.yaml
index 587b707f..1172b3a4 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,8 +63,6 @@ annotations:
       - LoFtool
 
 params:
-  yara:
-    "-e 3"
   optitype:
     ""
   microphaser:
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 54271eca..652e2d72 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -91,8 +91,6 @@ properties:
   params:
     type: object
     properties:
-      yara:
-        type: string
       optitype:
         type: string
       microphaser:
@@ -122,7 +120,6 @@ properties:
           - peptide_len
           - variant_sets
     required:
-      - yara
       - microphaser
       - optitype
 

From a4f71c69af66dbe6f4d2013970f914eff92a7539 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:19:24 +0000
Subject: [PATCH 067/191] clean up .test/config/config.yaml

---
 .test/config/config.yaml | 68 ++--------------------------------------
 1 file changed, 2 insertions(+), 66 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 3042b885..524cc23d 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -1,50 +1,6 @@
 samples: "config/samples.tsv"
 units: "config/units.tsv"
 
-# boolean if read trimming should be skipped
-trimming:
-  activate: false
-
-remove_duplicates:
-  activate: false
-
-calling:
-  freebayes:
-    activate: true
-  # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
-  scenario: config/scenario.yaml
-  filter:
-    # Filter candidate variants (this filter helps to keep the number of evaluated candidates small).
-    # It should ideally generate a superset of all other filters defined below.
-    # Annotation of candidate variants tries to be as fast as possible, only using VEP
-    # default parameters.
-    candidates: ""
-    # Add any number of named filters here. They will be applied independenty,
-    # and can be referred in FDR control below to generate calls for different events.
-    # In particular, you can also filter by ID or dbsnp annotations here.
-    # See http://snpeff.sourceforge.net/SnpSift.html#filter
-    filtername: "ANN['IMPACT'] != 'MODIFIER'"
-  fdr-control:
-    threshold: 0.05
-    events: 
-      complete:
-        varlociraptor: 
-          - "somatic"
-          - "germline"
-      somatic:
-        varlociraptor:
-          - "somatic"
-      germline:
-        varlociraptor:
-          - "germline"
-
-fusion:
-  arriba:
-    activate: true
-    blacklist:
-     "arriba_blacklist"
-    params:
-      "-T -P"
 
 tmb:
   activate: true
@@ -101,20 +57,6 @@ annotations:
       - LoFtool
 
 params:
-  cutadapt: ""
-  bwa:
-    "-M"
-  picard:
-    MarkDuplicates:
-      "VALIDATION_STRINGENCY=LENIENT"
-  gatk:
-    BaseRecalibrator: "--tmp-dir tmp"
-    applyBQSR: ""
-  strelka:
-    config:
-      "--exome"
-    run:
-      "--mode local"
   optitype:
     ""
   microphaser:
@@ -126,11 +68,5 @@ params:
       netMHCIIpan:
           15
     events:
-      tumor: "strelka_somatic"
-      normal: "strelka_germline"
-  kallisto:
-    "-b 100"
-  star: >-
-    --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip
-    --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0
-    --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3
+      tumor: "tumor_only"
+      normal: "normal_only"

From 2500cb6d5cfea69bcf18370496b7fa9cc70f9ae7 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:20:52 +0000
Subject: [PATCH 068/191] get hla_la.yaml to work by fixing further
 dependencies

---
 workflow/envs/hla_la.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml
index 6c2487fb..4b94aaef 100644
--- a/workflow/envs/hla_la.yaml
+++ b/workflow/envs/hla_la.yaml
@@ -5,4 +5,6 @@ channels:
 dependencies:
   - hla-la ==1.0.5
   - samtools ==1.10
-  - boost-cpp ==1.73.0
+  - bamtools ==2.5.1
+  - boost-cpp ==1.74.0
+  - r-base =4

From bd6b24d74b41bca144d4de428d30c0ccfb62db8e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:21:44 +0000
Subject: [PATCH 069/191] add environments for getting HLA gene regions

---
 workflow/envs/bedtools.yaml | 5 +++++
 workflow/envs/grep_sed.yaml | 6 ++++++
 workflow/envs/rust.yaml     | 8 ++++++++
 3 files changed, 19 insertions(+)
 create mode 100644 workflow/envs/bedtools.yaml
 create mode 100644 workflow/envs/grep_sed.yaml
 create mode 100644 workflow/envs/rust.yaml

diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml
new file mode 100644
index 00000000..00c3a615
--- /dev/null
+++ b/workflow/envs/bedtools.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bedtools =2.30
\ No newline at end of file
diff --git a/workflow/envs/grep_sed.yaml b/workflow/envs/grep_sed.yaml
new file mode 100644
index 00000000..00e5b6aa
--- /dev/null
+++ b/workflow/envs/grep_sed.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - grep =3.4
+  - sed =4.8
diff --git a/workflow/envs/rust.yaml b/workflow/envs/rust.yaml
new file mode 100644
index 00000000..16124ac7
--- /dev/null
+++ b/workflow/envs/rust.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+dependencies:
+  - rust-script >=0.17.0
+  - rust >=1.58
+  - cryptography >=36.0
+  - c-compiler =1.3
+  - pkg-config >=0.29
\ No newline at end of file

From c31f660c52eed276fc3ae64ca420b4d22aae8380 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:23:09 +0000
Subject: [PATCH 070/191] add imports to use remote resource for HLA allele
 list

---
 workflow/rules/common.smk | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9e25bb50..dc653b72 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -2,8 +2,10 @@ import glob
 
 import pandas as pd
 from snakemake.remote import FTP
+from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
 from snakemake.utils import validate
 
+HTTP = HTTPRemoteProvider()
 ftp = FTP.RemoteProvider()
 
 ##### config file #####

From 08197e05eebc606b4e5b6540745eb043f9bc3b47 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:26:03 +0000
Subject: [PATCH 071/191] remove generation of HLA allele gene regions

---
 workflow/rules/common.smk |  2 --
 workflow/rules/ref.smk    | 63 ---------------------------------------
 2 files changed, 65 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index dc653b72..9e25bb50 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -2,10 +2,8 @@ import glob
 
 import pandas as pd
 from snakemake.remote import FTP
-from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider
 from snakemake.utils import validate
 
-HTTP = HTTPRemoteProvider()
 ftp = FTP.RemoteProvider()
 
 ##### config file #####
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 0a374f9d..513ea5dd 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -118,69 +118,6 @@ rule index_HLALA:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
-rule download_hla_allele_list:
-    input:
-        HTTP.remote(
-            expand(
-                "raw.githubusercontent.com/ANHIG/IMGTHLA/{version}/Allelelist.txt",
-                version=config["HLAtyping"]["imgt_hla_version"]
-            ),
-        ),
-    output:
-        "resources/hla_alleles/Allelelist.txt",
-    log:
-        "logs/hla_alleles/download_Allelelist.log",
-    shell:
-        "( mv {input} {output} ) 2> {log}"
-
-
-rule get_hla_allele_names:
-    input:
-        "resources/hla_alleles/Allelelist.txt",
-    output:
-        "resources/hla_alleles/hla_allele_names.txt",
-    log:
-        "logs/hla_alleles/hla_allele_names.log",
-    conda:
-        "../envs/grep_sed.yaml"
-    shell:
-        '( grep -v "^\\(#\\|Allele\\)" {input} | '
-        '  cut -d "," -f 2,2 | '
-        '  cut -d "*" -f 1,1 | '
-        "  uniq | "
-        "  sed -e 's/^\\([A-Z]\\)$/HLA-\\1/' | "
-        "  sed -e 's/^\\(D[A-Z]\\{{2,2\\}}[1-9]*\\)$/HLA-\\1/' "
-        "  >{output} ) 2> {log}"
-
-
-rule get_hla_regions_from_gtf:
-    input:
-        gtf="resources/genome.gtf",
-        allele_names="resources/hla_alleles/hla_allele_names.txt",
-    output:
-        "resources/hla_alleles/hla_allele_regions.bed",
-    log:
-        "logs/hla_alleles/hla_allele_regions.log",
-    conda:
-        "../envs/rust.yaml"
-    script:
-        "../scripts/hla_regions_from_gtf.rs"
-
-
-rule expand_hla_regions:
-    input:
-        bed="resources/hla_alleles/hla_allele_regions.bed",
-        genome="resources/genome.fasta.fai",
-    output:
-        "resources/hla_alleles/hla_allele_regions.expanded_1000.bed",
-    log:
-        "logs/hla_alleles/hla_allele_regions.expanded_1000.log",
-    conda:
-        "../envs/bedtools.yaml"
-    shell:
-        "( sort {input.bed} | bedtools slop -b 1000 -g {input.genome} | bedtools merge > {output} ) 2> {log}"
-
-
 rule make_sampleheader:
     output:
         "resources/sampleheader.txt",

From d527371f50ae4b2c7abe69b56d87b8b444f3109a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:28:12 +0000
Subject: [PATCH 072/191] remove environment defs for HLA allele region
 generation

---
 workflow/envs/bedtools.yaml | 5 -----
 workflow/envs/grep_sed.yaml | 6 ------
 workflow/envs/rust.yaml     | 8 --------
 3 files changed, 19 deletions(-)
 delete mode 100644 workflow/envs/bedtools.yaml
 delete mode 100644 workflow/envs/grep_sed.yaml
 delete mode 100644 workflow/envs/rust.yaml

diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml
deleted file mode 100644
index 00c3a615..00000000
--- a/workflow/envs/bedtools.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - bedtools =2.30
\ No newline at end of file
diff --git a/workflow/envs/grep_sed.yaml b/workflow/envs/grep_sed.yaml
deleted file mode 100644
index 00e5b6aa..00000000
--- a/workflow/envs/grep_sed.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - grep =3.4
-  - sed =4.8
diff --git a/workflow/envs/rust.yaml b/workflow/envs/rust.yaml
deleted file mode 100644
index 16124ac7..00000000
--- a/workflow/envs/rust.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - rust-script >=0.17.0
-  - rust >=1.58
-  - cryptography >=36.0
-  - c-compiler =1.3
-  - pkg-config >=0.29
\ No newline at end of file

From 0a12e794d46044c4d9fc5dd692e9c3e1decedff8 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:34:31 +0000
Subject: [PATCH 073/191] remove optitype, only use HLA-LA

---
 .test/config/config.yaml     | 12 ------------
 config/config.yaml           | 15 ---------------
 workflow/rules/HLAtyping.smk | 37 ------------------------------------
 workflow/rules/common.smk    | 30 +++++------------------------
 4 files changed, 5 insertions(+), 89 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 524cc23d..b09277f7 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -26,16 +26,6 @@ affinity:
     location: "../netMHCIIpan-4.0"
 
 
-HLAtyping:
-  # activate to use razers3 to pre-filter reads before using optitype
-  optitype_prefiltering:
-    activate: true
-  optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
-  # activate to predict MHC-I and MHC-II alleles with HLA-LA
-  HLA_LA:
-    activate: false
-
-
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
@@ -57,8 +47,6 @@ annotations:
       - LoFtool
 
 params:
-  optitype:
-    ""
   microphaser:
     window_len:
         33
diff --git a/config/config.yaml b/config/config.yaml
index 1172b3a4..257f312e 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -29,19 +29,6 @@ affinity:
     location: "../netMHCIIpan-4.0"
 
 
-HLAtyping:
-  # activate to use yara to pre-filter reads before using optitype
-  optitype_prefiltering:
-    activate: True
-  optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
-  # version of the IMGT-IPD repository to use for determining HLA allele 
-  # regions, the repo is at: https://github.com/ANHIG/IMGTHLA
-  imgt_hla_version: "v3.48.0-alpha"
-  # activate to predict MHC-I and MHC-II alleles with HLA-LA
-  HLA_LA:
-    activate: true
-
-
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
@@ -63,8 +50,6 @@ annotations:
       - LoFtool
 
 params:
-  optitype:
-    ""
   microphaser:
     window_len:
         33
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index a259a57d..0b4169a5 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -72,40 +72,3 @@ rule hla_reads_single_ends:
 
 
 ruleorder: hla_reads_single_ends > bam_index
-
-
-rule OptiType:
-    input:
-        reads=[
-            "results/fished/{group}.{alias}.R1.bam",
-            "results/fished/{group}.{alias}.R2.bam",
-        ],
-    output:
-        multiext(
-            "results/optitype/{group}/{group}.{alias}",
-            ".coverage_plot.pdf",
-            ".result.tsv",
-        ),
-    log:
-        "logs/optitype/{group}.{alias}.log",
-    params:
-        extra=config["params"]["optitype"],
-        sequencing_type="dna",
-    wrapper:
-        "v1.7.0/bio/optitype"
-
-
-rule parse_Optitype:
-    input:
-        "results/optitype/{group}/{group}.{alias}.result.tsv",
-    output:
-        report(
-            "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
-            caption="../report/HLA_Types.rst",
-            category="HLA-Typing(Optitype)",
-        ),
-    log:
-        "logs/parse-optitype/{group}.{alias}.log",
-    shell:
-        "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' "
-        '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}'
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9e25bb50..2d9c4b35 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -114,20 +114,15 @@ def get_final_output():
                 )
             )
     else:
-        if config["HLAtyping"]["HLA_LA"]["activate"]:
             final_output = expand(
                 [
-                    "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
-                    "results/HLA-LA/{group}.{alias}.hlaI.tsv",
-                    "results/HLA-LA/{group}.{alias}.hlaII.tsv",
+                    "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv",
+                    "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv",
                 ],
-                sample=samples["sample_name"],
-            )
-        else:
-            final_output = expand(
-                "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
-                sample=samples["sample_name"],
+                group=group,
+                tumor_alias=tumor_aliases
             )
+
     return final_output
 
 
@@ -159,21 +154,6 @@ def get_sample_from_group_and_alias(group, alias):
     return sample
 
 
-def get_optitype_reads_input(wildcards):
-    sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias)
-    if is_activated("HLAtyping/optitype_prefiltering"):
-        if is_paired_end(sample, "DNA"):
-            return expand(
-                "results/razers3/fastq/{sample}_{read}.fished.fastq",
-                sample=sample,
-                read=["R1", "R2"],
-            )
-        return f"results/razers3/fastq/{sample}_single.fastq"
-    else:
-        wildcards["sample"] = sample
-        return get_map_reads_input(wildcards)
-
-
 def get_bam_from_group_and_alias(ext=".bam"):
     def inner(wildcards):
         alias = wildcards.get("alias",

From d55d8c6dc2e6834449ac88a2f88cc298f78fbf91 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:35:02 +0000
Subject: [PATCH 074/191] check in rust script used for getting hla regions
 from GTF file

---
 workflow/scripts/hla_regions_from_gtf.rs | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 workflow/scripts/hla_regions_from_gtf.rs

diff --git a/workflow/scripts/hla_regions_from_gtf.rs b/workflow/scripts/hla_regions_from_gtf.rs
new file mode 100644
index 00000000..2357216d
--- /dev/null
+++ b/workflow/scripts/hla_regions_from_gtf.rs
@@ -0,0 +1,49 @@
+//! This is a regular crate doc comment, but it also contains a partial
+//! Cargo manifest.  Note the use of a *fenced* code block, and the
+//! `cargo` "language".
+//!
+//! ```cargo
+//! cargo-features = ["edition2021"]
+//! [dependencies]
+//! bio = { version = "0.41.0" }
+//! ```
+use bio::io::{gff, bed};
+
+use std::fs::File;
+use std::path::PathBuf;
+use std::collections::HashSet;
+use std::io::{BufRead, BufReader};
+use std::error::Error;
+
+
+fn main() -> Result<(), Box<dyn Error>> {
+
+    snakemake.redirect_stderr(&snakemake.log[0])?;
+
+    let alleles_file = BufReader::new(File::open(PathBuf::from(&snakemake.input.allele_names))?);
+    let allele_names: HashSet<String> = alleles_file.lines().map(|line| line.unwrap()).collect();
+
+    let mut gtf_reader = gff::Reader::from_file(PathBuf::from(&snakemake.input.gtf), gff::GffType::GTF2)?;
+
+    let mut bed_writer = bed::Writer::to_file(PathBuf::from(&snakemake.output[0]))?;
+
+    for r in gtf_reader.records() {
+        let record = r?;
+        if record.feature_type() == "gene" {
+            let attr = record.attributes();
+            if let Some(name) = attr.get("gene_name") {
+                if allele_names.contains(name) {
+                    let mut bed_record = bed::Record::new();
+                    bed_record.set_chrom(record.seqname());
+                    bed_record.set_start(*record.start());
+                    bed_record.set_end(*record.end());
+                    bed_record.set_name(name);
+                    // write out bed record
+                    bed_writer.write(&bed_record)?;
+                }
+            } 
+        }
+    }
+
+    Ok(())
+}
\ No newline at end of file

From c635b631f755b3b57a2f61883ab1c5d2ae5a39ba Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:35:47 +0000
Subject: [PATCH 075/191] remove checked-in rust script, as it was only needed
 for optitype read fishing, optitype is now removed

---
 workflow/scripts/hla_regions_from_gtf.rs | 49 ------------------------
 1 file changed, 49 deletions(-)
 delete mode 100644 workflow/scripts/hla_regions_from_gtf.rs

diff --git a/workflow/scripts/hla_regions_from_gtf.rs b/workflow/scripts/hla_regions_from_gtf.rs
deleted file mode 100644
index 2357216d..00000000
--- a/workflow/scripts/hla_regions_from_gtf.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-//! This is a regular crate doc comment, but it also contains a partial
-//! Cargo manifest.  Note the use of a *fenced* code block, and the
-//! `cargo` "language".
-//!
-//! ```cargo
-//! cargo-features = ["edition2021"]
-//! [dependencies]
-//! bio = { version = "0.41.0" }
-//! ```
-use bio::io::{gff, bed};
-
-use std::fs::File;
-use std::path::PathBuf;
-use std::collections::HashSet;
-use std::io::{BufRead, BufReader};
-use std::error::Error;
-
-
-fn main() -> Result<(), Box<dyn Error>> {
-
-    snakemake.redirect_stderr(&snakemake.log[0])?;
-
-    let alleles_file = BufReader::new(File::open(PathBuf::from(&snakemake.input.allele_names))?);
-    let allele_names: HashSet<String> = alleles_file.lines().map(|line| line.unwrap()).collect();
-
-    let mut gtf_reader = gff::Reader::from_file(PathBuf::from(&snakemake.input.gtf), gff::GffType::GTF2)?;
-
-    let mut bed_writer = bed::Writer::to_file(PathBuf::from(&snakemake.output[0]))?;
-
-    for r in gtf_reader.records() {
-        let record = r?;
-        if record.feature_type() == "gene" {
-            let attr = record.attributes();
-            if let Some(name) = attr.get("gene_name") {
-                if allele_names.contains(name) {
-                    let mut bed_record = bed::Record::new();
-                    bed_record.set_chrom(record.seqname());
-                    bed_record.set_start(*record.start());
-                    bed_record.set_end(*record.end());
-                    bed_record.set_name(name);
-                    // write out bed record
-                    bed_writer.write(&bed_record)?;
-                }
-            } 
-        }
-    }
-
-    Ok(())
-}
\ No newline at end of file

From 9642129445961429502693210aff984967ea64b9 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:37:33 +0000
Subject: [PATCH 076/191] clean up config.schema.yaml after all the removals

---
 workflow/schemas/config.schema.yaml | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 652e2d72..8b960444 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -66,33 +66,9 @@ properties:
           params:
             type: string
 
-  HLAtyping:
-    type: object
-    properties:
-      HLA_LA:
-        type: object
-        properties:
-          activate:
-            type: boolean
-
-  fusion:
-    type: object
-    properties:
-      arriba:
-        type: object
-        properties:
-          activate:
-            type: boolean
-          blacklist:
-            type: string
-          params:
-            type: string
-
   params:
     type: object
     properties:
-      optitype:
-        type: string
       microphaser:
         type: object
         properties:
@@ -121,7 +97,6 @@ properties:
           - variant_sets
     required:
       - microphaser
-      - optitype
 
 required:
   - samples

From 5935345c202c7161911ba1c576859b6188c37802 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:38:34 +0000
Subject: [PATCH 077/191] remove last rules for optitype read fishing

---
 workflow/rules/HLAtyping.smk | 37 ------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 0b4169a5..db8bcc58 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -35,40 +35,3 @@ rule parse_HLA_LA:
         "logs/parse-HLA-LA/{group}.{alias}.log",
     script:
         "../scripts/parse_HLA_types.py"
-
-
-rule get_hla_aligning_reads:
-    input:
-        bam=get_bam_from_group_and_alias(),
-        bai=get_bam_from_group_and_alias(ext=".bai"),
-        regions="resources/hla_alleles/hla_allele_regions.expanded_1000.bed",
-    output:
-        bam="results/fished/{group}.{alias}.bam",
-        idx="results/fished/{group}.{alias}.bai",
-    log:
-        "logs/get_hla_reads/{group}.{alias}.log",
-    params:
-        extra=lambda wc, input: f"--regions-file {input.regions}"
-    wrapper:
-        "v1.7.0/bio/samtools/view"
-
-
-ruleorder: get_hla_aligning_reads > bam_index
-
-
-rule hla_reads_single_ends:
-    input:
-        "results/fished/{group}.{alias}.bam",
-        "results/fished/{group}.{alias}.bai",
-    output:
-        bam="results/fished/{group}.{alias}.{read}.bam",
-        idx="results/fished/{group}.{alias}.{read}.bai",
-    log:
-        "logs/split_hla_reads/{group}.{alias}.{read}.log",
-    params:
-        extra=lambda wc: "-f 0x80" if wc.read == "R2" else "-f 0x40"
-    wrapper:
-        "v1.7.0/bio/samtools/view"
-
-
-ruleorder: hla_reads_single_ends > bam_index

From 6c06a2a3d81d11e62c93c30b0c891787bfca9c60 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:40:04 +0000
Subject: [PATCH 078/191] work around microphaser requirement for `gene_name`
 in every GTF record by excluding those (only about 150k in a total of
 something like 3.5m records)

---
 workflow/rules/ref.smk | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 513ea5dd..86d41dd7 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -44,9 +44,21 @@ rule get_annotation:
         "0.45.1/bio/reference/ensembl-annotation"
 
 
-rule split_annotation:
+#TODO: remove this rule, once microphaser is fixed to make gene_name optional
+rule remove_records_with_gene_name_missing:
     input:
         "resources/genome.gtf",
+    output:
+        "resources/genome.records_with_gene_name.gtf",
+    log:
+        "logs/remove_records_with_gene_name_missing.log",
+    shell:
+        '( grep "gene_name" {input} > {output} ) 2> {log}'
+
+
+rule split_annotation:
+    input:
+        "resources/genome.records_with_gene_name.gtf",
     output:
         "resources/annotation/{contig}.gtf",
     log:

From f12187bad6432c24774d0f6da88c9f2d86d749ca Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:40:27 +0000
Subject: [PATCH 079/191] fix bcf_index output file name

---
 workflow/rules/utils.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index 92f0ab16..c8b11474 100644
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -2,7 +2,7 @@ rule bcf_index:
     input:
         "{prefix}.bcf",
     output:
-        "{prefix}.csi",
+        "{prefix}.bcf.csi",
     log:
         "logs/bcf-index/{prefix}.log",
     wrapper:

From 42eda3f64e61a2059ffad5aed24c59ed65877dda Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:40:59 +0000
Subject: [PATCH 080/191] remove tmb from config.yaml (previously removed, done
 in dna-seq-varlociraptor workflow)

---
 .test/config/config.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index b09277f7..937d9028 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -2,15 +2,6 @@ samples: "config/samples.tsv"
 units: "config/units.tsv"
 
 
-tmb:
-  activate: true
-  coding_genome_size: 3e7
-  # Name of the tumor sample in the scenario.yaml.
-  tumor_sample: tumor
-  somatic_events:
-    - somatic
-
-
 epitope_prediction:
   activate: true
 

From 02d121ff6af926c72d742cb79b4ca93a4076fbcf Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:42:20 +0000
Subject: [PATCH 081/191] fix HLA-LA graph download: generalize to work with
 module inclusion, rm tar.gz file after extraction

---
 workflow/rules/ref.smk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 86d41dd7..d9878f1f 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -102,11 +102,13 @@ rule download_HLALA_graph:
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"),
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
+    params:
+        graphs_dir=lambda w, output: output[0].replace("/PRG_MHC_GRCh38_withIMGT/PRG", ""),
     log:
         "logs/download-HLA-LA-graph.log",
     shell:
-        "cd resources/graphs && wget  http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
-        "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz"
+        "( cd {params.graphs_dir} && wget  http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
+        "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm  PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}"
 
 
 rule index_HLALA:

From 00e42a1f7878e1035886c26b8f5492d5c7533db1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:43:17 +0000
Subject: [PATCH 082/191] remove multiext() from rule index_HLALA, as snakemake
 does not accept directories as suffixes any more

---
 workflow/rules/ref.smk | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index d9878f1f..93daa96a 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -115,12 +115,8 @@ rule index_HLALA:
     input:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     output:
-        multiext(
-            "resources/graphs/PRG_MHC_GRCh38_withIMGT/",
-            "serializedGRAPH",
-            "serializedGRAPH_preGapPathindex",
-        ),
-    cache: True
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathIndex",
     conda:
         "../envs/hla_la.yaml"
     params:

From b5eda3a50cc3117335d7be3612398401200bcd85 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:45:18 +0000
Subject: [PATCH 083/191] add netMHCpan install instructions in both configs,
 update to version 4.1

---
 .test/config/config.yaml | 16 ++++++++++++++--
 config/config.yaml       |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 937d9028..25ccfa45 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -10,11 +10,23 @@ affinity:
   netMHCpan:
     activate: true
     params: "-BA -l 9 -s -xls"
-    location: "../netMHCpan-4.0"
+    # Please download netMHCpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+    # To make the `netMHCpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCpan-4.1"
   netMHCIIpan:
     activate: false
     params: "-length 15 -s -xls"
-    location: "../netMHCIIpan-4.0"
+    # Please download netMHCIIpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+    # To make the `netMHCIIpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCIIpan-4.1"
 
 
 ref:
diff --git a/config/config.yaml b/config/config.yaml
index 257f312e..31512865 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -16,7 +16,7 @@ affinity:
     # in addition to the other edits described for a complete install. To use
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
-    location: "../netMHCpan-4.0"
+    location: "../netMHCpan-4.1"
   netMHCIIpan:
     activate: false
     params: "-length 15 -s -xls"
@@ -26,7 +26,7 @@ affinity:
     # in addition to the other edits described for a complete install. To use
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
-    location: "../netMHCIIpan-4.0"
+    location: "../netMHCIIpan-4.1"
 
 
 ref:

From aab65ad8c381b22279c5a41ae3270b72a31958b8 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:45:52 +0000
Subject: [PATCH 084/191] update parsing of called HLA types to
 netMHC(II)pan-4.1 alleles

---
 workflow/scripts/parse_HLA_types.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
index ff1906f3..66c65b3b 100644
--- a/workflow/scripts/parse_HLA_types.py
+++ b/workflow/scripts/parse_HLA_types.py
@@ -1,8 +1,8 @@
 import pandas as pd
 
-hlaI = ["A","B","C"]
+hlaI = ["A","B","C", "E", "G"]
 
-hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"]
+hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"]
 
 hlas = pd.read_csv(snakemake.input[0], sep='\t')
 

From 8d1acc08ffc9533af1c381b399ebd53ba038022e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:48:37 +0000
Subject: [PATCH 085/191] add comments on how to list alleles that
 netMHC(II)pan can each handle

---
 workflow/scripts/parse_HLA_types.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
index 66c65b3b..1824326d 100644
--- a/workflow/scripts/parse_HLA_types.py
+++ b/workflow/scripts/parse_HLA_types.py
@@ -1,7 +1,9 @@
 import pandas as pd
 
+# to get alleles that netMHCpan can handle, use its -listMHC option
 hlaI = ["A","B","C", "E", "G"]
 
+# to get alleles that netMHCIIpan can handle, use its -list option
 hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"]
 
 hlas = pd.read_csv(snakemake.input[0], sep='\t')

From 0062cfdcf8dd97fbea8a8ba33f9fd8fdae1874d8 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:50:28 +0000
Subject: [PATCH 086/191] netMHC(II)pan rules: use tcsh environments, clean up
 shell command via params

---
 workflow/envs/tcsh.yaml        |  4 ++++
 workflow/rules/MHC_binding.smk | 28 ++++++++++++++++++++--------
 2 files changed, 24 insertions(+), 8 deletions(-)
 create mode 100644 workflow/envs/tcsh.yaml

diff --git a/workflow/envs/tcsh.yaml b/workflow/envs/tcsh.yaml
new file mode 100644
index 00000000..9bc374fd
--- /dev/null
+++ b/workflow/envs/tcsh.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - tcsh =6.24
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 2e80b180..c8d1ed9a 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -24,13 +24,19 @@ rule netMHCpan:
         "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
     log:
         "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+    conda:
+        "../envs/tcsh.yaml"
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
-    run:
-        alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0])
-        cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
-        shell(cmd)
+        alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t").iloc[0])
+    shell:
+        "if [ -s {input.peptides} ]; "
+        "then "
+        "  {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "else "
+        "  touch {output}; "
+        "fi"
 
 
 rule netMHCIIpan:
@@ -41,13 +47,19 @@ rule netMHCIIpan:
         "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
     log:
         "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+    conda:
+        "../envs/tcsh.yaml"
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
-    run:
-        alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist())
-        cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
-        shell(cmd)
+        alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist())
+    shell:
+        "if [ -s {input.peptides} ]; "
+        "then "
+        "  {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "else "
+        "  touch {output}; "
+        "fi"
 
 
 rule parse_mhc_out:

From 298afbd7220444e9a3dfe1042ca00ee2018c54aa Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:51:13 +0000
Subject: [PATCH 087/191] generalize rule HLA_LA workdir to make it work via
 module import

---
 workflow/rules/HLAtyping.smk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index db8bcc58..12b78705 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -11,10 +11,11 @@ rule HLA_LA:
     params:
         graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
         graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
+        workdir=lambda w, output: os.path.dirname(os.path.dirname(os.path.dirname(output[0]))),
     conda:
         "../envs/hla_la.yaml"
     shell:
-        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
+        "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1"
 
 
 rule parse_HLA_LA:

From dffb7573beef509639dc3ea6bafb0592aa17dbe3 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:52:26 +0000
Subject: [PATCH 088/191] get_final_output(): move smps extraction to higher
 level, also get tumor_aliases at this high level

---
 workflow/rules/common.smk | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 2d9c4b35..34b5c06c 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -82,16 +82,16 @@ def is_activated(xpath):
 
 def get_final_output():
     final_output = []
-    if config["epitope_prediction"]["activate"]:
         for group in pd.unique(samples["group"]):
             smps = samples.loc[samples["group"] == group, "sample_name"]
-            sequencing_types = pd.unique(
-                units.loc[units["sample_name"].isin(smps), "sequencing_type"]
-            )
             tumor_aliases = samples.loc[
                 (samples["group"] == group) & (samples["alias"].str.match("tumor")),
                 "alias",
             ]
+        if config["epitope_prediction"]["activate"]:
+            sequencing_types = pd.unique(
+                units.loc[units["sample_name"].isin(smps), "sequencing_type"]
+            )
             final_output.extend(
                 expand(
                     "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",

From e038952047009b7faabb663172390c827cf8ac37 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:52:54 +0000
Subject: [PATCH 089/191] new wildcard_constraints for normal_alias, tumor_set
 and normal_set

---
 workflow/rules/common.smk | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 34b5c06c..b831f530 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -62,6 +62,9 @@ wildcard_constraints:
     tumor_alias="|".join(
         pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])
     ),
+    normal_alias="normal",
+    tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"],
+    normal_set=config["params"]["microphaser"]["variant_sets"]["normal"],
     group="|".join(pd.unique(samples["group"])),
     caller="|".join(["freebayes", "delly"]),
     peptide_type="|".join(["normal", "neo"]),

From ab79de0bd9c363ad61cbcbd56499d230eab781c9 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:53:45 +0000
Subject: [PATCH 090/191] switch netMHC(II)pan input functions to properly use
 HLA-LA output

---
 workflow/rules/common.smk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index b831f530..97e23eb8 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -177,7 +177,7 @@ def get_bam_from_group_and_alias(ext=".bam"):
 def get_alleles_MHCI(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
-        "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv",
+        "results/HLA-LA/{group}.{alias}.hlaI.tsv",
         group=wildcards.group,
         alias=alias,
     )
@@ -187,5 +187,7 @@ def get_alleles_MHCII(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
         #TODO: check that hlaII is correct here, and not hlaI which it previously was
-        "results/HLA-LA/{group}.{alias}.hlaII.tsv", group=wildcards.group, alias=alias
+        "results/HLA-LA/{group}.{alias}.hlaII.tsv",
+        group=wildcards.group,
+        alias=alias
     )

From 060285acccc9c568220d0c16e44476650d3eacba Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:53:56 +0000
Subject: [PATCH 091/191] type

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 97e23eb8..80f183f7 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -166,7 +166,7 @@ def get_bam_from_group_and_alias(ext=".bam"):
         )
         if alias == "unknown":
             raise CustomException(
-                "get_bam_from_group_and_alias() requires on of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'."
+                "get_bam_from_group_and_alias() requires one of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'."
             )
         sample = get_sample_from_group_and_alias(wildcards.group, alias)
         return f"results/recal/{sample}.sorted{ext}"

From 0faa787547049b37e1568efa6094182256dd0728 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:54:33 +0000
Subject: [PATCH 092/191] fix indentation

---
 workflow/rules/common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 80f183f7..985eb64a 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -116,7 +116,7 @@ def get_final_output():
                     seqtype=sequencing_types,
                 )
             )
-    else:
+        else:
             final_output = expand(
                 [
                     "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv",

From 73de9b72b7be413a68c559c63a613b889853ab50 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:54:50 +0000
Subject: [PATCH 093/191] fix further indentation

---
 workflow/rules/common.smk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 985eb64a..c5e93cac 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -85,12 +85,12 @@ def is_activated(xpath):
 
 def get_final_output():
     final_output = []
-        for group in pd.unique(samples["group"]):
-            smps = samples.loc[samples["group"] == group, "sample_name"]
-            tumor_aliases = samples.loc[
-                (samples["group"] == group) & (samples["alias"].str.match("tumor")),
-                "alias",
-            ]
+    for group in pd.unique(samples["group"]):
+        smps = samples.loc[samples["group"] == group, "sample_name"]
+        tumor_aliases = samples.loc[
+            (samples["group"] == group) & (samples["alias"].str.match("tumor")),
+            "alias",
+        ]
         if config["epitope_prediction"]["activate"]:
             sequencing_types = pd.unique(
                 units.loc[units["sample_name"].isin(smps), "sequencing_type"]

From b91744f749d9a8e9f818eef78369c07d6a021eb3 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 09:55:27 +0000
Subject: [PATCH 094/191] adjust bcf index file name to standard bcftools
 naming scheme

---
 workflow/rules/microphaser.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 044b825d..50e99926 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -22,7 +22,7 @@ rule merge_tumor_normal:
             ],
         ),
         index=expand(
-            "results/final-calls/{{group}}.{sets}.norm.csi",
+            "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
             sets=[
                 config["params"]["microphaser"]["variant_sets"]["normal"],
                 config["params"]["microphaser"]["variant_sets"]["tumor"],

From 8015a151084ab3fb47c39e226e1baa1cbfc434ab Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 24 Jun 2022 10:03:54 +0000
Subject: [PATCH 095/191] snakefmt

---
 workflow/rules/HLAtyping.smk   |  4 +++-
 workflow/rules/MHC_binding.smk |  8 ++++++--
 workflow/rules/common.smk      | 14 +++++++-------
 workflow/rules/microphaser.smk |  8 ++++++--
 workflow/rules/ref.smk         |  6 ++++--
 5 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 12b78705..6cd39931 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -11,7 +11,9 @@ rule HLA_LA:
     params:
         graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
         graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
-        workdir=lambda w, output: os.path.dirname(os.path.dirname(os.path.dirname(output[0]))),
+        workdir=lambda w, output: os.path.dirname(
+            os.path.dirname(os.path.dirname(output[0]))
+        ),
     conda:
         "../envs/hla_la.yaml"
     shell:
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index c8d1ed9a..eb3724ea 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -29,7 +29,9 @@ rule netMHCpan:
     params:
         extra=config["affinity"]["netMHCpan"]["params"],
         netMHC=config["affinity"]["netMHCpan"]["location"],
-        alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t").iloc[0])
+        alleles=lambda wc, input: ",".join(
+            pd.read_csv(input.alleles[0], sep="\t").iloc[0]
+        ),
     shell:
         "if [ -s {input.peptides} ]; "
         "then "
@@ -52,7 +54,9 @@ rule netMHCIIpan:
     params:
         extra=config["affinity"]["netMHCIIpan"]["params"],
         netMHC=config["affinity"]["netMHCIIpan"]["location"],
-        alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist())
+        alleles=lambda wc, input: ",".join(
+            pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist()
+        ),
     shell:
         "if [ -s {input.peptides} ]; "
         "then "
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index c5e93cac..54a71e58 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -123,7 +123,7 @@ def get_final_output():
                     "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv",
                 ],
                 group=group,
-                tumor_alias=tumor_aliases
+                tumor_alias=tumor_aliases,
             )
 
     return final_output
@@ -135,6 +135,7 @@ caller = list(
 
 ### helper functions ###
 
+
 def is_paired_end(sample, seqtype):
     sample_units = units.loc[sample].loc[seqtype]
     fq2_null = sample_units["fq2"].isnull()
@@ -159,10 +160,9 @@ def get_sample_from_group_and_alias(group, alias):
 
 def get_bam_from_group_and_alias(ext=".bam"):
     def inner(wildcards):
-        alias = wildcards.get("alias",
-            wildcards.get("tumor_alias",
-                wildcards.get("normal_alias", "unknown")
-            )
+        alias = wildcards.get(
+            "alias",
+            wildcards.get("tumor_alias", wildcards.get("normal_alias", "unknown")),
         )
         if alias == "unknown":
             raise CustomException(
@@ -186,8 +186,8 @@ def get_alleles_MHCI(wildcards):
 def get_alleles_MHCII(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
-        #TODO: check that hlaII is correct here, and not hlaI which it previously was
+        # TODO: check that hlaII is correct here, and not hlaI which it previously was
         "results/HLA-LA/{group}.{alias}.hlaII.tsv",
         group=wildcards.group,
-        alias=alias
+        alias=alias,
     )
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 50e99926..a43c7b18 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -68,8 +68,12 @@ rule microphaser_normal:
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        wt_fasta=("results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"),
-        wt_tsv=("results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"),
+        wt_fasta=(
+            "results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"
+        ),
+        wt_tsv=(
+            "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"
+        ),
     log:
         "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log",
     conda:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 93daa96a..c6474b7f 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -44,7 +44,7 @@ rule get_annotation:
         "0.45.1/bio/reference/ensembl-annotation"
 
 
-#TODO: remove this rule, once microphaser is fixed to make gene_name optional
+# TODO: remove this rule, once microphaser is fixed to make gene_name optional
 rule remove_records_with_gene_name_missing:
     input:
         "resources/genome.gtf",
@@ -103,7 +103,9 @@ rule download_HLALA_graph:
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"),
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     params:
-        graphs_dir=lambda w, output: output[0].replace("/PRG_MHC_GRCh38_withIMGT/PRG", ""),
+        graphs_dir=lambda w, output: output[0].replace(
+            "/PRG_MHC_GRCh38_withIMGT/PRG", ""
+        ),
     log:
         "logs/download-HLA-LA-graph.log",
     shell:

From 84e91122cae5de8e2857b4f01649666dc1ff3c94 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 12 Jul 2022 09:58:25 +0000
Subject: [PATCH 096/191] constrain `set` wildcard to tumor and normal variant
 set names

---
 workflow/rules/common.smk | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 54a71e58..097509c6 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -65,6 +65,12 @@ wildcard_constraints:
     normal_alias="normal",
     tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"],
     normal_set=config["params"]["microphaser"]["variant_sets"]["normal"],
+    set="|".join(
+        [
+            config["params"]["microphaser"]["variant_sets"]["tumor"],
+            config["params"]["microphaser"]["variant_sets"]["normal"],
+        ]
+    ),
     group="|".join(pd.unique(samples["group"])),
     caller="|".join(["freebayes", "delly"]),
     peptide_type="|".join(["normal", "neo"]),

From 5f15ecfb50185197f6e80d58ae3ade41938e281f Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 12 Jul 2022 10:00:07 +0000
Subject: [PATCH 097/191] ensure that HLA-LA reference fasta is indexed with
 hla-la.yaml dependency version of bwa

---
 workflow/rules/HLAtyping.smk |  1 +
 workflow/rules/ref.smk       | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 6cd39931..b2c389e9 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -3,6 +3,7 @@ rule HLA_LA:
         bam=get_bam_from_group_and_alias(),
         bai=get_bam_from_group_and_alias(ext=".bai"),
         index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
+        ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac",
     output:
         "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     threads: 7
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index c6474b7f..cb54eeeb 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -94,13 +94,13 @@ rule genome_dict:
 rule download_HLALA_graph:
     output:
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"),
-        directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping_PRGonly"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/referenceGenomeSimulations"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"),
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa",
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     params:
         graphs_dir=lambda w, output: output[0].replace(
@@ -130,6 +130,23 @@ rule index_HLALA:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
+rule index_HLALA_extended_ref:
+    input:
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa",
+    output:
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.amb",
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.ann",
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.bwt",
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac",
+        "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.sa",
+    conda:
+        "../envs/hla_la.yaml"
+    log:
+        "logs/index_HLA-LA_extended_ref.log",
+    shell:
+        "bwa index {input} > {log} 2>&1"
+
+
 rule make_sampleheader:
     output:
         "resources/sampleheader.txt",

From 11cdb4af942ec37bc5f383cf3c8b47d28d7085be Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 12 Jul 2022 10:01:48 +0000
Subject: [PATCH 098/191] update to newest microphaser version

---
 workflow/envs/microphaser.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml
index 84d2ad83..3237608c 100644
--- a/workflow/envs/microphaser.yaml
+++ b/workflow/envs/microphaser.yaml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - microphaser =0.2
+  - microphaser =0.4

From 29131ce6822c57e6908e310cf0adea8654dc1079 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 12 Jul 2022 10:02:44 +0000
Subject: [PATCH 099/191] fix logs path in rule merge_tumor_normal

---
 workflow/rules/microphaser.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index a43c7b18..10daf61d 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -31,7 +31,7 @@ rule merge_tumor_normal:
     output:
         "results/final-calls/{group}.merged_tumor_normal.norm.bcf",
     log:
-        "bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log",
+        "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log",
     params:
         extra="-O b -a",
     wrapper:

From d9e993c6c8fb10dab40d6ba46a6a91cb9fe3f7f2 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 12 Jul 2022 10:04:06 +0000
Subject: [PATCH 100/191] add INFO flag SOMATIC to somatic tumor calls for
 microphaser (before merging with normal calls)

---
 workflow/envs/gawk.yaml        |  4 +++
 workflow/rules/microphaser.smk | 27 ++++++++++++++++--
 workflow/rules/ref.smk         | 51 ++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 workflow/envs/gawk.yaml

diff --git a/workflow/envs/gawk.yaml b/workflow/envs/gawk.yaml
new file mode 100644
index 00000000..fa09e590
--- /dev/null
+++ b/workflow/envs/gawk.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - gawk =5.1
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 10daf61d..2df2e782 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -12,20 +12,43 @@ rule norm_bcf:
         "0.65.0/bio/bcftools/norm"
 
 
+rule add_somatic_flag:
+    input:
+        bcf="results/final-calls/{group}.{set}.norm.bcf",
+        header_line="resources/somatic_flag_header_line.txt",
+        flag_bed="resources/genome.somatic_flag.bed.gz",
+        flag_bed_idx="resources/genome.somatic_flag.bed.gz.tbi",
+    output:
+        "results/final-calls/{group}.{set}.somatic_flag.norm.bcf",
+    log:
+        "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log"
+    conda:
+        "../envs/bcftools.yaml"
+    shell:
+        "( bcftools annotate "
+        "  --annotations {input.flag_bed} "
+        "  --mark-sites +SOMATIC "
+        "  --columns CHROM,FROM,TO "
+        "  --header-lines {input.header_line} "
+        "  -O b -o {output} "
+        "  {input.bcf} "
+        ") 2> {log}"
+
+
 rule merge_tumor_normal:
     input:
         calls=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf",
             sets=[
                 config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"],
+                config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag",
             ],
         ),
         index=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
             sets=[
                 config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"],
+                config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag",
             ],
         ),
     output:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index cb54eeeb..0d6324a0 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -79,6 +79,57 @@ rule genome_faidx:
         "0.45.1/bio/samtools/faidx"
 
 
+rule create_somatic_flag_header_line:
+    output:
+        "resources/somatic_flag_header_line.txt",
+    log:
+        "logs/create_somatic_flag_header_line.log"
+    shell:
+        """
+        ( echo '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic tumor variant">' > {output} ) 2> {log}
+        """
+
+
+rule create_genome_somatic_flag_bed:
+    input:
+        "resources/genome.fasta.fai",
+    output:
+        "resources/genome.somatic_flag.bed",
+    log:
+        "logs/create_genome_somatic_flag_bed.log"
+    conda:
+        "../envs/gawk.yaml"
+    cache: True
+    shell:
+        """
+        ( awk 'BEGIN {{ OFS="\\t" }} {{ print $1,0,$2 }}' {input} > {output} ) 2> {log}
+        """
+
+
+rule bgzip_genome_somatic_flag_bed:
+    input:
+        "resources/genome.somatic_flag.bed",
+    output:
+        "resources/genome.somatic_flag.bed.gz",
+    log:
+        "logs/bgzip/genome.somatic_flag.log",
+    wrapper:
+        "v1.7.0/bio/bgzip"
+
+
+rule tabix_genome_somatic_flag_bed:
+    input:
+        "resources/genome.somatic_flag.bed.gz",
+    output:
+        "resources/genome.somatic_flag.bed.gz.tbi",
+    conda:
+        "../envs/htslib.yaml"
+    log:
+        "logs/tabix/genome.somatic_flag.log",
+    shell:
+        "( tabix -p bed {input} ) 2> {log}"
+
+
 rule genome_dict:
     input:
         "resources/genome.fasta",

From 218354c4a0db5b01ab3d10c49d9b9d8264ca00ce Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 14 Jul 2022 08:22:26 +0000
Subject: [PATCH 101/191] add proper logging to python scripts

---
 workflow/scripts/add_rna_info.py                 | 4 ++++
 workflow/scripts/count_neoantigen_occurrences.py | 4 ++++
 workflow/scripts/group_mhc_output.py             | 3 +++
 workflow/scripts/merge_data.py                   | 3 +++
 workflow/scripts/merge_mhcflurry.py              | 3 +++
 workflow/scripts/parse_HLA_types.py              | 4 ++++
 workflow/scripts/sample_comp_plot.py             | 4 ++++
 workflow/scripts/tsv_to_xlsx.py                  | 3 ++-
 8 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py
index bdcdcac2..b8690f86 100644
--- a/workflow/scripts/add_rna_info.py
+++ b/workflow/scripts/add_rna_info.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import pandas as pd
 
 ## load data table
diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py
index 1b6050e4..a70caa36 100644
--- a/workflow/scripts/count_neoantigen_occurrences.py
+++ b/workflow/scripts/count_neoantigen_occurrences.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import pandas as pd
 import glob
 
diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py
index 5ce0cf15..fda5b360 100644
--- a/workflow/scripts/group_mhc_output.py
+++ b/workflow/scripts/group_mhc_output.py
@@ -1,4 +1,7 @@
 import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import os
 import pandas as pd
 import numpy as np
diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py
index b90af1cb..0142ba9f 100644
--- a/workflow/scripts/merge_data.py
+++ b/workflow/scripts/merge_data.py
@@ -1,4 +1,7 @@
 import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import os
 import pandas as pd
 import numpy as np
diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py
index fcc6aba4..c0098cb9 100644
--- a/workflow/scripts/merge_mhcflurry.py
+++ b/workflow/scripts/merge_mhcflurry.py
@@ -1,4 +1,7 @@
 import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import os
 import pandas as pd
 import numpy as np
diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
index 1824326d..ae6cb69f 100644
--- a/workflow/scripts/parse_HLA_types.py
+++ b/workflow/scripts/parse_HLA_types.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import pandas as pd
 
 # to get alleles that netMHCpan can handle, use its -listMHC option
diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py
index 3c7c287d..d8dbf6e0 100644
--- a/workflow/scripts/sample_comp_plot.py
+++ b/workflow/scripts/sample_comp_plot.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
 import os
 import glob
 import pandas as pd
diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py
index 28a24594..5d9bf10f 100644
--- a/workflow/scripts/tsv_to_xlsx.py
+++ b/workflow/scripts/tsv_to_xlsx.py
@@ -1,7 +1,8 @@
 import sys
-import pandas as pd
 
 sys.stderr = open(snakemake.log[0], "w")
 
+import pandas as pd
+
 data = pd.read_csv(snakemake.input.tsv, sep="\t")
 data.to_excel(snakemake.output.xlsx, index=False)
\ No newline at end of file

From 9d6b58f91cda1ee0881d2de22fcbe6367eb513dc Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 14 Jul 2022 08:23:33 +0000
Subject: [PATCH 102/191] add proper logging to R script

---
 workflow/scripts/phylogeny.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/workflow/scripts/phylogeny.R b/workflow/scripts/phylogeny.R
index 7f26becf..9e32c131 100644
--- a/workflow/scripts/phylogeny.R
+++ b/workflow/scripts/phylogeny.R
@@ -1,3 +1,7 @@
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
 library(phangorn)
 
 ## read the variant matrix

From fdbd9dd4431c86e1df1e4c686ddd531ddfcd69be Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 14 Jul 2022 08:24:52 +0000
Subject: [PATCH 103/191] add logging capture to rules netMHCpan and
 netMHCIIpan

---
 workflow/rules/MHC_binding.smk | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index eb3724ea..571a7623 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -33,12 +33,14 @@ rule netMHCpan:
             pd.read_csv(input.alleles[0], sep="\t").iloc[0]
         ),
     shell:
+        "( "
         "if [ -s {input.peptides} ]; "
         "then "
         "  {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
-        "fi"
+        "fi "
+        " ) 2> {log}"
 
 
 rule netMHCIIpan:
@@ -58,12 +60,14 @@ rule netMHCIIpan:
             pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist()
         ),
     shell:
+        "( "
         "if [ -s {input.peptides} ]; "
         "then "
         "  {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
-        "fi"
+        "fi "
+        " ) 2> {log}"
 
 
 rule parse_mhc_out:

From da392e29f37503be4c89803a5a3d0c85aa5f6c68 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 19 Jul 2022 12:44:55 +0000
Subject: [PATCH 104/191] harmoize microphaser logging paths

---
 workflow/rules/microphaser.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 2df2e782..9c747582 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -98,7 +98,7 @@ rule microphaser_normal:
             "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"
         ),
     log:
-        "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log",
+        "logs/microphaser_normal/{group}/{normal_alias}.{normal_set}-{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -117,7 +117,7 @@ rule concat_normal_proteome:
     output:
         "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa",
     log:
-        "logs/microphaser/concat_normal_proteome/{group}.{normal_set}.log",
+        "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.log",
     shell:
         "cat {input} > {output} 2> {log}"
 
@@ -129,7 +129,7 @@ rule build_normal_proteome_db:
         bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin",
         fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta",
     log:
-        "logs/microphaser/build_normal_proteome_db/{group}.{normal_set}-{mhc}.log",
+        "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}-{mhc}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
@@ -137,7 +137,7 @@ rule build_normal_proteome_db:
             wildcards.mhc
         ],
     shell:
-        "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1"
+        "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}"
 
 
 rule microphaser_filter:
@@ -177,7 +177,7 @@ rule concat_tsvs:
     output:
         "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
     log:
-        "logs/concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
+        "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From 873fe38753a2924a9d52d242bf85ae74ae810e3e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 19 Jul 2022 12:46:20 +0000
Subject: [PATCH 105/191] fix default microphaser window_len to 45, 3x the
 default netMHCIIpan peptide_len of 15

---
 config/config.yaml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 31512865..aaa78af8 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -51,13 +51,11 @@ annotations:
 
 params:
   microphaser:
-    window_len:
-        33
+    # window_len should be at least 3 times the longest peptide_len specified below
+    window_len: 45
     peptide_len:
-      netMHCpan:
-          9
-      netMHCIIpan:
-          15
+      netMHCpan: 9
+      netMHCIIpan: 15
     variant_sets:
       normal: "normal_only"
       tumor: "tumor_only"

From ac8b6849fd0b5aeb3b7f8beed3fbaa938691c5d7 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 25 Jul 2022 11:24:35 +0000
Subject: [PATCH 106/191] clean up config params specification for microphaser

---
 .test/config/config.yaml       | 51 +++++++++++++++-------------------
 config/config.yaml             | 48 +++++++++++++++-----------------
 workflow/rules/MHC_binding.smk | 22 +++++++--------
 workflow/rules/common.smk      |  4 +--
 workflow/rules/microphaser.smk | 12 +++-----
 5 files changed, 61 insertions(+), 76 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 25ccfa45..8efb0dde 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -6,28 +6,6 @@ epitope_prediction:
   activate: true
 
 
-affinity:
-  netMHCpan:
-    activate: true
-    params: "-BA -l 9 -s -xls"
-    # Please download netMHCpan manually from:
-    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
-    # To make the `netMHCpan` script work, you need to fix its first line in
-    # in addition to the other edits described for a complete install. To use
-    # the conda-provided tcsh installation, it needs to read (without quotes):
-    # "#!/usr/bin/env tcsh"
-    location: "../netMHCpan-4.1"
-  netMHCIIpan:
-    activate: false
-    params: "-length 15 -s -xls"
-    # Please download netMHCIIpan manually from:
-    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
-    # To make the `netMHCIIpan` script work, you need to fix its first line in
-    # in addition to the other edits described for a complete install. To use
-    # the conda-provided tcsh installation, it needs to read (without quotes):
-    # "#!/usr/bin/env tcsh"
-    location: "../netMHCIIpan-4.1"
-
 
 ref:
   # Number of chromosomes to consider for calling.
@@ -51,13 +29,28 @@ annotations:
 
 params:
   microphaser:
-    window_len:
-        33
-    peptide_len:
-      netMHCpan:
-          9
-      netMHCIIpan:
-          15
     events:
       tumor: "tumor_only"
       normal: "normal_only"
+  netMHCpan:
+    activate: true
+    peptide_len: 9
+    params: "-BA -s"
+    # Please download netMHCpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+    # To make the `netMHCpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCpan-4.1"
+  netMHCIIpan:
+    activate: false
+    peptide_len: 15
+    params: "-BA -s"
+    # Please download netMHCIIpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+    # To make the `netMHCIIpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCIIpan-4.1"
diff --git a/config/config.yaml b/config/config.yaml
index aaa78af8..a9978057 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,27 +7,6 @@ epitope_prediction:
 
 
 affinity:
-  netMHCpan:
-    activate: true
-    params: "-BA -l 9 -s -xls"
-    # Please download netMHCpan manually from:
-    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
-    # To make the `netMHCpan` script work, you need to fix its first line in
-    # in addition to the other edits described for a complete install. To use
-    # the conda-provided tcsh installation, it needs to read (without quotes):
-    # "#!/usr/bin/env tcsh"
-    location: "../netMHCpan-4.1"
-  netMHCIIpan:
-    activate: false
-    params: "-length 15 -s -xls"
-    # Please download netMHCIIpan manually from:
-    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
-    # To make the `netMHCIIpan` script work, you need to fix its first line in
-    # in addition to the other edits described for a complete install. To use
-    # the conda-provided tcsh installation, it needs to read (without quotes):
-    # "#!/usr/bin/env tcsh"
-    location: "../netMHCIIpan-4.1"
-
 
 ref:
   # Number of chromosomes to consider for calling.
@@ -52,10 +31,29 @@ annotations:
 params:
   microphaser:
     # window_len should be at least 3 times the longest peptide_len specified below
-    window_len: 45
-    peptide_len:
-      netMHCpan: 9
-      netMHCIIpan: 15
     variant_sets:
       normal: "normal_only"
       tumor: "tumor_only"
+  netMHCpan:
+    activate: true
+    peptide_len: 9
+    params: "-BA -s"
+    # Please download netMHCpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+    # To make the `netMHCpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCpan-4.1"
+  netMHCIIpan:
+    activate: false
+    peptide_len: 15
+    params: "-BA -s"
+    # Please download netMHCIIpan manually from:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+    # To make the `netMHCIIpan` script work, you need to fix its first line in
+    # in addition to the other edits described for a complete install. To use
+    # the conda-provided tcsh installation, it needs to read (without quotes):
+    # "#!/usr/bin/env tcsh"
+    location: "../netMHCIIpan-4.1"
+
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 571a7623..fbbf1708 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -27,16 +27,15 @@ rule netMHCpan:
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["affinity"]["netMHCpan"]["params"],
-        netMHC=config["affinity"]["netMHCpan"]["location"],
-        alleles=lambda wc, input: ",".join(
-            pd.read_csv(input.alleles[0], sep="\t").iloc[0]
-        ),
+        extra=config["params"]["netMHCpan"]["params"],
+        netMHC=config["params"]["netMHCpan"]["location"],
+        length=config["params"]["netMHCpan"]["peptide_len"],
+        alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
     shell:
         "( "
         "if [ -s {input.peptides} ]; "
         "then "
-        "  {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "  {params.netMHC}/netMHCpan {params.extra} -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
         "fi "
@@ -54,16 +53,15 @@ rule netMHCIIpan:
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["affinity"]["netMHCIIpan"]["params"],
-        netMHC=config["affinity"]["netMHCIIpan"]["location"],
-        alleles=lambda wc, input: ",".join(
-            pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist()
-        ),
+        extra=config["params"]["netMHCIIpan"]["params"],
+        netMHC=config["params"]["netMHCIIpan"]["location"],
+        length=config["params"]["netMHCIIpan"]["peptide_len"],
+        alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
     shell:
         "( "
         "if [ -s {input.peptides} ]; "
         "then "
-        "  {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "  {params.netMHC}/netMHCIIpan {params.extra} -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
         "fi "
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 097509c6..1e976731 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -111,10 +111,10 @@ def get_final_output():
                             None,
                             [
                                 "netMHCpan"
-                                if is_activated("affinity/netMHCpan")
+                                if is_activated("params/netMHCpan")
                                 else None,
                                 "netMHCIIpan"
-                                if is_activated("affinity/netMHCIIpan")
+                                if is_activated("params/netMHCIIpan")
                                 else None,
                             ],
                         )
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 9c747582..86af23d0 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -77,7 +77,7 @@ rule microphaser_tumor:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=config["params"]["microphaser"]["window_len"],
+        window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3,
     shell:
         "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
@@ -102,7 +102,7 @@ rule microphaser_normal:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=config["params"]["microphaser"]["window_len"],
+        window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3,
     shell:
         "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"
@@ -133,9 +133,7 @@ rule build_normal_proteome_db:
     conda:
         "../envs/microphaser.yaml"
     params:
-        length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][
-            wildcards.mhc
-        ],
+        length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"],
     shell:
         "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}"
 
@@ -161,9 +159,7 @@ rule microphaser_filter:
     conda:
         "../envs/microphaser.yaml"
     params:
-        length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][
-            wildcards.mhc
-        ],
+        length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"],
     shell:
         "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}"
 

From f0f3cd2338765fd6bc2a0cde59b1bb7fee1c5c6f Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 25 Jul 2022 11:27:00 +0000
Subject: [PATCH 107/191] debug HLA_LA output parsing, adapting to netMHCpan
 4.1, extensive comments with linkouts, named inputs/outputs in rule

---
 workflow/rules/HLAtyping.smk        |  6 +--
 workflow/scripts/merge_data.py      |  9 ++--
 workflow/scripts/parse_HLA_types.py | 73 +++++++++++++++++++++++------
 3 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index b2c389e9..5482e871 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -23,14 +23,14 @@ rule HLA_LA:
 
 rule parse_HLA_LA:
     input:
-        "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+        hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     output:
-        report(
+        hlaI=report(
             "results/HLA-LA/{group}.{alias}.hlaI.tsv",
             caption="../report/HLA_Types.rst",
             category="HLA-Typing(HLA-LA)",
         ),
-        report(
+        hlaII=report(
             "results/HLA-LA/{group}.{alias}.hlaII.tsv",
             caption="../report/HLA_Types.rst",
             category="HLA-Typing(HLA-LA)",
diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py
index 0142ba9f..ac5d21c1 100644
--- a/workflow/scripts/merge_data.py
+++ b/workflow/scripts/merge_data.py
@@ -2,10 +2,7 @@
 
 sys.stderr = open(snakemake.log[0], "w")
 
-import os
 import pandas as pd
-import numpy as np
-
 
 def select_columns(mhc):
     rank_cols = [c for c in mhc.columns if "Rank" in c]
@@ -82,9 +79,9 @@ def diffEpitope(e1,e2):
 
 
 def main():
-    info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str)
-    tumor = pd.read_csv(snakemake.input[1], sep = '\t')
-    normal = pd.read_csv(snakemake.input[2], sep = '\t')
+    info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str)
+    tumor = pd.read_csv(snakemake.input.neo, sep = '\t')
+    normal = pd.read_csv(snakemake.input.normal, sep = '\t')
     outfile = snakemake.output[0]
     merge(info, tumor, normal, outfile)
 
diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
index ae6cb69f..8523f9ea 100644
--- a/workflow/scripts/parse_HLA_types.py
+++ b/workflow/scripts/parse_HLA_types.py
@@ -4,23 +4,66 @@
 
 import pandas as pd
 
-# to get alleles that netMHCpan can handle, use its -listMHC option
-hlaI = ["A","B","C", "E", "G"]
+# To know which alleles netMHCpan can handle, use its -listMHC option.
+HLAI = ["A","B","C", "E", "G"]
 
-# to get alleles that netMHCIIpan can handle, use its -list option
-hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"]
+# To know which alleles netMHCIIpan can handle, use its -list option.
+# DRB alleles need to be formatted differently from DP and DQ alleles,
+# so we specify them separately.
+DRB = ["DRB1", "DRB3", "DRB4", "DRB5"]
+ALPHA_BETA = ["DPA1", "DPB1", "DQA1", "DQB1"]
 
-hlas = pd.read_csv(snakemake.input[0], sep='\t')
+hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep='\t')
 
-hlasI = hlas[hlas.Locus.isin(hlaI)]
-hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')
-hlasI = hlasI[["Allele"]].drop_duplicates()
-hlasI.to_csv(snakemake.output[0], sep='\t', index=False)
+# the Allele column can contain multiple ";"-separated entries for the
+# same locus
+hlas.loc[:, "Allele"] = hlas.Allele.str.split(pat=";")
+hlas["alternative"] = hlas.Allele.apply( lambda x: range( len(x) ) )
+hlas = hlas.explode(["Allele", "alternative"])
 
-hlasII = hlas[hlas.Locus.isin(hlaII)]
-hlasII["HLA"] = hlasII.Locus.str[0:2]
-hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')
+# reformat to netMHCpan allele list format:
+# * https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/allele.list
+# it needs to be in the format of the first column of the above list, as explained in
+# the "Instructions" tab under "MHC SELECTION" point "2)" at:
+# * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+hlaI_alleles = hlas.loc[hlas["Locus"].isin(HLAI), "Allele"].str.replace("([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True).drop_duplicates()
+hlaI_alleles.to_csv(snakemake.output.hlaI, sep='\t', index=False, header=False)
 
-hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates()
-hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_")
-hlasII.to_csv(snakemake.output[1], sep='\t', index=False)
+# reformat to netMHCIIpan allele list format:
+# * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list
+# contrary to the format in that list, alleles actually need to be formatted like this,
+# with <gene>s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only
+# the 1st and 2nd sub-number are used) according to the official nomenclature (see:
+# https://ars.els-cdn.com/content/image/1-s2.0-S0006497120405555-gr2.jpg ):
+# * DRB alleles: "<gene>_<allele_group><specific_HLA_protein>"
+# * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA):
+#   "HLA-<alpha_gene><allele_group><specific_HLA_protein>-<beta_gene><allele_group><specific_HLA_protein>"
+# This format was determined by manually selecting combinations above the
+# "type a list of molecules names" field of the "Submission" tab at:
+# * https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+
+# TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary:
+# * example: DRB1_1501-DRB30101-DRB40301N
+# * "DRB1_<allele_group><specific_HLA_protein>-DRB3<allele_group><specific_HLA_protein>-DRB4<allele_group><specific_HLA_protein>"
+drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)]
+hlaII_alleles = drb_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True).drop_duplicates()
+
+# handle alleles where a combination of alpha and beta always exists
+alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)]
+alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True)
+# we need a variable to group alpha and beta of the same gene combination together
+alpha_beta_alleles["gene_group"] = alpha_beta_alleles.Locus.str[0:2]
+# we need to handle cases where we had multiple allele entries in an
+# alpha or beta locus, adding in a duplicate of the corresponding locus
+alleles_to_duplicate = alpha_beta_alleles.loc[ alpha_beta_alleles["alternative"] > 0 & alpha_beta_alleles["Locus"].str.startswith("D[PQ]"), ["Locus", "Chromosome", "alternative"] ].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}})
+alleles_to_insert = alleles_to_duplicate.merge(alpha_beta_alleles.drop('alternative', axis='columns'), on=['Locus', 'Chromosome'], how='left')
+alpha_beta_alleles = pd.concat([alpha_beta_alleles, alleles_to_insert]).drop_duplicates()
+
+hlaII_alleles = hlaII_alleles.append(
+        alpha_beta_alleles\
+            .groupby(["gene_group", "Chromosome", "alternative"])["Allele"]\
+            .transform(lambda x: f"HLA-{'-'.join(x)}")\
+            .drop_duplicates()
+    )
+
+hlaII_alleles.to_csv(snakemake.output.hlaII, sep='\t', index=False, header=False)

From 08bbebd575b11be805ba5ee75d9c422357f3bd6b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 27 Jul 2022 15:12:32 +0000
Subject: [PATCH 108/191] clean up parse_HLA_types.py

---
 workflow/scripts/parse_HLA_types.py | 61 +++++++++++++++++++----------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
index 8523f9ea..c93a943e 100644
--- a/workflow/scripts/parse_HLA_types.py
+++ b/workflow/scripts/parse_HLA_types.py
@@ -5,20 +5,20 @@
 import pandas as pd
 
 # To know which alleles netMHCpan can handle, use its -listMHC option.
-HLAI = ["A","B","C", "E", "G"]
+HLAI = {"A", "B", "C", "E", "G"}
 
 # To know which alleles netMHCIIpan can handle, use its -list option.
 # DRB alleles need to be formatted differently from DP and DQ alleles,
 # so we specify them separately.
-DRB = ["DRB1", "DRB3", "DRB4", "DRB5"]
-ALPHA_BETA = ["DPA1", "DPB1", "DQA1", "DQB1"]
+DRB = {"DRB1", "DRB3", "DRB4", "DRB5"}
+ALPHA_BETA = {"DPA1", "DPB1", "DQA1", "DQB1"}
 
-hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep='\t')
+hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t")
 
 # the Allele column can contain multiple ";"-separated entries for the
 # same locus
-hlas.loc[:, "Allele"] = hlas.Allele.str.split(pat=";")
-hlas["alternative"] = hlas.Allele.apply( lambda x: range( len(x) ) )
+hlas.loc[:, "Allele"] = hlas["Allele"].str.split(pat=";")
+hlas["alternative"] = hlas["Allele"].apply(lambda x: range(len(x)))
 hlas = hlas.explode(["Allele", "alternative"])
 
 # reformat to netMHCpan allele list format:
@@ -26,8 +26,12 @@
 # it needs to be in the format of the first column of the above list, as explained in
 # the "Instructions" tab under "MHC SELECTION" point "2)" at:
 # * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
-hlaI_alleles = hlas.loc[hlas["Locus"].isin(HLAI), "Allele"].str.replace("([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True).drop_duplicates()
-hlaI_alleles.to_csv(snakemake.output.hlaI, sep='\t', index=False, header=False)
+hlaI_alleles = (
+    hlas.loc[hlas["Locus"].isin(HLAI), "Allele"]
+    .str.replace(r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True)
+    .drop_duplicates()
+)
+hlaI_alleles.to_csv(snakemake.output.hlaI, sep="\t", index=False, header=False)
 
 # reformat to netMHCIIpan allele list format:
 # * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list
@@ -46,24 +50,41 @@
 # * example: DRB1_1501-DRB30101-DRB40301N
 # * "DRB1_<allele_group><specific_HLA_protein>-DRB3<allele_group><specific_HLA_protein>-DRB4<allele_group><specific_HLA_protein>"
 drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)]
-hlaII_alleles = drb_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True).drop_duplicates()
+hlaII_alleles = (
+    drb_alleles["Allele"]
+    .str.replace(r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True)
+    .drop_duplicates()
+)
 
 # handle alleles where a combination of alpha and beta always exists
 alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)]
-alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True)
+alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace(
+    r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True
+)
 # we need a variable to group alpha and beta of the same gene combination together
-alpha_beta_alleles["gene_group"] = alpha_beta_alleles.Locus.str[0:2]
+alpha_beta_alleles["gene_group"] = alpha_beta_alleles["Locus"].str[0:2]
 # we need to handle cases where we had multiple allele entries in an
 # alpha or beta locus, adding in a duplicate of the corresponding locus
-alleles_to_duplicate = alpha_beta_alleles.loc[ alpha_beta_alleles["alternative"] > 0 & alpha_beta_alleles["Locus"].str.startswith("D[PQ]"), ["Locus", "Chromosome", "alternative"] ].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}})
-alleles_to_insert = alleles_to_duplicate.merge(alpha_beta_alleles.drop('alternative', axis='columns'), on=['Locus', 'Chromosome'], how='left')
-alpha_beta_alleles = pd.concat([alpha_beta_alleles, alleles_to_insert]).drop_duplicates()
+select_mult_all = alpha_beta_alleles["alternative"] > 0
+select_dpq_loci = alpha_beta_alleles["Locus"].str.startswith("D[PQ]")
+mult_all_per_loc_selection = select_mult_all & select_dpq_loci
+alleles_to_duplicate = alpha_beta_alleles.loc[
+    mult_all_per_loc_selection,
+    ["Locus", "Chromosome", "alternative"],
+].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}})
+alleles_to_insert = alleles_to_duplicate.merge(
+    alpha_beta_alleles.drop("alternative", axis="columns"),
+    on=["Locus", "Chromosome"],
+    how="left",
+)
+alpha_beta_alleles = pd.concat(
+    [alpha_beta_alleles, alleles_to_insert]
+).drop_duplicates()
 
 hlaII_alleles = hlaII_alleles.append(
-        alpha_beta_alleles\
-            .groupby(["gene_group", "Chromosome", "alternative"])["Allele"]\
-            .transform(lambda x: f"HLA-{'-'.join(x)}")\
-            .drop_duplicates()
-    )
+    alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"]
+    .transform(lambda x: f"HLA-{'-'.join(x)}")
+    .drop_duplicates()
+)
 
-hlaII_alleles.to_csv(snakemake.output.hlaII, sep='\t', index=False, header=False)
+hlaII_alleles.to_csv(snakemake.output.hlaII, sep="\t", index=False, header=False)

From 51b17e15d38b59c7b4bfeb38af4823b0a76ea644 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 27 Jul 2022 20:08:20 +0000
Subject: [PATCH 109/191] initial suggestions from Till

---
 workflow/scripts/merge_data.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py
index ac5d21c1..23f92e37 100644
--- a/workflow/scripts/merge_data.py
+++ b/workflow/scripts/merge_data.py
@@ -4,10 +4,10 @@
 
 import pandas as pd
 
-def select_columns(mhc):
+def select_columns(mhc: pd.DataFrame) -> pd.DataFrame:
     rank_cols = [c for c in mhc.columns if "Rank" in c]
     affinity_cols = [c for c in mhc.columns if "nM" in c]
-    mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"]
+    mhc_cols = ["Pos", "ID", "Peptide"] + rank_cols + affinity_cols + ["NB"]
     mhc = mhc[mhc_cols]
     mhc["Rank_min"] = mhc[rank_cols].min(axis=1)
     mhc["Aff_min"] = mhc[affinity_cols].min(axis=1)
@@ -17,7 +17,7 @@ def select_columns(mhc):
     mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
     return mhc
 
-def merge(info, tumor, normal, outfile):
+def merge(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame:
     tumor = select_columns(tumor)
     normal = select_columns(normal)
     id_length = len(tumor.ID[0])
@@ -60,7 +60,7 @@ def merge(info, tumor, normal, outfile):
     ### Remove Duplicate kmers
     data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"])
 
-    data.to_csv(outfile, index=False, sep = '\t')
+    return data
 
 
 ## highlight the difference between mutated neopeptide and wildtype
@@ -83,7 +83,8 @@ def main():
     tumor = pd.read_csv(snakemake.input.neo, sep = '\t')
     normal = pd.read_csv(snakemake.input.normal, sep = '\t')
     outfile = snakemake.output[0]
-    merge(info, tumor, normal, outfile)
+    data = merge(info, tumor, normal)
+    data.to_csv(outfile, index=False, sep = '\t')
 
 if __name__ == '__main__':
     sys.exit(main())

From c75855d10e5e58822c759298ba3907d0898192d2 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 29 Jul 2022 08:55:32 +0000
Subject: [PATCH 110/191] further cleanup of config

---
 .test/config/config.yaml  | 11 +----------
 config/config.yaml        | 12 +-----------
 workflow/rules/common.smk |  2 +-
 3 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 8efb0dde..475a31f0 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -2,11 +2,10 @@ samples: "config/samples.tsv"
 units: "config/units.tsv"
 
 
-epitope_prediction:
+neoantigen_prediction:
   activate: true
 
 
-
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
@@ -19,14 +18,6 @@ ref:
   build: GRCh38
 
 
-annotations:
-  vep:
-    params: "--everything"
-    plugins:
-      # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
-      # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt".
-      - LoFtool
-
 params:
   microphaser:
     events:
diff --git a/config/config.yaml b/config/config.yaml
index a9978057..af13f61b 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -2,12 +2,10 @@ samples: "config/samples.tsv"
 units: "config/units.tsv"
 
 
-epitope_prediction:
+neoantigen_prediction:
   activate: true
 
 
-affinity:
-
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
@@ -20,14 +18,6 @@ ref:
   build: GRCh38
 
 
-annotations:
-  vep:
-    params: "--everything"
-    plugins:
-      # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
-      # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt".
-      - LoFtool
-
 params:
   microphaser:
     # window_len should be at least 3 times the longest peptide_len specified below
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 1e976731..e6262532 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -97,7 +97,7 @@ def get_final_output():
             (samples["group"] == group) & (samples["alias"].str.match("tumor")),
             "alias",
         ]
-        if config["epitope_prediction"]["activate"]:
+        if config["neoantigen_prediction"]["activate"]:
             sequencing_types = pd.unique(
                 units.loc[units["sample_name"].isin(smps), "sequencing_type"]
             )

From 6c6b2bd74f3bec7f765426d508f578bee8bffbe1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 29 Jul 2022 09:03:13 +0000
Subject: [PATCH 111/191] further config cleanup, including schema

---
 .test/config/config.yaml            |  4 +--
 config/config.yaml                  |  4 +--
 workflow/rules/MHC_binding.smk      |  8 ++---
 workflow/schemas/config.schema.yaml | 55 ++++++++++++++---------------
 4 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 475a31f0..c5ed610c 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -26,7 +26,7 @@ params:
   netMHCpan:
     activate: true
     peptide_len: 9
-    params: "-BA -s"
+    extra: ""
     # Please download netMHCpan manually from:
     # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
     # To make the `netMHCpan` script work, you need to fix its first line in
@@ -37,7 +37,7 @@ params:
   netMHCIIpan:
     activate: false
     peptide_len: 15
-    params: "-BA -s"
+    extra: ""
     # Please download netMHCIIpan manually from:
     # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
     # To make the `netMHCIIpan` script work, you need to fix its first line in
diff --git a/config/config.yaml b/config/config.yaml
index af13f61b..29da7260 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -27,7 +27,7 @@ params:
   netMHCpan:
     activate: true
     peptide_len: 9
-    params: "-BA -s"
+    extra: ""
     # Please download netMHCpan manually from:
     # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
     # To make the `netMHCpan` script work, you need to fix its first line in
@@ -38,7 +38,7 @@ params:
   netMHCIIpan:
     activate: false
     peptide_len: 15
-    params: "-BA -s"
+    extra: ""
     # Please download netMHCIIpan manually from:
     # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
     # To make the `netMHCIIpan` script work, you need to fix its first line in
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index fbbf1708..5c596d76 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -27,7 +27,7 @@ rule netMHCpan:
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["params"]["netMHCpan"]["params"],
+        extra=config["params"]["netMHCpan"]["extra"],
         netMHC=config["params"]["netMHCpan"]["location"],
         length=config["params"]["netMHCpan"]["peptide_len"],
         alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
@@ -35,7 +35,7 @@ rule netMHCpan:
         "( "
         "if [ -s {input.peptides} ]; "
         "then "
-        "  {params.netMHC}/netMHCpan {params.extra} -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "  {params.netMHC}/netMHCpan {params.extra} -BA -s -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
         "fi "
@@ -53,7 +53,7 @@ rule netMHCIIpan:
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["params"]["netMHCIIpan"]["params"],
+        extra=config["params"]["netMHCIIpan"]["extra"],
         netMHC=config["params"]["netMHCIIpan"]["location"],
         length=config["params"]["netMHCIIpan"]["peptide_len"],
         alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
@@ -61,7 +61,7 @@ rule netMHCIIpan:
         "( "
         "if [ -s {input.peptides} ]; "
         "then "
-        "  {params.netMHC}/netMHCIIpan {params.extra} -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+        "  {params.netMHC}/netMHCIIpan {params.extra} -BA -s -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
         "else "
         "  touch {output}; "
         "fi "
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 8b960444..dad1a71e 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -42,7 +42,7 @@ properties:
       - build
       - n_chromosomes
 
-  epitope_prediction:
+  neoantigen_prediction:
     type: object
     properties:
       activate:
@@ -51,36 +51,12 @@ properties:
   affinity:
     type: object
     properties:
-      netMHCpan:
-        type: object
-        properties:
-          activate:
-            type: boolean
-          params:
-            type: string
-      netMHCIIpan:
-        type: object
-        properties:
-          activate:
-            type: boolean
-          params:
-            type: string
-
   params:
     type: object
     properties:
       microphaser:
         type: object
         properties:
-          window_len:
-            type: integer
-          peptide_len:
-            type: object
-            properties:
-              netMHCpan:
-                type: integer
-              netMHCIIpan:
-                type: integer
           variant_sets:
             type: object
             properties:
@@ -91,12 +67,35 @@ properties:
             required:
               - normal
               - tumor
+      netMHCpan:
+        type: object
+        properties:
+          activate:
+            type: boolean
+          peptide_len:
+            type: integer
+          extra:
+            type: string
+          location:
+            type: string
+        required:
+          - activate
+      netMHCIIpan:
+        type: object
+        properties:
+          activate:
+            type: boolean
+          peptide_len:
+            type: integer
+          extra:
+            type: string
         required:
-          - window_len
-          - peptide_len
-          - variant_sets
+          - activate
     required:
       - microphaser
+      - netMHCpan
+      - netMHCIIpan
+
 
 required:
   - samples

From 64fc352fa28816d4346c90e06c73138a5fb12957 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 29 Jul 2022 09:03:46 +0000
Subject: [PATCH 112/191] remove traces of attempted mhcflurry implementation

---
 workflow/rules/MHC_binding.smk      | 44 -------------------
 workflow/scripts/merge_mhcflurry.py | 66 -----------------------------
 2 files changed, 110 deletions(-)
 delete mode 100644 workflow/scripts/merge_mhcflurry.py

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 5c596d76..50b98a6b 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -1,21 +1,3 @@
-# rule mhcflurry:
-#     input:
-#         peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{contig}.{peptide_type}.fa",
-#         alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv",
-#         wt_alleles=get_germline_optitype
-#     output:
-#         "results/mhcflurry/{sample}/{contig}/output.{peptide_type}.csv"
-#     log:
-#         "logs/mhcflurry/{sample}-{contig}-{peptide_type}.log"
-#     run:
-#         if "wt" in input.peptides:
-#             alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0])
-#         else:
-#             alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0])
-#         cmd = "if [ -s {input.peptides} ]; then mhctools --mhc-predictor mhcflurry --mhc-alleles {alleles} --input-fasta-file {input.peptides} --output-csv {output} > {log}; else touch {output}; fi"
-#         shell(cmd)
-
-
 rule netMHCpan:
     input:
         peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa",
@@ -82,21 +64,6 @@ rule parse_mhc_out:
         "../scripts/group_mhc_output.py"
 
 
-# rule parse_mhcflurry:
-#     input:
-#         expand("results/mhcflurry/{{sample}}/{contig}/output.{{peptide_type}}.csv", contig=contigs)
-#     output:
-#         "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv"
-#     wildcard_constraints:
-#         group="wt|mt"
-#     log:
-#         "logs/parse-mhc/mhcflurry-{sample}-{peptide_type}.log"
-#     conda:
-#         "../envs/xsv.yaml"
-#     shell:
-#         "xsv cat rows -d ',' {input} | cut --complement -f2,7,8 > {output}"
-
-
 rule mhc_csv_table:
     input:
         info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
@@ -114,17 +81,6 @@ rule mhc_csv_table:
         "../scripts/merge_data.py"
 
 
-# rule mhcflurry_table:
-#     input:
-#         info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv",
-#         neo="results/mhcflurry/{sample}/{sample}.mhc.neo.tsv",
-#         normal="results/mhcflurry/{sample}/{sample}.mhc.normal.tsv"
-#     output:
-#         report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)")
-#     script:
-#         "../scripts/merge_mhcflurry.py"
-
-
 rule add_RNA_info:
     input:
         counts="results/kallisto/{group}.{tumor_alias}",
diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py
deleted file mode 100644
index c0098cb9..00000000
--- a/workflow/scripts/merge_mhcflurry.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import sys
-
-sys.stderr = open(snakemake.log[0], "w")
-
-import os
-import pandas as pd
-import numpy as np
-
-
-## highlight the difference between mutated neopeptide and wildtype
-def diffEpitope(e1,e2):
-    if str(e2) == 'nan':
-        return(e1)
-    e1 = str(e1)
-    e2 = str(e2)
-    diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]]
-    e_new = e1
-    e2_new = e2
-    for p in diff_pos:
-        e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:]
-        e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:]
-    return(e_new)
-
-info = pd.read_csv(snakemake.input[0])
-tumor = pd.read_csv(snakemake.input[1])
-normal = pd.read_csv(snakemake.input[2])
-outfile = snakemake.output[0]
-
-
-tumor = tumor[["source_sequence_name","peptide","allele","affinity","percentile_rank"]]
-tumor = tumor.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index()
-tumor.columns = tumor.columns.map("-".join)
-tumor = tumor.rename(columns={col: col.replace("-","") for col in tumor.columns if col.endswith("-")})
-
-normal = normal[["source_sequence_name","peptide","allele","affinity","percentile_rank"]]
-normal = normal.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index()
-normal.columns = normal.columns.map("-".join)
-normal = normal.rename(columns={col: col.replace("-","") for col in normal.columns if col.endswith("-")})
-
-merged = tumor.merge(normal, how="left", on=["source_sequence_name"])
-
-merged = merged.rename(columns={col: col.replace("_y","_normal") for col in merged.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged.columns})
-## add info
-info = info.rename(columns={"id":"ID","gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"})
-merged_dataframe = merged.merge(info, how="left", left_on="source_sequence_name", right_on="ID")
-
-merged_dataframe["peptide_tumor"]=merged_dataframe[["peptide_tumor","peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
-## Are all possible variants in the peptide ("Cis") or not ("Trans")
-merged_dataframe["Variant_Orientation"] = "Cis"
-trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
-merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans"
-
-## check misssense/silent mutation status
-nonsilent = merged_dataframe.peptide_tumor != merged_dataframe.peptide_normal
-merged_dataframe = merged_dataframe[nonsilent]
-data = merged_dataframe.drop_duplicates(subset=["Gene_ID","offset","peptide_tumor","Somatic_AminoAcid_Change"])
-
-### Delete Stop-Codon including peptides
-data = data[data.peptide_tumor.str.count("x") == 0]
-data = data[data.peptide_tumor.str.count("X") == 0]
-data.to_csv(outfile, index=False, sep = '\t')
-
-
-
-
-

From 31b9bddd025b595b92ebae204b8fb7eb3d5f78d3 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Sat, 30 Jul 2022 19:09:30 +0000
Subject: [PATCH 113/191] update microphaser to bug-fixed `v0.5.0`

---
 workflow/envs/microphaser.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml
index 3237608c..00657086 100644
--- a/workflow/envs/microphaser.yaml
+++ b/workflow/envs/microphaser.yaml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - microphaser =0.4
+  - microphaser =0.5

From dfbf9a37def516dc4c8dd7a57d6b6b7e5cf91633 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Sat, 30 Jul 2022 19:13:36 +0000
Subject: [PATCH 114/191] use bioconda `hla-la` recipe

---
 workflow/envs/hla_la.yaml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml
index 4b94aaef..6a24bc29 100644
--- a/workflow/envs/hla_la.yaml
+++ b/workflow/envs/hla_la.yaml
@@ -1,10 +1,5 @@
 channels:
   - conda-forge
   - bioconda
-  - jafors
 dependencies:
-  - hla-la ==1.0.5
-  - samtools ==1.10
-  - bamtools ==2.5.1
-  - boost-cpp ==1.74.0
-  - r-base =4
+  - hla-la ==1.0.3

From fc80d5c9c35d07a55c2fe529b5e1ebf5738c2c63 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 2 Aug 2022 19:11:18 +0000
Subject: [PATCH 115/191] rename group_mhc_output.py to clean_mhc_output.py and
 consistently use pandas for parsing

---
 workflow/rules/MHC_binding.smk       |  8 ++---
 workflow/scripts/clean_mhc_output.py | 54 ++++++++++++++++++++++++++++
 workflow/scripts/group_mhc_output.py | 31 ----------------
 3 files changed, 58 insertions(+), 35 deletions(-)
 create mode 100644 workflow/scripts/clean_mhc_output.py
 delete mode 100644 workflow/scripts/group_mhc_output.py

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 50b98a6b..5a0ff9bc 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -50,18 +50,18 @@ rule netMHCIIpan:
         " ) 2> {log}"
 
 
-rule parse_mhc_out:
+rule clean_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.xls",
+            "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv",
             contig=contigs,
         ),
     output:
-        "results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv",
+        joined_mhc_out="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv",
     log:
         "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log",
     script:
-        "../scripts/group_mhc_output.py"
+        "../scripts/clean_mhc_output.py"
 
 
 rule mhc_csv_table:
diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/clean_mhc_output.py
new file mode 100644
index 00000000..fcf360a6
--- /dev/null
+++ b/workflow/scripts/clean_mhc_output.py
@@ -0,0 +1,54 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import os
+import pandas as pd
+# assumptions of this script about netMHCpan or netMHCIIpan:
+# * version 4.1
+# * output generated via `-xls` option
+# * generated with the `-BA` option to include binding affinity prediction
+
+# The mapping of index column names used here to original names in netMHCpan files is:
+INDEX_NAMES = {'Pos': 'position_in_protein_sequence', 'Peptide': 'peptide_sequence', 'ID': 'peptide_ID', 'Ave': 'average_el_score', 'NB': 'number_of_binders'}
+# The mapping of column names used here to original names in netMHCpan files is:
+COLUMN_NAMES = {'BA-score':  'binding_affinity_score', 'BA_Rank': 'binding_affinity_percent_rank', 'EL-score': 'elution_ligang_score', 'EL_Rank': 'elution_ligand_percent_rank' , 'core': 'binding_core', 'icore': 'interaction_core'}
+
+def parse_file(mhc_in: FileIO):
+    """
+    Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile <filename>`
+    directive into a tidy pandas data frame.
+    """
+    if os.path.getsize(mhc_in) == 0:
+        # Short-circuit empty files, but generate correct header.
+        return pd.DataFrame( columns = list(COLUMN_NAMES.values()) + ["allele"] + list(INDEX_NAMES.values()) )
+    
+    # It's a compound header over two rows and a compound row index in the initial
+    # three and final two columns of the table. For some reason, the final two
+    # columns are added to the index but not removed from the table, so we do this
+    # manually with `.iloc[]``.
+    data = pd.read_csv(mhc_in, sep="\t", header = [0, 1], index_col=[0, 1, 2, -2, -1]).iloc[:, :-2]
+
+    # With two lines of header parsed into a MultiIndex, pandas only uses the
+    # first column name in index_col as an entry. Obviously the following code
+    # assumes that these are the first three and last two columns of the data file.
+    data.index.names = list(INDEX_NAMES.values())
+
+    # Entries of the header MultiIndex need to be fixed, there doesn't seem to be
+    # any way to automatically do this during read_csv.
+    cols = pd.DataFrame(data.columns.to_list(), columns = ['allele', 'info'])
+
+    # fix up the columns and reassign
+    cols.loc[cols['allele'].str.endswith("_level_0"), 'allele'] = pd.NA
+    cols = cols.fillna(method="ffill")
+    data.columns = pd.MultiIndex.from_frame(cols)
+    
+    # Turn into longer table with one HLA Allele per row instead of MultiIndex
+    # header, rename columns to something readable and turn index into columns.
+    data = data.stack(level='allele').rename(columns = COLUMN_NAMES).reset_index()
+
+    return data
+
+all_data = pd.concat((parse_file(f) for f in snakemake.input), axis='index')
+
+all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False)
\ No newline at end of file
diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py
deleted file mode 100644
index fda5b360..00000000
--- a/workflow/scripts/group_mhc_output.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import sys
-
-sys.stderr = open(snakemake.log[0], "w")
-
-import os
-import pandas as pd
-import numpy as np
-
-first = True
-out = open(snakemake.output[0], "w")
-for e in snakemake.input:
-    if os.path.getsize(e) > 0:
-        mhcout = open(e, 'r')
-        alleles = next(mhcout).split('\t')
-        header = next(mhcout).rstrip().split('\t')
-        if first:
-            allele = ''
-            for i in range(0, len(header)):
-                #print(header[i].rstrip())
-                if i < len(alleles):
-                    #print(alleles[i])
-                    if alleles[i] != '':
-                        allele = alleles[i].rstrip() + '_'
-                header[i] = allele + header[i].rstrip()
-            header[len(header) -1] = "NB"
-            first = False
-            #print(header)
-            out.write('\t'.join(header) + '\n')
-        for line in mhcout:
-            out.write(line)
-out.close()

From 4be7c35584b83bde0582ac4fd7fa6d58e3dfa5d9 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 2 Aug 2022 19:11:53 +0000
Subject: [PATCH 116/191] black clean_mhc_output.py

---
 workflow/scripts/clean_mhc_output.py | 44 +++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/clean_mhc_output.py
index fcf360a6..8ab28fbe 100644
--- a/workflow/scripts/clean_mhc_output.py
+++ b/workflow/scripts/clean_mhc_output.py
@@ -4,15 +4,30 @@
 
 import os
 import pandas as pd
+
 # assumptions of this script about netMHCpan or netMHCIIpan:
 # * version 4.1
 # * output generated via `-xls` option
 # * generated with the `-BA` option to include binding affinity prediction
 
 # The mapping of index column names used here to original names in netMHCpan files is:
-INDEX_NAMES = {'Pos': 'position_in_protein_sequence', 'Peptide': 'peptide_sequence', 'ID': 'peptide_ID', 'Ave': 'average_el_score', 'NB': 'number_of_binders'}
+INDEX_NAMES = {
+    "Pos": "position_in_protein_sequence",
+    "Peptide": "peptide_sequence",
+    "ID": "peptide_ID",
+    "Ave": "average_el_score",
+    "NB": "number_of_binders",
+}
 # The mapping of column names used here to original names in netMHCpan files is:
-COLUMN_NAMES = {'BA-score':  'binding_affinity_score', 'BA_Rank': 'binding_affinity_percent_rank', 'EL-score': 'elution_ligang_score', 'EL_Rank': 'elution_ligand_percent_rank' , 'core': 'binding_core', 'icore': 'interaction_core'}
+COLUMN_NAMES = {
+    "BA-score": "binding_affinity_score",
+    "BA_Rank": "binding_affinity_percent_rank",
+    "EL-score": "elution_ligang_score",
+    "EL_Rank": "elution_ligand_percent_rank",
+    "core": "binding_core",
+    "icore": "interaction_core",
+}
+
 
 def parse_file(mhc_in: FileIO):
     """
@@ -21,13 +36,19 @@ def parse_file(mhc_in: FileIO):
     """
     if os.path.getsize(mhc_in) == 0:
         # Short-circuit empty files, but generate correct header.
-        return pd.DataFrame( columns = list(COLUMN_NAMES.values()) + ["allele"] + list(INDEX_NAMES.values()) )
-    
+        return pd.DataFrame(
+            columns=list(COLUMN_NAMES.values())
+            + ["allele"]
+            + list(INDEX_NAMES.values())
+        )
+
     # It's a compound header over two rows and a compound row index in the initial
     # three and final two columns of the table. For some reason, the final two
     # columns are added to the index but not removed from the table, so we do this
     # manually with `.iloc[]``.
-    data = pd.read_csv(mhc_in, sep="\t", header = [0, 1], index_col=[0, 1, 2, -2, -1]).iloc[:, :-2]
+    data = pd.read_csv(
+        mhc_in, sep="\t", header=[0, 1], index_col=[0, 1, 2, -2, -1]
+    ).iloc[:, :-2]
 
     # With two lines of header parsed into a MultiIndex, pandas only uses the
     # first column name in index_col as an entry. Obviously the following code
@@ -36,19 +57,20 @@ def parse_file(mhc_in: FileIO):
 
     # Entries of the header MultiIndex need to be fixed, there doesn't seem to be
     # any way to automatically do this during read_csv.
-    cols = pd.DataFrame(data.columns.to_list(), columns = ['allele', 'info'])
+    cols = pd.DataFrame(data.columns.to_list(), columns=["allele", "info"])
 
     # fix up the columns and reassign
-    cols.loc[cols['allele'].str.endswith("_level_0"), 'allele'] = pd.NA
+    cols.loc[cols["allele"].str.endswith("_level_0"), "allele"] = pd.NA
     cols = cols.fillna(method="ffill")
     data.columns = pd.MultiIndex.from_frame(cols)
-    
+
     # Turn into longer table with one HLA Allele per row instead of MultiIndex
     # header, rename columns to something readable and turn index into columns.
-    data = data.stack(level='allele').rename(columns = COLUMN_NAMES).reset_index()
+    data = data.stack(level="allele").rename(columns=COLUMN_NAMES).reset_index()
 
     return data
 
-all_data = pd.concat((parse_file(f) for f in snakemake.input), axis='index')
 
-all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False)
\ No newline at end of file
+all_data = pd.concat((parse_file(f) for f in snakemake.input), axis="index")
+
+all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False)

From d9e1f0f9dc7f177b7f89a2516a8e5f9b1151c6d5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 08:33:01 +0000
Subject: [PATCH 117/191] rename clean_mhc_output to tidy_mhc_output

---
 workflow/rules/MHC_binding.smk                               | 2 +-
 workflow/scripts/{clean_mhc_output.py => tidy_mhc_output.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename workflow/scripts/{clean_mhc_output.py => tidy_mhc_output.py} (100%)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 5a0ff9bc..4e8de82d 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -50,7 +50,7 @@ rule netMHCIIpan:
         " ) 2> {log}"
 
 
-rule clean_mhc_out:
+rule tidy_mhc_out:
     input:
         expand(
             "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv",
diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
similarity index 100%
rename from workflow/scripts/clean_mhc_output.py
rename to workflow/scripts/tidy_mhc_output.py

From a105c04af24ed36107ec817a513e30c325c432f1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 08:33:53 +0000
Subject: [PATCH 118/191] rename netMHCpan and netMHCIIpan -xls output to
 `.tsv` to reflect that these are actually tab separated plain text files

---
 workflow/rules/MHC_binding.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 4e8de82d..b1418d4f 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -3,7 +3,7 @@ rule netMHCpan:
         peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
+        "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
         "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:
@@ -29,7 +29,7 @@ rule netMHCIIpan:
         peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls",
+        "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
         "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:

From ea38fe891ca2ae0d6ad6b6d93507e2df697e5b13 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 09:09:46 +0000
Subject: [PATCH 119/191] rename parse_HLA_types to
 parse_and_filter_hla_alleles_for_netmhc

---
 workflow/rules/HLAtyping.smk                                  | 4 ++--
 ...LA_types.py => parse_and_filter_hla_alleles_for_netmhc.py} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename workflow/scripts/{parse_HLA_types.py => parse_and_filter_hla_alleles_for_netmhc.py} (100%)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 5482e871..ff10fe49 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -21,7 +21,7 @@ rule HLA_LA:
         "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1"
 
 
-rule parse_HLA_LA:
+rule parse_and_filter_hla_alleles_for_netmhc:
     input:
         hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     output:
@@ -38,4 +38,4 @@ rule parse_HLA_LA:
     log:
         "logs/parse-HLA-LA/{group}.{alias}.log",
     script:
-        "../scripts/parse_HLA_types.py"
+        "../scripts/parse_and_filter_hla_alleles_for_netmhc.py"
diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
similarity index 100%
rename from workflow/scripts/parse_HLA_types.py
rename to workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py

From 6312d2e952b939ce89e44322efb9f9f2165ce8f4 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 13:14:25 +0000
Subject: [PATCH 120/191] use net_mhc_pan and net_mhc_two_pan for better
 readability and consistency wherever possible

---
 .test/config/config.yaml            |  4 ++--
 config/config.yaml                  |  4 ++--
 workflow/rules/HLAtyping.smk        | 28 ++++++++++++++++++++++++++++
 workflow/rules/MHC_binding.smk      | 28 ++++++++++++++--------------
 workflow/rules/common.smk           |  8 ++++----
 workflow/rules/microphaser.smk      |  4 ++--
 workflow/schemas/config.schema.yaml |  8 ++++----
 7 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index c5ed610c..bcd1a88c 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -23,7 +23,7 @@ params:
     events:
       tumor: "tumor_only"
       normal: "normal_only"
-  netMHCpan:
+  net_mhc_pan:
     activate: true
     peptide_len: 9
     extra: ""
@@ -34,7 +34,7 @@ params:
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
     location: "../netMHCpan-4.1"
-  netMHCIIpan:
+  net_mhc_two_pan:
     activate: false
     peptide_len: 15
     extra: ""
diff --git a/config/config.yaml b/config/config.yaml
index 29da7260..a9c006cd 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -24,7 +24,7 @@ params:
     variant_sets:
       normal: "normal_only"
       tumor: "tumor_only"
-  netMHCpan:
+  net_mhc_pan:
     activate: true
     peptide_len: 9
     extra: ""
@@ -35,7 +35,7 @@ params:
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
     location: "../netMHCpan-4.1"
-  netMHCIIpan:
+  net_mhc_two_pan:
     activate: false
     peptide_len: 15
     extra: ""
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index ff10fe49..97e3b0ec 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -21,9 +21,37 @@ rule HLA_LA:
         "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1"
 
 
+rule net_mhc_pan_alleles:
+    output:
+        mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt",
+    conda:
+        "../envs/tcsh.yaml"
+    log:
+        "logs/net_mhc_pan/available_alleles.net_mhc_pan.log",
+    params:
+        net_mhc=config["params"]["net_mhc_pan"]["location"],
+    shell:
+        "{params.net_mhc}/net_mhc_pan -listMHC > {output.mhc_one_alleles} 2> {log}"
+
+
+rule net_mhc_two_pan_alleles:
+    output:
+        mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt",
+    conda:
+        "../envs/tcsh.yaml"
+    log:
+        "logs/net_mhc_pan/available_alleles.net_mhc_two_pan.log",
+    params:
+        net_mhc=config["params"]["net_mhc_two_pan"]["location"],
+    shell:
+        "{params.net_mhc}/net_mhc_two_pan -list > {output.mhc_two_alleles} 2> {log}"
+
+
 rule parse_and_filter_hla_alleles_for_netmhc:
     input:
         hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+        mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt",
+        mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt",
     output:
         hlaI=report(
             "results/HLA-LA/{group}.{alias}.hlaI.tsv",
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index b1418d4f..33154e72 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -1,17 +1,17 @@
-rule netMHCpan:
+rule net_mhc_pan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_pan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCI,
     output:
-        "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+        "results/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
-        "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+        "logs/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["params"]["netMHCpan"]["extra"],
-        netMHC=config["params"]["netMHCpan"]["location"],
-        length=config["params"]["netMHCpan"]["peptide_len"],
+        extra=config["params"]["net_mhc_pan"]["extra"],
+        netMHC=config["params"]["net_mhc_pan"]["location"],
+        length=config["params"]["net_mhc_pan"]["peptide_len"],
         alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
     shell:
         "( "
@@ -24,20 +24,20 @@ rule netMHCpan:
         " ) 2> {log}"
 
 
-rule netMHCIIpan:
+rule net_mhc_two_pan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa",
+        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_two_pan.{contig}.{peptide_type}.fa",
         alleles=get_alleles_MHCII,
     output:
-        "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+        "results/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
-        "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+        "logs/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:
         "../envs/tcsh.yaml"
     params:
-        extra=config["params"]["netMHCIIpan"]["extra"],
-        netMHC=config["params"]["netMHCIIpan"]["location"],
-        length=config["params"]["netMHCIIpan"]["peptide_len"],
+        extra=config["params"]["net_mhc_two_pan"]["extra"],
+        netMHC=config["params"]["net_mhc_two_pan"]["location"],
+        length=config["params"]["net_mhc_two_pan"]["peptide_len"],
         alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
     shell:
         "( "
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e6262532..9c928417 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -110,11 +110,11 @@ def get_final_output():
                         filter(
                             None,
                             [
-                                "netMHCpan"
-                                if is_activated("params/netMHCpan")
+                                "net_mhc_pan"
+                                if is_activated("params/net_mhc_pan")
                                 else None,
-                                "netMHCIIpan"
-                                if is_activated("params/netMHCIIpan")
+                                "net_mhc_two_pan"
+                                if is_activated("params/net_mhc_two_pan")
                                 else None,
                             ],
                         )
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 86af23d0..a57340ad 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -77,7 +77,7 @@ rule microphaser_tumor:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3,
+        window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3,
     shell:
         "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
@@ -102,7 +102,7 @@ rule microphaser_normal:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3,
+        window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3,
     shell:
         "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index dad1a71e..1186fc44 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -67,7 +67,7 @@ properties:
             required:
               - normal
               - tumor
-      netMHCpan:
+      net_mhc_pan:
         type: object
         properties:
           activate:
@@ -80,7 +80,7 @@ properties:
             type: string
         required:
           - activate
-      netMHCIIpan:
+      net_mhc_two_pan:
         type: object
         properties:
           activate:
@@ -93,8 +93,8 @@ properties:
           - activate
     required:
       - microphaser
-      - netMHCpan
-      - netMHCIIpan
+      - net_mhc_pan
+      - net_mhc_two_pan
 
 
 required:

From fff90dc747aa3d70ff038268b43748e5f32cc79f Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 13:15:05 +0000
Subject: [PATCH 121/191] automatically filter HLA-LA alleles down to those
 that netMHC can handle

---
 ...parse_and_filter_hla_alleles_for_netmhc.py | 107 +++++++++++++-----
 1 file changed, 81 insertions(+), 26 deletions(-)

diff --git a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
index c93a943e..79c6994c 100644
--- a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
+++ b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
@@ -4,15 +4,57 @@
 
 import pandas as pd
 
-# To know which alleles netMHCpan can handle, use its -listMHC option.
-HLAI = {"A", "B", "C", "E", "G"}
+# # read in available alleles
 
-# To know which alleles netMHCIIpan can handle, use its -list option.
+
+def read_allele_list(filename: str):
+    with open(filename, "r") as alleles_in:
+        alleles = set()
+        for line in alleles_in:
+            if not (line.startswith("#") or line == "\n"):
+                alleles.add(line.strip())
+        return alleles
+
+
+HLA_SUFFIXES_REGEX = r"[NLSCAQ]?"
+
+# netMHCpan alleles and loci
+HLA_ONE_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_one_alleles)
+hla_one_net_mhc_alleles = pd.Series(list(HLA_ONE_NET_MHC_ALLELES))
+HLA_ONE_LOCI = set(
+    hla_one_net_mhc_alleles[hla_one_net_mhc_alleles.str.startswith("HLA-")]
+    .str.replace(r"HLA-([A-Z])[\d:]+" + HLA_SUFFIXES_REGEX, r"\1", regex=True)
+    .drop_duplicates()
+)
+
+
+# netMHCIIpan alleles and loci
+HLA_TWO_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_two_alleles)
+hla_two_net_mhc_alleles = pd.Series(list(HLA_TWO_NET_MHC_ALLELES))
 # DRB alleles need to be formatted differently from DP and DQ alleles,
-# so we specify them separately.
-DRB = {"DRB1", "DRB3", "DRB4", "DRB5"}
-ALPHA_BETA = {"DPA1", "DPB1", "DQA1", "DQB1"}
+# so we extract them separately.
+DRB_LOCI = set(
+    hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("DRB")]
+    .str.replace(r"(DRB\d)_\d+" + HLA_SUFFIXES_REGEX, r"\1", regex=True)
+    .drop_duplicates()
+)
+
+ALPHA_BETA_LOCI = set(
+    hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("HLA-")]
+    .str.replace(
+        r"HLA-(D[A-Z]A\d)\d+"
+        + HLA_SUFFIXES_REGEX
+        + r"-(D[A-Z]B\d)\d+"
+        + HLA_SUFFIXES_REGEX,
+        r"\1_\2",
+        regex=True,
+    )
+    .str.split("_")
+    .explode()
+    .drop_duplicates()
+)
 
+# read in alleles as determined by HLA-LA
 hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t")
 
 # the Allele column can contain multiple ";"-separated entries for the
@@ -26,52 +68,63 @@
 # it needs to be in the format of the first column of the above list, as explained in
 # the "Instructions" tab under "MHC SELECTION" point "2)" at:
 # * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
-hlaI_alleles = (
-    hlas.loc[hlas["Locus"].isin(HLAI), "Allele"]
-    .str.replace(r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True)
+hla_one_alleles = (
+    hlas.loc[hlas["Locus"].isin(HLA_ONE_LOCI), "Allele"]
+    .str.replace(
+        r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+        r"HLA-\1\2:\3\5",
+        regex=True,
+    )
     .drop_duplicates()
 )
-hlaI_alleles.to_csv(snakemake.output.hlaI, sep="\t", index=False, header=False)
+hla_one_alleles[hla_one_alleles.isin(HLA_ONE_NET_MHC_ALLELES)].to_csv(
+    snakemake.output.hlaI, sep="\t", index=False, header=False
+)
 
 # reformat to netMHCIIpan allele list format:
-# * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list
+# * https://services.healthtech.dtu.dk/services/netMHCIIpan-4.1/alleles_name.list
 # contrary to the format in that list, alleles actually need to be formatted like this,
 # with <gene>s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only
 # the 1st and 2nd sub-number are used) according to the official nomenclature (see:
-# https://ars.els-cdn.com/content/image/1-s2.0-S0006497120405555-gr2.jpg ):
+# http://www.hla.alleles.org/nomenclature/naming.html ):
 # * DRB alleles: "<gene>_<allele_group><specific_HLA_protein>"
 # * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA):
 #   "HLA-<alpha_gene><allele_group><specific_HLA_protein>-<beta_gene><allele_group><specific_HLA_protein>"
 # This format was determined by manually selecting combinations above the
 # "type a list of molecules names" field of the "Submission" tab at:
-# * https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+# * https://services.healthtech.dtu.dk/service.php?netMHCIIpan-4.1
 
 # TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary:
 # * example: DRB1_1501-DRB30101-DRB40301N
 # * "DRB1_<allele_group><specific_HLA_protein>-DRB3<allele_group><specific_HLA_protein>-DRB4<allele_group><specific_HLA_protein>"
-drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)]
-hlaII_alleles = (
+drb_alleles = hlas.loc[hlas["Locus"].isin(DRB_LOCI)]
+hla_two_alleles = (
     drb_alleles["Allele"]
-    .str.replace(r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True)
+    .str.replace(
+        r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+        r"\1_\2\3\5",
+        regex=True,
+    )
     .drop_duplicates()
 )
 
 # handle alleles where a combination of alpha and beta always exists
-alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)]
+alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA_LOCI)]
 alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace(
-    r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True
+    r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+    r"\1\2\3\5",
+    regex=True,
 )
 # we need a variable to group alpha and beta of the same gene combination together
-alpha_beta_alleles["gene_group"] = alpha_beta_alleles["Locus"].str[0:2]
+alpha_beta_alleles.loc[:, "gene_group"] = alpha_beta_alleles["Locus"].str.replace(
+    r"(D[A-Z])[AB]\d", r"\1", regex=True
+)
 # we need to handle cases where we had multiple allele entries in an
 # alpha or beta locus, adding in a duplicate of the corresponding locus
-select_mult_all = alpha_beta_alleles["alternative"] > 0
-select_dpq_loci = alpha_beta_alleles["Locus"].str.startswith("D[PQ]")
-mult_all_per_loc_selection = select_mult_all & select_dpq_loci
 alleles_to_duplicate = alpha_beta_alleles.loc[
-    mult_all_per_loc_selection,
+    alpha_beta_alleles["alternative"] > 0,
     ["Locus", "Chromosome", "alternative"],
-].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}})
+].replace(regex={"Locus": {"(D[A-Z])A(\d)": r"\1B\2", "(D[A-Z])B(\d)": r"\1A\2"}})
 alleles_to_insert = alleles_to_duplicate.merge(
     alpha_beta_alleles.drop("alternative", axis="columns"),
     on=["Locus", "Chromosome"],
@@ -81,10 +134,12 @@
     [alpha_beta_alleles, alleles_to_insert]
 ).drop_duplicates()
 
-hlaII_alleles = hlaII_alleles.append(
+hla_two_alleles = hla_two_alleles.append(
     alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"]
     .transform(lambda x: f"HLA-{'-'.join(x)}")
     .drop_duplicates()
 )
 
-hlaII_alleles.to_csv(snakemake.output.hlaII, sep="\t", index=False, header=False)
+hla_two_alleles[hla_two_alleles.isin(HLA_TWO_NET_MHC_ALLELES)].to_csv(
+    snakemake.output.hlaII, sep="\t", index=False, header=False
+)

From fd999dea825794885e0a6a452bdee5492c111801 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 13:22:23 +0000
Subject: [PATCH 122/191] rename merge_data.py to merge_neoantigen_info.py

---
 workflow/rules/MHC_binding.smk                               | 4 ++--
 workflow/scripts/{merge_data.py => merge_neoantigen_info.py} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename workflow/scripts/{merge_data.py => merge_neoantigen_info.py} (100%)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 33154e72..7934237b 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -64,7 +64,7 @@ rule tidy_mhc_out:
         "../scripts/clean_mhc_output.py"
 
 
-rule mhc_csv_table:
+rule merge_neoantigen_info:
     input:
         info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
         neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv",
@@ -78,7 +78,7 @@ rule mhc_csv_table:
     log:
         "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     script:
-        "../scripts/merge_data.py"
+        "../scripts/merge_neoantigen_info.py"
 
 
 rule add_RNA_info:
diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_neoantigen_info.py
similarity index 100%
rename from workflow/scripts/merge_data.py
rename to workflow/scripts/merge_neoantigen_info.py

From 457870428ed278fb55e42c3ee192825f998cad94 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 3 Aug 2022 13:25:13 +0000
Subject: [PATCH 123/191] fix erroneous substitutions

---
 workflow/rules/HLAtyping.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 97e3b0ec..179a2c67 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -31,7 +31,7 @@ rule net_mhc_pan_alleles:
     params:
         net_mhc=config["params"]["net_mhc_pan"]["location"],
     shell:
-        "{params.net_mhc}/net_mhc_pan -listMHC > {output.mhc_one_alleles} 2> {log}"
+        "{params.net_mhc}/netMHCpan -listMHC > {output.mhc_one_alleles} 2> {log}"
 
 
 rule net_mhc_two_pan_alleles:
@@ -44,7 +44,7 @@ rule net_mhc_two_pan_alleles:
     params:
         net_mhc=config["params"]["net_mhc_two_pan"]["location"],
     shell:
-        "{params.net_mhc}/net_mhc_two_pan -list > {output.mhc_two_alleles} 2> {log}"
+        "{params.net_mhc}/netMHCIIpan -list > {output.mhc_two_alleles} 2> {log}"
 
 
 rule parse_and_filter_hla_alleles_for_netmhc:

From 8f40088013e597463f998136986d511cd45027b7 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 4 Aug 2022 08:24:11 +0000
Subject: [PATCH 124/191] fix changed script name

---
 workflow/rules/MHC_binding.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index 7934237b..f6163b26 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -61,7 +61,7 @@ rule tidy_mhc_out:
     log:
         "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log",
     script:
-        "../scripts/clean_mhc_output.py"
+        "../scripts/tidy_mhc_output.py"
 
 
 rule merge_neoantigen_info:

From 8008228963ec7866d2711585cf58e85bf72313be Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 4 Aug 2022 10:24:31 +0000
Subject: [PATCH 125/191] fix mhc_in type in tidy_mhc_output.py

---
 workflow/scripts/tidy_mhc_output.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
index 8ab28fbe..535609ee 100644
--- a/workflow/scripts/tidy_mhc_output.py
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -29,7 +29,7 @@
 }
 
 
-def parse_file(mhc_in: FileIO):
+def parse_file(mhc_in: str):
     """
     Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile <filename>`
     directive into a tidy pandas data frame.

From 6376dd5c97aa9cea65db64c083adc951cb718b0a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 5 Aug 2022 15:48:06 +0000
Subject: [PATCH 126/191] properly clean up netMHC(II)pan headers

---
 workflow/scripts/tidy_mhc_output.py | 110 ++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 31 deletions(-)

diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
index 535609ee..2fee212a 100644
--- a/workflow/scripts/tidy_mhc_output.py
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -5,27 +5,57 @@
 import os
 import pandas as pd
 
+from itertools import cycle
+
 # assumptions of this script about netMHCpan or netMHCIIpan:
 # * version 4.1
 # * output generated via `-xls` option
 # * generated with the `-BA` option to include binding affinity prediction
 
-# The mapping of index column names used here to original names in netMHCpan files is:
+# The mapping of index column names used here to original names in netMHCpan files
+# is (please excuse the pd.NA tuples, they make it easier further down the line):
 INDEX_NAMES = {
-    "Pos": "position_in_protein_sequence",
-    "Peptide": "peptide_sequence",
-    "ID": "peptide_ID",
-    "Ave": "average_el_score",
-    "NB": "number_of_binders",
+    (pd.NA, "Pos"): "position_in_protein_sequence",
+    (pd.NA, "Peptide"): "peptide_sequence",
+    (pd.NA, "ID"): "peptide_ID",
+    (pd.NA, "Ave"): "average_el_score",
+    (pd.NA, "NB"): "number_of_binders",
 }
-# The mapping of column names used here to original names in netMHCpan files is:
-COLUMN_NAMES = {
-    "BA-score": "binding_affinity_score",
-    "BA_Rank": "binding_affinity_percent_rank",
-    "EL-score": "elution_ligang_score",
-    "EL_Rank": "elution_ligand_percent_rank",
-    "core": "binding_core",
-    "icore": "interaction_core",
+
+if snakemake.wildcards.mhc == "net_mhc_pan":
+    # The mapping of column names used here to original names in netMHCpan files is:
+    COLUMN_NAMES = {
+        "BA-score": "binding_affinity_score",
+        "BA_Rank": "binding_affinity_percent_rank",
+        "EL-score": "elution_ligang_score",
+        "EL_Rank": "elution_ligand_percent_rank",
+        "core": "binding_core",
+    }
+elif snakemake.wildcards.mhc == "net_mhc_two_pan":
+    # The mapping of column names used here to original names in netMHCIIpan files is:
+    COLUMN_NAMES = {
+        "Score_BA": "binding_affinity_score",
+        "Rank_BA": "binding_affinity_percent_rank",
+        "Score": "elution_ligang_score",
+        "Rank": "elution_ligand_percent_rank",
+        "Core": "binding_core",
+    }
+else:
+    sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}")
+
+COLUMNS_TO_DROP = {
+    # I have not found any docs or indication in the manuscript, what the column
+    # `Target` is. It might be the column `Exp_Bind` from Figure 1B here (it's
+    # the only column that only appears in netMHCIIpan output and not netMHCpan
+    # output):
+    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7319546/figure/F1/
+    # If so, it is only for benchmarking purposes according to the docs. And it
+    # is always NA, even in their manuscript. So we simply remove it here.
+    "Target",
+    # There doesn't seem to be an `icore` equivalent in netMHCIIpan output.
+    "icore",
+    # There doesn't seem to be an `nM` equivalent in netMHCpan output.
+    "nM",
 }
 
 
@@ -42,27 +72,45 @@ def parse_file(mhc_in: str):
             + list(INDEX_NAMES.values())
         )
 
+    # We need to fix the utterly broken headers first
+
+    # parse first header line into pandas.Series and name it
+    first_header_line = pd.read_csv(mhc_in, nrows=1, header=None, sep="\t").iloc[0, :]
+    first_header_line.name = "allele"
+
+    # parse second header line into pandas.Series and name it
+    second_header_line = pd.read_csv(
+        mhc_in, skiprows=1, nrows=1, header=None, sep="\t"
+    ).iloc[0, :]
+    second_header_line.name = "column_name"
+
+    header = pd.concat([first_header_line, second_header_line], axis="columns")
+    header = header.fillna(method="ffill")
+    header.loc[
+        header.column_name.isin({"Pos", "Peptide", "ID", "Target", "Ave", "NB"}),
+        "allele",
+    ] = pd.NA
+
     # It's a compound header over two rows and a compound row index in the initial
     # three and final two columns of the table. For some reason, the final two
     # columns are added to the index but not removed from the table, so we do this
     # manually with `.iloc[]``.
-    data = pd.read_csv(
-        mhc_in, sep="\t", header=[0, 1], index_col=[0, 1, 2, -2, -1]
-    ).iloc[:, :-2]
-
-    # With two lines of header parsed into a MultiIndex, pandas only uses the
-    # first column name in index_col as an entry. Obviously the following code
-    # assumes that these are the first three and last two columns of the data file.
-    data.index.names = list(INDEX_NAMES.values())
-
-    # Entries of the header MultiIndex need to be fixed, there doesn't seem to be
-    # any way to automatically do this during read_csv.
-    cols = pd.DataFrame(data.columns.to_list(), columns=["allele", "info"])
-
-    # fix up the columns and reassign
-    cols.loc[cols["allele"].str.endswith("_level_0"), "allele"] = pd.NA
-    cols = cols.fillna(method="ffill")
-    data.columns = pd.MultiIndex.from_frame(cols)
+    data = pd.read_csv(mhc_in, sep="\t", skiprows=2, header=None)
+
+    data.columns = pd.MultiIndex.from_frame(header)
+
+    # remove columns only present in one of the two tools' output
+    columns_to_keep = [
+        col
+        for col in list(data.columns.get_level_values("column_name"))
+        if col not in COLUMNS_TO_DROP
+    ]
+    idx = pd.IndexSlice
+    data = data.loc[:, idx[:, columns_to_keep]]
+
+    # properly set row index columns (values to repeat in every row while doing the stack below)
+    data = data.set_index(list(INDEX_NAMES.keys()))
+    data.index.set_names(INDEX_NAMES, inplace=True)
 
     # Turn into longer table with one HLA Allele per row instead of MultiIndex
     # header, rename columns to something readable and turn index into columns.

From c4c9cd9959e69cba4aa3fe7c184bc1909a79f644 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Mon, 8 Aug 2022 09:35:14 +0000
Subject: [PATCH 127/191] minor comment text update

---
 workflow/scripts/tidy_mhc_output.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
index 2fee212a..5188d6de 100644
--- a/workflow/scripts/tidy_mhc_output.py
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -13,7 +13,8 @@
 # * generated with the `-BA` option to include binding affinity prediction
 
 # The mapping of index column names used here to original names in netMHCpan files
-# is (please excuse the pd.NA tuples, they make it easier further down the line):
+# is (please excuse the pd.NA tuples, they make header and index handling 
+# easier further down the line):
 INDEX_NAMES = {
     (pd.NA, "Pos"): "position_in_protein_sequence",
     (pd.NA, "Peptide"): "peptide_sequence",

From a98b222500dddbe6f038648707ca179c6bdcaff1 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 09:23:40 +0000
Subject: [PATCH 128/191] rename to shorter col names, use existing definitions
 where possible

---
 workflow/scripts/tidy_mhc_output.py | 34 +++++++++++++++--------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
index 5188d6de..3d29dfb8 100644
--- a/workflow/scripts/tidy_mhc_output.py
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -16,30 +16,30 @@
 # is (please excuse the pd.NA tuples, they make header and index handling 
 # easier further down the line):
 INDEX_NAMES = {
-    (pd.NA, "Pos"): "position_in_protein_sequence",
-    (pd.NA, "Peptide"): "peptide_sequence",
-    (pd.NA, "ID"): "peptide_ID",
-    (pd.NA, "Ave"): "average_el_score",
-    (pd.NA, "NB"): "number_of_binders",
+    (pd.NA, "Pos"): "pos_in_id_seq",
+    (pd.NA, "Peptide"): "pep_seq",
+    (pd.NA, "ID"): "id",
+    (pd.NA, "Ave"): "ave_el_score",
+    (pd.NA, "NB"): "num_binders",
 }
 
 if snakemake.wildcards.mhc == "net_mhc_pan":
     # The mapping of column names used here to original names in netMHCpan files is:
     COLUMN_NAMES = {
-        "BA-score": "binding_affinity_score",
-        "BA_Rank": "binding_affinity_percent_rank",
-        "EL-score": "elution_ligang_score",
-        "EL_Rank": "elution_ligand_percent_rank",
-        "core": "binding_core",
+        "BA-score": "ba_score",
+        "BA_Rank": "ba_rank",
+        "EL-score": "el_score",
+        "EL_Rank": "el_rank",
+        "core": "bind_core",
     }
 elif snakemake.wildcards.mhc == "net_mhc_two_pan":
     # The mapping of column names used here to original names in netMHCIIpan files is:
     COLUMN_NAMES = {
-        "Score_BA": "binding_affinity_score",
-        "Rank_BA": "binding_affinity_percent_rank",
-        "Score": "elution_ligang_score",
-        "Rank": "elution_ligand_percent_rank",
-        "Core": "binding_core",
+        "Score_BA": "ba_score",
+        "Rank_BA": "ba_rank",
+        "Score": "el_score",
+        "Rank": "el_rank",
+        "Core": "bind_core",
     }
 else:
     sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}")
@@ -88,7 +88,9 @@ def parse_file(mhc_in: str):
     header = pd.concat([first_header_line, second_header_line], axis="columns")
     header = header.fillna(method="ffill")
     header.loc[
-        header.column_name.isin({"Pos", "Peptide", "ID", "Target", "Ave", "NB"}),
+        header.column_name.isin(
+            [ index_col for (_, index_col) in INDEX_NAMES.keys() ]
+        ),
         "allele",
     ] = pd.NA
 

From 6bdadca7636968ff1d1fd232dcda21b7736efb32 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 09:24:20 +0000
Subject: [PATCH 129/191] intermediate rework step of merge_neoantigen_info.py
 with with .apply() approach for getting set columns to work

---
 workflow/scripts/merge_neoantigen_info.py | 82 ++++++++++++++++++-----
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 23f92e37..47526fdd 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -4,6 +4,14 @@
 
 import pandas as pd
 
+def get_minimum_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
+    df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele'])
+    rank_col = f"{rank_type}_rank"
+    score_col = f"{rank_type}_score"
+    prefix = f"top_{rank_col}_"
+    columns_to_keep = ['bind_core', rank_col, score_col]
+    return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix)
+
 def select_columns(mhc: pd.DataFrame) -> pd.DataFrame:
     rank_cols = [c for c in mhc.columns if "Rank" in c]
     affinity_cols = [c for c in mhc.columns if "nM" in c]
@@ -17,16 +25,61 @@ def select_columns(mhc: pd.DataFrame) -> pd.DataFrame:
     mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
     return mhc
 
-def merge(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame:
-    tumor = select_columns(tumor)
-    normal = select_columns(normal)
-    id_length = len(tumor.ID[0])
-    print(info.columns)
-    info["ID"] = info["id"].astype(str).str[:id_length]
-    merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID'])
-    merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns})
-    info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"})
-    merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')
+def get_filtered_per_sample(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
+    common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True)
+    sample_el = get_minimum_rank_per_peptide(sample, "el")
+    sample_ba = get_minimum_rank_per_peptide(sample, "ba")
+    sample_filtered = sample_el.join(sample_ba)
+    return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True)
+
+def tidy_info(info: pd.DataFrame, alias: str) -> pd.DataFrame:
+    info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand'])
+    int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites']
+    info[int_cols] = info[int_cols].astype('int32')
+    num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': alias}).set_index('alias', append=True)
+    num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': alias}).set_index('alias', append=True)
+    sites_tidy = info.copy()
+    sites_tidy['sites'] = sites_tidy['variant_sites'].apply(lambda x: set(str(x).split('|')))
+    sites_tidy['somatic_sites'] = sites_tidy['somatic_positions'].apply(lambda x: set(str(x).split('|')))
+    sites_tidy['germline_sites'] = sites_tidy[['sites', 'somatic_sites']].apply(lambda row: [s for s in row['sites'] if s not in row['somatic_sites']], axis=1)
+    sites_tidy = sites_tidy.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_sites', 'germline_sites'], ignore_index=False).replace({'somatic_sites': 'normal', 'germline_sites': 'tumor_alias'}).set_index('alias', append=True)['genomic_pos'].apply(lambda r: '|'.join(r))
+    sites_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': 'tumor_resection', 'germline_positions': 'normal'}).set_index('alias', append=True)
+
+    seq_tidy = info.melt(var_name='alias', value_name='sequence', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': alias}).set_index('alias', append=True)
+
+def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame:
+    # get and merge tumor and normal
+    tumor_filtered = get_filtered_per_sample(tumor, snakemake.wildcards.tumor_alias)
+    normal_filtered = get_filtered_per_sample(normal, "normal")
+    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').sort_index()
+    info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias)
+
+    # tidy up info
+    all_annotated = all_filtered.merge(info, on='id', how='left')
+#    tumor = select_columns(tumor)
+#    normal = select_columns(normal)
+#    id_length = len(tumor.ID[0])
+#    print(info.columns)
+#    info["ID"] = info["id"].astype(str).str[:id_length]
+#    merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID'])
+#    merged_mhc = merged_mhc
+#        .rename(
+#            columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}
+#            )
+#        .rename(
+#            columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns}
+#            )
+#    info = info.rename(
+#        columns={
+#            "gene_id":"Gene_ID",
+#            "gene_name":"Gene_Symbol",
+#            "strand":"Strand",
+#            "positions":"Variant_Position",
+#            "chrom":"Chromosome",
+#            "somatic_aa_change":
+#            "Somatic_AminoAcid_Change"
+#        })
+#    merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')
 
     merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
     ## Are all possible variants in the peptide ("Cis") or not ("Trans")
@@ -80,11 +133,10 @@ def diffEpitope(e1,e2):
 
 def main():
     info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str)
-    tumor = pd.read_csv(snakemake.input.neo, sep = '\t')
-    normal = pd.read_csv(snakemake.input.normal, sep = '\t')
-    outfile = snakemake.output[0]
-    data = merge(info, tumor, normal)
-    data.to_csv(outfile, index=False, sep = '\t')
+    tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str})
+    normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str})
+    data = merge_data_frames(info, tumor, normal)
+    data.to_csv(snakemake.output[0], index=False, sep = '\t')
 
 if __name__ == '__main__':
     sys.exit(main())

From 525d406c7032666b2306cf08bf663216e7eb750b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 17:47:04 +0000
Subject: [PATCH 130/191] intermediate version of merge_neoantigen_info.py
 cleanup

---
 workflow/scripts/merge_neoantigen_info.py | 84 ++++++++++++++---------
 1 file changed, 50 insertions(+), 34 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 47526fdd..3fe10399 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -4,7 +4,7 @@
 
 import pandas as pd
 
-def get_minimum_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
+def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
     df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele'])
     rank_col = f"{rank_type}_rank"
     score_col = f"{rank_type}_score"
@@ -25,37 +25,41 @@ def select_columns(mhc: pd.DataFrame) -> pd.DataFrame:
     mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
     return mhc
 
-def get_filtered_per_sample(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
+def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
     common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True)
-    sample_el = get_minimum_rank_per_peptide(sample, "el")
-    sample_ba = get_minimum_rank_per_peptide(sample, "ba")
+    sample_el = get_best_rank_per_peptide(sample, "el")
+    sample_ba = get_best_rank_per_peptide(sample, "ba")
     sample_filtered = sample_el.join(sample_ba)
     return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True)
 
-def tidy_info(info: pd.DataFrame, alias: str) -> pd.DataFrame:
+def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
+    """
+    Get the -o info output of the microphaser filter command into tidy data format.
+    """
     info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand'])
     int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites']
     info[int_cols] = info[int_cols].astype('int32')
-    num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': alias}).set_index('alias', append=True)
-    num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': alias}).set_index('alias', append=True)
-    sites_tidy = info.copy()
-    sites_tidy['sites'] = sites_tidy['variant_sites'].apply(lambda x: set(str(x).split('|')))
-    sites_tidy['somatic_sites'] = sites_tidy['somatic_positions'].apply(lambda x: set(str(x).split('|')))
-    sites_tidy['germline_sites'] = sites_tidy[['sites', 'somatic_sites']].apply(lambda row: [s for s in row['sites'] if s not in row['somatic_sites']], axis=1)
-    sites_tidy = sites_tidy.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_sites', 'germline_sites'], ignore_index=False).replace({'somatic_sites': 'normal', 'germline_sites': 'tumor_alias'}).set_index('alias', append=True)['genomic_pos'].apply(lambda r: '|'.join(r))
-    sites_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': 'tumor_resection', 'germline_positions': 'normal'}).set_index('alias', append=True)
-
-    seq_tidy = info.melt(var_name='alias', value_name='sequence', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': alias}).set_index('alias', append=True)
+    # TODO: Ensure that microphaser output contains only one entry per id.
+    # If there is more than one entry per index, ensure that they are identical
+    if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0:
+        sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.")
+    # Always take the first entry for each index.
+    info = info.groupby(info.index).head(1)    
+    # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal.
+    num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True)
+    num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True)
+    genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True)
+    aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True)
+    nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True)
+    return num_var_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy])
 
 def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame:
     # get and merge tumor and normal
-    tumor_filtered = get_filtered_per_sample(tumor, snakemake.wildcards.tumor_alias)
-    normal_filtered = get_filtered_per_sample(normal, "normal")
-    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').sort_index()
+    tumor_filtered = get_filtered_per_alias(tumor, snakemake.wildcards.tumor_alias)
+    normal_filtered = get_filtered_per_alias(normal, "normal")
+    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias='tumor_resection').sort_index()
     info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias)
-
-    # tidy up info
-    all_annotated = all_filtered.merge(info, on='id', how='left')
+    all_annotated = all_filtered.merge(info_tidy, on=['id', 'alias'], how='left')
 #    tumor = select_columns(tumor)
 #    normal = select_columns(normal)
 #    id_length = len(tumor.ID[0])
@@ -81,7 +85,7 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr
 #        })
 #    merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')
 
-    merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
+    merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diff_peptides(*x), axis=1)
     ## Are all possible variants in the peptide ("Cis") or not ("Trans")
     merged_dataframe["Variant_Orientation"] = "Cis"
     trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
@@ -116,19 +120,31 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr
     return data
 
 
-## highlight the difference between mutated neopeptide and wildtype
-def diffEpitope(e1,e2):
-    if str(e2) == 'nan' or str(e2) == '':
-        return(e1)
-    e1 = str(e1)
-    e2 = str(e2)
-    diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]]
-    e_new = e1
-    e2_new = e2
+def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame:
+    group = group.reset_index(level='alias')
+    normal_pep = group.loc[group['alias'] == 'normal', column].fillna('')
+    if normal_pep.empty:
+        normal_pep = ''
+    else:
+        normal_pep = normal_pep.squeeze()
+    tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze()
+    ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= diff_peptides(tumor_pep, normal_pep)
+    return group.set_index('alias', append=True)
+
+
+def diff_peptides(tumor_p: str, normal_p: str) -> (str, str):
+    """
+    Highlight the difference between mutated neopeptide and normal peptide
+    """
+    if normal_p == 'nan' or normal_p == '':
+        return (tumor_p, normal_p)
+    diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
+    tp_changed = tumor_p
+    np_changed = normal_p
     for p in diff_pos:
-        e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:]
-        e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:]
-    return(e_new)
+        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:]
+        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:]
+    return (tp_changed, np_changed)
 
 
 def main():

From f660e21dbda495017a0f6915a6fd872a5e24cd10 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 19:26:40 +0000
Subject: [PATCH 131/191] reworked version of merge_neoantigen_info.py to test

---
 workflow/scripts/merge_neoantigen_info.py | 201 ++++++++++------------
 1 file changed, 94 insertions(+), 107 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 3fe10399..d2a9334d 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -1,8 +1,11 @@
 import sys
+from xml.sax.handler import all_properties
 
 sys.stderr = open(snakemake.log[0], "w")
 
 import pandas as pd
+from typing import Tuple
+
 
 def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
     df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele'])
@@ -12,18 +15,6 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
     columns_to_keep = ['bind_core', rank_col, score_col]
     return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix)
 
-def select_columns(mhc: pd.DataFrame) -> pd.DataFrame:
-    rank_cols = [c for c in mhc.columns if "Rank" in c]
-    affinity_cols = [c for c in mhc.columns if "nM" in c]
-    mhc_cols = ["Pos", "ID", "Peptide"] + rank_cols + affinity_cols + ["NB"]
-    mhc = mhc[mhc_cols]
-    mhc["Rank_min"] = mhc[rank_cols].min(axis=1)
-    mhc["Aff_min"] = mhc[affinity_cols].min(axis=1)
-    mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1)
-    mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1)
-    mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","")
-    mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
-    return mhc
 
 def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
     common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True)
@@ -32,126 +23,122 @@ def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
     sample_filtered = sample_el.join(sample_ba)
     return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True)
 
+
+def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
+    """
+    Highlight the difference between mutated neopeptide and normal peptide
+    """
+    if normal_p == 'nan' or normal_p == '':
+        return (tumor_p, normal_p)
+    diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
+    tp_changed = tumor_p
+    np_changed = normal_p
+    for p in diff_pos:
+        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:]
+        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:]
+    return (tp_changed, np_changed)
+
+
+def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame:
+    group = group.reset_index(level='alias')
+    normal_pep = group.loc[group['alias'] == 'normal', column].fillna('')
+    if normal_pep.empty:
+        normal_pep = ''
+    else:
+        normal_pep = normal_pep.squeeze()
+    tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze()
+    # Silent mutations should not be included in microphaser output.
+    if normal_pep == tumor_pep:
+        sys.exit(f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
+                    "Please fix this upstream or comment out this check to ignore this problem.\n"
+        )
+    # Remove groups where the tumor peptide contains a stop codon.
+    # TODO: Maybe this should be a hard fail complaining to fix this upstream?
+    if 'X' in tumor_pep:
+        return group.loc[[], :]
+    ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= highlight_peptides_diff(tumor_pep, normal_pep)
+    return group.set_index('alias', append=True)
+
+
 def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     """
     Get the -o info output of the microphaser filter command into tidy data format.
     """
-    info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand'])
+    info = info.rename(columns={'credible_interval': 'freq_credible_interval'}).set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'freq_credible_interval', 'depth', 'strand'])
     int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites']
     info[int_cols] = info[int_cols].astype('int32')
     # TODO: Ensure that microphaser output contains only one entry per id.
     # If there is more than one entry per index, ensure that they are identical
     if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0:
-        sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.")
+        sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n")
     # Always take the first entry for each index.
     info = info.groupby(info.index).head(1)    
     # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal.
-    num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True)
+    num_var_in_pep_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var_in_pep', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True)
     num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True)
     genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True)
     aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True)
     nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True)
-    return num_var_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy])
+    all_tidy = num_var_in_pep_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy])
+    return all_tidy.reset_index(level=[i for i in all_tidy.index.names if i not in ['id', 'alias']])
 
-def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame:
+
+def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     # get and merge tumor and normal
-    tumor_filtered = get_filtered_per_alias(tumor, snakemake.wildcards.tumor_alias)
+    tumor_filtered = get_filtered_per_alias(tumor, tumor_alias)
     normal_filtered = get_filtered_per_alias(normal, "normal")
-    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias='tumor_resection').sort_index()
-    info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias)
-    all_annotated = all_filtered.merge(info_tidy, on=['id', 'alias'], how='left')
-#    tumor = select_columns(tumor)
-#    normal = select_columns(normal)
-#    id_length = len(tumor.ID[0])
-#    print(info.columns)
-#    info["ID"] = info["id"].astype(str).str[:id_length]
-#    merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID'])
-#    merged_mhc = merged_mhc
-#        .rename(
-#            columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}
-#            )
-#        .rename(
-#            columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns}
-#            )
-#    info = info.rename(
-#        columns={
-#            "gene_id":"Gene_ID",
-#            "gene_name":"Gene_Symbol",
-#            "strand":"Strand",
-#            "positions":"Variant_Position",
-#            "chrom":"Chromosome",
-#            "somatic_aa_change":
-#            "Somatic_AminoAcid_Change"
-#        })
-#    merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')
-
-    merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diff_peptides(*x), axis=1)
-    ## Are all possible variants in the peptide ("Cis") or not ("Trans")
-    merged_dataframe["Variant_Orientation"] = "Cis"
-    trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
-    merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans"
-
-    ## check misssense/silent mutation status
-    nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal
-    merged_dataframe = merged_dataframe[nonsilent]
-    merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"])
-
-    data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth",
-    "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor",
-    "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal",
-    "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]]
-
-    data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth",
-    "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor",
-    "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal",
-    "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]
-
-    # data = data[data.BindingHLAs_tumor > 0]
-    # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)]
-    #data = data[(data.BindingHLAs_normal == 0)]
-
-    ### Delete Stop-Codon including peptides
-    data = data[data.Peptide_tumor.str.count("x") == 0]
-    data = data[data.Peptide_tumor.str.count("X") == 0]
-    data.sort_values(["Chromosome", "somatic_positions"], inplace=True)
-    ### Remove Duplicate kmers
-    data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"])
-
-    return data
-
-
-def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame:
-    group = group.reset_index(level='alias')
-    normal_pep = group.loc[group['alias'] == 'normal', column].fillna('')
-    if normal_pep.empty:
-        normal_pep = ''
-    else:
-        normal_pep = normal_pep.squeeze()
-    tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze()
-    ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= diff_peptides(tumor_pep, normal_pep)
-    return group.set_index('alias', append=True)
-
-
-def diff_peptides(tumor_p: str, normal_p: str) -> (str, str):
-    """
-    Highlight the difference between mutated neopeptide and normal peptide
-    """
-    if normal_p == 'nan' or normal_p == '':
-        return (tumor_p, normal_p)
-    diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
-    tp_changed = tumor_p
-    np_changed = normal_p
-    for p in diff_pos:
-        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:]
-        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:]
-    return (tp_changed, np_changed)
+    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level=['pep_seq', 'pos_in_id_seq']).groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias=tumor_alias).sort_index()
+    info_tidy = tidy_info(info, tumor_alias)
+    all_annotated = all_filtered.join(info_tidy, how='left').reset_index(level=['id', 'alias'])
+    
+    # Double-check for weird duplicates, as previously done in Jan's code.
+    if sum(all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])) > 0:
+        duplicates = all_annotated[all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])]
+        sys.exit("Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n"
+                    "This indicates an upstream issue, please fix this. The offending entries are:\n"
+                    f"{duplicates}\n"
+            )
+
+    column_order = [
+        'id',
+        'pep_seq',
+        'pos_in_id_seq',
+        'alias',
+        'num_binders',
+        'freq',
+        'depth',
+        'num_var_sites',
+        'num_var_in_pep',
+        'top_el_rank_allele',
+        'top_el_rank_bind_core',
+        'top_el_rank_el_rank',
+        'top_el_rank_el_score',
+        'ave_el_score',
+        'top_ba_rank_allele',
+        'top_ba_rank_bind_core',
+        'top_ba_rank_ba_rank',
+        'top_ba_rank_ba_score',
+        'aa_changes',
+        'genomic_pos',
+        'nt_seq',
+        'gene_name',
+        'gene_id',
+        'transcript',
+        'chrom',
+        'offset',
+        'frame',
+        'strand',
+        'freq_credible_interval',
+    ]
+
+    return all_annotated.reindex(columns=column_order).sort_values(['chrom', 'genomic_pos', 'id'])
 
 
 def main():
     info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str)
     tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str})
     normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str})
-    data = merge_data_frames(info, tumor, normal)
+    data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias)
     data.to_csv(snakemake.output[0], index=False, sep = '\t')
 
 if __name__ == '__main__':

From ace6130a066205ed63579b5638d55c1a9d5c6f83 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 19:42:06 +0000
Subject: [PATCH 132/191] try fix for sorting

---
 workflow/scripts/merge_neoantigen_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index d2a9334d..ea861ea8 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -131,7 +131,7 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr
         'freq_credible_interval',
     ]
 
-    return all_annotated.reindex(columns=column_order).sort_values(['chrom', 'genomic_pos', 'id'])
+    return all_annotated.reindex(columns=column_order).sort_values(by = ['chrom', 'genomic_pos', 'id'])
 
 
 def main():

From 92dcdf1e6b37cd1f754d0603e8a55f04406b214e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 11 Aug 2022 19:43:12 +0000
Subject: [PATCH 133/191] black formatting

---
 workflow/scripts/merge_neoantigen_info.py | 264 ++++++++++++++++------
 1 file changed, 190 insertions(+), 74 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index ea861ea8..32a50556 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -8,138 +8,254 @@
 
 
 def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
-    df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele'])
+    df = df.set_index(["id", "pos_in_id_seq", "pep_seq", "allele"])
     rank_col = f"{rank_type}_rank"
     score_col = f"{rank_type}_score"
     prefix = f"top_{rank_col}_"
-    columns_to_keep = ['bind_core', rank_col, score_col]
-    return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix)
+    columns_to_keep = ["bind_core", rank_col, score_col]
+    return (
+        df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep]
+        .reset_index(level="allele")
+        .sort_index()
+        .drop_duplicates()
+        .add_prefix(prefix)
+    )
 
 
 def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
-    common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True)
+    common_info = (
+        sample.set_index(["id", "pos_in_id_seq", "pep_seq"])
+        .loc[:, ["ave_el_score", "num_binders"]]
+        .reset_index(level=["id", "pos_in_id_seq"])
+        .drop_duplicates()
+        .set_index(["id", "pos_in_id_seq"], append=True)
+    )
     sample_el = get_best_rank_per_peptide(sample, "el")
     sample_ba = get_best_rank_per_peptide(sample, "ba")
     sample_filtered = sample_el.join(sample_ba)
-    return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True)
+    return (
+        sample_filtered.join(common_info, how="left")
+        .assign(alias=alias)
+        .set_index("alias", append=True)
+    )
 
 
 def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
     """
     Highlight the difference between mutated neopeptide and normal peptide
     """
-    if normal_p == 'nan' or normal_p == '':
+    if normal_p == "nan" or normal_p == "":
         return (tumor_p, normal_p)
     diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
     tp_changed = tumor_p
     np_changed = normal_p
     for p in diff_pos:
-        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:]
-        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:]
+        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :]
+        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :]
     return (tp_changed, np_changed)
 
 
-def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame:
-    group = group.reset_index(level='alias')
-    normal_pep = group.loc[group['alias'] == 'normal', column].fillna('')
+def diff_tumor_normal_peptides(
+    group: pd.DataFrame, column: str, tumor_alias: str
+) -> pd.DataFrame:
+    group = group.reset_index(level="alias")
+    normal_pep = group.loc[group["alias"] == "normal", column].fillna("")
     if normal_pep.empty:
-        normal_pep = ''
+        normal_pep = ""
     else:
         normal_pep = normal_pep.squeeze()
-    tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze()
+    tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze()
     # Silent mutations should not be included in microphaser output.
     if normal_pep == tumor_pep:
-        sys.exit(f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
-                    "Please fix this upstream or comment out this check to ignore this problem.\n"
+        sys.exit(
+            f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
+            "Please fix this upstream or comment out this check to ignore this problem.\n"
         )
     # Remove groups where the tumor peptide contains a stop codon.
     # TODO: Maybe this should be a hard fail complaining to fix this upstream?
-    if 'X' in tumor_pep:
+    if "X" in tumor_pep:
         return group.loc[[], :]
-    ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= highlight_peptides_diff(tumor_pep, normal_pep)
-    return group.set_index('alias', append=True)
+    (
+        group.loc[group["alias"] == tumor_alias, column],
+        group.loc[group["alias"] == "normal", column],
+    ) = highlight_peptides_diff(tumor_pep, normal_pep)
+    return group.set_index("alias", append=True)
 
 
 def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     """
     Get the -o info output of the microphaser filter command into tidy data format.
     """
-    info = info.rename(columns={'credible_interval': 'freq_credible_interval'}).set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'freq_credible_interval', 'depth', 'strand'])
-    int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites']
-    info[int_cols] = info[int_cols].astype('int32')
+    info = info.rename(
+        columns={"credible_interval": "freq_credible_interval"}
+    ).set_index(
+        [
+            "id",
+            "transcript",
+            "gene_id",
+            "gene_name",
+            "chrom",
+            "offset",
+            "frame",
+            "freq",
+            "freq_credible_interval",
+            "depth",
+            "strand",
+        ]
+    )
+    int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"]
+    info[int_cols] = info[int_cols].astype("int32")
     # TODO: Ensure that microphaser output contains only one entry per id.
     # If there is more than one entry per index, ensure that they are identical
     if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0:
-        sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n")
+        sys.exit(
+            f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n"
+        )
     # Always take the first entry for each index.
-    info = info.groupby(info.index).head(1)    
+    info = info.groupby(info.index).head(1)
     # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal.
-    num_var_in_pep_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var_in_pep', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True)
-    num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True)
-    genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True)
-    aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True)
-    nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True)
-    all_tidy = num_var_in_pep_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy])
-    return all_tidy.reset_index(level=[i for i in all_tidy.index.names if i not in ['id', 'alias']])
+    num_var_in_pep_tidy = (
+        info.assign(ngermline=lambda x: x.nvar - x.nsomatic)
+        .melt(
+            var_name="alias",
+            value_name="num_var_in_pep",
+            value_vars=["ngermline", "nsomatic"],
+            ignore_index=False,
+        )
+        .replace({"ngermline": "normal", "nsomatic": tumor_alias})
+        .set_index("alias", append=True)
+    )
+    num_var_sites_tidy = (
+        info.assign(ngermvariant_sites=lambda x: x.nvariant_sites - x.nsomvariant_sites)
+        .melt(
+            var_name="alias",
+            value_name="num_var_sites",
+            value_vars=["ngermvariant_sites", "nsomvariant_sites"],
+            ignore_index=False,
+        )
+        .replace({"ngermvariant_sites": "normal", "nsomvariant_sites": tumor_alias})
+        .set_index("alias", append=True)
+    )
+    genomic_pos_tidy = (
+        info.melt(
+            var_name="alias",
+            value_name="genomic_pos",
+            value_vars=["somatic_positions", "germline_positions"],
+            ignore_index=False,
+        )
+        .replace({"somatic_positions": tumor_alias, "germline_positions": "normal"})
+        .set_index("alias", append=True)
+    )
+    aa_changes_tidy = (
+        info.melt(
+            var_name="alias",
+            value_name="aa_changes",
+            value_vars=["somatic_aa_change", "germline_aa_change"],
+            ignore_index=False,
+        )
+        .replace({"somatic_aa_change": tumor_alias, "germline_aa_change": "normal"})
+        .set_index("alias", append=True)
+    )
+    nt_seq_tidy = (
+        info.melt(
+            var_name="alias",
+            value_name="nt_seq",
+            value_vars=["normal_sequence", "mutant_sequence"],
+            ignore_index=False,
+        )
+        .replace({"normal_sequence": "normal", "mutant_sequence": tumor_alias})
+        .set_index("alias", append=True)
+    )
+    all_tidy = num_var_in_pep_tidy.join(
+        [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]
+    )
+    return all_tidy.reset_index(
+        level=[i for i in all_tidy.index.names if i not in ["id", "alias"]]
+    )
 
 
-def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
+def merge_data_frames(
+    info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str
+) -> pd.DataFrame:
     # get and merge tumor and normal
     tumor_filtered = get_filtered_per_alias(tumor, tumor_alias)
     normal_filtered = get_filtered_per_alias(normal, "normal")
-    all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level=['pep_seq', 'pos_in_id_seq']).groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias=tumor_alias).sort_index()
+    all_filtered = (
+        pd.concat([tumor_filtered, normal_filtered])
+        .reset_index(level=["pep_seq", "pos_in_id_seq"])
+        .groupby("id", group_keys=False)
+        .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias)
+        .sort_index()
+    )
     info_tidy = tidy_info(info, tumor_alias)
-    all_annotated = all_filtered.join(info_tidy, how='left').reset_index(level=['id', 'alias'])
-    
+    all_annotated = all_filtered.join(info_tidy, how="left").reset_index(
+        level=["id", "alias"]
+    )
+
     # Double-check for weird duplicates, as previously done in Jan's code.
-    if sum(all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])) > 0:
-        duplicates = all_annotated[all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])]
-        sys.exit("Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n"
-                    "This indicates an upstream issue, please fix this. The offending entries are:\n"
-                    f"{duplicates}\n"
+    if (
+        sum(
+            all_annotated.duplicated(
+                subset=["transcript", "offset", "pep_seq", "aa_changes"]
+            )
+        )
+        > 0
+    ):
+        duplicates = all_annotated[
+            all_annotated.duplicated(
+                subset=["transcript", "offset", "pep_seq", "aa_changes"]
             )
+        ]
+        sys.exit(
+            "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n"
+            "This indicates an upstream issue, please fix this. The offending entries are:\n"
+            f"{duplicates}\n"
+        )
 
     column_order = [
-        'id',
-        'pep_seq',
-        'pos_in_id_seq',
-        'alias',
-        'num_binders',
-        'freq',
-        'depth',
-        'num_var_sites',
-        'num_var_in_pep',
-        'top_el_rank_allele',
-        'top_el_rank_bind_core',
-        'top_el_rank_el_rank',
-        'top_el_rank_el_score',
-        'ave_el_score',
-        'top_ba_rank_allele',
-        'top_ba_rank_bind_core',
-        'top_ba_rank_ba_rank',
-        'top_ba_rank_ba_score',
-        'aa_changes',
-        'genomic_pos',
-        'nt_seq',
-        'gene_name',
-        'gene_id',
-        'transcript',
-        'chrom',
-        'offset',
-        'frame',
-        'strand',
-        'freq_credible_interval',
+        "id",
+        "pep_seq",
+        "pos_in_id_seq",
+        "alias",
+        "num_binders",
+        "freq",
+        "depth",
+        "num_var_sites",
+        "num_var_in_pep",
+        "top_el_rank_allele",
+        "top_el_rank_bind_core",
+        "top_el_rank_el_rank",
+        "top_el_rank_el_score",
+        "ave_el_score",
+        "top_ba_rank_allele",
+        "top_ba_rank_bind_core",
+        "top_ba_rank_ba_rank",
+        "top_ba_rank_ba_score",
+        "aa_changes",
+        "genomic_pos",
+        "nt_seq",
+        "gene_name",
+        "gene_id",
+        "transcript",
+        "chrom",
+        "offset",
+        "frame",
+        "strand",
+        "freq_credible_interval",
     ]
 
-    return all_annotated.reindex(columns=column_order).sort_values(by = ['chrom', 'genomic_pos', 'id'])
+    return all_annotated.reindex(columns=column_order).sort_values(
+        by=["chrom", "genomic_pos", "id"]
+    )
 
 
 def main():
-    info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str)
-    tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str})
-    normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str})
+    info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str)
+    tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str})
+    normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str})
     data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias)
-    data.to_csv(snakemake.output[0], index=False, sep = '\t')
+    data.to_csv(snakemake.output[0], index=False, sep="\t")
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())

From 52b46cb27edece3651b2d88e33ced7ceacdc57f4 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 16 Aug 2022 09:00:21 +0000
Subject: [PATCH 134/191] code review by @tedil

---
 workflow/scripts/merge_neoantigen_info.py | 35 +++++++++++------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 32a50556..4599f932 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -1,5 +1,4 @@
 import sys
-from xml.sax.handler import all_properties
 
 sys.stderr = open(snakemake.log[0], "w")
 
@@ -46,6 +45,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
     """
     if normal_p == "nan" or normal_p == "":
         return (tumor_p, normal_p)
+    assert len(tumor_p) == len(normal_p), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
     diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
     tp_changed = tumor_p
     np_changed = normal_p
@@ -67,18 +67,19 @@ def diff_tumor_normal_peptides(
     tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze()
     # Silent mutations should not be included in microphaser output.
     if normal_pep == tumor_pep:
-        sys.exit(
+        raise ValueError(
             f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
             "Please fix this upstream or comment out this check to ignore this problem.\n"
         )
     # Remove groups where the tumor peptide contains a stop codon.
     # TODO: Maybe this should be a hard fail complaining to fix this upstream?
+    # TODO: Write out warning.
     if "X" in tumor_pep:
+        print(f"Warning: ", file=sys.stderr)
         return group.loc[[], :]
-    (
-        group.loc[group["alias"] == tumor_alias, column],
-        group.loc[group["alias"] == "normal", column],
-    ) = highlight_peptides_diff(tumor_pep, normal_pep)
+    t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep)
+    group.loc[group["alias"] == tumor_alias, column] = t_diff
+    group.loc[group["alias"] == "normal", column] = n_diff
     return group.set_index("alias", append=True)
 
 
@@ -108,12 +109,13 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     # TODO: Ensure that microphaser output contains only one entry per id.
     # If there is more than one entry per index, ensure that they are identical
     if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0:
-        sys.exit(
+        raise ValueError(
             f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n"
         )
     # Always take the first entry for each index.
     info = info.groupby(info.index).head(1)
     # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal.
+    # TODO: factor out tidying of column pairs into a function.
     num_var_in_pep_tidy = (
         info.assign(ngermline=lambda x: x.nvar - x.nsomatic)
         .melt(
@@ -206,7 +208,7 @@ def merge_data_frames(
                 subset=["transcript", "offset", "pep_seq", "aa_changes"]
             )
         ]
-        sys.exit(
+        raise ValueError(
             "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n"
             "This indicates an upstream issue, please fix this. The offending entries are:\n"
             f"{duplicates}\n"
@@ -245,17 +247,12 @@ def merge_data_frames(
     ]
 
     return all_annotated.reindex(columns=column_order).sort_values(
-        by=["chrom", "genomic_pos", "id"]
+        by=["chrom", "offset", "id", "alias"]
     )
 
 
-def main():
-    info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str)
-    tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str})
-    normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str})
-    data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias)
-    data.to_csv(snakemake.output[0], index=False, sep="\t")
-
-
-if __name__ == "__main__":
-    sys.exit(main())
+info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str)
+tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str})
+normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str})
+data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias)
+data.to_csv(snakemake.output[0], index=False, sep="\t")

From f4960bb5655a7de25b23bf14d68e4bad204a8963 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 16 Aug 2022 11:26:39 +0000
Subject: [PATCH 135/191] truncate ids for netMHCpan join() on id to work

---
 workflow/scripts/merge_neoantigen_info.py | 54 +++++++++++++++--------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 4599f932..0a2906f2 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -3,7 +3,7 @@
 sys.stderr = open(snakemake.log[0], "w")
 
 import pandas as pd
-from typing import Tuple
+from typing import List, Tuple
 
 
 def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
@@ -176,6 +176,29 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     )
 
 
+def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str):
+    if (
+        sum(
+            df.duplicated(
+                subset=cols
+            )
+        )
+        > 0
+    ):
+        duplicates = all_annotated[
+            df.duplicated(
+                subset=cols
+            )
+        ]
+        raise ValueError(
+            f"Found multiple rows with identical [ \"{'\", \"'.join(cols)}\" ] entries.\n"
+            "This indicates an upstream issue, please fix this.\n"
+            f"{specific_error}"
+            "The offending entries are:\n"
+            f"{duplicates}\n"
+        )
+
+
 def merge_data_frames(
     info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str
 ) -> pd.DataFrame:
@@ -190,29 +213,22 @@ def merge_data_frames(
         .sort_index()
     )
     info_tidy = tidy_info(info, tumor_alias)
+
+    # netMHCpan 4.1 truncates the fasta entry IDs, so we have to cut down the IDs
+    # that microphaser originally provided to make the following .join() work
+    len_tumor_id = len(tumor_filtered["id"][0])
+    len_normal_id = len(normal_filtered["id"][0])
+    assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
+    info_tidy['id'] = info_tidy['id'].str[:len_tumor_id]
+    # Double-check for duplicates resulting from the id truncation
+    check_duplicates(info_tidy, ["id", "alias"])
+
     all_annotated = all_filtered.join(info_tidy, how="left").reset_index(
         level=["id", "alias"]
     )
 
     # Double-check for weird duplicates, as previously done in Jan's code.
-    if (
-        sum(
-            all_annotated.duplicated(
-                subset=["transcript", "offset", "pep_seq", "aa_changes"]
-            )
-        )
-        > 0
-    ):
-        duplicates = all_annotated[
-            all_annotated.duplicated(
-                subset=["transcript", "offset", "pep_seq", "aa_changes"]
-            )
-        ]
-        raise ValueError(
-            "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n"
-            "This indicates an upstream issue, please fix this. The offending entries are:\n"
-            f"{duplicates}\n"
-        )
+    check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"])
 
     column_order = [
         "id",

From da2974d082490ab4d6244360cb04cca2ea78015b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 16 Aug 2022 11:48:57 +0000
Subject: [PATCH 136/191] further fixes, working without index wherever
 possible

---
 workflow/scripts/merge_neoantigen_info.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 0a2906f2..f87f23d0 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -35,7 +35,7 @@ def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
     return (
         sample_filtered.join(common_info, how="left")
         .assign(alias=alias)
-        .set_index("alias", append=True)
+        .reset_index()
     )
 
 
@@ -171,9 +171,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     all_tidy = num_var_in_pep_tidy.join(
         [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]
     )
-    return all_tidy.reset_index(
-        level=[i for i in all_tidy.index.names if i not in ["id", "alias"]]
-    )
+    return all_tidy.reset_index()
 
 
 def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str):
@@ -190,8 +188,9 @@ def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str):
                 subset=cols
             )
         ]
+        cols_str = '", "'.join(cols)
         raise ValueError(
-            f"Found multiple rows with identical [ \"{'\", \"'.join(cols)}\" ] entries.\n"
+            f'Found multiple rows with identical [ "{cols_str}" ] entries.\n'
             "This indicates an upstream issue, please fix this.\n"
             f"{specific_error}"
             "The offending entries are:\n"
@@ -207,10 +206,8 @@ def merge_data_frames(
     normal_filtered = get_filtered_per_alias(normal, "normal")
     all_filtered = (
         pd.concat([tumor_filtered, normal_filtered])
-        .reset_index(level=["pep_seq", "pos_in_id_seq"])
         .groupby("id", group_keys=False)
         .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias)
-        .sort_index()
     )
     info_tidy = tidy_info(info, tumor_alias)
 
@@ -219,13 +216,11 @@ def merge_data_frames(
     len_tumor_id = len(tumor_filtered["id"][0])
     len_normal_id = len(normal_filtered["id"][0])
     assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
-    info_tidy['id'] = info_tidy['id'].str[:len_tumor_id]
+    info_tidy["id"] = info_tidy["id"].str[:len_tumor_id]
     # Double-check for duplicates resulting from the id truncation
     check_duplicates(info_tidy, ["id", "alias"])
 
-    all_annotated = all_filtered.join(info_tidy, how="left").reset_index(
-        level=["id", "alias"]
-    )
+    all_annotated = all_filtered.join(info_tidy, how="left", on=["id", "alias"])
 
     # Double-check for weird duplicates, as previously done in Jan's code.
     check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"])

From 377bba433a0a1ddd6d981e5e1342ad19f6470d91 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 16 Aug 2022 12:16:40 +0000
Subject: [PATCH 137/191] further fixes to remove indices and for more specific
 error messages

---
 workflow/scripts/merge_neoantigen_info.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index f87f23d0..c98a1ed1 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -58,7 +58,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
 def diff_tumor_normal_peptides(
     group: pd.DataFrame, column: str, tumor_alias: str
 ) -> pd.DataFrame:
-    group = group.reset_index(level="alias")
+    group = group
     normal_pep = group.loc[group["alias"] == "normal", column].fillna("")
     if normal_pep.empty:
         normal_pep = ""
@@ -80,7 +80,7 @@ def diff_tumor_normal_peptides(
     t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep)
     group.loc[group["alias"] == tumor_alias, column] = t_diff
     group.loc[group["alias"] == "normal", column] = n_diff
-    return group.set_index("alias", append=True)
+    return group
 
 
 def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
@@ -174,7 +174,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     return all_tidy.reset_index()
 
 
-def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str):
+def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""):
     if (
         sum(
             df.duplicated(
@@ -218,9 +218,9 @@ def merge_data_frames(
     assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
     info_tidy["id"] = info_tidy["id"].str[:len_tumor_id]
     # Double-check for duplicates resulting from the id truncation
-    check_duplicates(info_tidy, ["id", "alias"])
+    check_duplicates(info_tidy, ["id", "alias"], specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n")
 
-    all_annotated = all_filtered.join(info_tidy, how="left", on=["id", "alias"])
+    all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"])
 
     # Double-check for weird duplicates, as previously done in Jan's code.
     check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"])

From b425ff77bbda11266645ca31e43863215b32bb7b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:38:13 +0000
Subject: [PATCH 138/191] sort merged neoantigen infos by el_rank of tumor
 sample

---
 workflow/scripts/merge_neoantigen_info.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index c98a1ed1..d50f2a05 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -257,9 +257,13 @@ def merge_data_frames(
         "freq_credible_interval",
     ]
 
-    return all_annotated.reindex(columns=column_order).sort_values(
-        by=["chrom", "offset", "id", "alias"]
-    )
+    def get_id_rank(group: pd.DataFrame, tumor_alias: str):
+        return group.loc[group["alias"] == tumor_alias, 'top_el_rank_el_rank'].squeeze()
+    
+    sort_rank = all_annotated.groupby('id').apply(get_id_rank, tumor_alias).rename('sort_rank')
+    all_sorted = all_annotated.merge(sort_rank, on=['id'], how='left').sort_values(['sort_rank', 'id', 'alias'], ascending=[True, True, False]).drop(columns='sort_rank')
+
+    return all_sorted.reindex(columns=column_order)
 
 
 info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str)

From fb98a80ffa88e233a614efdc1fd38034d48dea2d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:39:49 +0000
Subject: [PATCH 139/191] get rid of further index usages

---
 workflow/scripts/merge_neoantigen_info.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index d50f2a05..c2082da7 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -15,27 +15,24 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
     return (
         df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep]
         .reset_index(level="allele")
-        .sort_index()
-        .drop_duplicates()
         .add_prefix(prefix)
+        .reset_index()
+        .drop_duplicates()
     )
 
 
 def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
+    merge_cols = ["id", "pos_in_id_seq", "pep_seq"]
     common_info = (
-        sample.set_index(["id", "pos_in_id_seq", "pep_seq"])
-        .loc[:, ["ave_el_score", "num_binders"]]
-        .reset_index(level=["id", "pos_in_id_seq"])
+        sample.loc[:, merge_cols + ["ave_el_score", "num_binders"]]
         .drop_duplicates()
-        .set_index(["id", "pos_in_id_seq"], append=True)
     )
     sample_el = get_best_rank_per_peptide(sample, "el")
     sample_ba = get_best_rank_per_peptide(sample, "ba")
-    sample_filtered = sample_el.join(sample_ba)
+    sample_filtered = sample_el.merge(sample_ba, on=merge_cols)
     return (
-        sample_filtered.join(common_info, how="left")
+        sample_filtered.merge(common_info, how="left", on=merge_cols)
         .assign(alias=alias)
-        .reset_index()
     )
 
 

From 167ea484898baf0c3eb12df5227bfb8b3b3b6f37 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:40:33 +0000
Subject: [PATCH 140/191] remove peptides with stop codon earlier

---
 workflow/scripts/merge_neoantigen_info.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index c2082da7..c6136f6a 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -62,18 +62,17 @@ def diff_tumor_normal_peptides(
     else:
         normal_pep = normal_pep.squeeze()
     tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze()
+    # Remove groups where the tumor peptide contains a stop codon.
+    # TODO: Maybe this should be a hard fail complaining to fix this upstream?
+    if "X" in tumor_pep:
+        print(f"Warning: ", file=sys.stderr)
+        return group.loc[[], :]
     # Silent mutations should not be included in microphaser output.
     if normal_pep == tumor_pep:
         raise ValueError(
             f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
             "Please fix this upstream or comment out this check to ignore this problem.\n"
         )
-    # Remove groups where the tumor peptide contains a stop codon.
-    # TODO: Maybe this should be a hard fail complaining to fix this upstream?
-    # TODO: Write out warning.
-    if "X" in tumor_pep:
-        print(f"Warning: ", file=sys.stderr)
-        return group.loc[[], :]
     t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep)
     group.loc[group["alias"] == tumor_alias, column] = t_diff
     group.loc[group["alias"] == "normal", column] = n_diff

From 650a3500900972b1b2b44c93ec54c631bc0e1224 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:41:15 +0000
Subject: [PATCH 141/191] aggregate all transcripts for a peptide into a list
 column

---
 workflow/scripts/merge_neoantigen_info.py | 34 +++++++++++++----------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index c6136f6a..cea3c94a 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -85,21 +85,27 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     """
     info = info.rename(
         columns={"credible_interval": "freq_credible_interval"}
-    ).set_index(
-        [
-            "id",
-            "transcript",
-            "gene_id",
-            "gene_name",
-            "chrom",
-            "offset",
-            "frame",
-            "freq",
-            "freq_credible_interval",
-            "depth",
-            "strand",
-        ]
     )
+    # Aggregate multiple identical entries that differ only in 'id' and 'transcript'
+    # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
+    # list.
+    cols = [ c for c in info.columns if c not in ['id', 'transcript'] ]
+    aggregation_functions = {'id': lambda i: list(i), 'transcript': lambda t: '|'.join(set(t)) }
+    info = info.groupby(cols, dropna=False).agg(aggregation_functions).reset_index().explode('id').set_index(
+            [
+                "id",
+                "transcript",
+                "gene_id",
+                "gene_name",
+                "chrom",
+                "offset",
+                "frame",
+                "freq",
+                "freq_credible_interval",
+                "depth",
+                "strand",
+            ]
+        )
     int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"]
     info[int_cols] = info[int_cols].astype("int32")
     # TODO: Ensure that microphaser output contains only one entry per id.

From 4467eb7d61707c01b36069a67a747615e6fb556b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:42:30 +0000
Subject: [PATCH 142/191] remove duplicate entries that differ only in
 microphaser id

---
 workflow/scripts/merge_neoantigen_info.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index cea3c94a..07dfce50 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -185,9 +185,10 @@ def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""
         )
         > 0
     ):
-        duplicates = all_annotated[
+        duplicates = df[
             df.duplicated(
-                subset=cols
+                subset=cols,
+                keep=False,
             )
         ]
         cols_str = '", "'.join(cols)
@@ -225,7 +226,11 @@ def merge_data_frames(
     all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"])
 
     # Double-check for weird duplicates, as previously done in Jan's code.
-    check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"])
+    # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"],
+    # we check for everything except id.
+    cols_without_id = [ c for c in all_annotated.columns if c not in ['id'] ]
+    all_annotated = all_annotated.drop_duplicates(subset=cols_without_id)
+    check_duplicates(all_annotated, cols_without_id)
 
     column_order = [
         "id",

From 732717fe68722305c30c8e64087ed08346b6a9a3 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 12:46:56 +0000
Subject: [PATCH 143/191] update microphaser to 0.6 with filter TSV header fix

---
 workflow/envs/microphaser.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml
index 00657086..d781c04c 100644
--- a/workflow/envs/microphaser.yaml
+++ b/workflow/envs/microphaser.yaml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - microphaser =0.5
+  - microphaser =0.6

From d078e9cfd20a3812e00c7107b2a3bf1af700948e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 18:13:04 +0000
Subject: [PATCH 144/191] detailed description of columns in neoantigen results
 table

---
 workflow/report/WES_results.rst     |  1 -
 workflow/report/neoantigens.DNA.rst | 60 +++++++++++++++++++++++++++++
 workflow/rules/MHC_binding.smk      |  4 +-
 3 files changed, 62 insertions(+), 3 deletions(-)
 delete mode 100644 workflow/report/WES_results.rst
 create mode 100644 workflow/report/neoantigens.DNA.rst

diff --git a/workflow/report/WES_results.rst b/workflow/report/WES_results.rst
deleted file mode 100644
index 4a699bdf..00000000
--- a/workflow/report/WES_results.rst
+++ /dev/null
@@ -1 +0,0 @@
-"Results - neoantigen candidate table"
diff --git a/workflow/report/neoantigens.DNA.rst b/workflow/report/neoantigens.DNA.rst
new file mode 100644
index 00000000..17ba537e
--- /dev/null
+++ b/workflow/report/neoantigens.DNA.rst
@@ -0,0 +1,60 @@
+Neoantigens and corresponding normal peptides as phased and determined by
+microphaser, with elution ligand / binding affinity predictions by netMHC
+(netMHCpan and netMHCIIpan) to the HLA alleles determined by HLA-LA.
+
+===================
+Column descriptions
+===================
+
+* **id**: Peptide ID assigned by microphaser.
+* **pep_seq**: Sequence of the full peptide that was given to netMHC(II)pan. Amino acids that
+    are different between the normal and the tumor sample are highlighted in lower case.
+* **pos_in_id_seq**: Position in pep_seq of the peptide that was used for prediction.
+    It seems like netMHCpan positions start at 0 and netMHCIIpan positions at 1.
+* **alias**: Indicator of the type of sample (normal vs. some tumor sample).
+* **num_binders**: Total number of peptide-HLA allele pairs from pep_seq that are considered binders,
+    either weak or strong. Cutoffs are applied to el_rank and are:
+    * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder)
+    * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder)
+* **freq**: Allelic frequency of the peptide as predicted by microphaser. For a
+    credible allele frequency interval, see column freq_credible_interval.
+* **depth**: Read depth at the peptide position.
+* **num_var_sites**: Number of variant sites on the peptides haplotype in that sample (alias).
+* **num_var_in_pep**: Number of variant sites within the peptide sequence.
+* **top_el_rank_allele**: HLA allele with the best eluted ligand prediction score percentile rank.
+* **top_el_rank_bind_core**: Binding core of the peptide for the HLA allele with the best
+    elution ligand score percentile rank.
+* **top_el_rank_el_rank**: Percentile rank of the elution ligand score. The rank of the predicted binding
+    score when compared to a set of random natural peptides. This measure is not
+    affected by inherent bias of certain molecules towards higher or lower mean
+    predicted affinities. It is the recommended value for determining likely
+    binders / neoantigens of interest. Cutoffs recommended by netMHC(II)pan authors
+    are:
+    * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder)
+    * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder)
+* **top_el_rank_el_score**: The raw eluted ligand prediction score
+* **ave_el_score**: Average across the eluted ligand prediction scores of all alleles for this
+    peptide in the particular sample (alias).
+* **top_ba_rank_allele**: HLA allele with the best binding affinity prediction score percentile rank.
+* **top_ba_rank_bind_core**: Binding core of the peptide for the HLA allele with the best
+    binding affinity score percentile rank.
+* **top_ba_rank_ba_rank**: Percentile rank of the predicted binding affinity compared to a set of 100.000
+    random natural peptides. This measure is not affected by inherent bias of certain
+    molecules towards higher or lower mean predicted affinities.
+* **top_ba_rank_ba_score**: Predicted binding affinity in log-scale.
+* **aa_changes**: List of aa changes. For normal samples, only germline changes are listed. For
+    tumor samples, only somatic tumor changes are listed, even though the germline
+    changes also affect the tumor peptide.
+* **genomic_pos**: Genomic position of the nucleotide change.
+* **nt_seq**: Nucleotide sequence underlying the peptide. Nucleotide changes underlying the
+    amino acid changes are highlighted as lower case letters.    
+* **gene_name**: Common gene name / gene symbol of the peptide's gene of origin.
+* **gene_id**: Ensembl gene id of the peptide's gene of origin.
+* **transcript**: List of Ensembl transcript ids in which the peptide occurs.
+* **chrom**: Chromosome on which the peptide's gene of origin is located.    
+* **offset**: Chromosomal position of the peptide's gene of origin.    
+* **frame**: Open reading frame that the peptide originates from. 0 indicates the regular
+    reading frame, non-zero values indicate frame shifts.
+* **strand**: Strand of the gene / transcript.
+* **freq_credible_interval**: Credible interval for freq, the allelic frequency of the peptide as predicted
+    by microphaser.
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
index f6163b26..4ee0c185 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/MHC_binding.smk
@@ -72,8 +72,8 @@ rule merge_neoantigen_info:
     output:
         report(
             "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
-            caption="../report/WES_results.rst",
-            category="Results WES (netMHC)",
+            caption="../report/neoantigens.DNA.rst",
+            category="Neoantigens",
         ),
     log:
         "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",

From 4463176ead51ab0386bdf387f8c8051736750ab5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 17 Aug 2022 18:14:12 +0000
Subject: [PATCH 145/191] update HLA allele report description

---
 workflow/report/HLA_Types.rst   | 1 -
 workflow/report/hla_alleles.rst | 1 +
 workflow/rules/HLAtyping.smk    | 8 ++++----
 3 files changed, 5 insertions(+), 5 deletions(-)
 delete mode 100644 workflow/report/HLA_Types.rst
 create mode 100644 workflow/report/hla_alleles.rst

diff --git a/workflow/report/HLA_Types.rst b/workflow/report/HLA_Types.rst
deleted file mode 100644
index 999fe494..00000000
--- a/workflow/report/HLA_Types.rst
+++ /dev/null
@@ -1 +0,0 @@
-Typing of HLA profile.
diff --git a/workflow/report/hla_alleles.rst b/workflow/report/hla_alleles.rst
new file mode 100644
index 00000000..a793a914
--- /dev/null
+++ b/workflow/report/hla_alleles.rst
@@ -0,0 +1 @@
+HLA allele profile as determined by HLA-LA.
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
index 179a2c67..2bbb1435 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/HLAtyping.smk
@@ -55,13 +55,13 @@ rule parse_and_filter_hla_alleles_for_netmhc:
     output:
         hlaI=report(
             "results/HLA-LA/{group}.{alias}.hlaI.tsv",
-            caption="../report/HLA_Types.rst",
-            category="HLA-Typing(HLA-LA)",
+            caption="../report/hla_alleles.rst",
+            category="HLA alleles",
         ),
         hlaII=report(
             "results/HLA-LA/{group}.{alias}.hlaII.tsv",
-            caption="../report/HLA_Types.rst",
-            category="HLA-Typing(HLA-LA)",
+            caption="../report/hla_alleles.rst",
+            category="HLA alleles",
         ),
     log:
         "logs/parse-HLA-LA/{group}.{alias}.log",

From 2e9a0b6664c415417036b63d92b54f2874f618bd Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:09:07 +0000
Subject: [PATCH 146/191] snakecase and lower case wherever possible

---
 workflow/Snakefile                                 |  4 ++--
 .../{neoantigens.DNA.rst => neoantigens.dna.rst}   |  0
 .../{RNA_results.rst => neoantigens.rna.rst}       |  0
 workflow/rules/common.smk                          |  8 ++++----
 workflow/rules/{HLAtyping.smk => hla_typing.smk}   | 14 +++++++-------
 .../rules/{MHC_binding.smk => mhc_binding.smk}     | 10 +++++-----
 workflow/rules/ref.smk                             | 12 ++++++------
 7 files changed, 24 insertions(+), 24 deletions(-)
 rename workflow/report/{neoantigens.DNA.rst => neoantigens.dna.rst} (100%)
 rename workflow/report/{RNA_results.rst => neoantigens.rna.rst} (100%)
 rename workflow/rules/{HLAtyping.smk => hla_typing.smk} (86%)
 rename workflow/rules/{MHC_binding.smk => mhc_binding.smk} (94%)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index f75eba84..f00c401a 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -29,8 +29,8 @@ include: "rules/common.smk"
 include: "rules/utils.smk"
 include: "rules/ref.smk"
 include: "rules/microphaser.smk"
-include: "rules/HLAtyping.smk"
-include: "rules/MHC_binding.smk"
+include: "rules/hla_typing.smk"
+include: "rules/mhc_binding.smk"
 
 
 rule all:
diff --git a/workflow/report/neoantigens.DNA.rst b/workflow/report/neoantigens.dna.rst
similarity index 100%
rename from workflow/report/neoantigens.DNA.rst
rename to workflow/report/neoantigens.dna.rst
diff --git a/workflow/report/RNA_results.rst b/workflow/report/neoantigens.rna.rst
similarity index 100%
rename from workflow/report/RNA_results.rst
rename to workflow/report/neoantigens.rna.rst
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9c928417..f25cdb85 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -125,8 +125,8 @@ def get_final_output():
         else:
             final_output = expand(
                 [
-                    "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv",
-                    "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv",
+                    "results/hla_la/{group}.{tumor_alias}.hlaI.tsv",
+                    "results/hla_la/{group}.{tumor_alias}.hlaII.tsv",
                 ],
                 group=group,
                 tumor_alias=tumor_aliases,
@@ -183,7 +183,7 @@ def get_bam_from_group_and_alias(ext=".bam"):
 def get_alleles_MHCI(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
-        "results/HLA-LA/{group}.{alias}.hlaI.tsv",
+        "results/hla_la/{group}.{alias}.hlaI.tsv",
         group=wildcards.group,
         alias=alias,
     )
@@ -193,7 +193,7 @@ def get_alleles_MHCII(wildcards):
     alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
     return expand(
         # TODO: check that hlaII is correct here, and not hlaI which it previously was
-        "results/HLA-LA/{group}.{alias}.hlaII.tsv",
+        "results/hla_la/{group}.{alias}.hlaII.tsv",
         group=wildcards.group,
         alias=alias,
     )
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/hla_typing.smk
similarity index 86%
rename from workflow/rules/HLAtyping.smk
rename to workflow/rules/hla_typing.smk
index 2bbb1435..98034abf 100644
--- a/workflow/rules/HLAtyping.smk
+++ b/workflow/rules/hla_typing.smk
@@ -1,14 +1,14 @@
-rule HLA_LA:
+rule hla_la:
     input:
         bam=get_bam_from_group_and_alias(),
         bai=get_bam_from_group_and_alias(ext=".bai"),
         index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
         ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac",
     output:
-        "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+        "results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt",
     threads: 7
     log:
-        "logs/HLA-LA/{group}_{alias}.log",
+        "logs/hla_la/{group}_{alias}.log",
     params:
         graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
         graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
@@ -49,21 +49,21 @@ rule net_mhc_two_pan_alleles:
 
 rule parse_and_filter_hla_alleles_for_netmhc:
     input:
-        hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+        hla_la_bestguess="results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt",
         mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt",
         mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt",
     output:
         hlaI=report(
-            "results/HLA-LA/{group}.{alias}.hlaI.tsv",
+            "results/hla_la/{group}.{alias}.hlaI.tsv",
             caption="../report/hla_alleles.rst",
             category="HLA alleles",
         ),
         hlaII=report(
-            "results/HLA-LA/{group}.{alias}.hlaII.tsv",
+            "results/hla_la/{group}.{alias}.hlaII.tsv",
             caption="../report/hla_alleles.rst",
             category="HLA alleles",
         ),
     log:
-        "logs/parse-HLA-LA/{group}.{alias}.log",
+        "logs/parse_hla_la/{group}.{alias}.log",
     script:
         "../scripts/parse_and_filter_hla_alleles_for_netmhc.py"
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/mhc_binding.smk
similarity index 94%
rename from workflow/rules/MHC_binding.smk
rename to workflow/rules/mhc_binding.smk
index 4ee0c185..0a326942 100644
--- a/workflow/rules/MHC_binding.smk
+++ b/workflow/rules/mhc_binding.smk
@@ -72,7 +72,7 @@ rule merge_neoantigen_info:
     output:
         report(
             "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
-            caption="../report/neoantigens.DNA.rst",
+            caption="../report/neoantigens.dna.rst",
             category="Neoantigens",
         ),
     log:
@@ -81,19 +81,19 @@ rule merge_neoantigen_info:
         "../scripts/merge_neoantigen_info.py"
 
 
-rule add_RNA_info:
+rule add_rna_info:
     input:
         counts="results/kallisto/{group}.{tumor_alias}",
         table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
     output:
         report(
             "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv",
-            caption="../report/RNA_results.rst",
-            category="Results RNA",
+            caption="../report/neoantigens.rna.rst",
+            category="Neoantigens",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
     log:
-        "logs/add-RNA/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
+        "logs/add_rna/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
     script:
         "../scripts/add_rna_info.py"
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 0d6324a0..d041c2e6 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -142,7 +142,7 @@ rule genome_dict:
         "0.45.1/bio/picard/createsequencedictionary"
 
 
-rule download_HLALA_graph:
+rule download_hla_la_graph:
     output:
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"),
         directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"),
@@ -158,13 +158,13 @@ rule download_HLALA_graph:
             "/PRG_MHC_GRCh38_withIMGT/PRG", ""
         ),
     log:
-        "logs/download-HLA-LA-graph.log",
+        "logs/download_hla_la_graph.log",
     shell:
         "( cd {params.graphs_dir} && wget  http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
         "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm  PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}"
 
 
-rule index_HLALA:
+rule index_hla_la:
     input:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     output:
@@ -176,12 +176,12 @@ rule index_HLALA:
         path=lambda wc, input: os.path.dirname(os.path.dirname(input[0])),
         graph=lambda wc, input: os.path.basename(os.path.dirname(input[0])),
     log:
-        "logs/index-HLA-LA-graph.log",
+        "logs/index_hla_la_graph.log",
     shell:
         "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
 
 
-rule index_HLALA_extended_ref:
+rule index_hla_la_extended_ref:
     input:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa",
     output:
@@ -193,7 +193,7 @@ rule index_HLALA_extended_ref:
     conda:
         "../envs/hla_la.yaml"
     log:
-        "logs/index_HLA-LA_extended_ref.log",
+        "logs/index_hla_la_extended_ref.log",
     shell:
         "bwa index {input} > {log} 2>&1"
 

From 946bbdf8b41faffb706447e8c937065b2466bc9d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:10:45 +0000
Subject: [PATCH 147/191] snakefmt

---
 workflow/rules/mhc_binding.smk |  8 ++++++--
 workflow/rules/microphaser.smk | 24 +++++++++++++++++-------
 workflow/rules/ref.smk         |  4 ++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk
index 0a326942..db077370 100644
--- a/workflow/rules/mhc_binding.smk
+++ b/workflow/rules/mhc_binding.smk
@@ -12,7 +12,9 @@ rule net_mhc_pan:
         extra=config["params"]["net_mhc_pan"]["extra"],
         netMHC=config["params"]["net_mhc_pan"]["location"],
         length=config["params"]["net_mhc_pan"]["peptide_len"],
-        alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
+        alleles=lambda wc, input: ",".join(
+            pd.read_csv(input.alleles[0], header=None)[0]
+        ),
     shell:
         "( "
         "if [ -s {input.peptides} ]; "
@@ -38,7 +40,9 @@ rule net_mhc_two_pan:
         extra=config["params"]["net_mhc_two_pan"]["extra"],
         netMHC=config["params"]["net_mhc_two_pan"]["location"],
         length=config["params"]["net_mhc_two_pan"]["peptide_len"],
-        alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ),
+        alleles=lambda wc, input: ",".join(
+            pd.read_csv(input.alleles[0], header=None)[0]
+        ),
     shell:
         "( "
         "if [ -s {input.peptides} ]; "
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index a57340ad..727d5b99 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -21,7 +21,7 @@ rule add_somatic_flag:
     output:
         "results/final-calls/{group}.{set}.somatic_flag.norm.bcf",
     log:
-        "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log"
+        "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log",
     conda:
         "../envs/bcftools.yaml"
     shell:
@@ -41,14 +41,16 @@ rule merge_tumor_normal:
             "results/final-calls/{{group}}.{sets}.norm.bcf",
             sets=[
                 config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag",
+                config["params"]["microphaser"]["variant_sets"]["tumor"]
+                + ".somatic_flag",
             ],
         ),
         index=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
             sets=[
                 config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag",
+                config["params"]["microphaser"]["variant_sets"]["tumor"]
+                + ".somatic_flag",
             ],
         ),
     output:
@@ -77,7 +79,11 @@ rule microphaser_tumor:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3,
+        window_length=lambda w: max(
+            config["params"]["net_mhc_pan"]["peptide_len"],
+            config["params"]["net_mhc_two_pan"]["peptide_len"],
+        )
+        * 3,
     shell:
         "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
@@ -102,7 +108,11 @@ rule microphaser_normal:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3,
+        window_length=lambda w: max(
+            config["params"]["net_mhc_pan"]["peptide_len"],
+            config["params"]["net_mhc_two_pan"]["peptide_len"],
+        )
+        * 3,
     shell:
         "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"
@@ -133,7 +143,7 @@ rule build_normal_proteome_db:
     conda:
         "../envs/microphaser.yaml"
     params:
-        length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"],
+        length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"],
     shell:
         "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}"
 
@@ -159,7 +169,7 @@ rule microphaser_filter:
     conda:
         "../envs/microphaser.yaml"
     params:
-        length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"],
+        length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"],
     shell:
         "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}"
 
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index d041c2e6..6e7db8a5 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -83,7 +83,7 @@ rule create_somatic_flag_header_line:
     output:
         "resources/somatic_flag_header_line.txt",
     log:
-        "logs/create_somatic_flag_header_line.log"
+        "logs/create_somatic_flag_header_line.log",
     shell:
         """
         ( echo '##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic tumor variant">' > {output} ) 2> {log}
@@ -96,7 +96,7 @@ rule create_genome_somatic_flag_bed:
     output:
         "resources/genome.somatic_flag.bed",
     log:
-        "logs/create_genome_somatic_flag_bed.log"
+        "logs/create_genome_somatic_flag_bed.log",
     conda:
         "../envs/gawk.yaml"
     cache: True

From 5adc1dfd968efd7d7f4eab3e7068a1518c460d19 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:12:42 +0000
Subject: [PATCH 148/191] black on worfklow/scripts/

---
 workflow/scripts/add_rna_info.py              | 10 +--
 .../scripts/count_neoantigen_occurrences.py   | 23 +++---
 workflow/scripts/merge_neoantigen_info.py     | 70 +++++++++++--------
 workflow/scripts/sample_comp_plot.py          | 53 +++++++-------
 workflow/scripts/tidy_mhc_output.py           |  6 +-
 workflow/scripts/tsv_to_xlsx.py               |  2 +-
 6 files changed, 92 insertions(+), 72 deletions(-)

diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py
index b8690f86..0eb2298e 100644
--- a/workflow/scripts/add_rna_info.py
+++ b/workflow/scripts/add_rna_info.py
@@ -5,13 +5,15 @@
 import pandas as pd
 
 ## load data table
-data = pd.read_csv(snakemake.input["table"], sep='\t')
+data = pd.read_csv(snakemake.input["table"], sep="\t")
 
 ## Merge transcript count
-transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t')
+transcript_count = pd.read_csv(snakemake.params["abundance"], sep="\t")
 transcript_count = transcript_count[["target_id", "tpm"]]
 transcript_count.columns = ["Transcript_ID", "TPM"]
-transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0]
+transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split(
+    ".", expand=True
+)[0]
 data = data.merge(transcript_count, on="Transcript_ID", how="left")
 
-data.to_csv(snakemake.output[0], sep='\t', index=False)
+data.to_csv(snakemake.output[0], sep="\t", index=False)
diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py
index a70caa36..eb9c917a 100644
--- a/workflow/scripts/count_neoantigen_occurrences.py
+++ b/workflow/scripts/count_neoantigen_occurrences.py
@@ -9,16 +9,21 @@
 
 dfs, dfs_id = dict(), []
 
-for f in files: 
-	df = pd.read_csv(f, sep = '\t') 
-	dfs[f] = df
-	df = df[["ID_tumor"]]
-	dfs_id.append(df)
+for f in files:
+    df = pd.read_csv(f, sep="\t")
+    dfs[f] = df
+    df = df[["ID_tumor"]]
+    dfs_id.append(df)
 
 ids = pd.concat(dfs_id)
-ids=ids.groupby(ids.columns.tolist(), as_index = False).size().reset_index().rename(columns = {0:"occurrences"})
-ids.to_csv("out/tables/candidate_occurrences.tsv", sep = '\t', index = False)
+ids = (
+    ids.groupby(ids.columns.tolist(), as_index=False)
+    .size()
+    .reset_index()
+    .rename(columns={0: "occurrences"})
+)
+ids.to_csv("out/tables/candidate_occurrences.tsv", sep="\t", index=False)
 
 for k, v in dfs.items():
-	df = v.merge(ids, on = "ID_tumor")
-	df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep = ',', index = False)
+    df = v.merge(ids, on="ID_tumor")
+    df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep=",", index=False)
diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 07dfce50..850792d0 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -23,16 +23,14 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
 
 def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
     merge_cols = ["id", "pos_in_id_seq", "pep_seq"]
-    common_info = (
-        sample.loc[:, merge_cols + ["ave_el_score", "num_binders"]]
-        .drop_duplicates()
-    )
+    common_info = sample.loc[
+        :, merge_cols + ["ave_el_score", "num_binders"]
+    ].drop_duplicates()
     sample_el = get_best_rank_per_peptide(sample, "el")
     sample_ba = get_best_rank_per_peptide(sample, "ba")
     sample_filtered = sample_el.merge(sample_ba, on=merge_cols)
-    return (
-        sample_filtered.merge(common_info, how="left", on=merge_cols)
-        .assign(alias=alias)
+    return sample_filtered.merge(common_info, how="left", on=merge_cols).assign(
+        alias=alias
     )
 
 
@@ -42,7 +40,9 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
     """
     if normal_p == "nan" or normal_p == "":
         return (tumor_p, normal_p)
-    assert len(tumor_p) == len(normal_p), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
+    assert len(tumor_p) == len(
+        normal_p
+    ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
     diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
     tp_changed = tumor_p
     np_changed = normal_p
@@ -83,15 +83,21 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     """
     Get the -o info output of the microphaser filter command into tidy data format.
     """
-    info = info.rename(
-        columns={"credible_interval": "freq_credible_interval"}
-    )
+    info = info.rename(columns={"credible_interval": "freq_credible_interval"})
     # Aggregate multiple identical entries that differ only in 'id' and 'transcript'
     # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
     # list.
-    cols = [ c for c in info.columns if c not in ['id', 'transcript'] ]
-    aggregation_functions = {'id': lambda i: list(i), 'transcript': lambda t: '|'.join(set(t)) }
-    info = info.groupby(cols, dropna=False).agg(aggregation_functions).reset_index().explode('id').set_index(
+    cols = [c for c in info.columns if c not in ["id", "transcript"]]
+    aggregation_functions = {
+        "id": lambda i: list(i),
+        "transcript": lambda t: "|".join(set(t)),
+    }
+    info = (
+        info.groupby(cols, dropna=False)
+        .agg(aggregation_functions)
+        .reset_index()
+        .explode("id")
+        .set_index(
             [
                 "id",
                 "transcript",
@@ -106,6 +112,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
                 "strand",
             ]
         )
+    )
     int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"]
     info[int_cols] = info[int_cols].astype("int32")
     # TODO: Ensure that microphaser output contains only one entry per id.
@@ -177,14 +184,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
 
 
 def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""):
-    if (
-        sum(
-            df.duplicated(
-                subset=cols
-            )
-        )
-        > 0
-    ):
+    if sum(df.duplicated(subset=cols)) > 0:
         duplicates = df[
             df.duplicated(
                 subset=cols,
@@ -218,17 +218,23 @@ def merge_data_frames(
     # that microphaser originally provided to make the following .join() work
     len_tumor_id = len(tumor_filtered["id"][0])
     len_normal_id = len(normal_filtered["id"][0])
-    assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
+    assert (
+        len_tumor_id == len_normal_id
+    ), f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
     info_tidy["id"] = info_tidy["id"].str[:len_tumor_id]
     # Double-check for duplicates resulting from the id truncation
-    check_duplicates(info_tidy, ["id", "alias"], specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n")
+    check_duplicates(
+        info_tidy,
+        ["id", "alias"],
+        specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n",
+    )
 
     all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"])
 
     # Double-check for weird duplicates, as previously done in Jan's code.
     # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"],
     # we check for everything except id.
-    cols_without_id = [ c for c in all_annotated.columns if c not in ['id'] ]
+    cols_without_id = [c for c in all_annotated.columns if c not in ["id"]]
     all_annotated = all_annotated.drop_duplicates(subset=cols_without_id)
     check_duplicates(all_annotated, cols_without_id)
 
@@ -265,10 +271,16 @@ def merge_data_frames(
     ]
 
     def get_id_rank(group: pd.DataFrame, tumor_alias: str):
-        return group.loc[group["alias"] == tumor_alias, 'top_el_rank_el_rank'].squeeze()
-    
-    sort_rank = all_annotated.groupby('id').apply(get_id_rank, tumor_alias).rename('sort_rank')
-    all_sorted = all_annotated.merge(sort_rank, on=['id'], how='left').sort_values(['sort_rank', 'id', 'alias'], ascending=[True, True, False]).drop(columns='sort_rank')
+        return group.loc[group["alias"] == tumor_alias, "top_el_rank_el_rank"].squeeze()
+
+    sort_rank = (
+        all_annotated.groupby("id").apply(get_id_rank, tumor_alias).rename("sort_rank")
+    )
+    all_sorted = (
+        all_annotated.merge(sort_rank, on=["id"], how="left")
+        .sort_values(["sort_rank", "id", "alias"], ascending=[True, True, False])
+        .drop(columns="sort_rank")
+    )
 
     return all_sorted.reindex(columns=column_order)
 
diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py
index d8dbf6e0..ffbd7aaa 100644
--- a/workflow/scripts/sample_comp_plot.py
+++ b/workflow/scripts/sample_comp_plot.py
@@ -10,8 +10,10 @@
 import matplotlib.pyplot as plt
 from pysam import VariantFile
 
-variant_df = pd.read_csv(snakemake.input[0], sep='\t').fillna(0.0)
-variant_df = variant_df[["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]]
+variant_df = pd.read_csv(snakemake.input[0], sep="\t").fillna(0.0)
+variant_df = variant_df[
+    ["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]
+]
 ## tidy data - for facet plot
 tidy_df = variant_df.melt(id_vars=["CHROM", "POS"], var_name="Sample", value_name="VAF")
 g = sns.FacetGrid(tidy_df, col="Sample")
@@ -19,25 +21,27 @@
 g.savefig(snakemake.output["facet"])
 plt.close()
 ## pairplot
-sns.pairplot(variant_df.drop(["CHROM", "POS"],axis=1), diag_kind="kde")
+sns.pairplot(variant_df.drop(["CHROM", "POS"], axis=1), diag_kind="kde")
 plt.savefig(snakemake.output["pairplot"])
 plt.close()
 
+
 def overlap_pct(x, y, **kws):
     n = 0
-    for i in range(0,len(x)):
+    for i in range(0, len(x)):
         if (x[i] > 0) & (y[i] > 0):
             n += 1
-    overlap=n/len([e for e in x if e > 0])
+    overlap = n / len([e for e in x if e > 0])
     ax = plt.gca()
-    ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4))
-    ax.annotate("Shared Variants: {}".format(n), xy=(.2, .6))
+    ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(0.2, 0.4))
+    ax.annotate("Shared Variants: {}".format(n), xy=(0.2, 0.6))
+
 
 def variants(x, **kws):
     positive = len([e for e in x if e > 0])
-    ax = plt.gca() 
-    ax.annotate("#Variants: {}".format(positive), xy=(.2, .5),
-xycoords=ax.transAxes)
+    ax = plt.gca()
+    ax.annotate("#Variants: {}".format(positive), xy=(0.2, 0.5), xycoords=ax.transAxes)
+
 
 g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
 
@@ -46,7 +50,7 @@ def variants(x, **kws):
 g.savefig(snakemake.output["grid"])
 plt.close()
 
-#def neg_overlap_pct(x, y, **kws):
+# def neg_overlap_pct(x, y, **kws):
 #    n = 0
 #    for i in range(0,len(x)):
 #        if (x[i] == 0) & (y[i] == 0):
@@ -56,25 +60,24 @@ def variants(x, **kws):
 #    ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4))
 #    ax.annotate("Shared missing variants: {}".format(n), xy=(.2, .6))
 
-#def neg_variants(x, **kws):
+# def neg_variants(x, **kws):
 #    zero = len([e for e in x if e == 0])
-#    ax = plt.gca() 
+#    ax = plt.gca()
 #    ax.annotate("#Missing Variants: {}".format(zero), xy=(.2, .5),
-#xycoords=ax.transAxes)
+# xycoords=ax.transAxes)
 
-#g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
+# g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
 
-#g.map_offdiag(neg_overlap_pct)
-#g.map_diag(neg_variants)
-#g.savefig("plots/Mssing_Variant_table.pdf")
-#plt.close()
+# g.map_offdiag(neg_overlap_pct)
+# g.map_diag(neg_variants)
+# g.savefig("plots/Mssing_Variant_table.pdf")
+# plt.close()
 
-for c in variant_df.drop(["CHROM", "POS"], axis=1).columns: 
-    sns.distplot(variant_df[[c]][variant_df[c] > 0]) 
-    plt.title(c) 
-    plt.savefig("plots/positive_" + c +".distplot.pdf")
+for c in variant_df.drop(["CHROM", "POS"], axis=1).columns:
+    sns.distplot(variant_df[[c]][variant_df[c] > 0])
+    plt.title(c)
+    plt.savefig("plots/positive_" + c + ".distplot.pdf")
     plt.close()
     sns.distplot(variant_df[[c]])
-    plt.savefig("plots/all_" + c +".distplot.pdf")
+    plt.savefig("plots/all_" + c + ".distplot.pdf")
     plt.close()
-
diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
index 3d29dfb8..fa129c0a 100644
--- a/workflow/scripts/tidy_mhc_output.py
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -13,7 +13,7 @@
 # * generated with the `-BA` option to include binding affinity prediction
 
 # The mapping of index column names used here to original names in netMHCpan files
-# is (please excuse the pd.NA tuples, they make header and index handling 
+# is (please excuse the pd.NA tuples, they make header and index handling
 # easier further down the line):
 INDEX_NAMES = {
     (pd.NA, "Pos"): "pos_in_id_seq",
@@ -88,9 +88,7 @@ def parse_file(mhc_in: str):
     header = pd.concat([first_header_line, second_header_line], axis="columns")
     header = header.fillna(method="ffill")
     header.loc[
-        header.column_name.isin(
-            [ index_col for (_, index_col) in INDEX_NAMES.keys() ]
-        ),
+        header.column_name.isin([index_col for (_, index_col) in INDEX_NAMES.keys()]),
         "allele",
     ] = pd.NA
 
diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py
index 5d9bf10f..f99e77ec 100644
--- a/workflow/scripts/tsv_to_xlsx.py
+++ b/workflow/scripts/tsv_to_xlsx.py
@@ -5,4 +5,4 @@
 import pandas as pd
 
 data = pd.read_csv(snakemake.input.tsv, sep="\t")
-data.to_excel(snakemake.output.xlsx, index=False)
\ No newline at end of file
+data.to_excel(snakemake.output.xlsx, index=False)

From b166c9d52591add05860e791535e7b3306759505 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:21:15 +0000
Subject: [PATCH 149/191] fix config.schema.yaml

---
 workflow/schemas/config.schema.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 1186fc44..60708436 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -102,5 +102,5 @@ required:
   - units
   - ref
   - params
-  - epitope_prediction
+  - neoantigen_prediction
   - affinity

From 3b3ed3ec6fb3c4672e0f2635c0a160b9bc1c08f7 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:32:34 +0000
Subject: [PATCH 150/191] further config.schema.yaml fixes

---
 workflow/schemas/config.schema.yaml | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 60708436..eedcb838 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -4,21 +4,6 @@ description: snakemake configuration file
 
 type: object
 
-definitions:
-  filterentry:
-    type: object
-    additionalProperties:
-      type: string
-  evententry:
-    type: object
-    properties:
-      varlociraptor:
-        type: array
-        items: 
-          type: string
-      filter:
-        type: string
-
 properties:
   samples:
     type: string
@@ -48,9 +33,6 @@ properties:
       activate:
         type: boolean
   
-  affinity:
-    type: object
-    properties:
   params:
     type: object
     properties:
@@ -103,4 +85,3 @@ required:
   - ref
   - params
   - neoantigen_prediction
-  - affinity

From 20eb61afe4c768d2f4ffda7fdc3527577a4aa195 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 13:44:19 +0000
Subject: [PATCH 151/191] silence misguided lint

---
 workflow/rules/ref.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 6e7db8a5..f6be3bb0 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -155,7 +155,7 @@ rule download_hla_la_graph:
         "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
     params:
         graphs_dir=lambda w, output: output[0].replace(
-            "/PRG_MHC_GRCh38_withIMGT/PRG", ""
+            "graphs/PRG_MHC_GRCh38_withIMGT/PRG", "graphs"
         ),
     log:
         "logs/download_hla_la_graph.log",

From 3ee90aba269460c0b6dd6e4ea8c11307eb22f6dd Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 14:30:08 +0000
Subject: [PATCH 152/191] fix different config entries

---
 config/config.yaml                  |  2 +-
 workflow/rules/common.smk           |  8 ++++----
 workflow/rules/microphaser.smk      | 10 +++++-----
 workflow/schemas/config.schema.yaml |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index a9c006cd..10c46191 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -21,7 +21,7 @@ ref:
 params:
   microphaser:
     # window_len should be at least 3 times the longest peptide_len specified below
-    variant_sets:
+    events:
       normal: "normal_only"
       tumor: "tumor_only"
   net_mhc_pan:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index f25cdb85..0fd753f0 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -63,12 +63,12 @@ wildcard_constraints:
         pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])
     ),
     normal_alias="normal",
-    tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"],
-    normal_set=config["params"]["microphaser"]["variant_sets"]["normal"],
+    tumor_set=config["params"]["microphaser"]["events"]["tumor"],
+    normal_set=config["params"]["microphaser"]["events"]["normal"],
     set="|".join(
         [
-            config["params"]["microphaser"]["variant_sets"]["tumor"],
-            config["params"]["microphaser"]["variant_sets"]["normal"],
+            config["params"]["microphaser"]["events"]["tumor"],
+            config["params"]["microphaser"]["events"]["normal"],
         ]
     ),
     group="|".join(pd.unique(samples["group"])),
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 727d5b99..6879e32f 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -40,16 +40,16 @@ rule merge_tumor_normal:
         calls=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf",
             sets=[
-                config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"]
+                config["params"]["microphaser"]["events"]["normal"],
+                config["params"]["microphaser"]["events"]["tumor"]
                 + ".somatic_flag",
             ],
         ),
         index=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
             sets=[
-                config["params"]["microphaser"]["variant_sets"]["normal"],
-                config["params"]["microphaser"]["variant_sets"]["tumor"]
+                config["params"]["microphaser"]["events"]["normal"],
+                config["params"]["microphaser"]["events"]["tumor"]
                 + ".somatic_flag",
             ],
         ),
@@ -153,7 +153,7 @@ rule microphaser_filter:
         tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv",
         proteome=expand(
             "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin",
-            normal_set=config["params"]["microphaser"]["variant_sets"]["normal"],
+            normal_set=config["params"]["microphaser"]["events"]["normal"],
         ),
     output:
         mt_fasta=(
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index eedcb838..05e55b83 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -39,7 +39,7 @@ properties:
       microphaser:
         type: object
         properties:
-          variant_sets:
+          events:
             type: object
             properties:
               normal:

From 4323a572cb54ff605c4e46fb502c9004738fac1d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 23 Aug 2022 14:45:24 +0000
Subject: [PATCH 153/191] snakefmt

---
 workflow/rules/microphaser.smk | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 6879e32f..4e6a3407 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -41,16 +41,14 @@ rule merge_tumor_normal:
             "results/final-calls/{{group}}.{sets}.norm.bcf",
             sets=[
                 config["params"]["microphaser"]["events"]["normal"],
-                config["params"]["microphaser"]["events"]["tumor"]
-                + ".somatic_flag",
+                config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag",
             ],
         ),
         index=expand(
             "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
             sets=[
                 config["params"]["microphaser"]["events"]["normal"],
-                config["params"]["microphaser"]["events"]["tumor"]
-                + ".somatic_flag",
+                config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag",
             ],
         ),
     output:

From 4dc72784d52a07636a033f8512332071d3011f94 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 26 Aug 2022 18:37:13 +0000
Subject: [PATCH 154/191] fix deduplication of microphaser filter info tsv

---
 workflow/scripts/merge_neoantigen_info.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
index 850792d0..669a7716 100644
--- a/workflow/scripts/merge_neoantigen_info.py
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -87,10 +87,13 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
     # Aggregate multiple identical entries that differ only in 'id' and 'transcript'
     # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
     # list.
-    cols = [c for c in info.columns if c not in ["id", "transcript"]]
+    cols = [c for c in info.columns if c not in ["id", "transcript", "depth", "freq", "freq_credible_interval"]]
     aggregation_functions = {
         "id": lambda i: list(i),
-        "transcript": lambda t: "|".join(set(t)),
+        "transcript": lambda t: "|".join(list(t)),
+        "depth": lambda d: "|".join(list(d)),
+        "freq": lambda f: "|".join(list(f)),
+        "freq_credible_interval": lambda c: "|".join(list(c)),
     }
     info = (
         info.groupby(cols, dropna=False)

From 6251ac4deff41dd5c879398186e7a460e3a2e41e Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 26 Aug 2022 18:38:14 +0000
Subject: [PATCH 155/191] NeoFox conda environment creation setup with
 post-deploy

---
 workflow/Snakefile                        |  14 +++
 workflow/envs/neo_fox_deps.post-deploy.sh | 144 ++++++++++++++++++++++
 workflow/envs/neo_fox_deps.yaml           |  30 +++++
 3 files changed, 188 insertions(+)
 create mode 100755 workflow/envs/neo_fox_deps.post-deploy.sh
 create mode 100644 workflow/envs/neo_fox_deps.yaml

diff --git a/workflow/Snakefile b/workflow/Snakefile
index f00c401a..dee9f9ff 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -11,6 +11,20 @@ configfile: "config/config.yaml"
 scattergather:
     calling=24,
 
+##### required envvars #####
+
+envvars:
+    # For NeoFox installation:
+    # The tarballs for both netMHCpan and netMHCIIpan
+    # need to be donwloaded manually after registering
+    # online and deposited at a filesystem location
+    # that needs to be available as a shell environment
+    # variable when running snakemake.
+    # For downloading, see the "Downloads" tab at:
+    # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+    "NET_MHC_PAN_4_1_TARBALL",
+    # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0
+    "NET_MHC_TWO_PAN_4_0_TARBALL",
 
 ##### setup report #####
 
diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
new file mode 100755
index 00000000..939b3c3a
--- /dev/null
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -0,0 +1,144 @@
+set -euo pipefail
+
+# set all the necessary conda paths and
+# ensure they exist
+CONDA_BIN="${CONDA_PREFIX}/bin/"
+CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/"
+mkdir -p $CONDA_MAN1
+CONDA_INFO="${CONDA_PREFIX}/share/info/"
+mkdir -p $CONDA_INFO
+CONDA_LIB="${CONDA_PREFIX}/lib/"
+mkdir -p ${CONDA_LIB}
+CONDA_ETC="${CONDA_PREFIX}/etc/"
+mkdir -p $CONDA_ETC
+
+# install MixMHCpred, following:
+# https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README
+MIX_MHC_PRED_VERSION="2.1"
+MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/"
+wget https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz
+tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz
+cd MixMHCpred-${MIX_MHC_PRED_VERSION}
+g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x
+# TODO: when updating to v2.2, change this line to:
+# MMP_PLACEHOLDER="/PATH_TO_MIXMHCPRED/lib"
+MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER"
+grep "${MMP_PLACEHOLDER}" MixMHCpred
+sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred
+mv lib $MIX_MHC_PRED_LIB_PATH
+mv MixMHCpred ${CONDA_BIN}
+# TODO: when updating to v2.2, change this line to:
+# mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
+mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
+MixMHCpred -i test/test.fa -o test/out.txt -a A0101,A2501,B0801,B1801
+diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt)
+cd ..
+rm v${MIX_MHC_PRED_VERSION}.tar.gz
+rm -r MixMHCpred-${MIX_MHC_PRED_VERSION}
+
+# install MixMHC2pred, mostly following (we use the default GitHub-created
+# .tar.gz file instead of the hand-crafted .zip file for future-proofing):
+# https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md
+MIX_MHC_TWO_PRED_VERSION="1.2"
+MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/"
+wget https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
+mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix
+mv rpep ${CONDA_ETC}
+ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep
+mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE
+MixMHC2pred_unix -i test/testData.txt -o test/out.txt -a DRB1_11_01 DRB3_02_02 DPA1_01_03__DPB1_04_01 DQA1_05_05__DQB1_03_01
+diff test/out.txt test/out_compare.txt
+cd ..
+rm v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
+
+# install PRIME, mostly following (minor corrections):
+# https://github.com/GfellerLab/PRIME/blob/v1.0/README
+PRIME_VERSION="1.0"
+PRIME_LIB_PATH="${CONDA_LIB}/prime/"
+wget https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz
+tar xzf v${PRIME_VERSION}.tar.gz
+cd PRIME-${PRIME_VERSION}
+PRIME_PLACEHOLDER="/app/PRIME/lib"
+grep "${PRIME_PLACEHOLDER}" PRIME
+sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME
+mv lib $PRIME_LIB_PATH
+mv PRIME ${CONDA_BIN}
+mv PRIME_license.pdf ${CONDA_INFO}
+PRIME -i test/test.txt -o test/out.txt -a A0201,A0101
+diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt)
+cd ..
+rm v${PRIME_VERSION}.tar.gz
+rm -r PRIME-${PRIME_VERSION}
+
+# This is the non-portable version of the 1st line
+# in both netMHCpan and netMHCIIpan, assuming a
+# root install of tcsh. 
+TCSH_ROOT="#! /bin/tcsh -f"
+# For the scripts to work with any tcsh in the
+# $PATH, we need to change this to:
+TCSH_PATH="#!/usr/bin/env tcsh"
+
+# install netMHCpan version 4.1
+# requires tcsh to have been installed via conda dependencies
+NET_MHC_PAN_4_1_LIB="${CONDA_LIB}/netMHCpan_4_1/"
+mkdir -p ${NET_MHC_PAN_4_1_LIB}
+NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/"
+mkdir -p ${NET_MHC_PAN_4_1_ETC}
+tar xzf ${NET_MHC_PAN_4_1_TARBALL}
+cd netMHCpan-4.1
+wget https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz
+tar xzf data.tar.gz
+rm data.tar.gz
+grep "${TCSH_ROOT}" netMHCpan
+sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCpan
+grep -P "setenv\s+NMHOME" netMHCpan
+sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_PAN_4_1_LIB}%" netMHCpan
+mv -t ${CONDA_BIN} netMHCpan
+mv -t ${CONDA_MAN1} netMHCpan.1
+mv -t ${CONDA_INFO} netMHCpan-4.1.readme
+mv -t ${NET_MHC_PAN_4_1_LIB} Linux_x86_64
+mv -t ${NET_MHC_PAN_4_1_ETC} data
+ln -s ${NET_MHC_PAN_4_1_ETC}/data ${NET_MHC_PAN_4_1_LIB}/data
+cd test
+netMHCpan -p test.pep -BA -xls -a HLA-A01:01,HLA-A02:01 -xlsfile my_NetMHCpan_out.xls
+diff NetMHCpan_out.xls my_NetMHCpan_out.xls
+cd ../..
+rm -r netMHCpan-4.1
+
+# install netMHCIIpan version 4.0
+# requires tcsh to have been installed via conda dependencies
+NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/"
+mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB}
+NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/"
+mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC}
+tar xzf ${NET_MHC_PAN_TWO_4_0_TARBALL}
+cd netMHCIIpan-4.0
+wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz 
+tar xzf data.tar.gz
+rm data.tar.gz
+grep "${TCSH_ROOT}" netMHCIIpan
+sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCIIpan
+grep -P "setenv\s+NMHOME" netMHCIIpan
+sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_TWO_PAN_4_0_LIB}%" netMHCIIpan
+mv -t ${CONDA_BIN} netMHCIIpan
+mv -t ${CONDA_MAN1} netMHCIIpan.1
+mv -t ${CONDA_INFO} netMHCIIpan-4.0.readme
+mv -t ${NET_MHC_TWO_PAN_4_0_LIB} Linux_x86_64 NetMHCIIpan-4.0.pl
+mv -t ${NET_MHC_TWO_PAN_4_0_ETC} data
+ln -s ${NET_MHC_TWO_PAN_4_0_ETC}/data ${NET_MHC_TWO_PAN_4_0_LIB}/data
+cd test
+netMHCIIpan -f example.fsa -a DRB1_0101 > example.fsa.myout
+diff example.fsa.out example.fsa.myout
+netMHCIIpan -f example.pep -inptype 1 -a DRB1_0101 > example.pep.myout
+diff example.pep.out example.pep.myout
+netMHCIIpan -f example.fsa -a H-2-IAb -s -u > example.fsa.sorted.myout
+diff example.fsa.sorted.out example.fsa.sorted.myout
+netMHCIIpan -f example.fsa -hlaseq DRB10101.fsa > example.fsa_hlaseq.myout
+diff example.fsa_hlaseq.out example.fsa_hlaseq.myout
+netMHCIIpan -f example.fsa -hlaseqA alpha.dat -hlaseq beta.dat > example.fsa_hlaseq_A+B.myout
+diff example.fsa_hlaseq_A+B.out example.fsa_hlaseq_A+B.myout
+cd ../..
+rm -r netMHCIIpan-4.0
diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml
new file mode 100644
index 00000000..c4cb4a9b
--- /dev/null
+++ b/workflow/envs/neo_fox_deps.yaml
@@ -0,0 +1,30 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  # https://neofox.readthedocs.io/en/latest/02_installation.html#step-by-step-guide-without-docker
+  - r-base =3.6
+  - python >=3.7, <=3.8
+  # implicit unmentioned dependency of MixMHCpred and PRIME
+  - perl
+  # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp
+  - blast =2.10
+  # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64
+  - cxx-compiler
+  # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts
+  - tcsh =6.24
+  # just make sure this is available for the post-deploy.sh script
+  - sed =4.8
+  # R packages mentioned at the end of this section:
+  # https://neofox.readthedocs.io/en/latest/02_installation.html#configuration-of-the-reference-folder
+  - r-lattice
+  - r-ggplot2
+  - r-caret
+  - r-peptides
+  - r-doparallel
+  - r-gbm
+  - bioconductor-biostrings
+  # https://neofox.readthedocs.io/en/latest/02_installation.html#install-neofox
+  - pip
+  - pip:
+    - neofox==0.6.4

From 0d1058cc4dd9b180de3ca6a492ceb8e15dc25ddf Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 26 Aug 2022 18:41:55 +0000
Subject: [PATCH 156/191] initial NeoFox config and run rules, rules for exact
 input missing

---
 .test/config/config.yaml                |  7 +++
 config/config.yaml                      |  7 +++
 workflow/Snakefile                      |  1 +
 workflow/rules/annotate_neoantigens.smk | 74 +++++++++++++++++++++++++
 workflow/rules/common.smk               |  1 +
 5 files changed, 90 insertions(+)
 create mode 100644 workflow/rules/annotate_neoantigens.smk

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index bcd1a88c..80c0e438 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -45,3 +45,10 @@ params:
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
     location: "../netMHCIIpan-4.1"
+  neo_fox:
+    activate: false
+    # This should be at least as long as the desired net_mhc_two_pan peptide length
+    peptide_len: 15
+    # update the version number to get a newer release of the reference set of HLA Alleles
+    hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt"
+    extra: ""
diff --git a/config/config.yaml b/config/config.yaml
index 10c46191..083fc651 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -46,4 +46,11 @@ params:
     # the conda-provided tcsh installation, it needs to read (without quotes):
     # "#!/usr/bin/env tcsh"
     location: "../netMHCIIpan-4.1"
+  neo_fox:
+    activate: false
+    # This should be at least as long as the desired net_mhc_two_pan peptide length
+    peptide_len: 15
+    # update the version number to get a newer release of the reference set of HLA Alleles
+    hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt"
+    extra: ""
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
index dee9f9ff..ee333ba2 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -45,6 +45,7 @@ include: "rules/ref.smk"
 include: "rules/microphaser.smk"
 include: "rules/hla_typing.smk"
 include: "rules/mhc_binding.smk"
+include: "rules/annotate_neoantigens.smk"
 
 
 rule all:
diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
new file mode 100644
index 00000000..b610c948
--- /dev/null
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -0,0 +1,74 @@
+rule prepare_neo_fox_config_and_resources:
+    output:
+        config="resources/neo_fox/neo_fox_config.txt",
+        references=directory("resources/neo_fox/references/"),
+    conda:
+        "../envs/neo_fox_deps.yaml"
+    params:
+        hla_alleles=config["params"]["neofox"]["hla_alleles"],
+    shell:
+        """
+        # environment variables necessary for neofox-configure
+
+        ## pre-installed via conda
+        export NEOFOX_MAKEBLASTDB=makeblastdb
+        echo 'NEOFOX_MAKEBLASTDB=makeblastdb' > {output.config}
+        export NEOFOX_RSCRIPT=Rscript
+        echo 'NEOFOX_RSCRIPT=Rscript' >> {output.config}
+
+        ## pre-installed into conda environment via post-deploy script
+        export NEOFOX_NETMHCPAN=netMHCpan
+        echo 'NEOFOX_NETMHCPAN=netMHCpan' >> {output.config}
+        export NEOFOX_NETMHC2PAN=netMHCIIpan
+        echo 'NEOFOX_NETMHC2PAN=netMHCIIpan' >> {output.config}
+
+        ## specification of hla_allele link via config.yaml
+        export NEOFOX_HLA_DATABASE={params.hla_alleles}
+
+        neofox-configure --reference-folder {output.references}
+        echo 'NEOFOX_REFERENCE_FOLDER={output.references}' >> {output.config}
+
+        # further environment variables needed for the config file
+
+        ## pre-installed via conda
+        echo 'NEOFOX_BLASTP=blastp' >> {output.config}
+        
+        ## pre-installed into conda environment via post-deploy script
+        echo 'NEOFOX_MIXMHCPRED=MixMHCpred' >> {output.config}
+        echo 'NEOFOX_MIXMHC2PRED=MixMHC2pred_unix' >> {output.config}
+        echo 'NEOFOX_PRIME=PRIME' >> {output.config}
+        """
+
+
+rule neo_fox:
+    input:
+        config="resources/neo_fox/neo_fox_config.txt",
+        references=directory("resources/neo_fox/references/"),
+        candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv",
+        patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
+    output:
+        tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv"
+        json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json"
+        meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json"
+    threads: 8
+    conda:
+        "../envs/neo_fox_deps.yaml"
+    params:
+        folder=lambda wc, output: path.dirname(output.annotated),
+        prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0],
+        organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported"
+    shell:
+        "(neofox "
+        "  --num_cpus {threads} "
+        "  --config {input.config} "
+        "  --candidate-file {input.candidates} "
+        "  --patient-data {input.patient_annotation} "
+        "  --with-table "
+        "  --with-json "
+        "  --organism {params.organism} "
+        "  --output-folder {params.folder} "
+        "  --output-prefix {params.prefix} ; "
+        " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
+        " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
+        " mv {params_folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; "
+        ") 2> {log} "
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 0fd753f0..68725904 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,6 +1,7 @@
 import glob
 
 import pandas as pd
+from os import path
 from snakemake.remote import FTP
 from snakemake.utils import validate
 

From 3d8bbe126cfc6306dcf104cc12ccebffdac02537 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 26 Aug 2022 18:48:11 +0000
Subject: [PATCH 157/191] fix typos

---
 workflow/rules/annotate_neoantigens.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index b610c948..5c4a4029 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -5,7 +5,7 @@ rule prepare_neo_fox_config_and_resources:
     conda:
         "../envs/neo_fox_deps.yaml"
     params:
-        hla_alleles=config["params"]["neofox"]["hla_alleles"],
+        hla_alleles=config["params"]["neo_fox"]["hla_alleles"],
     shell:
         """
         # environment variables necessary for neofox-configure
@@ -47,16 +47,16 @@ rule neo_fox:
         candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv",
         patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
     output:
-        tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv"
-        json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json"
-        meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json"
+        tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
+        json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json",
+        meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json",
     threads: 8
     conda:
         "../envs/neo_fox_deps.yaml"
     params:
         folder=lambda wc, output: path.dirname(output.annotated),
         prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0],
-        organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported"
+        organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported",
     shell:
         "(neofox "
         "  --num_cpus {threads} "

From 2f8abd4f663711dc2ce881602249e78d927361cc Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 30 Aug 2022 12:09:55 +0000
Subject: [PATCH 158/191] update microphaser to 0.7.0 for filter output adapted
 for NeoFox

---
 workflow/envs/microphaser.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml
index d781c04c..90698479 100644
--- a/workflow/envs/microphaser.yaml
+++ b/workflow/envs/microphaser.yaml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
-  - microphaser =0.6
+  - microphaser =0.7

From 0affc660dd9f9c07b2c83cfd94ca8965ee7fb4a0 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 30 Aug 2022 12:10:51 +0000
Subject: [PATCH 159/191] consistently use peptide_length wildcard and contigs/
 folder throughout

---
 workflow/rules/mhc_binding.smk | 20 ++++++----
 workflow/rules/microphaser.smk | 68 ++++++++++++++--------------------
 2 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk
index db077370..0a6079ca 100644
--- a/workflow/rules/mhc_binding.smk
+++ b/workflow/rules/mhc_binding.smk
@@ -1,11 +1,14 @@
 rule net_mhc_pan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_pan.{contig}.{peptide_type}.fa",
+        peptides=expand(
+            "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa",
+            peptide_length=config["params"]["net_mhc_pan"]["peptide_len"],
+        ),
         alleles=get_alleles_MHCI,
     output:
-        "results/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+        "results/net_mhc_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
-        "logs/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+        "logs/net_mhc_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:
         "../envs/tcsh.yaml"
     params:
@@ -28,12 +31,15 @@ rule net_mhc_pan:
 
 rule net_mhc_two_pan:
     input:
-        peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_two_pan.{contig}.{peptide_type}.fa",
+        peptides=expand(
+            "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa",
+            peptide_length=config["params"]["net_mhc_two_pan"]["peptide_len"],
+        ),
         alleles=get_alleles_MHCII,
     output:
-        "results/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+        "results/net_mhc_two_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
     log:
-        "logs/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+        "logs/net_mhc_two_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
     conda:
         "../envs/tcsh.yaml"
     params:
@@ -57,7 +63,7 @@ rule net_mhc_two_pan:
 rule tidy_mhc_out:
     input:
         expand(
-            "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv",
+            "results/{{mhc}}/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv",
             contig=contigs,
         ),
     output:
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 4e6a3407..a0bfc6c6 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -69,19 +69,15 @@ rule microphaser_tumor:
         track="resources/annotation/{contig}.gtf",
         ref="resources/genome.fasta",
     output:
-        mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.neo.fa",
-        wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.normal.fa",
-        tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv",
+        mt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa",
+        wt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa",
+        tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
     log:
-        "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.{contig}.log",
+        "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(
-            config["params"]["net_mhc_pan"]["peptide_len"],
-            config["params"]["net_mhc_two_pan"]["peptide_len"],
-        )
-        * 3,
+        window_length=lambda wc: int(wc.peptide_length) * 3
     shell:
         "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
@@ -96,21 +92,17 @@ rule microphaser_normal:
         ref="resources/genome.fasta",
     output:
         wt_fasta=(
-            "results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"
+            "results/microphaser/fasta/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.fa"
         ),
         wt_tsv=(
-            "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"
+            "results/microphaser/info/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.tsv"
         ),
     log:
-        "logs/microphaser_normal/{group}/{normal_alias}.{normal_set}-{contig}.log",
+        "logs/microphaser_normal/contigs/{group}/{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda w: max(
-            config["params"]["net_mhc_pan"]["peptide_len"],
-            config["params"]["net_mhc_two_pan"]["peptide_len"],
-        )
-        * 3,
+        window_length=lambda wc: int(wc.peptide_length) * 3
     shell:
         "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"
@@ -119,69 +111,65 @@ rule microphaser_normal:
 rule concat_normal_proteome:
     input:
         expand(
-            "results/microphaser/fasta/{{group}}/normal.{{normal_set}}.{contig}.fa",
+            "results/microphaser/fasta/contigs/{{group}}.normal.{{normal_set}}.pep_len_{{peptide_length}}.{contig}.fa",
             contig=contigs,
         ),
     output:
-        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa",
     log:
-        "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.log",
+        "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.pep_len_{peptide_length}.log",
     shell:
         "cat {input} > {output} 2> {log}"
 
 
 rule build_normal_proteome_db:
     input:
-        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa",
+        "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa",
     output:
-        bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin",
-        fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta",
+        bin="results/microphaser/bin/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.bin",
+        fasta="results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.peptides.fasta",
     log:
-        "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}-{mhc}.log",
+        "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}.pep_len_{peptide_length}.log",
     conda:
         "../envs/microphaser.yaml"
-    params:
-        length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"],
     shell:
-        "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}"
+        "( microphaser build_reference -r {input} -o {output.bin} -l {wildcards.peptide_length} > {output.fasta} ) 2> {log}"
 
 
 rule microphaser_filter:
     input:
-        tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv",
+        tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
         proteome=expand(
-            "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin",
+            "results/microphaser/bin/{{group}}.{normal_set}.normal_proteome.pep_len_{{peptide_length}}.bin",
             normal_set=config["params"]["microphaser"]["events"]["normal"],
         ),
     output:
         mt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.neo.fa"
+            "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa"
         ),
         wt_fasta=(
-            "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.normal.fa"
+            "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa"
         ),
-        tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.tsv",
-        removed="results/microphaser/info/removed/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.removed.tsv",
+        tsv="results/microphaser/info/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
+        removed="results/microphaser/info/removed/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.removed.tsv",
     log:
-        "logs/microphaser_filter/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.log",
+        "logs/microphaser_filter/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log",
     conda:
         "../envs/microphaser.yaml"
-    params:
-        length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"],
     shell:
-        "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}"
+        "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {wildcards.peptide_length} > {output.mt_fasta} 2>{log}"
 
 
 rule concat_tsvs:
     input:
         expand(
-            "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.merged_tumor_normal.{{mhc}}.{contig}.tsv",
+            "results/microphaser/info/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{{peptide_length}}.{contig}.tsv",
             contig=contigs,
         ),
     output:
-        "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
+        "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
     log:
-        "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
+        "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log",
     conda:
         "../envs/xsv.yaml"
     shell:

From 22ce178a7e0de1091ded61851e325b4723d0fdd5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 30 Aug 2022 12:11:19 +0000
Subject: [PATCH 160/191] initial infrastructure for parsing microphaes output
 into NeoFox input

---
 workflow/envs/polars.yaml                       |  5 +++++
 workflow/rules/annotate_neoantigens.smk         | 17 ++++++++++++++++-
 .../adjust_microphaser_output_for_neo_fox.py    |  7 +++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 workflow/envs/polars.yaml
 create mode 100644 workflow/scripts/adjust_microphaser_output_for_neo_fox.py

diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml
new file mode 100644
index 00000000..c5a678f4
--- /dev/null
+++ b/workflow/envs/polars.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - polars =0.14
\ No newline at end of file
diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 5c4a4029..5e071228 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -40,11 +40,26 @@ rule prepare_neo_fox_config_and_resources:
         """
 
 
+rule adjust_microphaser_output_for_neo_fox:
+    input:
+        candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+    output:
+        candidates="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+    threads: 1
+    conda:
+        "../envs/polars.yaml"
+    script:
+        "../scripts/adjust_microphaser_output_for_neo_fox.py"
+
+
 rule neo_fox:
     input:
         config="resources/neo_fox/neo_fox_config.txt",
         references=directory("resources/neo_fox/references/"),
-        candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv",
+        candidates=expand(
+            "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+            peptide_length=config["params"]["neo_fox"]["peptide_len"],
+        ),
         patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
     output:
         tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
new file mode 100644
index 00000000..cdccddcb
--- /dev/null
+++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
@@ -0,0 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import polars as pl
+
+candidates = pl.read_tsv(snakemake.input.candidates, sep="\t")
\ No newline at end of file

From a9077b7226af8ea4452fbbdea750426fb6893bee Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:36:14 +0000
Subject: [PATCH 161/191] fix NET_MHC_TWO_PAN_4_0_TARBALL environment variable
 spelling

---
 workflow/envs/neo_fox_deps.post-deploy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
index 939b3c3a..595fa6a0 100755
--- a/workflow/envs/neo_fox_deps.post-deploy.sh
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -114,7 +114,7 @@ NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/"
 mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB}
 NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/"
 mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC}
-tar xzf ${NET_MHC_PAN_TWO_4_0_TARBALL}
+tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL}
 cd netMHCIIpan-4.0
 wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz 
 tar xzf data.tar.gz

From 5e2c180181eafb4ace73c597f1e07716338dd96d Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:37:33 +0000
Subject: [PATCH 162/191] further unpin NeoFox blast dep from `2.10` to `2`, to
 avoid unsolvable situations in some settings

---
 workflow/envs/neo_fox_deps.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml
index c4cb4a9b..780af1d6 100644
--- a/workflow/envs/neo_fox_deps.yaml
+++ b/workflow/envs/neo_fox_deps.yaml
@@ -8,7 +8,7 @@ dependencies:
   # implicit unmentioned dependency of MixMHCpred and PRIME
   - perl
   # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp
-  - blast =2.10
+  - blast =2
   # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64
   - cxx-compiler
   # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts

From 85b4ded9271f9bf690cff95f6b0a042fc0758522 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:39:16 +0000
Subject: [PATCH 163/191] use shebang in post-deploy script, to ensure using
 bash (see https://github.com/snakemake/snakemake/pull/1841 for details of
 respective snakemake change)

---
 workflow/envs/neo_fox_deps.post-deploy.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
index 595fa6a0..77cd586c 100755
--- a/workflow/envs/neo_fox_deps.post-deploy.sh
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -1,3 +1,4 @@
+#!/usr/bin/env bash
 set -euo pipefail
 
 # set all the necessary conda paths and

From 3483eb9ef14ccebae6a6212ddb9df427f51e9cdd Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:40:01 +0000
Subject: [PATCH 164/191] silence wget calls (`-q`) in
 neo_fox_deps.post-deploy.sh

---
 workflow/envs/neo_fox_deps.post-deploy.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
index 77cd586c..c17a3400 100755
--- a/workflow/envs/neo_fox_deps.post-deploy.sh
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -17,7 +17,7 @@ mkdir -p $CONDA_ETC
 # https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README
 MIX_MHC_PRED_VERSION="2.1"
 MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/"
-wget https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz
+wget -q https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz
 tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz
 cd MixMHCpred-${MIX_MHC_PRED_VERSION}
 g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x
@@ -42,7 +42,7 @@ rm -r MixMHCpred-${MIX_MHC_PRED_VERSION}
 # https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md
 MIX_MHC_TWO_PRED_VERSION="1.2"
 MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/"
-wget https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
 tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
 cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
 mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix
@@ -59,7 +59,7 @@ rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
 # https://github.com/GfellerLab/PRIME/blob/v1.0/README
 PRIME_VERSION="1.0"
 PRIME_LIB_PATH="${CONDA_LIB}/prime/"
-wget https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz
+wget -q https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz
 tar xzf v${PRIME_VERSION}.tar.gz
 cd PRIME-${PRIME_VERSION}
 PRIME_PLACEHOLDER="/app/PRIME/lib"
@@ -90,7 +90,7 @@ NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/"
 mkdir -p ${NET_MHC_PAN_4_1_ETC}
 tar xzf ${NET_MHC_PAN_4_1_TARBALL}
 cd netMHCpan-4.1
-wget https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz
+wget -q https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz
 tar xzf data.tar.gz
 rm data.tar.gz
 grep "${TCSH_ROOT}" netMHCpan
@@ -117,7 +117,7 @@ NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/"
 mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC}
 tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL}
 cd netMHCIIpan-4.0
-wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz 
+wget -q https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz 
 tar xzf data.tar.gz
 rm data.tar.gz
 grep "${TCSH_ROOT}" netMHCIIpan

From 7f2300de6caa30b53cc3a5d4c9d5b6c39bb4748c Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:41:19 +0000
Subject: [PATCH 165/191] set hardcoded full paths in NeoFox config file, as
 NeoFox tests for file existence of the binary, not binary presence in path

---
 workflow/rules/annotate_neoantigens.smk | 29 +++++++++++++++----------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 5e071228..5206b93d 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -9,18 +9,23 @@ rule prepare_neo_fox_config_and_resources:
     shell:
         """
         # environment variables necessary for neofox-configure
+        # NOTE: we have to provide all binaries with hard-coded
+        # paths, because NeoFox checks that the file at the path
+        # exists (and not simply that a binary exists):
+        # https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L90
+        CONDA_BIN=$CONDA_PREFIX/bin
 
         ## pre-installed via conda
-        export NEOFOX_MAKEBLASTDB=makeblastdb
-        echo 'NEOFOX_MAKEBLASTDB=makeblastdb' > {output.config}
-        export NEOFOX_RSCRIPT=Rscript
-        echo 'NEOFOX_RSCRIPT=Rscript' >> {output.config}
+        export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb
+        echo 'NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb' > {output.config}
+        export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript
+        echo 'NEOFOX_RSCRIPT=$CONDA_BIN/Rscript' >> {output.config}
 
         ## pre-installed into conda environment via post-deploy script
-        export NEOFOX_NETMHCPAN=netMHCpan
-        echo 'NEOFOX_NETMHCPAN=netMHCpan' >> {output.config}
-        export NEOFOX_NETMHC2PAN=netMHCIIpan
-        echo 'NEOFOX_NETMHC2PAN=netMHCIIpan' >> {output.config}
+        export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan
+        echo 'NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan' >> {output.config}
+        export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan
+        echo 'NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan' >> {output.config}
 
         ## specification of hla_allele link via config.yaml
         export NEOFOX_HLA_DATABASE={params.hla_alleles}
@@ -31,12 +36,12 @@ rule prepare_neo_fox_config_and_resources:
         # further environment variables needed for the config file
 
         ## pre-installed via conda
-        echo 'NEOFOX_BLASTP=blastp' >> {output.config}
+        echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config}
         
         ## pre-installed into conda environment via post-deploy script
-        echo 'NEOFOX_MIXMHCPRED=MixMHCpred' >> {output.config}
-        echo 'NEOFOX_MIXMHC2PRED=MixMHC2pred_unix' >> {output.config}
-        echo 'NEOFOX_PRIME=PRIME' >> {output.config}
+        echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config}
+        echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config}
+        echo 'NEOFOX_PRIME=$CONDA_BIN/PRIME' >> {output.config}
         """
 
 

From 304610f5333ad03128c6635fc6b7e5de278eacf9 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:42:12 +0000
Subject: [PATCH 166/191] adapt naming of stuff in `rule
 adjust_microphaser_output_for_neo_fox`

---
 workflow/rules/annotate_neoantigens.smk | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 5206b93d..17a07d65 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -47,9 +47,11 @@ rule prepare_neo_fox_config_and_resources:
 
 rule adjust_microphaser_output_for_neo_fox:
     input:
-        candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+        microphaser="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
     output:
-        candidates="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+        neo_fox="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+    log:
+        "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log",
     threads: 1
     conda:
         "../envs/polars.yaml"

From 6aeb681a2260518c82ed640a87c0e9b5efc8278a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:43:24 +0000
Subject: [PATCH 167/191] rely only on neo_fox_config.txt to ensure `rule
 prepare_neo_fox_config_and_resources` is run, add logs to neo_fox rules

---
 workflow/rules/annotate_neoantigens.smk | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 17a07d65..bb3e2c02 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -1,7 +1,14 @@
 rule prepare_neo_fox_config_and_resources:
     output:
         config="resources/neo_fox/neo_fox_config.txt",
+        # we cannot put the exact files generated into the
+        # output, as snakemake will generate the respective
+        # subdirectories and NeoFox has default exist_ok=False
+        # set for os.makedirs:
+        # https://github.com/TRON-Bioinformatics/neofox/blob/fb6cdf9f10e77c409d0fa44657ef520eedca6994/neofox/references/installer.py#L221
         references=directory("resources/neo_fox/references/"),
+    log:
+        "logs/neo_fox/neo_fox_config.log",
     conda:
         "../envs/neo_fox_deps.yaml"
     params:
@@ -62,7 +69,6 @@ rule adjust_microphaser_output_for_neo_fox:
 rule neo_fox:
     input:
         config="resources/neo_fox/neo_fox_config.txt",
-        references=directory("resources/neo_fox/references/"),
         candidates=expand(
             "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
             peptide_length=config["params"]["neo_fox"]["peptide_len"],
@@ -72,6 +78,8 @@ rule neo_fox:
         tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
         json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json",
         meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json",
+    log:
+        "logs/neo_fox/annotated/{group}.{tumor_alias}.log",
     threads: 8
     conda:
         "../envs/neo_fox_deps.yaml"

From dfd5760a87bc660a02ffcdc76dacdb913179d12b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:56:45 +0000
Subject: [PATCH 168/191] initial version of
 adjust_microphaser_output_for_neo_fox.py

---
 .../adjust_microphaser_output_for_neo_fox.py      | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
index cdccddcb..879994da 100644
--- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
+++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
@@ -4,4 +4,17 @@
 
 import polars as pl
 
-candidates = pl.read_tsv(snakemake.input.candidates, sep="\t")
\ No newline at end of file
+columns_mapping = {
+    "gene_name": "gene",
+    "normal_peptide": "mutation.wildTypeXmer",
+    "tumor_peptide": "mutation.mutatedXmer",
+    "freq": "dnaVariantAlleleFrequency",
+}
+
+candidates = (
+    pl.read_tsv(snakemake.input.microphaser, sep="\t", quote="")
+    .rename(columns_mapping)
+    .with_column(pl.lit(snakemake.wildcards.group).alias("patientIdentifier"))
+)
+
+candidates.write_csv(snakemake.output.neo_fox, sep="\t", quote="")

From c78f5dd3d45e60921b0059955e3b011f821c6120 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 1 Sep 2022 15:59:01 +0000
Subject: [PATCH 169/191] Revert "further unpin NeoFox blast dep from `2.10` to
 `2`, to avoid unsolvable situations in some settings"

This reverts commit 5e2c180181eafb4ace73c597f1e07716338dd96d.
---
 workflow/envs/neo_fox_deps.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml
index 780af1d6..c4cb4a9b 100644
--- a/workflow/envs/neo_fox_deps.yaml
+++ b/workflow/envs/neo_fox_deps.yaml
@@ -8,7 +8,7 @@ dependencies:
   # implicit unmentioned dependency of MixMHCpred and PRIME
   - perl
   # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp
-  - blast =2
+  - blast =2.10
   # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64
   - cxx-compiler
   # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts

From e07d5dc9122bed38768724b7aefc67e00318b8b5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 2 Sep 2022 08:11:44 +0000
Subject: [PATCH 170/191] update wrapper and tool versions

---
 workflow/envs/bcftools.yaml      |  2 +-
 workflow/envs/rbt.yaml           |  4 ++--
 workflow/envs/varlociraptor.yaml |  2 +-
 workflow/rules/microphaser.smk   |  4 ++--
 workflow/rules/phylogeny.smk     |  2 +-
 workflow/rules/ref.smk           | 12 ++++++------
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml
index 70e39da1..35bf7643 100644
--- a/workflow/envs/bcftools.yaml
+++ b/workflow/envs/bcftools.yaml
@@ -2,4 +2,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - bcftools =1.10
+  - bcftools =1.14
diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml
index 6c8be1fb..94ff5bae 100644
--- a/workflow/envs/rbt.yaml
+++ b/workflow/envs/rbt.yaml
@@ -1,6 +1,6 @@
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
 dependencies:
   - rust-bio-tools =0.19
-  - bcftools =1.10
+  - bcftools =1.14
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
index a76d765a..270dc831 100644
--- a/workflow/envs/varlociraptor.yaml
+++ b/workflow/envs/varlociraptor.yaml
@@ -3,4 +3,4 @@ channels:
   - bioconda
 dependencies:
   - varlociraptor =2.3.0
-  - bcftools =1.10
+  - bcftools =1.14
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index a0bfc6c6..b0570b0b 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -9,7 +9,7 @@ rule norm_bcf:
     params:
         lambda w, input: "-f {} -O b -m-".format(input.genome),  # optional parameters for bcftools norm (except -o)
     wrapper:
-        "0.65.0/bio/bcftools/norm"
+        "v1.12.0/bio/bcftools/norm"
 
 
 rule add_somatic_flag:
@@ -58,7 +58,7 @@ rule merge_tumor_normal:
     params:
         extra="-O b -a",
     wrapper:
-        "0.64.0/bio/bcftools/concat"
+        "v1.12.0/bio/bcftools/concat"
 
 
 rule microphaser_tumor:
diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk
index 63d2ad4a..d7279042 100644
--- a/workflow/rules/phylogeny.smk
+++ b/workflow/rules/phylogeny.smk
@@ -15,7 +15,7 @@ rule merge_snvs:
     params:
         "--use-header final-calls/sampleheader.txt --force-samples",
     wrapper:
-        "0.36.0/bio/bcftools/merge"
+        "v1.12.0/bio/bcftools/merge"
 
 
 rule query:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index f6be3bb0..80e62380 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -10,7 +10,7 @@ rule get_genome:
         release=config["ref"]["release"],
     cache: True
     wrapper:
-        "0.45.1/bio/reference/ensembl-sequence"
+        "v1.12.0/bio/reference/ensembl-sequence"
 
 
 rule get_cdna:
@@ -25,7 +25,7 @@ rule get_cdna:
         release=config["ref"]["release"],
     cache: True
     wrapper:
-        "0.45.1/bio/reference/ensembl-sequence"
+        "v1.12.0/bio/reference/ensembl-sequence"
 
 
 rule get_annotation:
@@ -41,7 +41,7 @@ rule get_annotation:
     log:
         "logs/get-annotation.log",
     wrapper:
-        "0.45.1/bio/reference/ensembl-annotation"
+        "v1.12.0/bio/reference/ensembl-annotation"
 
 
 # TODO: remove this rule, once microphaser is fixed to make gene_name optional
@@ -76,7 +76,7 @@ rule genome_faidx:
         "logs/genome-faidx.log",
     cache: True
     wrapper:
-        "0.45.1/bio/samtools/faidx"
+        "v1.12.0/bio/samtools/faidx"
 
 
 rule create_somatic_flag_header_line:
@@ -114,7 +114,7 @@ rule bgzip_genome_somatic_flag_bed:
     log:
         "logs/bgzip/genome.somatic_flag.log",
     wrapper:
-        "v1.7.0/bio/bgzip"
+        "v1.12.0/bio/bgzip"
 
 
 rule tabix_genome_somatic_flag_bed:
@@ -139,7 +139,7 @@ rule genome_dict:
         "logs/picard/create-dict.log",
     cache: True
     wrapper:
-        "0.45.1/bio/picard/createsequencedictionary"
+        "v1.12.0/bio/picard/createsequencedictionary"
 
 
 rule download_hla_la_graph:

From 6beada8b8301cd4cb4445d8d11e73ca74461975b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 2 Sep 2022 19:14:27 +0000
Subject: [PATCH 171/191] get polars to work on older cpu architectures:
 https://github.com/pola-rs/polars/blob/622c92470f81cff2626b2c130c35d38bf4c66be9/README.md?plain=1#L213-L214

---
 workflow/envs/polars.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml
index c5a678f4..c26d1af2 100644
--- a/workflow/envs/polars.yaml
+++ b/workflow/envs/polars.yaml
@@ -2,4 +2,6 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - polars =0.14
\ No newline at end of file
+  - pip
+  - pip:
+    - polars-lts-cpu==0.14.8
\ No newline at end of file

From 2453302ee44409e19166f3fd206a7f3dbac7b5da Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 08:50:35 +0000
Subject: [PATCH 172/191] add in groups.tsv for group_annotation

---
 .test/config/groups.tsv   |  2 ++
 config/config.yaml        |  1 +
 config/groups.tsv         |  3 +++
 workflow/rules/common.smk | 13 +++++++++++++
 4 files changed, 19 insertions(+)
 create mode 100644 .test/config/groups.tsv
 create mode 100644 config/groups.tsv

diff --git a/.test/config/groups.tsv b/.test/config/groups.tsv
new file mode 100644
index 00000000..c0a77bbd
--- /dev/null
+++ b/.test/config/groups.tsv
@@ -0,0 +1,2 @@
+group	tumorType
+A	LUSC
diff --git a/config/config.yaml b/config/config.yaml
index 083fc651..4d447645 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,5 +1,6 @@
 samples: "config/samples.tsv"
 units: "config/units.tsv"
+groups: "config/groups.tsv"
 
 
 neoantigen_prediction:
diff --git a/config/groups.tsv b/config/groups.tsv
new file mode 100644
index 00000000..11353453
--- /dev/null
+++ b/config/groups.tsv
@@ -0,0 +1,3 @@
+group	tumorType
+A	LUSC
+B	LUAD
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 68725904..d4d8dafb 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -48,6 +48,19 @@ units = (
 )
 validate(units, schema="../schemas/units.schema.yaml")
 
+groups = samples["group"].unique()
+
+if "groups" in config:
+    group_annotation = (
+        pd.read_csv(config["groups"], sep="\t", dtype={"group": str})
+        .set_index("group")
+        .sort_index()
+    )
+    group_annotation = group_annotation.loc[groups]
+else:
+    group_annotation = pd.DataFrame({"group": groups}).set_index("group")
+
+
 contigs = [c for c in range(1, 23)]
 contigs.extend(["X", "Y"])
 

From cd1688c4b400e10a1b755fbc8bd6472a652215fb Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 08:51:27 +0000
Subject: [PATCH 173/191] replace polars with pandas, because polars doesn't
 play well with old CPUs (and thus hampers workflow portability)

---
 workflow/envs/pandas.yaml                              |  4 ++++
 workflow/envs/polars.yaml                              |  7 -------
 workflow/rules/annotate_neoantigens.smk                |  2 +-
 .../scripts/adjust_microphaser_output_for_neo_fox.py   | 10 +++++-----
 4 files changed, 10 insertions(+), 13 deletions(-)
 create mode 100644 workflow/envs/pandas.yaml
 delete mode 100644 workflow/envs/polars.yaml

diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml
new file mode 100644
index 00000000..5ced20fa
--- /dev/null
+++ b/workflow/envs/pandas.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - pandas=1.4
\ No newline at end of file
diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml
deleted file mode 100644
index c26d1af2..00000000
--- a/workflow/envs/polars.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - pip
-  - pip:
-    - polars-lts-cpu==0.14.8
\ No newline at end of file
diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index bb3e2c02..d1c09c39 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -61,7 +61,7 @@ rule adjust_microphaser_output_for_neo_fox:
         "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log",
     threads: 1
     conda:
-        "../envs/polars.yaml"
+        "../envs/pandas.yaml"
     script:
         "../scripts/adjust_microphaser_output_for_neo_fox.py"
 
diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
index 879994da..0d5581bb 100644
--- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
+++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
@@ -2,7 +2,7 @@
 
 sys.stderr = open(snakemake.log[0], "w")
 
-import polars as pl
+import pandas as pd
 
 columns_mapping = {
     "gene_name": "gene",
@@ -12,9 +12,9 @@
 }
 
 candidates = (
-    pl.read_tsv(snakemake.input.microphaser, sep="\t", quote="")
-    .rename(columns_mapping)
-    .with_column(pl.lit(snakemake.wildcards.group).alias("patientIdentifier"))
+    pd.read_csv(snakemake.input.microphaser, sep="\t", quoting=3)
+    .rename(columns=columns_mapping)
+    .assign(patientIdentifier=snakemake.wildcards.group)
 )
 
-candidates.write_csv(snakemake.output.neo_fox, sep="\t", quote="")
+candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3)

From f6e21a09dc95cdda89094269ba92dd6495dfb7cc Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 08:52:40 +0000
Subject: [PATCH 174/191] create NeoFox group / patient sheet

---
 workflow/rules/annotate_neoantigens.smk       | 19 +++++-
 .../scripts/create_neo_fox_group_sheet.py     | 60 +++++++++++++++++++
 2 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 workflow/scripts/create_neo_fox_group_sheet.py

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index d1c09c39..d844b4b2 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -66,6 +66,21 @@ rule adjust_microphaser_output_for_neo_fox:
         "../scripts/adjust_microphaser_output_for_neo_fox.py"
 
 
+rule create_neo_fox_group_sheet:
+    input:
+        hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt",
+    output:
+        group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv"
+    log:
+        "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log"
+    conda:
+        "../envs/pandas.yaml"
+    params:
+        group=lambda wc: group_annotation.loc[wc.group]
+    script:
+        "../scripts/create_neo_fox_group_sheet.py"
+    
+
 rule neo_fox:
     input:
         config="resources/neo_fox/neo_fox_config.txt",
@@ -73,7 +88,7 @@ rule neo_fox:
             "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
             peptide_length=config["params"]["neo_fox"]["peptide_len"],
         ),
-        patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
+        group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
     output:
         tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
         json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json",
@@ -92,7 +107,7 @@ rule neo_fox:
         "  --num_cpus {threads} "
         "  --config {input.config} "
         "  --candidate-file {input.candidates} "
-        "  --patient-data {input.patient_annotation} "
+        "  --patient-data {input.group_sheet} "
         "  --with-table "
         "  --with-json "
         "  --organism {params.organism} "
diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py
new file mode 100644
index 00000000..252b6fc4
--- /dev/null
+++ b/workflow/scripts/create_neo_fox_group_sheet.py
@@ -0,0 +1,60 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+HLA_SUFFIXES_REGEX = r"[NLSCAQ]?"
+
+# allowed loci according to NeoFox input data documentation:
+# https://neofox.readthedocs.io/en/latest/03_01_input_data.html#file-with-patient-data
+# * mhcIAlleles: comma separated MHC I alleles of the patient for HLA-A, HLA-B and
+#   HLA-C. If homozygous, the allele should be added twice.
+# * mhcIIAlleles: comma separated MHC II alleles of the patient for HLA-DRB1, HLA-DQA1,
+#   HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice.
+ALLOWED_LOCI = {
+    "A",
+    "B",
+    "C",
+    "DRB1",
+    "DQA1",
+    "DQB1",
+    "DPA1",
+    "DPB1",
+}
+
+mhc_alleles = pd.read_csv(
+        snakemake.input.hla_la_bestguess,
+        sep="\t",
+    )
+# the Allele column can contain multiple ";"-separated entries for the
+# same locus
+mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";")
+mhc_alleles = mhc_alleles.explode(["Allele"])
+mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)]
+mhc_alleles.loc[:, "Allele"] = (
+    mhc_alleles["Allele"]
+    .str
+    .replace(
+        r"([A-Z]+\d?)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+        r"HLA-\1*\2:\3\5",
+        regex=True,
+    )
+)
+# the multiple ";"-separated entries from above can be identical after reducing
+# to the allele group (1st number) and specific HLA protein (2nd number)
+mhc_alleles = mhc_alleles.drop_duplicates(subset=["Chromosome", "Allele"])
+
+mhc_one_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() == 1, "Allele"] )
+mhc_two_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() > 1, "Allele"] )
+
+patient_info = pd.DataFrame(
+    data={
+        "identifier": [ snakemake.params.group.name ],
+        "tumorType": [ snakemake.params.group["tumorType"] ],
+        "mhcIAlleles": [ mhc_one_alleles ],
+        "mhcIIAlleles": [ mhc_two_alleles ],
+    }
+)
+
+patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3)

From 1895d0f74af9853b4a9fa55fcc81fde3d5ce6765 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 08:54:38 +0000
Subject: [PATCH 175/191] snakefmt

---
 workflow/Snakefile                      |  3 +++
 workflow/rules/annotate_neoantigens.smk | 16 ++++++++++------
 workflow/rules/microphaser.smk          |  4 ++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index ee333ba2..6f3c2caf 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -11,8 +11,10 @@ configfile: "config/config.yaml"
 scattergather:
     calling=24,
 
+
 ##### required envvars #####
 
+
 envvars:
     # For NeoFox installation:
     # The tarballs for both netMHCpan and netMHCIIpan
@@ -26,6 +28,7 @@ envvars:
     # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0
     "NET_MHC_TWO_PAN_4_0_TARBALL",
 
+
 ##### setup report #####
 
 
diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index d844b4b2..406ca0d7 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -44,7 +44,7 @@ rule prepare_neo_fox_config_and_resources:
 
         ## pre-installed via conda
         echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config}
-        
+
         ## pre-installed into conda environment via post-deploy script
         echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config}
         echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config}
@@ -70,16 +70,16 @@ rule create_neo_fox_group_sheet:
     input:
         hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt",
     output:
-        group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv"
+        group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
     log:
-        "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log"
+        "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log",
     conda:
         "../envs/pandas.yaml"
     params:
-        group=lambda wc: group_annotation.loc[wc.group]
+        group=lambda wc: group_annotation.loc[wc.group],
     script:
         "../scripts/create_neo_fox_group_sheet.py"
-    
+
 
 rule neo_fox:
     input:
@@ -101,7 +101,11 @@ rule neo_fox:
     params:
         folder=lambda wc, output: path.dirname(output.annotated),
         prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0],
-        organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported",
+        organism="human"
+        if config["ref"]["species"] == "homo_sapiens"
+        else "mouse"
+        if config["ref"]["species"] == "mus_musculus"
+        else "unsupported",
     shell:
         "(neofox "
         "  --num_cpus {threads} "
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index b0570b0b..83061530 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -77,7 +77,7 @@ rule microphaser_tumor:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda wc: int(wc.peptide_length) * 3
+        window_length=lambda wc: int(wc.peptide_length) * 3,
     shell:
         "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
         "< {input.track} > {output.mt_fasta} 2> {log}"
@@ -102,7 +102,7 @@ rule microphaser_normal:
     conda:
         "../envs/microphaser.yaml"
     params:
-        window_length=lambda wc: int(wc.peptide_length) * 3
+        window_length=lambda wc: int(wc.peptide_length) * 3,
     shell:
         "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
         "< {input.track} > {output.wt_fasta} 2> {log}"

From 657e982a6a6685bbc42e0ec4c6eba66112c233cc Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 09:14:16 +0000
Subject: [PATCH 176/191] request NeoFox output instead of original netMHC
 outputs

---
 workflow/rules/common.smk | 43 +++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index d4d8dafb..a604bc55 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -112,30 +112,37 @@ def get_final_output():
             "alias",
         ]
         if config["neoantigen_prediction"]["activate"]:
-            sequencing_types = pd.unique(
-                units.loc[units["sample_name"].isin(smps), "sequencing_type"]
-            )
             final_output.extend(
                 expand(
-                    "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
+                    "results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
                     group=group,
                     tumor_alias=tumor_aliases,
-                    mhc=list(
-                        filter(
-                            None,
-                            [
-                                "net_mhc_pan"
-                                if is_activated("params/net_mhc_pan")
-                                else None,
-                                "net_mhc_two_pan"
-                                if is_activated("params/net_mhc_two_pan")
-                                else None,
-                            ],
-                        )
-                    ),
-                    seqtype=sequencing_types,
                 )
             )
+            #sequencing_types = pd.unique(
+            #    units.loc[units["sample_name"].isin(smps), "sequencing_type"]
+            #)
+            #final_output.extend(
+            #    expand(
+            #        "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
+            #        group=group,
+            #        tumor_alias=tumor_aliases,
+            #        mhc=list(
+            #            filter(
+            #                None,
+            #                [
+            #                    "net_mhc_pan"
+            #                    if is_activated("params/net_mhc_pan")
+            #                    else None,
+            #                    "net_mhc_two_pan"
+            #                    if is_activated("params/net_mhc_two_pan")
+            #                    else None,
+            #                ],
+            #            )
+            #        ),
+            #        seqtype=sequencing_types,
+            #    )
+            #)
         else:
             final_output = expand(
                 [

From 539da379e3e5a4c5e32e1bb6c1a1fc7c725456a9 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 09:14:27 +0000
Subject: [PATCH 177/191] small fixes

---
 workflow/rules/annotate_neoantigens.smk | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 406ca0d7..1d805267 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -99,8 +99,8 @@ rule neo_fox:
     conda:
         "../envs/neo_fox_deps.yaml"
     params:
-        folder=lambda wc, output: path.dirname(output.annotated),
-        prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0],
+        folder=lambda wc, output: path.dirname(output.tsv),
+        prefix=lambda wc, output: path.splitext(path.basename(output.tsv))[0],
         organism="human"
         if config["ref"]["species"] == "homo_sapiens"
         else "mouse"
@@ -117,7 +117,7 @@ rule neo_fox:
         "  --organism {params.organism} "
         "  --output-folder {params.folder} "
         "  --output-prefix {params.prefix} ; "
-        " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
-        " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
-        " mv {params_folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; "
+        " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
+        " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
+        " mv {params.folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; "
         ") 2> {log} "

From 21a62fca00212e70ae8c3d2462e6391f164aec92 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 09:35:19 +0000
Subject: [PATCH 178/191] snakefmt

---
 workflow/rules/common.smk | 48 +++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index a604bc55..37a72984 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -119,30 +119,30 @@ def get_final_output():
                     tumor_alias=tumor_aliases,
                 )
             )
-            #sequencing_types = pd.unique(
-            #    units.loc[units["sample_name"].isin(smps), "sequencing_type"]
-            #)
-            #final_output.extend(
-            #    expand(
-            #        "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
-            #        group=group,
-            #        tumor_alias=tumor_aliases,
-            #        mhc=list(
-            #            filter(
-            #                None,
-            #                [
-            #                    "net_mhc_pan"
-            #                    if is_activated("params/net_mhc_pan")
-            #                    else None,
-            #                    "net_mhc_two_pan"
-            #                    if is_activated("params/net_mhc_two_pan")
-            #                    else None,
-            #                ],
-            #            )
-            #        ),
-            #        seqtype=sequencing_types,
-            #    )
-            #)
+        #    sequencing_types = pd.unique(
+        #       units.loc[units["sample_name"].isin(smps), "sequencing_type"]
+        #    )
+        #    final_output.extend(
+        #       expand(
+        #           "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
+        #           group=group,
+        #           tumor_alias=tumor_aliases,
+        #           mhc=list(
+        #               filter(
+        #                   None,
+        #                   [
+        #                       "net_mhc_pan"
+        #                       if is_activated("params/net_mhc_pan")
+        #                       else None,
+        #                       "net_mhc_two_pan"
+        #                       if is_activated("params/net_mhc_two_pan")
+        #                       else None,
+        #                   ],
+        #               )
+        #           ),
+        #           seqtype=sequencing_types,
+        #       )
+        #    )
         else:
             final_output = expand(
                 [

From 63f618d1bf82931f97982ad0a4a7516e49c202e6 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Tue, 6 Sep 2022 12:25:16 +0000
Subject: [PATCH 179/191] unpin blast from `2.10` to `2`, so it installs on our
 server

---
 workflow/envs/neo_fox_deps.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml
index c4cb4a9b..780af1d6 100644
--- a/workflow/envs/neo_fox_deps.yaml
+++ b/workflow/envs/neo_fox_deps.yaml
@@ -8,7 +8,7 @@ dependencies:
   # implicit unmentioned dependency of MixMHCpred and PRIME
   - perl
   # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp
-  - blast =2.10
+  - blast =2
   # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64
   - cxx-compiler
   # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts

From 50591114fe92c0ee9abf7cb088c48a26a86fa125 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 7 Sep 2022 09:13:50 +0000
Subject: [PATCH 180/191] fix quoting in neo_fox prep rule, so that bash
 variables get expanded upon writing into config file

---
 workflow/rules/annotate_neoantigens.smk | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 1d805267..64557ffc 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -24,31 +24,31 @@ rule prepare_neo_fox_config_and_resources:
 
         ## pre-installed via conda
         export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb
-        echo 'NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb' > {output.config}
+        echo "NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb" > {output.config}
         export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript
-        echo 'NEOFOX_RSCRIPT=$CONDA_BIN/Rscript' >> {output.config}
+        echo "NEOFOX_RSCRIPT=$CONDA_BIN/Rscript" >> {output.config}
 
         ## pre-installed into conda environment via post-deploy script
         export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan
-        echo 'NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan' >> {output.config}
+        echo "NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan" >> {output.config}
         export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan
-        echo 'NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan' >> {output.config}
+        echo "NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan" >> {output.config}
 
         ## specification of hla_allele link via config.yaml
         export NEOFOX_HLA_DATABASE={params.hla_alleles}
 
         neofox-configure --reference-folder {output.references}
-        echo 'NEOFOX_REFERENCE_FOLDER={output.references}' >> {output.config}
+        echo "NEOFOX_REFERENCE_FOLDER={output.references}" >> {output.config}
 
         # further environment variables needed for the config file
 
         ## pre-installed via conda
-        echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config}
+        echo "NEOFOX_BLASTP=$CONDA_BIN/blastp" >> {output.config}
 
         ## pre-installed into conda environment via post-deploy script
-        echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config}
-        echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config}
-        echo 'NEOFOX_PRIME=$CONDA_BIN/PRIME' >> {output.config}
+        echo "NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred" >> {output.config}
+        echo "NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix" >> {output.config}
+        echo "NEOFOX_PRIME=$CONDA_BIN/PRIME" >> {output.config}
         """
 
 
@@ -108,7 +108,7 @@ rule neo_fox:
         else "unsupported",
     shell:
         "(neofox "
-        "  --num_cpus {threads} "
+        "  --num-cpus {threads} "
         "  --config {input.config} "
         "  --candidate-file {input.candidates} "
         "  --patient-data {input.group_sheet} "

From 70ce592f8c9343c949621f29ec91f05a493db8ed Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 7 Sep 2022 12:06:32 +0000
Subject: [PATCH 181/191] move allele lists of MixMHC tools and PRIME where
 NeoFox expects them (hard-coded in the tool)

---
 workflow/envs/neo_fox_deps.post-deploy.sh | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
index c17a3400..c344af3e 100755
--- a/workflow/envs/neo_fox_deps.post-deploy.sh
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -4,6 +4,9 @@ set -euo pipefail
 # set all the necessary conda paths and
 # ensure they exist
 CONDA_BIN="${CONDA_PREFIX}/bin/"
+# this is needed for the allele list files of MixMHCpred, MixMHC2pred
+# and PRIME, where NeoFox expects some of the files in that directory
+mkdir -p ${CONDA_BIN}/lib/
 CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/"
 mkdir -p $CONDA_MAN1
 CONDA_INFO="${CONDA_PREFIX}/share/info/"
@@ -26,8 +29,11 @@ g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x
 MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER"
 grep "${MMP_PLACEHOLDER}" MixMHCpred
 sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred
-mv lib $MIX_MHC_PRED_LIB_PATH
 mv MixMHCpred ${CONDA_BIN}
+# The allele_list.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L123
+cp lib/allele_list.txt ${CONDA_BIN}/lib/
+mv lib $MIX_MHC_PRED_LIB_PATH
 # TODO: when updating to v2.2, change this line to:
 # mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
 mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
@@ -46,6 +52,9 @@ wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_T
 tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
 cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
 mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix
+# The Alleles_list.txt file needs to be in the same directory as the binaries, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L114
+mv -t ${CONDA_BIN} Alleles_list.txt
 mv rpep ${CONDA_ETC}
 ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep
 mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE
@@ -65,8 +74,11 @@ cd PRIME-${PRIME_VERSION}
 PRIME_PLACEHOLDER="/app/PRIME/lib"
 grep "${PRIME_PLACEHOLDER}" PRIME
 sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME
-mv lib $PRIME_LIB_PATH
 mv PRIME ${CONDA_BIN}
+# The alleles.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L132
+cp lib/alleles.txt ${CONDA_BIN}/lib/
+mv lib $PRIME_LIB_PATH
 mv PRIME_license.pdf ${CONDA_INFO}
 PRIME -i test/test.txt -o test/out.txt -a A0201,A0101
 diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt)

From a70f2aea95109cdd85302b3aa389bb896f47e725 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Wed, 7 Sep 2022 12:07:02 +0000
Subject: [PATCH 182/191] _neoantigen_features.json does not seem to exist

---
 workflow/rules/annotate_neoantigens.smk | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index 64557ffc..d4ce35fa 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -92,7 +92,6 @@ rule neo_fox:
     output:
         tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
         json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json",
-        meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json",
     log:
         "logs/neo_fox/annotated/{group}.{tumor_alias}.log",
     threads: 8
@@ -119,5 +118,4 @@ rule neo_fox:
         "  --output-prefix {params.prefix} ; "
         " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
         " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
-        " mv {params.folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; "
         ") 2> {log} "

From 49edf28e2949fa651f42110138cc8e23cbb3978a Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 8 Sep 2022 08:57:50 +0000
Subject: [PATCH 183/191] remove implicit NeoFox logs, as they are incomplete
 and accumulate across runs

---
 workflow/rules/annotate_neoantigens.smk | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
index d4ce35fa..70e1a492 100644
--- a/workflow/rules/annotate_neoantigens.smk
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -118,4 +118,7 @@ rule neo_fox:
         "  --output-prefix {params.prefix} ; "
         " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
         " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
+        # this implicitly created log does not seem to contain all of stderr,
+        # so we rather do our own capturing below
+        " rm {params.folder}/{params.prefix}.log; "
         ") 2> {log} "

From de676ae5d6ac8bfe2027fa327882b73de56008b5 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 8 Sep 2022 09:45:16 +0000
Subject: [PATCH 184/191] remove indexes from pandas.to_csv output

---
 workflow/scripts/adjust_microphaser_output_for_neo_fox.py | 2 +-
 workflow/scripts/create_neo_fox_group_sheet.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
index 0d5581bb..4806d56e 100644
--- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
+++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
@@ -17,4 +17,4 @@
     .assign(patientIdentifier=snakemake.wildcards.group)
 )
 
-candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3)
+candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3, index=False)
diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py
index 252b6fc4..b989ebf6 100644
--- a/workflow/scripts/create_neo_fox_group_sheet.py
+++ b/workflow/scripts/create_neo_fox_group_sheet.py
@@ -57,4 +57,4 @@
     }
 )
 
-patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3)
+patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False)

From 594fe3a0382a5249be0eb1f6b08e74738b491a59 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 8 Sep 2022 09:46:47 +0000
Subject: [PATCH 185/191] remove extra possible MHC alleles, because NeoFox
 will only parse one per chromosome

---
 workflow/scripts/create_neo_fox_group_sheet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py
index b989ebf6..1ee7193a 100644
--- a/workflow/scripts/create_neo_fox_group_sheet.py
+++ b/workflow/scripts/create_neo_fox_group_sheet.py
@@ -28,9 +28,11 @@
         sep="\t",
     )
 # the Allele column can contain multiple ";"-separated entries for the
-# same locus
+# same locus -- NeoFox does a hard assertion that only two alleles per
+# gene exist, so we chose to only ever keep the first of such multiple
+# possibilities
 mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";")
-mhc_alleles = mhc_alleles.explode(["Allele"])
+mhc_alleles = mhc_alleles.explode(["Allele"]).drop_duplicates(subset=["Locus", "Chromosome"])
 mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)]
 mhc_alleles.loc[:, "Allele"] = (
     mhc_alleles["Allele"]

From e867bf8a785d17decd6da485f5b53a86c2c8d9d6 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Thu, 8 Sep 2022 09:47:22 +0000
Subject: [PATCH 186/191] remove tumorType column from group_sheet if empty, as
 NeoFox cannot handle empty entries, here

---
 workflow/scripts/create_neo_fox_group_sheet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py
index 1ee7193a..4242d684 100644
--- a/workflow/scripts/create_neo_fox_group_sheet.py
+++ b/workflow/scripts/create_neo_fox_group_sheet.py
@@ -57,6 +57,8 @@
         "mhcIAlleles": [ mhc_one_alleles ],
         "mhcIIAlleles": [ mhc_two_alleles ],
     }
-)
+# This is required for cases where no tumorType is available, as NeoFox does not
+# seem to be able to handle empty entries, here -- so we remove the whole column
+).dropna(axis="columns")
 
 patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False)

From 68e0b963e17ac35cfb8a7271d63f9444b11fdb21 Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 23 Sep 2022 10:05:04 +0000
Subject: [PATCH 187/191] add datavzrd report for NeoFox output

---
 workflow/Snakefile                            |   1 +
 workflow/envs/datavzrd.yaml                   |   4 +
 workflow/report/neopeptides.rst               |   9 ++
 ...neo_fox-neoantigens-template.datavzrd.yaml |  16 +++
 workflow/rules/common.smk                     |   3 +-
 workflow/rules/datavzrd.smk                   |  48 +++++++
 workflow/rules/mhc_binding.smk                |   4 +-
 workflow/scripts/prepare_neoprint.py          | 118 ++++++++++++++++++
 8 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 workflow/envs/datavzrd.yaml
 create mode 100644 workflow/report/neopeptides.rst
 create mode 100644 workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
 create mode 100644 workflow/rules/datavzrd.smk
 create mode 100644 workflow/scripts/prepare_neoprint.py

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6f3c2caf..7282e12f 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -49,6 +49,7 @@ include: "rules/microphaser.smk"
 include: "rules/hla_typing.smk"
 include: "rules/mhc_binding.smk"
 include: "rules/annotate_neoantigens.smk"
+include: "rules/datavzrd.smk"
 
 
 rule all:
diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml
new file mode 100644
index 00000000..e3f74967
--- /dev/null
+++ b/workflow/envs/datavzrd.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - datavzrd =2.1
diff --git a/workflow/report/neopeptides.rst b/workflow/report/neopeptides.rst
new file mode 100644
index 00000000..1e6d6625
--- /dev/null
+++ b/workflow/report/neopeptides.rst
@@ -0,0 +1,9 @@
+Neopeptides and corresponding normal peptides as phased and determined by
+microphaser, with various annotation scores gathered and provided by NeoFox,
+using the HLA alleles determined by HLA-LA.
+
+===================
+Column descriptions
+===================
+
+TODO: transform spreadsheet into RST table
diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
new file mode 100644
index 00000000..2f3e1773
--- /dev/null
+++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
@@ -0,0 +1,16 @@
+name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}"
+
+default-view: "overview"
+
+datasets:
+  neoprint:
+    path: ?input.neopeptides
+    separator: "\t"
+
+views:
+  overview:
+    desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox."
+    dataset: neoprint
+    render-table:
+      gene:
+        link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value}
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 37a72984..9a71c4d0 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -114,9 +114,10 @@ def get_final_output():
         if config["neoantigen_prediction"]["activate"]:
             final_output.extend(
                 expand(
-                    "results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
+                    "results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}",
                     group=group,
                     tumor_alias=tumor_aliases,
+                    mhc=["I", "II"],
                 )
             )
         #    sequencing_types = pd.unique(
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
new file mode 100644
index 00000000..799616cc
--- /dev/null
+++ b/workflow/rules/datavzrd.smk
@@ -0,0 +1,48 @@
+rule prepare_neoprint:
+    input:
+        neopeptides="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
+    output:
+        mhc_one="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.I.sorted.tsv",
+        mhc_two="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.II.sorted.tsv",
+    log:
+        "logs/prepare_neoprint/{group}.{tumor_alias}.log",
+    params:
+        purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze()
+    conda:
+        "../envs/pandas.yaml"
+    script:
+        "../scripts/prepare_neoprint.py"
+
+
+rule render_datavzrd_neoprint_config:
+    input:
+        template=workflow.source_path(
+            "../resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml"
+        ),
+        neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv",
+    output:
+        "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml",
+    log:
+        "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log",
+    template_engine:
+        "yte"
+
+
+rule datavzrd_neoprint:
+    input:
+        neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv",
+        config="resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml",
+    output:
+        report(
+            directory("results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}"),
+            htmlindex="index.html",
+            caption="../report/neopeptides.rst",
+            category="Neopeptides",
+            labels=lambda wc: {"group": wc.group, "sample_type": wc.tumor_alias, "MHC": wc.mhc},
+        ),
+    conda:
+        "../envs/datavzrd.yaml"
+    log:
+        "logs/datavzrd_neoprint/{group}.{tumor_alias}.{mhc}.log",
+    shell:
+        "datavzrd {input.config} --output {output} &> {log}"
diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk
index 0a6079ca..535015fc 100644
--- a/workflow/rules/mhc_binding.smk
+++ b/workflow/rules/mhc_binding.smk
@@ -83,7 +83,7 @@ rule merge_neoantigen_info:
         report(
             "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
             caption="../report/neoantigens.dna.rst",
-            category="Neoantigens",
+            category="Neopeptides",
         ),
     log:
         "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
@@ -99,7 +99,7 @@ rule add_rna_info:
         report(
             "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv",
             caption="../report/neoantigens.rna.rst",
-            category="Neoantigens",
+            category="Neopeptides",
         ),
     params:
         abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py
new file mode 100644
index 00000000..d9a01928
--- /dev/null
+++ b/workflow/scripts/prepare_neoprint.py
@@ -0,0 +1,118 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+from typing import Tuple
+
+def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
+    """
+    Highlight the difference between mutated neopeptide and normal peptide
+    """
+    if normal_p == "nan" or normal_p == "NA" or normal_p == "":
+        return (tumor_p, normal_p)
+    assert len(tumor_p) == len(
+        normal_p
+    ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
+    diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
+    tp_changed = tumor_p
+    np_changed = normal_p
+    for p in diff_pos:
+        tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :]
+        np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :]
+    return (tp_changed, np_changed)
+
+all_neopeptides = pd.read_csv(snakemake.input.neopeptides, sep="\t")
+
+# If we leave in any dots, datavzrd will interpret this as attributes
+all_neopeptides.columns = all_neopeptides.columns.str.replace(".", "_", regex=False)
+
+# Aggregate multiple identical entries that differ only in 'id' and 'transcript'
+# into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
+# list.
+# TODO: Remove this redundancy from microphaser output before passing it along to other
+# tools.
+cols = [c for c in all_neopeptides.columns if c not in ["id", "transcript"]]
+aggregation_functions = {
+    "id": lambda i: list(i)[0],
+    "transcript": lambda t: "|".join(list(t)),
+}
+all_neopeptides = (
+    all_neopeptides.groupby(cols, dropna=False)
+    .agg(aggregation_functions)
+    .reset_index()
+    .explode("id")
+)
+
+# highlight mutations in the original Xmers
+all_neopeptides[["mutation_mutatedXmer", "mutation_wildTypeXmer"]] = (
+    pd.DataFrame(
+        all_neopeptides
+        .fillna({"mutation_wildTypeXmer": ""})
+        .apply(lambda row: highlight_peptides_diff(row["mutation_mutatedXmer"], row["mutation_wildTypeXmer"]), axis="columns")
+        .tolist()
+    )
+)
+
+# create new purity-adjusted DNA variant allele frequency
+all_neopeptides["purity_adjusted_DNA_VAF"] = all_neopeptides["dnaVariantAlleleFrequency"] / snakemake.params.purity
+
+# round all floats to the specified decimals
+all_neopeptides = all_neopeptides.round(decimals=5)
+
+# define important columns to move to the left of the table
+
+important_cols_general = [
+            "gene",
+            "mutation_mutatedXmer",
+            "mutation_wildTypeXmer",
+            "purity_adjusted_DNA_VAF",
+            "imputedGeneExpression",
+        ]
+
+important_cols_one = [
+            "PRIME_best_rank",
+            "PRIME_best_score",
+            "PRIME_best_peptide",
+            "PRIME_best_allele",
+            "Best_rank_MHCI_9mer_score",
+            "Best_rank_MHCI_9mer_score_WT",
+            "Best_rank_MHCI_9mer_epitope",
+            "Best_rank_MHCI_9mer_epitope_WT",
+            "Best_rank_MHCI_9mer_allele",
+            "Best_rank_MHCI_9mer_allele_WT",
+        ]
+
+important_cols_two = [
+            "MixMHC2pred_best_rank",
+            "MixMHC2pred_best_peptide",
+            "MixMHC2pred_best_allele",
+            "Best_rank_MHCII_score",
+            "Best_rank_MHCII_score_WT",
+            "Best_rank_MHCII_score_epitope",
+            "Best_rank_MHCII_score_epitope_WT",
+            "Best_rank_MHCII_score_allele",
+            "Best_rank_MHCII_score_allele_WT",
+        ]
+
+important_cols = important_cols_general + important_cols_one + important_cols_two
+
+
+mhc_one = (
+    all_neopeptides[ important_cols + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
+    .sort_values(by = ["PRIME_best_rank", "MixMHC2pred_best_rank"])
+    .groupby("PRIME_best_rank")
+    .head(n=1)
+)
+
+mhc_one.to_csv(snakemake.output.mhc_one, sep="\t", index=False)
+
+# move important columns to the left of the table
+mhc_two = (
+    all_neopeptides[ important_cols_general + important_cols_two + important_cols_one + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
+    .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"])
+    .groupby("MixMHC2pred_best_rank")
+    .head(n=1)
+)
+
+mhc_two.to_csv(snakemake.output.mhc_two, sep="\t", index=False)

From 67ed16ae07b79c11573bcb22befd85dcfa324985 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20K=C3=B6ster?= <johannes.koester@tu-dortmund.de>
Date: Tue, 27 Sep 2022 16:47:09 +0000
Subject: [PATCH 188/191] fix: adapt to wrapper changes

---
 workflow/rules/microphaser.smk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 83061530..4c0da1c9 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,13 +1,13 @@
 rule norm_bcf:
     input:
         "results/final-calls/{group}.{set}.bcf",
-        genome="resources/genome.fasta",
+        ref="resources/genome.fasta",
     output:
         "results/final-calls/{group}.{set}.norm.bcf",
     log:
         "logs/bcftools/norm/{group}.{set}.log",
     params:
-        lambda w, input: "-f {} -O b -m-".format(input.genome),  # optional parameters for bcftools norm (except -o)
+        extra="-m-"
     wrapper:
         "v1.12.0/bio/bcftools/norm"
 
@@ -58,7 +58,7 @@ rule merge_tumor_normal:
     params:
         extra="-O b -a",
     wrapper:
-        "v1.12.0/bio/bcftools/concat"
+        "v1.14.1/bio/bcftools/concat"
 
 
 rule microphaser_tumor:

From 5ffe288d60ed6ce049888e5dbbfa0ec35193b535 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20K=C3=B6ster?= <johannes.koester@tu-dortmund.de>
Date: Wed, 28 Sep 2022 13:23:15 +0000
Subject: [PATCH 189/191] polish datavzrd report

---
 workflow/envs/datavzrd.yaml                   |   2 +-
 ...neo_fox-neoantigens-template.datavzrd.yaml | 325 +++++++++++++++++-
 workflow/rules/common.smk                     |  39 +++
 workflow/rules/datavzrd.smk                   |   5 +-
 workflow/rules/microphaser.smk                |   2 +-
 workflow/scripts/prepare_neoprint.py          |  40 +--
 6 files changed, 373 insertions(+), 40 deletions(-)

diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml
index e3f74967..859add75 100644
--- a/workflow/envs/datavzrd.yaml
+++ b/workflow/envs/datavzrd.yaml
@@ -1,4 +1,4 @@
 channels:
   - conda-forge
 dependencies:
-  - datavzrd =2.1
+  - datavzrd =2.2
diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
index 2f3e1773..40398001 100644
--- a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
+++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
@@ -1,3 +1,313 @@
+__definitions__:
+  - import pandas as pd
+  - from itertools import chain
+  - from copy import deepcopy
+__variables__:
+  important_cols: ?set(params.neofox_important_cols["general"]) | set(params.neofox_important_cols[wildcards.mhc])
+  cols: ?set(pd.read_csv(input.neopeptides, sep="\t").columns.values)
+  coldefs:
+    gene:
+      link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value}
+    mutation_mutatedXmer:
+      custom: |
+        function(value, row) {
+          return value.split("").map(function(a) {
+            if (a === a.toLowerCase()) {
+              return `<b style="color: red">${a}</b>`
+            } else {
+              return a
+            }
+          }).join("");
+        }
+    purity_adjusted_DNA_VAF:
+      plot:
+        ticks:
+          scale: linear
+    imputedGeneExpression:
+      plot:
+        ticks:
+          scale: linear
+    PRIME_best_rank:
+      plot:
+        heatmap:
+          scale: linear
+          domain:
+            - 0.0
+            - 100.0
+          range:
+            - "#EC0000"
+            - white
+    PRIME_best_score:
+      plot:
+        ticks:
+          scale: linear
+    Best_rank_MHCI_9mer_score:
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_rank_MHCI_9mer_score_WT
+    Best_rank_MHCI_9mer_score_WT:
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_rank_MHCI_9mer_score
+    MixMHC2pred_best_rank:
+      plot:
+        heatmap:
+          scale: linear
+          domain:
+            - 0.0
+            - 100.0
+          range:
+            - "#EC0000"
+            - "#ffffff"
+    Best_rank_MHCII_score:
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_rank_MHCII_score_WT
+    Best_rank_MHCII_score_WT:
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_rank_MHCII_score
+    Recognition_Potential_MHCI_9mer:
+      plot:
+        ticks:
+          scale: linear
+    Improved_Binder_MHCI:
+      plot:
+        heatmap:
+          scale: ordinal
+          domain:
+            - 0
+            - 1
+          range:
+            - "#ffffff"
+            - "#2CA02C"
+    Selfsimilarity_MHCI_conserved_binder:
+      plot:
+        ticks:
+          scale: linear
+    mutation_position:
+      display-mode: detail
+    dnaVariantAlleleFrequency:
+      display-mode: detail
+    rnaVariantAlleleFrequency:
+      display-mode: hidden
+    rnaExpression:
+      display-mode: hidden
+    ADN_MHCI:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+    ADN_MHCII:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+    Amplitude_MHCII_rank:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: linear
+          domain:
+            - 0.0
+            - 100.0
+          range:
+            - "#EC0000"
+            - "#ffffff"
+    Amplitude_MHCI_affinity:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+    Amplitude_MHCI_affinity_9mer:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+    MixMHC2pred_best_allele:
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - MixMHC2pred_best_allele
+            - Best_rank_MHCII_score_allele
+            - Best_rank_MHCII_score_allele_WT
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+    Best_rank_MHCII_score_allele:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - MixMHC2pred_best_allele
+            - Best_rank_MHCII_score_allele
+            - Best_rank_MHCII_score_allele_WT
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+    Best_rank_MHCII_score_allele_WT:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - MixMHC2pred_best_allele
+            - Best_rank_MHCII_score_allele
+            - Best_rank_MHCII_score_allele_WT
+            - Best_rank_MHCII_allele_WT
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+    Best_affinity_MHCII_allele:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - MixMHC2pred_best_allele
+            - Best_rank_MHCII_score_allele
+            - Best_rank_MHCII_score_allele_WT
+            - Best_rank_MHCII_allele_WT
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+    Best_affinity_MHCII_allele_WT:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - MixMHC2pred_best_allele
+            - Best_rank_MHCII_score_allele
+            - Best_rank_MHCII_score_allele_WT
+            - Best_rank_MHCII_allele_WT
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+    Best_affinity_MHCII_score:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCII_score_WT
+    Best_affinity_MHCII_score_WT:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCII_score
+    PRIME_best_allele:
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - PRIME_best_allele
+            - Best_rank_MHCI_9mer_allele
+            - Best_rank_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_9mer_allele
+            - Best_affinity_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_allele
+            - Best_affinity_MHCI_allele_WT
+    Best_rank_MHCI_9mer_allele:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - PRIME_best_allele
+            - Best_rank_MHCI_9mer_allele
+            - Best_rank_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_9mer_allele
+            - Best_affinity_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_allele
+            - Best_affinity_MHCI_allele_WT
+    Best_rank_MHCI_9mer_allele_WT:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - PRIME_best_allele
+            - Best_rank_MHCI_9mer_allele
+            - Best_rank_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_9mer_allele
+            - Best_affinity_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_allele
+            - Best_affinity_MHCI_allele_WT
+    Best_affinity_MHCI_9mer_score:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCI_9mer_score_WT
+    Best_affinity_MHCI_9mer_score_WT:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCI_9mer_score
+    Best_affinity_MHCI_allele:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - PRIME_best_allele
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+            - Best_affinity_MHCI_9mer_allele
+            - Best_affinity_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_allele
+            - Best_affinity_MHCI_allele_WT
+    Best_affinity_MHCI_allele_WT:
+      display-mode: detail
+      plot:
+        heatmap:
+          scale: ordinal
+          color-scheme: category20
+          aux-domain-columns:
+            - PRIME_best_allele
+            - Best_affinity_MHCII_allele
+            - Best_affinity_MHCII_allele_WT
+            - Best_affinity_MHCI_9mer_allele
+            - Best_affinity_MHCI_9mer_allele_WT
+            - Best_affinity_MHCI_allele
+            - Best_affinity_MHCI_allele_WT
+    Best_affinity_MHCI_score:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCI_score_WT
+    Best_affinity_MHCI_score_WT:
+      display-mode: detail
+      plot:
+        ticks:
+          scale: linear
+          aux-domain-columns:
+            - Best_affinity_MHCI_score
+
 name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}"
 
 default-view: "overview"
@@ -11,6 +321,17 @@ views:
   overview:
     desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox."
     dataset: neoprint
+    
     render-table:
-      gene:
-        link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value}
+      columns:
+        ?for col, coldef in coldefs.items():
+          __definitions__:
+            - |
+              coldef_ = deepcopy(coldef)
+              if col not in important_cols:
+                coldef_["display-mode"] = "detail"
+          ?col: ?coldef_
+        ?for col in cols:
+          ?if col not in coldefs and col not in important_cols:
+            ?col:
+              display-mode: detail
\ No newline at end of file
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 9a71c4d0..23a4b97e 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -219,3 +219,42 @@ def get_alleles_MHCII(wildcards):
         group=wildcards.group,
         alias=alias,
     )
+
+
+##### Other stuff ####
+
+neofox_important_cols = {
+    "general": [
+            "gene",
+            "mutation_mutatedXmer",
+            "mutation_wildTypeXmer",
+            "purity_adjusted_DNA_VAF",
+            "imputedGeneExpression",
+        ],
+    "I": [
+            "PRIME_best_rank",
+            "PRIME_best_score",
+            "PRIME_best_peptide",
+            "PRIME_best_allele",
+            "Recognition_Potential_MHCI_9mer",
+            "Improved_Binder_MHCI",
+            "Selfsimilarity_MHCI_conserved_binder",
+            "Best_rank_MHCI_9mer_score",
+            "Best_rank_MHCI_9mer_score_WT",
+            "Best_rank_MHCI_9mer_epitope",
+            "Best_rank_MHCI_9mer_epitope_WT",
+            "Best_rank_MHCI_9mer_allele",
+            "Best_rank_MHCI_9mer_allele_WT",
+        ],
+    "II": [
+            "MixMHC2pred_best_rank",
+            "MixMHC2pred_best_peptide",
+            "MixMHC2pred_best_allele",
+            "Best_rank_MHCII_score",
+            "Best_rank_MHCII_score_WT",
+            "Best_rank_MHCII_score_epitope",
+            "Best_rank_MHCII_score_epitope_WT",
+            "Best_rank_MHCII_score_allele",
+            "Best_rank_MHCII_score_allele_WT",
+        ]
+}
\ No newline at end of file
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
index 799616cc..d09c9cd7 100644
--- a/workflow/rules/datavzrd.smk
+++ b/workflow/rules/datavzrd.smk
@@ -7,7 +7,8 @@ rule prepare_neoprint:
     log:
         "logs/prepare_neoprint/{group}.{tumor_alias}.log",
     params:
-        purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze()
+        purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze(),
+        neofox_important_cols=neofox_important_cols,
     conda:
         "../envs/pandas.yaml"
     script:
@@ -22,6 +23,8 @@ rule render_datavzrd_neoprint_config:
         neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv",
     output:
         "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml",
+    params:
+        neofox_important_cols=neofox_important_cols,
     log:
         "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log",
     template_engine:
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index 4c0da1c9..a52d72ef 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -56,7 +56,7 @@ rule merge_tumor_normal:
     log:
         "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log",
     params:
-        extra="-O b -a",
+        extra="-a",
     wrapper:
         "v1.14.1/bio/bcftools/concat"
 
diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py
index d9a01928..0a535d14 100644
--- a/workflow/scripts/prepare_neoprint.py
+++ b/workflow/scripts/prepare_neoprint.py
@@ -1,4 +1,5 @@
 import sys
+import json
 
 sys.stderr = open(snakemake.log[0], "w")
 
@@ -62,40 +63,9 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
 
 # define important columns to move to the left of the table
 
-important_cols_general = [
-            "gene",
-            "mutation_mutatedXmer",
-            "mutation_wildTypeXmer",
-            "purity_adjusted_DNA_VAF",
-            "imputedGeneExpression",
-        ]
-
-important_cols_one = [
-            "PRIME_best_rank",
-            "PRIME_best_score",
-            "PRIME_best_peptide",
-            "PRIME_best_allele",
-            "Best_rank_MHCI_9mer_score",
-            "Best_rank_MHCI_9mer_score_WT",
-            "Best_rank_MHCI_9mer_epitope",
-            "Best_rank_MHCI_9mer_epitope_WT",
-            "Best_rank_MHCI_9mer_allele",
-            "Best_rank_MHCI_9mer_allele_WT",
-        ]
-
-important_cols_two = [
-            "MixMHC2pred_best_rank",
-            "MixMHC2pred_best_peptide",
-            "MixMHC2pred_best_allele",
-            "Best_rank_MHCII_score",
-            "Best_rank_MHCII_score_WT",
-            "Best_rank_MHCII_score_epitope",
-            "Best_rank_MHCII_score_epitope_WT",
-            "Best_rank_MHCII_score_allele",
-            "Best_rank_MHCII_score_allele_WT",
-        ]
-
-important_cols = important_cols_general + important_cols_one + important_cols_two
+neofox_important_cols = snakemake.params.neofox_important_cols
+
+important_cols = neofox_important_cols["general"] + neofox_important_cols["I"] + neofox_important_cols["II"]
 
 
 mhc_one = (
@@ -109,7 +79,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
 
 # move important columns to the left of the table
 mhc_two = (
-    all_neopeptides[ important_cols_general + important_cols_two + important_cols_one + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
+    all_neopeptides[ neofox_important_cols["general"] + neofox_important_cols["II"] + neofox_important_cols["I"] + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
     .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"])
     .groupby("MixMHC2pred_best_rank")
     .head(n=1)

From 9257537e11c9d66893d643f66c85ad7c1926ffc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20K=C3=B6ster?= <johannes.koester@tu-dortmund.de>
Date: Wed, 28 Sep 2022 13:58:57 +0000
Subject: [PATCH 190/191] polish template

---
 .../neo_fox-neoantigens-template.datavzrd.yaml        | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
index 40398001..270c77a7 100644
--- a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
+++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
@@ -30,7 +30,7 @@ __variables__:
     PRIME_best_rank:
       plot:
         heatmap:
-          scale: linear
+          scale: symlog
           domain:
             - 0.0
             - 100.0
@@ -319,7 +319,14 @@ datasets:
 
 views:
   overview:
-    desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox."
+    desc: |
+      Neopeptide candidates with annotations gathered and provided by NeoFox.
+
+      ### Column descriptions
+      * **Recognition_Potential_MHCI_9mer:** The recognition potential of a neoantigen is the likelihood that it is effectively recognized by the TCR repertoire (definition: Amplitude_MHCI_affinity_9mer x Pathogensimiliarity_MHCI_affinity_9mer) \[[runup to equation (5) in Luksza et al. 2017](https://doi.org/10.1038/nature24473)\].
+      * **Selfsimilarity_MHCI_conserved_binder:** Score for k-mer based similarity between Best_rank_MHCI_score_epitope and Best_affinity_MHCI_epitope_WT. For conserved binders only (i.e. NOT improved binders), where this score is considered indicative of immunogenicity \[[page 3 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\].
+      * **Improved_Binder_MHCI:** Cutoff of 1.2 on the ratio between normal and mutated peptide rank scores to designate a peptide as an improved binder (as opposed to a conserved binder) (definition: (Best_rank_MHCI_score_WT / Best_rank_MHCI_score ) > 1.2) \[[page 4 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\]
+
     dataset: neoprint
     
     render-table:

From d40cffe13676441fc29d8e93db2e52a5ce85138b Mon Sep 17 00:00:00 2001
From: dlaehnemann <david.laehnemann@hhu.de>
Date: Fri, 18 Nov 2022 15:58:26 +0000
Subject: [PATCH 191/191] update to Ensembl release 108 (GTFs in 105 and 106
 had genes partly unsorted)

---
 .test/config/config.yaml | 2 +-
 config/config.yaml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index 80c0e438..7eb854e1 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -13,7 +13,7 @@ ref:
   # Ensembl species name
   species: homo_sapiens
   # Ensembl release
-  release: 100
+  release: 108
   # Genome build
   build: GRCh38
 
diff --git a/config/config.yaml b/config/config.yaml
index 4d447645..e00ae4d9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -14,7 +14,7 @@ ref:
   # Ensembl species name
   species: homo_sapiens
   # Ensembl release
-  release: 100
+  release: 108
   # Genome build
   build: GRCh38