diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
index 503d4a5e..364e02a4 100644
--- a/.github/workflows/main.yaml
+++ b/.github/workflows/main.yaml
@@ -8,33 +8,42 @@ on:
jobs:
+ Formatting:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - name: Formatting
+ uses: github/super-linter@v4
+ env:
+ VALIDATE_ALL_CODEBASE: false
+ DEFAULT_BRANCH: master
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ VALIDATE_SNAKEMAKE_SNAKEFMT: true
+
+
Linting:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v1
+ - uses: actions/checkout@v2
- name: Lint workflow
- uses: snakemake/snakemake-github-action@v1.17.0
+ uses: snakemake/snakemake-github-action@v1.22.0
with:
directory: .
snakefile: workflow/Snakefile
args: "--lint"
- stagein: |
- export TMPDIR=/tmp
Testing:
runs-on: ubuntu-latest
- needs: Linting
+ needs:
+ - Linting
+ - Formatting
steps:
- - uses: actions/checkout@v1
- - name: Checkout submodules
- uses: textbook/git-checkout-submodule-action@2.0.0
+ - uses: actions/checkout@v2
- name: Test workflow (local FASTQs)
- uses: snakemake/snakemake-github-action@v1.17.0
+ uses: snakemake/snakemake-github-action@v1.22.0
with:
directory: .test
snakefile: workflow/Snakefile
args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba"
- stagein: |
- export TMPDIR=/tmp
diff --git a/.gitignore b/.gitignore
index ee6ed52e..02a9591f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,21 @@
*
-!scripts
-!scripts/*
-!scripts/common
-!scripts/common/*
-scripts/.snakemake*
-!Snakefile
-!config.yaml
-!samples.tsv
-!resources
-!resources/*
-!envs
-!envs/*
-!environment.yaml
+!.github/workflows/*
+!.test
+!.test/*
+!.test/config/*
+!.test/data/*
+!config
+!config/*
+!config/HLA_Data/*
+!workflow
+!workflow/*
+!workflow/report/*
+!workflow/envs/*
+!workflow/schemas/*
+!workflow/scripts/*
+!workflow/rules/*
+!workflow/rules/annotation/*
+!workflow/resources/*
+!.gitignore
!LICENSE
!README.md
-!rules
-!rules/*
-!.gitignore
-!.editorconfig
-!.gitattributes
-!.test
-!.test/data
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
index d10fc559..7eb854e1 100644
--- a/.test/config/config.yaml
+++ b/.test/config/config.yaml
@@ -1,84 +1,10 @@
samples: "config/samples.tsv"
units: "config/units.tsv"
-# boolean if read trimming should be skipped
-trimming:
- activate: false
-remove_duplicates:
+neoantigen_prediction:
activate: true
-calling:
- freebayes:
- activate: false
- # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
- scenario: config/scenario.yaml
- filter:
- # Filter candidate variants (this filter helps to keep the number of evaluated candidates small).
- # It should ideally generate a superset of all other filters defined below.
- # Annotation of candidate variants tries to be as fast as possible, only using VEP
- # default parameters.
- candidates: ""
- # Add any number of named filters here. They will be applied independenty,
- # and can be referred in FDR control below to generate calls for different events.
- # In particular, you can also filter by ID or dbsnp annotations here.
- # See http://snpeff.sourceforge.net/SnpSift.html#filter
- filtername: "ANN['IMPACT'] != 'MODIFIER'"
- fdr-control:
- threshold: 0.05
- events:
- complete:
- varlociraptor:
- - "somatic"
- - "germline"
- somatic:
- varlociraptor:
- - "somatic"
- germline:
- varlociraptor:
- - "germline"
-
-fusion:
- arriba:
- activate: false
- blacklist:
- "arriba_blacklist"
- params:
- "-T -P"
-
-tmb:
- activate: false
- coding_genome_size: 3e7
- # Name of the tumor sample in the scenario.yaml.
- tumor_sample: tumor
- somatic_events:
- - somatic
-
-
-epitope_prediction:
- activate: false
-
-
-affinity:
- netMHCpan:
- activate: false
- params: "-BA -l 9 -s -xls"
- location: "../netMHCpan-4.0"
- netMHCIIpan:
- activate: false
- params: "-length 15 -s -xls"
- location: "../netMHCIIpan-4.0"
-
-
-HLAtyping:
- # activate to use razers3 to pre-filter reads before using optitype
- optitype_prefiltering:
- activate: false
- optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
- # activate to predict MHC-I and MHC-II alleles with HLA-LA
- HLA_LA:
- activate: false
-
ref:
# Number of chromosomes to consider for calling.
@@ -87,49 +13,42 @@ ref:
# Ensembl species name
species: homo_sapiens
# Ensembl release
- release: 100
+ release: 108
# Genome build
build: GRCh38
-annotations:
- vep:
- params: "--everything"
- plugins:
- # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
- # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt".
- - LoFtool
-
params:
- cutadapt: ""
- bwa:
- "-M"
- picard:
- MarkDuplicates:
- "VALIDATION_STRINGENCY=LENIENT"
- gatk:
- BaseRecalibrator: "--tmp-dir tmp"
- applyBQSR: ""
- strelka:
- config:
- "--exome"
- run:
- "--mode local"
- razers3:
- "-i 95 -m 1 -dr 0"
- optitype:
- ""
microphaser:
- window_len:
- 33
- peptide_len:
- netMHCpan:
- 9
- netMHCIIpan:
- 15
- kallisto:
- "-b 100"
- star: >-
- --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip
- --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0
- --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3
+ events:
+ tumor: "tumor_only"
+ normal: "normal_only"
+ net_mhc_pan:
+ activate: true
+ peptide_len: 9
+ extra: ""
+ # Please download netMHCpan manually from:
+ # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+ # To make the `netMHCpan` script work, you need to fix its first line in
+ # in addition to the other edits described for a complete install. To use
+ # the conda-provided tcsh installation, it needs to read (without quotes):
+ # "#!/usr/bin/env tcsh"
+ location: "../netMHCpan-4.1"
+ net_mhc_two_pan:
+ activate: false
+ peptide_len: 15
+ extra: ""
+ # Please download netMHCIIpan manually from:
+ # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+ # To make the `netMHCIIpan` script work, you need to fix its first line in
+ # in addition to the other edits described for a complete install. To use
+ # the conda-provided tcsh installation, it needs to read (without quotes):
+ # "#!/usr/bin/env tcsh"
+ location: "../netMHCIIpan-4.1"
+ neo_fox:
+ activate: false
+ # This should be at least as long as the desired net_mhc_two_pan peptide length
+ peptide_len: 15
+ # update the version number to get a newer release of the reference set of HLA Alleles
+ hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt"
+ extra: ""
diff --git a/.test/config/groups.tsv b/.test/config/groups.tsv
new file mode 100644
index 00000000..c0a77bbd
--- /dev/null
+++ b/.test/config/groups.tsv
@@ -0,0 +1,2 @@
+group tumorType
+A LUSC
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
index 044002d7..2aca82dc 100644
--- a/.test/config/samples.tsv
+++ b/.test/config/samples.tsv
@@ -1,3 +1,3 @@
-sample type matched_normal purity platform
-A_normal normal ILLUMINA
-A_tumor tumor A_normal 1 ILLUMINA
+sample_name group alias purity platform
+A_normal A normal 1 ILLUMINA
+A_tumor A tumor .99 ILLUMINA
diff --git a/.test/config/units.tsv b/.test/config/units.tsv
index cdb8af8e..4984edd8 100644
--- a/.test/config/units.tsv
+++ b/.test/config/units.tsv
@@ -1,3 +1,3 @@
-sample sequencing_type unit fq1 fq2 sra adapters
+sample_name sequencing_type unit_name fq1 fq2 sra adapters
A_normal DNA lane1 data/reads/A_normal.1.fastq.gz data/reads/A_normal.2.fastq.gz
A_tumor DNA lane1 data/reads/A_tumor.1.fastq.gz data/reads/A_tumor.2.fastq.gz
diff --git a/config/config.yaml b/config/config.yaml
index f1e6818a..e00ae4d9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,73 +1,11 @@
samples: "config/samples.tsv"
units: "config/units.tsv"
+groups: "config/groups.tsv"
-# boolean if read trimming should be skipped
-trimming:
- activate: false
-remove_duplicates:
+neoantigen_prediction:
activate: true
-calling:
- freebayes:
- activate: false
- # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling
- scenario: config/scenario.yaml
- fdr-control:
- threshold: 0.05
- events:
- complete:
- varlociraptor:
- - "somatic"
- - "germline"
- somatic:
- varlociraptor:
- - "somatic"
- germline:
- varlociraptor:
- - "germline"
-
-fusion:
- arriba:
- activate: false
- blacklist:
- "arriba_blacklist"
- params:
- "-T -P"
-
-tmb:
- activate: false
- coding_genome_size: 3e7
- # Name of the tumor sample in the scenario.yaml.
- tumor_sample: tumor
- somatic_events:
- - somatic
-
-
-epitope_prediction:
- activate: true
-
-
-affinity:
- netMHCpan:
- activate: true
- params: "-BA -l 9 -s -xls"
- location: "../netMHCpan-4.0"
- netMHCIIpan:
- activate: false
- params: "-length 15 -s -xls"
- location: "../netMHCIIpan-4.0"
-
-
-HLAtyping:
- # activate to use razers3 to pre-filter reads before using optitype
- optitype_prefiltering:
- activate: false
- optitype_data: "config/HLA_Data/hla_reference_dna.fasta"
- # activate to predict MHC-I and MHC-II alleles with HLA-LA
- HLA_LA:
- activate: false
-
ref:
# Number of chromosomes to consider for calling.
@@ -76,49 +14,44 @@ ref:
# Ensembl species name
species: homo_sapiens
# Ensembl release
- release: 100
+ release: 108
# Genome build
build: GRCh38
-annotations:
- vep:
- params: "--everything"
- plugins:
- # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
- # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt".
- - LoFtool
-
params:
- cutadapt: ""
- bwa:
- "-M"
- picard:
- MarkDuplicates:
- "VALIDATION_STRINGENCY=LENIENT"
- gatk:
- BaseRecalibrator: "--tmp-dir tmp"
- applyBQSR: ""
- strelka:
- config:
- "--exome"
- run:
- "--mode local"
- razers3:
- "-i 95 -m 1 -dr 0"
- optitype:
- ""
microphaser:
- window_len:
- 33
- peptide_len:
- netMHCpan:
- 9
- netMHCIIpan:
- 15
- kallisto:
- "-b 100"
- star: >-
- --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip
- --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0
- --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3
+ # window_len should be at least 3 times the longest peptide_len specified below
+ events:
+ normal: "normal_only"
+ tumor: "tumor_only"
+ net_mhc_pan:
+ activate: true
+ peptide_len: 9
+ extra: ""
+ # Please download netMHCpan manually from:
+ # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+ # To make the `netMHCpan` script work, you need to fix its first line in
+ # in addition to the other edits described for a complete install. To use
+ # the conda-provided tcsh installation, it needs to read (without quotes):
+ # "#!/usr/bin/env tcsh"
+ location: "../netMHCpan-4.1"
+ net_mhc_two_pan:
+ activate: false
+ peptide_len: 15
+ extra: ""
+ # Please download netMHCIIpan manually from:
+ # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1
+ # To make the `netMHCIIpan` script work, you need to fix its first line in
+ # in addition to the other edits described for a complete install. To use
+ # the conda-provided tcsh installation, it needs to read (without quotes):
+ # "#!/usr/bin/env tcsh"
+ location: "../netMHCIIpan-4.1"
+ neo_fox:
+ activate: false
+ # This should be at least as long as the desired net_mhc_two_pan peptide length
+ peptide_len: 15
+ # update the version number to get a newer release of the reference set of HLA Alleles
+ hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt"
+ extra: ""
+
diff --git a/config/groups.tsv b/config/groups.tsv
new file mode 100644
index 00000000..11353453
--- /dev/null
+++ b/config/groups.tsv
@@ -0,0 +1,3 @@
+group tumorType
+A LUSC
+B LUAD
diff --git a/config/samples.tsv b/config/samples.tsv
index 6e291f15..249e0521 100644
--- a/config/samples.tsv
+++ b/config/samples.tsv
@@ -1,4 +1,5 @@
-sample type matched_normal purity platform
-A_normal normal ILLUMINA
-A_tumor tumor A_normal 1 ILLUMINA
-B_tumor tumor A_normal 1 ILLUMINA
+sample_name group alias purity platform
+A_normal A normal 1 ILLUMINA
+A_tumor A tumor .99 ILLUMINA
+B_normal B normal 1 ILLUMINA
+B_tumor B tumor .98 ILLUMINA
diff --git a/config/units.tsv b/config/units.tsv
index f8758dfb..c417a45c 100644
--- a/config/units.tsv
+++ b/config/units.tsv
@@ -1,5 +1,6 @@
-sample sequencing_type unit fq1 fq2 sra adapters
+sample_name sequencing_type unit_name fq1 fq2 sra adapters
A_normal DNA lane1 A_normal_1.fastq.gz A_normal_2.fastq.gz
A_tumor DNA lane2 A_tumor_1.fastq.gz A_tumor_2.fastq.gz
+B_normal DNA lane1 B_normal_1.fastq.gz B_normal_2.fastq.gz
B_tumor DNA lane1 B_tumor_1.fastq.gz B_tumor_2.fastq.gz
B_tumor RNA lane1 B_tumor_RNA_1.fastq.gz B_tumor_RNA_2.fastq.gz
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 2b881b44..7282e12f 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -12,6 +12,23 @@ scattergather:
calling=24,
+##### required envvars #####
+
+
+envvars:
+ # For NeoFox installation:
+ # The tarballs for both netMHCpan and netMHCIIpan
+ # need to be donwloaded manually after registering
+ # online and deposited at a filesystem location
+ # that needs to be available as a shell environment
+ # variable when running snakemake.
+ # For downloading, see the "Downloads" tab at:
+ # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+ "NET_MHC_PAN_4_1_TARBALL",
+ # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0
+ "NET_MHC_TWO_PAN_4_0_TARBALL",
+
+
##### setup report #####
@@ -27,24 +44,14 @@ container: "docker://continuumio/miniconda3"
include: "rules/common.smk"
include: "rules/utils.smk"
-include: "rules/trim.smk"
include: "rules/ref.smk"
-include: "rules/mapping.smk"
-include: "rules/calling.smk"
-include: "rules/candidate_calling.smk"
-include: "rules/varlociraptor.smk"
-include: "rules/annotation.smk"
-include: "rules/filtering.smk"
include: "rules/microphaser.smk"
-include: "rules/HLAtyping.smk"
-include: "rules/MHC_binding.smk"
-include: "rules/RNA.smk"
-include: "rules/tmb.smk"
-include: "rules/vega.smk"
+include: "rules/hla_typing.smk"
+include: "rules/mhc_binding.smk"
+include: "rules/annotate_neoantigens.smk"
+include: "rules/datavzrd.smk"
rule all:
input:
get_final_output(),
- get_fusion_output(),
- get_tmb_targets(),
diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml
index 70e39da1..35bf7643 100644
--- a/workflow/envs/bcftools.yaml
+++ b/workflow/envs/bcftools.yaml
@@ -2,4 +2,4 @@ channels:
- conda-forge
- bioconda
dependencies:
- - bcftools =1.10
+ - bcftools =1.14
diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml
new file mode 100644
index 00000000..859add75
--- /dev/null
+++ b/workflow/envs/datavzrd.yaml
@@ -0,0 +1,4 @@
+channels:
+ - conda-forge
+dependencies:
+ - datavzrd =2.2
diff --git a/workflow/envs/gawk.yaml b/workflow/envs/gawk.yaml
new file mode 100644
index 00000000..fa09e590
--- /dev/null
+++ b/workflow/envs/gawk.yaml
@@ -0,0 +1,4 @@
+channels:
+ - conda-forge
+dependencies:
+ - gawk =5.1
diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml
index 6c2487fb..6a24bc29 100644
--- a/workflow/envs/hla_la.yaml
+++ b/workflow/envs/hla_la.yaml
@@ -1,8 +1,5 @@
channels:
- conda-forge
- bioconda
- - jafors
dependencies:
- - hla-la ==1.0.5
- - samtools ==1.10
- - boost-cpp ==1.73.0
+ - hla-la ==1.0.3
diff --git a/workflow/envs/htslib.yaml b/workflow/envs/htslib.yaml
new file mode 100644
index 00000000..b5d7959d
--- /dev/null
+++ b/workflow/envs/htslib.yaml
@@ -0,0 +1,5 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - htslib =1.14
diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml
index 84d2ad83..90698479 100644
--- a/workflow/envs/microphaser.yaml
+++ b/workflow/envs/microphaser.yaml
@@ -2,4 +2,4 @@ channels:
- bioconda
- conda-forge
dependencies:
- - microphaser =0.2
+ - microphaser =0.7
diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh
new file mode 100755
index 00000000..c344af3e
--- /dev/null
+++ b/workflow/envs/neo_fox_deps.post-deploy.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# set all the necessary conda paths and
+# ensure they exist
+CONDA_BIN="${CONDA_PREFIX}/bin/"
+# this is needed for the allele list files of MixMHCpred, MixMHC2pred
+# and PRIME, where NeoFox expects some of the files in that directory
+mkdir -p ${CONDA_BIN}/lib/
+CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/"
+mkdir -p $CONDA_MAN1
+CONDA_INFO="${CONDA_PREFIX}/share/info/"
+mkdir -p $CONDA_INFO
+CONDA_LIB="${CONDA_PREFIX}/lib/"
+mkdir -p ${CONDA_LIB}
+CONDA_ETC="${CONDA_PREFIX}/etc/"
+mkdir -p $CONDA_ETC
+
+# install MixMHCpred, following:
+# https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README
+MIX_MHC_PRED_VERSION="2.1"
+MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/"
+wget -q https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz
+tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz
+cd MixMHCpred-${MIX_MHC_PRED_VERSION}
+g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x
+# TODO: when updating to v2.2, change this line to:
+# MMP_PLACEHOLDER="/PATH_TO_MIXMHCPRED/lib"
+MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER"
+grep "${MMP_PLACEHOLDER}" MixMHCpred
+sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred
+mv MixMHCpred ${CONDA_BIN}
+# The allele_list.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L123
+cp lib/allele_list.txt ${CONDA_BIN}/lib/
+mv lib $MIX_MHC_PRED_LIB_PATH
+# TODO: when updating to v2.2, change this line to:
+# mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
+mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf
+MixMHCpred -i test/test.fa -o test/out.txt -a A0101,A2501,B0801,B1801
+diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt)
+cd ..
+rm v${MIX_MHC_PRED_VERSION}.tar.gz
+rm -r MixMHCpred-${MIX_MHC_PRED_VERSION}
+
+# install MixMHC2pred, mostly following (we use the default GitHub-created
+# .tar.gz file instead of the hand-crafted .zip file for future-proofing):
+# https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md
+MIX_MHC_TWO_PRED_VERSION="1.2"
+MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/"
+wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
+mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix
+# The Alleles_list.txt file needs to be in the same directory as the binaries, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L114
+mv -t ${CONDA_BIN} Alleles_list.txt
+mv rpep ${CONDA_ETC}
+ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep
+mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE
+MixMHC2pred_unix -i test/testData.txt -o test/out.txt -a DRB1_11_01 DRB3_02_02 DPA1_01_03__DPB1_04_01 DQA1_05_05__DQB1_03_01
+diff test/out.txt test/out_compare.txt
+cd ..
+rm v${MIX_MHC_TWO_PRED_VERSION}.tar.gz
+rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION}
+
+# install PRIME, mostly following (minor corrections):
+# https://github.com/GfellerLab/PRIME/blob/v1.0/README
+PRIME_VERSION="1.0"
+PRIME_LIB_PATH="${CONDA_LIB}/prime/"
+wget -q https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz
+tar xzf v${PRIME_VERSION}.tar.gz
+cd PRIME-${PRIME_VERSION}
+PRIME_PLACEHOLDER="/app/PRIME/lib"
+grep "${PRIME_PLACEHOLDER}" PRIME
+sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME
+mv PRIME ${CONDA_BIN}
+# The alleles.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here:
+# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L132
+cp lib/alleles.txt ${CONDA_BIN}/lib/
+mv lib $PRIME_LIB_PATH
+mv PRIME_license.pdf ${CONDA_INFO}
+PRIME -i test/test.txt -o test/out.txt -a A0201,A0101
+diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt)
+cd ..
+rm v${PRIME_VERSION}.tar.gz
+rm -r PRIME-${PRIME_VERSION}
+
+# This is the non-portable version of the 1st line
+# in both netMHCpan and netMHCIIpan, assuming a
+# root install of tcsh.
+TCSH_ROOT="#! /bin/tcsh -f"
+# For the scripts to work with any tcsh in the
+# $PATH, we need to change this to:
+TCSH_PATH="#!/usr/bin/env tcsh"
+
+# install netMHCpan version 4.1
+# requires tcsh to have been installed via conda dependencies
+NET_MHC_PAN_4_1_LIB="${CONDA_LIB}/netMHCpan_4_1/"
+mkdir -p ${NET_MHC_PAN_4_1_LIB}
+NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/"
+mkdir -p ${NET_MHC_PAN_4_1_ETC}
+tar xzf ${NET_MHC_PAN_4_1_TARBALL}
+cd netMHCpan-4.1
+wget -q https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz
+tar xzf data.tar.gz
+rm data.tar.gz
+grep "${TCSH_ROOT}" netMHCpan
+sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCpan
+grep -P "setenv\s+NMHOME" netMHCpan
+sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_PAN_4_1_LIB}%" netMHCpan
+mv -t ${CONDA_BIN} netMHCpan
+mv -t ${CONDA_MAN1} netMHCpan.1
+mv -t ${CONDA_INFO} netMHCpan-4.1.readme
+mv -t ${NET_MHC_PAN_4_1_LIB} Linux_x86_64
+mv -t ${NET_MHC_PAN_4_1_ETC} data
+ln -s ${NET_MHC_PAN_4_1_ETC}/data ${NET_MHC_PAN_4_1_LIB}/data
+cd test
+netMHCpan -p test.pep -BA -xls -a HLA-A01:01,HLA-A02:01 -xlsfile my_NetMHCpan_out.xls
+diff NetMHCpan_out.xls my_NetMHCpan_out.xls
+cd ../..
+rm -r netMHCpan-4.1
+
+# install netMHCIIpan version 4.0
+# requires tcsh to have been installed via conda dependencies
+NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/"
+mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB}
+NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/"
+mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC}
+tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL}
+cd netMHCIIpan-4.0
+wget -q https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz
+tar xzf data.tar.gz
+rm data.tar.gz
+grep "${TCSH_ROOT}" netMHCIIpan
+sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCIIpan
+grep -P "setenv\s+NMHOME" netMHCIIpan
+sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_TWO_PAN_4_0_LIB}%" netMHCIIpan
+mv -t ${CONDA_BIN} netMHCIIpan
+mv -t ${CONDA_MAN1} netMHCIIpan.1
+mv -t ${CONDA_INFO} netMHCIIpan-4.0.readme
+mv -t ${NET_MHC_TWO_PAN_4_0_LIB} Linux_x86_64 NetMHCIIpan-4.0.pl
+mv -t ${NET_MHC_TWO_PAN_4_0_ETC} data
+ln -s ${NET_MHC_TWO_PAN_4_0_ETC}/data ${NET_MHC_TWO_PAN_4_0_LIB}/data
+cd test
+netMHCIIpan -f example.fsa -a DRB1_0101 > example.fsa.myout
+diff example.fsa.out example.fsa.myout
+netMHCIIpan -f example.pep -inptype 1 -a DRB1_0101 > example.pep.myout
+diff example.pep.out example.pep.myout
+netMHCIIpan -f example.fsa -a H-2-IAb -s -u > example.fsa.sorted.myout
+diff example.fsa.sorted.out example.fsa.sorted.myout
+netMHCIIpan -f example.fsa -hlaseq DRB10101.fsa > example.fsa_hlaseq.myout
+diff example.fsa_hlaseq.out example.fsa_hlaseq.myout
+netMHCIIpan -f example.fsa -hlaseqA alpha.dat -hlaseq beta.dat > example.fsa_hlaseq_A+B.myout
+diff example.fsa_hlaseq_A+B.out example.fsa_hlaseq_A+B.myout
+cd ../..
+rm -r netMHCIIpan-4.0
diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml
new file mode 100644
index 00000000..780af1d6
--- /dev/null
+++ b/workflow/envs/neo_fox_deps.yaml
@@ -0,0 +1,30 @@
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ # https://neofox.readthedocs.io/en/latest/02_installation.html#step-by-step-guide-without-docker
+ - r-base =3.6
+ - python >=3.7, <=3.8
+ # implicit unmentioned dependency of MixMHCpred and PRIME
+ - perl
+ # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp
+ - blast =2
+ # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64
+ - cxx-compiler
+ # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts
+ - tcsh =6.24
+ # just make sure this is available for the post-deploy.sh script
+ - sed =4.8
+ # R packages mentioned at the end of this section:
+ # https://neofox.readthedocs.io/en/latest/02_installation.html#configuration-of-the-reference-folder
+ - r-lattice
+ - r-ggplot2
+ - r-caret
+ - r-peptides
+ - r-doparallel
+ - r-gbm
+ - bioconductor-biostrings
+ # https://neofox.readthedocs.io/en/latest/02_installation.html#install-neofox
+ - pip
+ - pip:
+ - neofox==0.6.4
diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml
new file mode 100644
index 00000000..5ced20fa
--- /dev/null
+++ b/workflow/envs/pandas.yaml
@@ -0,0 +1,4 @@
+channels:
+ - conda-forge
+dependencies:
+ - pandas=1.4
\ No newline at end of file
diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml
index 6c8be1fb..94ff5bae 100644
--- a/workflow/envs/rbt.yaml
+++ b/workflow/envs/rbt.yaml
@@ -1,6 +1,6 @@
channels:
- - bioconda
- conda-forge
+ - bioconda
dependencies:
- rust-bio-tools =0.19
- - bcftools =1.10
+ - bcftools =1.14
diff --git a/workflow/envs/tcsh.yaml b/workflow/envs/tcsh.yaml
new file mode 100644
index 00000000..9bc374fd
--- /dev/null
+++ b/workflow/envs/tcsh.yaml
@@ -0,0 +1,4 @@
+channels:
+ - conda-forge
+dependencies:
+ - tcsh =6.24
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
index a76d765a..270dc831 100644
--- a/workflow/envs/varlociraptor.yaml
+++ b/workflow/envs/varlociraptor.yaml
@@ -3,4 +3,4 @@ channels:
- bioconda
dependencies:
- varlociraptor =2.3.0
- - bcftools =1.10
+ - bcftools =1.14
diff --git a/workflow/report/HLA_Types.rst b/workflow/report/HLA_Types.rst
deleted file mode 100644
index 999fe494..00000000
--- a/workflow/report/HLA_Types.rst
+++ /dev/null
@@ -1 +0,0 @@
-Typing of HLA profile.
diff --git a/workflow/report/WES_results.rst b/workflow/report/WES_results.rst
deleted file mode 100644
index 4a699bdf..00000000
--- a/workflow/report/WES_results.rst
+++ /dev/null
@@ -1 +0,0 @@
-"Results - neoantigen candidate table"
diff --git a/workflow/report/hla_alleles.rst b/workflow/report/hla_alleles.rst
new file mode 100644
index 00000000..a793a914
--- /dev/null
+++ b/workflow/report/hla_alleles.rst
@@ -0,0 +1 @@
+HLA allele profile as determined by HLA-LA.
diff --git a/workflow/report/neoantigens.dna.rst b/workflow/report/neoantigens.dna.rst
new file mode 100644
index 00000000..17ba537e
--- /dev/null
+++ b/workflow/report/neoantigens.dna.rst
@@ -0,0 +1,60 @@
+Neoantigens and corresponding normal peptides as phased and determined by
+microphaser, with elution ligand / binding affinity predictions by netMHC
+(netMHCpan and netMHCIIpan) to the HLA alleles determined by HLA-LA.
+
+===================
+Column descriptions
+===================
+
+* **id**: Peptide ID assigned by microphaser.
+* **pep_seq**: Sequence of the full peptide that was given to netMHC(II)pan. Amino acids that
+ are different between the normal and the tumor sample are highlighted in lower case.
+* **pos_in_id_seq**: Position in pep_seq of the peptide that was used for prediction.
+ It seems like netMHCpan positions start at 0 and netMHCIIpan positions at 1.
+* **alias**: Indicator of the type of sample (normal vs. some tumor sample).
+* **num_binders**: Total number of peptide-HLA allele pairs from pep_seq that are considered binders,
+ either weak or strong. Cutoffs are applied to el_rank and are:
+ * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder)
+ * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder)
+* **freq**: Allelic frequency of the peptide as predicted by microphaser. For a
+ credible allele frequency interval, see column freq_credible_interval.
+* **depth**: Read depth at the peptide position.
+* **num_var_sites**: Number of variant sites on the peptides haplotype in that sample (alias).
+* **num_var_in_pep**: Number of variant sites within the peptide sequence.
+* **top_el_rank_allele**: HLA allele with the best eluted ligand prediction score percentile rank.
+* **top_el_rank_bind_core**: Binding core of the peptide for the HLA allele with the best
+ elution ligand score percentile rank.
+* **top_el_rank_el_rank**: Percentile rank of the elution ligand score. The rank of the predicted binding
+ score when compared to a set of random natural peptides. This measure is not
+ affected by inherent bias of certain molecules towards higher or lower mean
+ predicted affinities. It is the recommended value for determining likely
+ binders / neoantigens of interest. Cutoffs recommended by netMHC(II)pan authors
+ are:
+ * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder)
+ * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder)
+* **top_el_rank_el_score**: The raw eluted ligand prediction score
+* **ave_el_score**: Average across the eluted ligand prediction scores of all alleles for this
+ peptide in the particular sample (alias).
+* **top_ba_rank_allele**: HLA allele with the best binding affinity prediction score percentile rank.
+* **top_ba_rank_bind_core**: Binding core of the peptide for the HLA allele with the best
+ binding affinity score percentile rank.
+* **top_ba_rank_ba_rank**: Percentile rank of the predicted binding affinity compared to a set of 100.000
+ random natural peptides. This measure is not affected by inherent bias of certain
+ molecules towards higher or lower mean predicted affinities.
+* **top_ba_rank_ba_score**: Predicted binding affinity in log-scale.
+* **aa_changes**: List of aa changes. For normal samples, only germline changes are listed. For
+ tumor samples, only somatic tumor changes are listed, even though the germline
+ changes also affect the tumor peptide.
+* **genomic_pos**: Genomic position of the nucleotide change.
+* **nt_seq**: Nucleotide sequence underlying the peptide. Nucleotide changes underlying the
+ amino acid changes are highlighted as lower case letters.
+* **gene_name**: Common gene name / gene symbol of the peptide's gene of origin.
+* **gene_id**: Ensembl gene id of the peptide's gene of origin.
+* **transcript**: List of Ensembl transcript ids in which the peptide occurs.
+* **chrom**: Chromosome on which the peptide's gene of origin is located.
+* **offset**: Chromosomal position of the peptide's gene of origin.
+* **frame**: Open reading frame that the peptide originates from. 0 indicates the regular
+ reading frame, non-zero values indicate frame shifts.
+* **strand**: Strand of the gene / transcript.
+* **freq_credible_interval**: Credible interval for freq, the allelic frequency of the peptide as predicted
+ by microphaser.
diff --git a/workflow/report/RNA_results.rst b/workflow/report/neoantigens.rna.rst
similarity index 100%
rename from workflow/report/RNA_results.rst
rename to workflow/report/neoantigens.rna.rst
diff --git a/workflow/report/neopeptides.rst b/workflow/report/neopeptides.rst
new file mode 100644
index 00000000..1e6d6625
--- /dev/null
+++ b/workflow/report/neopeptides.rst
@@ -0,0 +1,9 @@
+Neopeptides and corresponding normal peptides as phased and determined by
+microphaser, with various annotation scores gathered and provided by NeoFox,
+using the HLA alleles determined by HLA-LA.
+
+===================
+Column descriptions
+===================
+
+TODO: transform spreadsheet into RST table
diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
new file mode 100644
index 00000000..270c77a7
--- /dev/null
+++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml
@@ -0,0 +1,344 @@
+__definitions__:
+ - import pandas as pd
+ - from itertools import chain
+ - from copy import deepcopy
+__variables__:
+ important_cols: ?set(params.neofox_important_cols["general"]) | set(params.neofox_important_cols[wildcards.mhc])
+ cols: ?set(pd.read_csv(input.neopeptides, sep="\t").columns.values)
+ coldefs:
+ gene:
+ link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value}
+ mutation_mutatedXmer:
+ custom: |
+ function(value, row) {
+ return value.split("").map(function(a) {
+ if (a === a.toLowerCase()) {
+ return `${a}`
+ } else {
+ return a
+ }
+ }).join("");
+ }
+ purity_adjusted_DNA_VAF:
+ plot:
+ ticks:
+ scale: linear
+ imputedGeneExpression:
+ plot:
+ ticks:
+ scale: linear
+ PRIME_best_rank:
+ plot:
+ heatmap:
+ scale: symlog
+ domain:
+ - 0.0
+ - 100.0
+ range:
+ - "#EC0000"
+ - white
+ PRIME_best_score:
+ plot:
+ ticks:
+ scale: linear
+ Best_rank_MHCI_9mer_score:
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_rank_MHCI_9mer_score_WT
+ Best_rank_MHCI_9mer_score_WT:
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_rank_MHCI_9mer_score
+ MixMHC2pred_best_rank:
+ plot:
+ heatmap:
+ scale: linear
+ domain:
+ - 0.0
+ - 100.0
+ range:
+ - "#EC0000"
+ - "#ffffff"
+ Best_rank_MHCII_score:
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_rank_MHCII_score_WT
+ Best_rank_MHCII_score_WT:
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_rank_MHCII_score
+ Recognition_Potential_MHCI_9mer:
+ plot:
+ ticks:
+ scale: linear
+ Improved_Binder_MHCI:
+ plot:
+ heatmap:
+ scale: ordinal
+ domain:
+ - 0
+ - 1
+ range:
+ - "#ffffff"
+ - "#2CA02C"
+ Selfsimilarity_MHCI_conserved_binder:
+ plot:
+ ticks:
+ scale: linear
+ mutation_position:
+ display-mode: detail
+ dnaVariantAlleleFrequency:
+ display-mode: detail
+ rnaVariantAlleleFrequency:
+ display-mode: hidden
+ rnaExpression:
+ display-mode: hidden
+ ADN_MHCI:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ ADN_MHCII:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ Amplitude_MHCII_rank:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: linear
+ domain:
+ - 0.0
+ - 100.0
+ range:
+ - "#EC0000"
+ - "#ffffff"
+ Amplitude_MHCI_affinity:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ Amplitude_MHCI_affinity_9mer:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ MixMHC2pred_best_allele:
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - MixMHC2pred_best_allele
+ - Best_rank_MHCII_score_allele
+ - Best_rank_MHCII_score_allele_WT
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ Best_rank_MHCII_score_allele:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - MixMHC2pred_best_allele
+ - Best_rank_MHCII_score_allele
+ - Best_rank_MHCII_score_allele_WT
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ Best_rank_MHCII_score_allele_WT:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - MixMHC2pred_best_allele
+ - Best_rank_MHCII_score_allele
+ - Best_rank_MHCII_score_allele_WT
+ - Best_rank_MHCII_allele_WT
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ Best_affinity_MHCII_allele:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - MixMHC2pred_best_allele
+ - Best_rank_MHCII_score_allele
+ - Best_rank_MHCII_score_allele_WT
+ - Best_rank_MHCII_allele_WT
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ Best_affinity_MHCII_allele_WT:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - MixMHC2pred_best_allele
+ - Best_rank_MHCII_score_allele
+ - Best_rank_MHCII_score_allele_WT
+ - Best_rank_MHCII_allele_WT
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ Best_affinity_MHCII_score:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCII_score_WT
+ Best_affinity_MHCII_score_WT:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCII_score
+ PRIME_best_allele:
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - PRIME_best_allele
+ - Best_rank_MHCI_9mer_allele
+ - Best_rank_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_9mer_allele
+ - Best_affinity_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_allele
+ - Best_affinity_MHCI_allele_WT
+ Best_rank_MHCI_9mer_allele:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - PRIME_best_allele
+ - Best_rank_MHCI_9mer_allele
+ - Best_rank_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_9mer_allele
+ - Best_affinity_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_allele
+ - Best_affinity_MHCI_allele_WT
+ Best_rank_MHCI_9mer_allele_WT:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - PRIME_best_allele
+ - Best_rank_MHCI_9mer_allele
+ - Best_rank_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_9mer_allele
+ - Best_affinity_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_allele
+ - Best_affinity_MHCI_allele_WT
+ Best_affinity_MHCI_9mer_score:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCI_9mer_score_WT
+ Best_affinity_MHCI_9mer_score_WT:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCI_9mer_score
+ Best_affinity_MHCI_allele:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - PRIME_best_allele
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ - Best_affinity_MHCI_9mer_allele
+ - Best_affinity_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_allele
+ - Best_affinity_MHCI_allele_WT
+ Best_affinity_MHCI_allele_WT:
+ display-mode: detail
+ plot:
+ heatmap:
+ scale: ordinal
+ color-scheme: category20
+ aux-domain-columns:
+ - PRIME_best_allele
+ - Best_affinity_MHCII_allele
+ - Best_affinity_MHCII_allele_WT
+ - Best_affinity_MHCI_9mer_allele
+ - Best_affinity_MHCI_9mer_allele_WT
+ - Best_affinity_MHCI_allele
+ - Best_affinity_MHCI_allele_WT
+ Best_affinity_MHCI_score:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCI_score_WT
+ Best_affinity_MHCI_score_WT:
+ display-mode: detail
+ plot:
+ ticks:
+ scale: linear
+ aux-domain-columns:
+ - Best_affinity_MHCI_score
+
+name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}"
+
+default-view: "overview"
+
+datasets:
+ neoprint:
+ path: ?input.neopeptides
+ separator: "\t"
+
+views:
+ overview:
+ desc: |
+ Neopeptide candidates with annotations gathered and provided by NeoFox.
+
+ ### Column descriptions
+ * **Recognition_Potential_MHCI_9mer:** The recognition potential of a neoantigen is the likelihood that it is effectively recognized by the TCR repertoire (definition: Amplitude_MHCI_affinity_9mer x Pathogensimiliarity_MHCI_affinity_9mer) \[[runup to equation (5) in Luksza et al. 2017](https://doi.org/10.1038/nature24473)\].
+ * **Selfsimilarity_MHCI_conserved_binder:** Score for k-mer based similarity between Best_rank_MHCI_score_epitope and Best_affinity_MHCI_epitope_WT. For conserved binders only (i.e. NOT improved binders), where this score is considered indicative of immunogenicity \[[page 3 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\].
+ * **Improved_Binder_MHCI:** Cutoff of 1.2 on the ratio between normal and mutated peptide rank scores to designate a peptide as an improved binder (as opposed to a conserved binder) (definition: (Best_rank_MHCI_score_WT / Best_rank_MHCI_score ) > 1.2) \[[page 4 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\]
+
+ dataset: neoprint
+
+ render-table:
+ columns:
+ ?for col, coldef in coldefs.items():
+ __definitions__:
+ - |
+ coldef_ = deepcopy(coldef)
+ if col not in important_cols:
+ coldef_["display-mode"] = "detail"
+ ?col: ?coldef_
+ ?for col in cols:
+ ?if col not in coldefs and col not in important_cols:
+ ?col:
+ display-mode: detail
\ No newline at end of file
diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk
deleted file mode 100644
index 682808ef..00000000
--- a/workflow/rules/HLAtyping.smk
+++ /dev/null
@@ -1,98 +0,0 @@
-rule HLA_LA:
- input:
- bam="results/recal/{sample}.sorted.bam",
- bai="results/recal/{sample}.sorted.bam.bai",
- index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
- output:
- "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt",
- threads: 7
- log:
- "logs/HLA-LA/{sample}.log",
- params:
- graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
- graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
- conda:
- "../envs/hla_la.yaml"
- shell:
- "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
-
-
-rule parse_HLA_LA:
- input:
- "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt",
- output:
- report(
- "results/HLA-LA/hlaI_{sample}.tsv",
- caption="../report/HLA_Types.rst",
- category="HLA-Typing(HLA-LA)",
- ),
- report(
- "results/HLA-LA/hlaII_{sample}.tsv",
- caption="../report/HLA_Types.rst",
- category="HLA-Typing(HLA-LA)",
- ),
- log:
- "logs/parse-HLA-LA/{sample}.log",
- script:
- "../scripts/parse_HLA_types.py"
-
-
-rule razers3:
- input:
- reads="results/merged/DNA/{sample}_{fq}.fastq.gz",
- output:
- bam="results/razers3/bam/{sample}_{fq}.bam",
- threads: 8
- log:
- "logs/razers3/{sample}_{fq}.log",
- params:
- genome=config["HLAtyping"]["optitype_data"],
- extra=config["params"]["razers3"],
- wrapper:
- "0.61.0/bio/razers3"
-
-
-rule bam2fq:
- input:
- "results/razers3/bam/{sample}_{fq}.bam",
- output:
- "results/razers3/fastq/{sample}_{fq}.fished.fastq",
- params:
- "",
- log:
- "logs/razers3-bam2fq/{sample}-{fq}.log",
- threads: 1
- wrapper:
- "0.61.0/bio/samtools/bam2fq/interleaved"
-
-
-rule OptiType:
- input:
- reads=get_optitype_reads_input,
- output:
- multiext(
- "results/optitype/{sample}/{sample}", "_coverage_plot.pdf", "_result.tsv"
- ),
- log:
- "logs/optitype/{sample}.log",
- params:
- extra=config["params"]["optitype"],
- sequencing_type="dna",
- wrapper:
- "0.63.0/bio/optitype"
-
-
-rule parse_Optitype:
- input:
- "results/optitype/{sample}/{sample}_result.tsv",
- output:
- report(
- "results/optitype/{sample}/hla_alleles_{sample}.tsv",
- caption="../report/HLA_Types.rst",
- category="HLA-Typing(Optitype)",
- ),
- log:
- "logs/parse-optitype/{sample}.log",
- shell:
- "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' "
- '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}'
diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk
deleted file mode 100644
index 2d95e343..00000000
--- a/workflow/rules/MHC_binding.smk
+++ /dev/null
@@ -1,127 +0,0 @@
-# rule mhcflurry:
-# input:
-# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{group}.fa",
-# alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv",
-# wt_alleles=get_germline_optitype
-# output:
-# "results/mhcflurry/{sample}/{chr}/output.{group}.csv"
-# log:
-# "logs/mhcflurry/{sample}-{chr}-{group}.log"
-# run:
-# if "wt" in input.peptides:
-# alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0])
-# else:
-# alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0])
-# cmd = "if [ -s {input.peptides} ]; then mhctools --mhc-predictor mhcflurry --mhc-alleles {alleles} --input-fasta-file {input.peptides} --output-csv {output} > {log}; else touch {output}; fi"
-# shell(cmd)
-
-
-rule netMHCpan:
- input:
- peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{group}.fa",
- alleles=get_alleles_MHCI,
- output:
- "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{group}.xls",
- log:
- "logs/netMHCpan/{sample}-{chr}-{group}.log",
- params:
- extra=config["affinity"]["netMHCpan"]["params"],
- netMHC=config["affinity"]["netMHCpan"]["location"],
- run:
- alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0])
- cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
- shell(cmd)
-
-
-rule netMHCIIpan:
- input:
- peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{group}.fa",
- alleles=get_alleles_MHCII,
- output:
- "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{group}.xls",
- log:
- "logs/netMHCIIpan/{sample}-{chr}-{group}.log",
- params:
- extra=config["affinity"]["netMHCIIpan"]["params"],
- netMHC=config["affinity"]["netMHCIIpan"]["location"],
- run:
- alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist())
- cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
- shell(cmd)
-
-
-rule parse_mhc_out:
- input:
- expand(
- "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{group}}.xls",
- chr=contigs,
- ),
- output:
- "results/{mhc}/{sample}/{sample}.mhc.{group}.tsv",
- log:
- "logs/parse-mhc/{mhc}-{sample}-{group}.log",
- wildcard_constraints:
- group="wt|mt",
- script:
- "../scripts/group_mhc_output.py"
-
-
-# rule parse_mhcflurry:
-# input:
-# expand("results/mhcflurry/{{sample}}/{chr}/output.{{group}}.csv", chr=contigs)
-# output:
-# "results/mhcflurry/{sample}/{sample}.mhc.{group}.csv"
-# wildcard_constraints:
-# group="wt|mt"
-# log:
-# "logs/parse-mhc/mhcflurry-{sample}-{group}.log"
-# conda:
-# "../envs/xsv.yaml"
-# shell:
-# "xsv cat rows -d ',' {input} | cut --complement -f2,7,8 > {output}"
-
-
-rule mhc_csv_table:
- input:
- info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv",
- mt="results/{mhc}/{sample}/{sample}.mhc.mt.tsv",
- wt="results/{mhc}/{sample}/{sample}.mhc.wt.tsv",
- output:
- report(
- "results/neoantigens/{mhc}/{sample}.DNA.tsv",
- caption="../report/WES_results.rst",
- category="Results WES (netMHC)",
- ),
- log:
- "logs/create-mhc-table/{mhc}-{sample}.log",
- script:
- "../scripts/merge_data.py"
-
-
-# rule mhcflurry_table:
-# input:
-# info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv",
-# mt="results/mhcflurry/{sample}/{sample}.mhc.mt.tsv",
-# wt="results/mhcflurry/{sample}/{sample}.mhc.wt.tsv"
-# output:
-# report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)")
-# script:
-# "../scripts/merge_mhcflurry.py"
-
-
-rule add_RNA_info:
- input:
- counts="results/kallisto/{sample}",
- table="results/neoantigens/{mhc}/{sample}.DNA.tsv",
- output:
- report(
- "results/neoantigens/{mhc}/{sample}.RNA.tsv",
- caption="../report/RNA_results.rst",
- category="Results RNA",
- ),
- params:
- abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
- log:
- "logs/add-RNA/{mhc}-{sample}.log",
- script:
- "../scripts/add_rna_info.py"
diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk
deleted file mode 100644
index 940d7c02..00000000
--- a/workflow/rules/RNA.smk
+++ /dev/null
@@ -1,71 +0,0 @@
-rule kallisto_quant:
- input:
- fastq=get_quant_reads_input,
- index="resources/kallisto/transcripts.idx",
- output:
- directory("results/kallisto/{sample}"),
- params:
- extra=kallisto_params,
- log:
- "results/logs/kallisto/quant/{sample}.log",
- wrapper:
- "0.60.1/bio/kallisto/quant"
-
-
-rule STAR_align:
- input:
- "resources/STAR_index",
- fq1=lambda wc: units.loc[(wc.sample, "RNA"), "fq1"],
- fq2=lambda wc: units.loc[(wc.sample, "RNA"), "fq2"],
- gtf="resources/genome.gtf",
- output:
- # see STAR manual for additional output files
- "results/star/{sample}/Aligned.out.bam",
- "results/star/{sample}/ReadsPerGene.out.tab",
- log:
- "logs/star/{sample}.log",
- params:
- # path to STAR reference genome index
- index="STAR_index",
- # optional parameters - designed to get chimeric alignments for fusion detection
- extra=lambda wc, input: "--quantMode GeneCounts --sjdbGTFfile {} {}".format(
- input.gtf, config["params"]["star"]
- ),
- threads: 8
- wrapper:
- "0.42.0/bio/star/align"
-
-
-rule arriba:
- input:
- bam="results/star/{sample}/Aligned.out.bam",
- genome="resources/genome.fasta",
- annotation="resources/genome.gtf",
- output:
- fusions="results/arriba/{sample}.fusions.tsv",
- discarded="results/arriba/{sample}.fusions.discarded.tsv",
- params:
- blacklist=config["fusion"]["arriba"]["blacklist"],
- extra=config["fusion"]["arriba"]["params"],
- log:
- "results/logs/arriba/{sample}.log",
- threads: 1
- wrapper:
- "0.60.1/bio/arriba"
-
-
-## TODO: Update
-# rule fusioncatcher:
-# input:
-# fq1=lambda w: units.loc[(w.sample, "RNA"), "fq1"],
-# fq2=lambda w: units.loc[(w.sample, "RNA"), "fq2"]
-# output:
-# directory("fusioncatcher/{sample}")
-# params:
-# extra="-T tmp -d ../../fusioncatcher_data"
-# log:
-# "logs/fusioncatcher/{sample}.log"
-# threads:
-# 8
-# shell:
-# "fusioncatcher -i {input.fq1},{input.fq2} -o {output} {params.extra} -p {threads} > {log}"
diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk
new file mode 100644
index 00000000..70e1a492
--- /dev/null
+++ b/workflow/rules/annotate_neoantigens.smk
@@ -0,0 +1,124 @@
+rule prepare_neo_fox_config_and_resources:
+ output:
+ config="resources/neo_fox/neo_fox_config.txt",
+ # we cannot put the exact files generated into the
+ # output, as snakemake will generate the respective
+ # subdirectories and NeoFox has default exist_ok=False
+ # set for os.makedirs:
+ # https://github.com/TRON-Bioinformatics/neofox/blob/fb6cdf9f10e77c409d0fa44657ef520eedca6994/neofox/references/installer.py#L221
+ references=directory("resources/neo_fox/references/"),
+ log:
+ "logs/neo_fox/neo_fox_config.log",
+ conda:
+ "../envs/neo_fox_deps.yaml"
+ params:
+ hla_alleles=config["params"]["neo_fox"]["hla_alleles"],
+ shell:
+ """
+ # environment variables necessary for neofox-configure
+ # NOTE: we have to provide all binaries with hard-coded
+ # paths, because NeoFox checks that the file at the path
+ # exists (and not simply that a binary exists):
+ # https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L90
+ CONDA_BIN=$CONDA_PREFIX/bin
+
+ ## pre-installed via conda
+ export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb
+ echo "NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb" > {output.config}
+ export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript
+ echo "NEOFOX_RSCRIPT=$CONDA_BIN/Rscript" >> {output.config}
+
+ ## pre-installed into conda environment via post-deploy script
+ export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan
+ echo "NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan" >> {output.config}
+ export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan
+ echo "NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan" >> {output.config}
+
+ ## specification of hla_allele link via config.yaml
+ export NEOFOX_HLA_DATABASE={params.hla_alleles}
+
+ neofox-configure --reference-folder {output.references}
+ echo "NEOFOX_REFERENCE_FOLDER={output.references}" >> {output.config}
+
+ # further environment variables needed for the config file
+
+ ## pre-installed via conda
+ echo "NEOFOX_BLASTP=$CONDA_BIN/blastp" >> {output.config}
+
+ ## pre-installed into conda environment via post-deploy script
+ echo "NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred" >> {output.config}
+ echo "NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix" >> {output.config}
+ echo "NEOFOX_PRIME=$CONDA_BIN/PRIME" >> {output.config}
+ """
+
+
+rule adjust_microphaser_output_for_neo_fox:
+ input:
+ microphaser="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+ output:
+ neo_fox="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+ log:
+ "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log",
+ threads: 1
+ conda:
+ "../envs/pandas.yaml"
+ script:
+ "../scripts/adjust_microphaser_output_for_neo_fox.py"
+
+
+rule create_neo_fox_group_sheet:
+ input:
+ hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt",
+ output:
+ group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
+ log:
+ "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log",
+ conda:
+ "../envs/pandas.yaml"
+ params:
+ group=lambda wc: group_annotation.loc[wc.group],
+ script:
+ "../scripts/create_neo_fox_group_sheet.py"
+
+
+rule neo_fox:
+ input:
+ config="resources/neo_fox/neo_fox_config.txt",
+ candidates=expand(
+ "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
+ peptide_length=config["params"]["neo_fox"]["peptide_len"],
+ ),
+ group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv",
+ output:
+ tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
+ json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json",
+ log:
+ "logs/neo_fox/annotated/{group}.{tumor_alias}.log",
+ threads: 8
+ conda:
+ "../envs/neo_fox_deps.yaml"
+ params:
+ folder=lambda wc, output: path.dirname(output.tsv),
+ prefix=lambda wc, output: path.splitext(path.basename(output.tsv))[0],
+ organism="human"
+ if config["ref"]["species"] == "homo_sapiens"
+ else "mouse"
+ if config["ref"]["species"] == "mus_musculus"
+ else "unsupported",
+ shell:
+ "(neofox "
+ " --num-cpus {threads} "
+ " --config {input.config} "
+ " --candidate-file {input.candidates} "
+ " --patient-data {input.group_sheet} "
+ " --with-table "
+ " --with-json "
+ " --organism {params.organism} "
+ " --output-folder {params.folder} "
+ " --output-prefix {params.prefix} ; "
+ " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; "
+ " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; "
+ # this implicitly created log does not seem to contain all of stderr,
+ # so we rather do our own capturing below
+ " rm {params.folder}/{params.prefix}.log; "
+ ") 2> {log} "
diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
deleted file mode 100644
index 64088375..00000000
--- a/workflow/rules/annotation.smk
+++ /dev/null
@@ -1,44 +0,0 @@
-rule annotate_variants:
- input:
- calls="results/calls/{group}.{scatteritem}.bcf",
- cache="resources/vep/cache",
- plugins="resources/vep/plugins",
- output:
- calls="results/calls/{group}.{scatteritem}.annotated.bcf",
- stats=report(
- "results/calls/{group}.{scatteritem}.stats.html",
- caption="../report/stats.rst",
- category="QC",
- ),
- params:
- # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
- # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
- plugins=config["annotations"]["vep"]["plugins"],
- extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]),
- log:
- "logs/vep/{group}.{scatteritem}.annotate.log",
- wrapper:
- "0.59.2/bio/vep/annotate"
-
-
-rule annotate_strelka_variants:
- input:
- calls="results/strelka/{calls}.bcf",
- cache="resources/vep/cache",
- plugins="resources/vep/plugins",
- output:
- calls="results/strelka/{calls}.annotated.bcf",
- stats=report(
- "results/strelka/{calls}.annotated.stats.html",
- caption="../report/stats.rst",
- category="QC",
- ),
- params:
- # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
- # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
- plugins=[], #config["annotations"]["vep"]["plugins"],
- extra="--vcf_info_field ANN --hgvs --symbol --canonical",
- log:
- "logs/vep/{calls}.strelka.annotate.log",
- wrapper:
- "0.59.2/bio/vep/annotate"
diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk
deleted file mode 100644
index 98ef992f..00000000
--- a/workflow/rules/calling.smk
+++ /dev/null
@@ -1,147 +0,0 @@
-rule strelka_somatic:
- input:
- normal=get_normal_bam,
- normal_index=get_normal_bai,
- tumor="results/recal/{sample}.sorted.bam",
- tumor_index="results/recal/{sample}.sorted.bam.bai",
- fasta="resources/genome.fasta",
- fasta_index="resources/genome.fasta.fai",
- callregions="resources/genome.callregions.bed.gz",
- output:
- "results/strelka/somatic/{sample}/results/variants/somatic.snvs.vcf.gz",
- "results/strelka/somatic/{sample}/results/variants/somatic.indels.vcf.gz",
- log:
- "logs/calling/strelka_somatic/{sample}.log",
- params:
- config_extra="--callRegions {} {}".format(
- "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
- ),
- run_extra=config["params"]["strelka"]["run"],
- threads: 22
- wrapper:
- "0.65.0/bio/strelka/somatic"
-
-
-rule strelka_germline:
- input:
- bam="results/recal/{normal}.sorted.bam",
- normal_index="results/recal/{normal}.sorted.bam.bai",
- fasta="resources/genome.fasta",
- fasta_index="resources/genome.fasta.fai",
- callregions="resources/genome.callregions.bed.gz",
- output:
- "results/strelka/germline/{normal}/results/variants/variants.vcf.gz",
- log:
- "logs/calling/strelka_germline/{normal}.log",
- params:
- config_extra="--callRegions {} {}".format(
- "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"]
- ),
- run_extra="",
- threads: 22
- wrapper:
- "0.65.0/bio/strelka/germline"
-
-
-rule vcf_to_bcf:
- input:
- "{variants}.vcf.gz",
- output:
- "{variants}.output.bcf",
- log:
- "logs/bcftools/to-bcf/{variants}.log",
- params:
- "-O b -f PASS",
- wrapper:
- "0.60.0/bio/bcftools/view"
-
-
-rule concat_somatic:
- input:
- calls=expand(
- "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf",
- type=["snvs", "indels"],
- ),
- indices=expand(
- "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf.csi",
- type=["snvs", "indels"],
- ),
- output:
- "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf",
- log:
- "bcftools/concat-somatic/{sample}.log",
- params:
- "-O b -a",
- wrapper:
- "0.60.0/bio/bcftools/concat"
-
-
-rule get_tumor_from_somatic:
- input:
- "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf",
- output:
- "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
- log:
- "logs/bcftools/view-TUMOR/{sample}.log",
- params:
- "-O b -s TUMOR",
- wrapper:
- "0.60.0/bio/bcftools/view"
-
-
-rule reheader_germline:
- input:
- vcf="{germline}/variants.output.bcf",
- samples="resources/sampleheader.txt",
- output:
- "{germline}/variants.reheader.bcf",
- log:
- "logs/bcftools/reheader/{germline}.log",
- params:
- extra="",
- view_extra="-O b",
- wrapper:
- "0.60.0/bio/bcftools/reheader"
-
-
-rule concat_variants:
- input:
- calls=lambda w: get_pair_variants(w, index=False),
- index=lambda w: get_pair_variants(w, index=True),
- output:
- "results/strelka/merged/{sample}/all_variants.bcf",
- log:
- "bcftools/concat-all/{sample}.log",
- params:
- extra="-O b -a",
- wrapper:
- "0.64.0/bio/bcftools/concat"
-
-
-rule preprocess_variants:
- input:
- variants="{variants}.bcf",
- output:
- "{variants}.prepy.bcf",
- params:
- extra="-L --somatic",
- genome="resources/genome.fasta",
- log:
- "logs/prepy/{variants}.log",
- threads: 2
- wrapper:
- "0.60.0/bio/hap.py/pre.py"
-
-
-rule norm_vcf:
- input:
- "{prefix}.bcf",
- genome="resources/genome.fasta",
- output:
- "{prefix}.norm.bcf",
- log:
- "logs/bcftools/norm/{prefix}.log",
- params:
- "-f {} -O b -m-".format("resources/genome.fasta"), # optional parameters for bcftools norm (except -o)
- wrapper:
- "0.65.0/bio/bcftools/norm"
diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk
deleted file mode 100644
index 74386763..00000000
--- a/workflow/rules/candidate_calling.smk
+++ /dev/null
@@ -1,28 +0,0 @@
-rule freebayes:
- input:
- ref="resources/genome.fasta",
- # you can have a list of samples here
- samples=get_paired_bams,
- output:
- "results/candidate-calls/{pair}.freebayes.bcf",
- log:
- "logs/{pair}.log",
- params:
- extra=config["params"].get("freebayes", ""),
- chunksize=100000,
- threads: 60
- wrapper:
- "0.65.0/bio/freebayes"
-
-
-rule scatter_candidates:
- input:
- "results/candidate-calls/{pair}.{caller}.bcf",
- output:
- scatter.calling("results/candidate-calls/{{pair}}.{{caller}}.{scatteritem}.bcf"),
- log:
- "logs/scatter-candidates/{pair}.{caller}.log",
- conda:
- "../envs/rbt.yaml"
- shell:
- "rbt vcf-split {input} {output}"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 3bc6c60d..23a4b97e 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -1,6 +1,7 @@
import glob
import pandas as pd
+from os import path
from snakemake.remote import FTP
from snakemake.utils import validate
@@ -10,16 +11,56 @@ ftp = FTP.RemoteProvider()
validate(config, schema="../schemas/config.schema.yaml")
-##### sample sheets #####
-
-samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
+##### samples sheet #####
+
+samples = (
+ pd.read_csv(
+ config["samples"],
+ sep="\t",
+ dtype={
+ "sample_name": str,
+ "group": str,
+ "alias": str,
+ },
+ comment="#",
+ )
+ .set_index("sample_name", drop=False)
+ .sort_index()
+)
validate(samples, schema="../schemas/samples.schema.yaml")
-units = pd.read_csv(config["units"], dtype=str, sep="\t").set_index(
- ["sample", "sequencing_type", "unit"], drop=False
+##### units sheet #####
+
+units = (
+ pd.read_csv(
+ config["units"],
+ sep="\t",
+ dtype={
+ "sample_name": str,
+ "sequencing_type": str,
+ "unit_name": str,
+ "adapters": str,
+ },
+ comment="#",
+ )
+ .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False)
+ .sort_index()
)
validate(units, schema="../schemas/units.schema.yaml")
+groups = samples["group"].unique()
+
+if "groups" in config:
+ group_annotation = (
+ pd.read_csv(config["groups"], sep="\t", dtype={"group": str})
+ .set_index("group")
+ .sort_index()
+ )
+ group_annotation = group_annotation.loc[groups]
+else:
+ group_annotation = pd.DataFrame({"group": groups}).set_index("group")
+
+
contigs = [c for c in range(1, 23)]
contigs.extend(["X", "Y"])
@@ -27,10 +68,29 @@ contigs.extend(["X", "Y"])
wildcard_constraints:
- pair="|".join(samples[samples.type == "tumor"]["sample"]),
- sample="|".join(samples["sample"]),
+ cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]),
+ normal_sample="|".join(samples[samples.alias == "normal"]["sample_name"]),
+ sample="|".join(samples["sample_name"]),
+ unit="|".join(units["unit_name"]),
+ alias="|".join(pd.unique(samples["alias"])),
+ tumor_alias="|".join(
+ pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])
+ ),
+ normal_alias="normal",
+ tumor_set=config["params"]["microphaser"]["events"]["tumor"],
+ normal_set=config["params"]["microphaser"]["events"]["normal"],
+ set="|".join(
+ [
+ config["params"]["microphaser"]["events"]["tumor"],
+ config["params"]["microphaser"]["events"]["normal"],
+ ]
+ ),
+ group="|".join(pd.unique(samples["group"])),
caller="|".join(["freebayes", "delly"]),
- event="somatic|germline|complete",
+ peptide_type="|".join(["normal", "neo"]),
+ event="|".join(["somatic", "germline", "complete"]),
+ read="|".join(["single", "R1", "R2"]),
+ seqtype="|".join(["DNA", "RNA"]),
### Output generation ###
@@ -44,60 +104,57 @@ def is_activated(xpath):
def get_final_output():
- if config["epitope_prediction"]["activate"]:
- final_output = expand(
- "results/neoantigens/{mhc}/{S.sample}.{S.sequencing_type}.xlsx",
- S=units.loc[samples[samples.type == "tumor"]["sample"]]
- .drop_duplicates(["sample", "sequencing_type"])
- .itertuples(),
- mhc=list(
- filter(
- None,
- [
- "netMHCpan" if is_activated("affinity/netMHCpan") else None,
- "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None,
- ],
+ final_output = []
+ for group in pd.unique(samples["group"]):
+ smps = samples.loc[samples["group"] == group, "sample_name"]
+ tumor_aliases = samples.loc[
+ (samples["group"] == group) & (samples["alias"].str.match("tumor")),
+ "alias",
+ ]
+ if config["neoantigen_prediction"]["activate"]:
+ final_output.extend(
+ expand(
+ "results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}",
+ group=group,
+ tumor_alias=tumor_aliases,
+ mhc=["I", "II"],
)
- ),
- )
- else:
- if config["HLAtyping"]["HLA_LA"]["activate"]:
- final_output = expand(
- [
- "results/optitype/{sample}/hla_alleles_{sample}.tsv",
- "results/HLA-LA/hlaI_{sample}.tsv",
- "results/HLA-LA/hlaII_{sample}.tsv",
- ],
- sample=samples["sample"],
)
+ # sequencing_types = pd.unique(
+ # units.loc[units["sample_name"].isin(smps), "sequencing_type"]
+ # )
+ # final_output.extend(
+ # expand(
+ # "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv",
+ # group=group,
+ # tumor_alias=tumor_aliases,
+ # mhc=list(
+ # filter(
+ # None,
+ # [
+ # "net_mhc_pan"
+ # if is_activated("params/net_mhc_pan")
+ # else None,
+ # "net_mhc_two_pan"
+ # if is_activated("params/net_mhc_two_pan")
+ # else None,
+ # ],
+ # )
+ # ),
+ # seqtype=sequencing_types,
+ # )
+ # )
else:
final_output = expand(
- "results/optitype/{sample}/hla_alleles_{sample}.tsv",
- sample=samples["sample"],
+ [
+ "results/hla_la/{group}.{tumor_alias}.hlaI.tsv",
+ "results/hla_la/{group}.{tumor_alias}.hlaII.tsv",
+ ],
+ group=group,
+ tumor_alias=tumor_aliases,
)
- return final_output
-
-
-def get_fusion_output():
- if config["fusion"]["arriba"]["activate"]:
- fusion_output = expand(
- "results/fusion/arriba/{sample}.fusions.tsv",
- sample=units[units["sequencing_type"] == "RNA"]["sample"],
- )
- else:
- fusion_output = []
- return fusion_output
-
-def get_tmb_targets():
- if is_activated("tmb"):
- return expand(
- "results/plots/tmb/{group}.{mode}.svg",
- group=samples[(samples.type == "tumor")]["sample"],
- mode=config["tmb"].get("mode", "curve"),
- )
- else:
- return []
+ return final_output
caller = list(
@@ -106,47 +163,6 @@ caller = list(
### helper functions ###
-## alignment ##
-
-
-def get_cutadapt_input(wildcards):
- unit = units.loc[wildcards.sample].loc[wildcards.unit].loc[wildcards.seqtype]
-
- if pd.isna(unit["fq1"]):
- # SRA sample (always paired-end for now)
- accession = unit["sra"]
- return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2])
-
- if unit["fq1"].endswith("gz"):
- ending = ".gz"
- else:
- ending = ""
-
- if pd.isna(unit["fq2"]):
- # single end local sample
- return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format(
- S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending
- )
- else:
- # paired end local sample
- return expand(
- "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format(
- S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending
- ),
- read=["fq1", "fq2"],
- )
-
-
-def get_cutadapt_pipe_input(wildcards):
- pattern = (
- units.loc[wildcards.sample]
- .loc[wildcards.seqtype]
- .loc[wildcards.unit, wildcards.fq]
- )
- files = list(sorted(glob.glob(pattern)))
- assert len(files) > 0, "no files found at {}".format(pattern)
- return files
-
def is_paired_end(sample, seqtype):
sample_units = units.loc[sample].loc[seqtype]
@@ -163,249 +179,82 @@ def is_paired_end(sample, seqtype):
return all_paired
-def get_fastqs(wc):
- if config["trimming"]["activate"]:
- return expand(
- "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz",
- unit=units.loc[wc.seqtype].loc[wc.sample, "unit_name"],
- sample=wc.sample,
- read=wc.read,
- seqtype=wc.seqtype,
- )
- unit = units.loc[wc.sample].loc[wc.seqtype]
- if all(pd.isna(unit["fq1"])):
- # SRA sample (always paired-end for now)
- accession = unit["sra"]
- return expand(
- "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1]
- )
- fq = "fq{}".format(wc.read[-1])
- return units.loc[wc.sample].loc[wc.seqtype, fq].tolist()
-
-
-def get_map_reads_input(wildcards):
- if is_paired_end(wildcards.sample, "DNA"):
- return [
- "results/merged/DNA/{sample}_R1.fastq.gz",
- "results/merged/DNA/{sample}_R2.fastq.gz",
- ]
- return "results/merged/DNA/{sample}_single.fastq.gz"
-
-
-def get_read_group(wildcards):
- """Denote sample name and platform in read group."""
- return r"-R '@RG\tID:{sample}\tSM:{sample}\tPL:{platform}'".format(
- sample=wildcards.sample, platform=samples.loc[wildcards.sample, "platform"]
- )
-
-
-def get_recalibrate_quality_input(wildcards, bai=False):
- ext = ".bai" if bai else ""
- if is_activated("remove_duplicates"):
- return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext)
- else:
- return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext)
-
-
-## HLA Typing ##
-
-
-def get_optitype_reads_input(wildcards):
- if is_activated("HLAtyping/optitype_prefiltering"):
- if is_paired_end(wildcards.sample, "DNA"):
- return expand(
- "results/razers3/fastq/{sample}_{fq}.fished.fastq",
- sample=wildcards.sample,
- fq=["R1", "R2"],
- )
- return "results/razers3/fastq/{sample}_single.fastq"
- else:
- return get_map_reads_input(wildcards)
-
-
-def get_oncoprint_batch(wildcards):
- if wildcards.batch == "all":
- groups = samples[samples["type"] == "tumor"]["sample"].unique()
- else:
- groups = samples.loc[
- samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch,
- "group",
- ].unique()
- return expand(
- "results/merged-calls/{group}.{{event}}.fdr-controlled.bcf", group=groups
- )
-
-
-## variant calls ##
-
-
-def get_annotated_bcf(wildcards):
- selection = ".annotated"
- return "results/calls/{pair}.{scatteritem}{selection}.bcf".format(
- pair=wildcards.pair, selection=selection, scatteritem=wildcards.scatteritem
- )
-
-
-def get_scattered_calls(ext=".bcf"):
- def inner(wildcards):
- return expand(
- "results/calls/{{pair}}.{caller}.{{scatteritem}}.sorted{ext}",
- caller=caller,
- ext=ext,
- )
-
- return inner
-
-
-def get_fdr_control_params(wildcards):
- query = config["calling"]["fdr-control"]["events"][wildcards.event]
- threshold = query.get(
- "threshold", config["calling"]["fdr-control"].get("threshold", 0.05)
- )
- events = query["varlociraptor"]
- return {"threshold": threshold, "events": events}
-
-
-def get_pair_variants(wildcards, index):
- if index:
- ext = ".csi"
- else:
- ext = ""
- variants = [
- "results/strelka/somatic/{}/results/variants/somatic.complete.tumor.bcf{}".format(
- wildcards.sample, ext
- )
- ]
- variants.append(
- "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format(
- get_normal(wildcards), ext
- )
- )
- return variants
-
+def get_sample_from_group_and_alias(group, alias):
+ sample = samples.loc[
+ (samples["group"] == group) & (samples["alias"] == alias), "sample_name"
+ ].squeeze()
+ return sample
-def get_pair_observations(wildcards):
- return expand(
- "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf",
- caller=wildcards.caller,
- pair=wildcards.pair,
- scatteritem=wildcards.scatteritem,
- sample=get_paired_samples(wildcards),
- )
-
-def get_merge_input(ext=".bcf"):
+def get_bam_from_group_and_alias(ext=".bam"):
def inner(wildcards):
- return expand(
- "results/calls/{{pair}}.{vartype}.{{event}}.fdr-controlled{ext}",
- ext=ext,
- vartype=["SNV", "INS", "DEL", "MNV"],
- filter=config["calling"]["fdr-control"]["events"][wildcards.event],
+ alias = wildcards.get(
+ "alias",
+ wildcards.get("tumor_alias", wildcards.get("normal_alias", "unknown")),
)
+ if alias == "unknown":
+ raise CustomException(
+ "get_bam_from_group_and_alias() requires one of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'."
+ )
+ sample = get_sample_from_group_and_alias(wildcards.group, alias)
+ return f"results/recal/{sample}.sorted{ext}"
return inner
-def get_pair_aliases(wildcards):
- return [
- samples.loc[samples.loc[wildcards.pair, "matched_normal"], "type"],
- samples.loc[wildcards.pair, "type"],
- ]
-
-
-def get_tabix_params(wildcards):
- if wildcards.format == "vcf":
- return "-p vcf"
- if wildcards.format == "txt":
- return "-s 1 -b 2 -e 2"
- raise ValueError("Invalid format for tabix: {}".format(wildcards.format))
-
-
-## RNA ##
-
-
-def get_quant_reads_input(wildcards):
- if is_paired_end(wildcards.sample, "RNA"):
- return [
- "results/merged/RNA/{sample}_R1.fastq.gz",
- "results/merged/RNA/{sample}_R2.fastq.gz",
- ]
- return "results/merged/RNA/{sample}_single.fastq.gz"
-
-
-def kallisto_params(wildcards, input):
- extra = config["params"]["kallisto"]
- if len(input.fastq) == 1:
- extra += " --single"
- extra += (
- " --fragment-length {unit.fragment_len_mean} " "--sd {unit.fragment_len_sd}"
- ).format(unit=units.loc[(wildcards.sample, wildcards.unit)])
- else:
- extra += " --fusion"
- return extra
-
-
-## helper functions ##
-
-
-def get_paired_samples(wildcards):
- return [
- samples.loc[(wildcards.pair), "matched_normal"],
- samples.loc[wildcards.pair, "sample"],
- ]
-
-
-def get_paired_bams(wildcards):
- return expand(
- "results/recal/{sample}.sorted.bam", sample=get_paired_samples(wildcards)
- )
-
-
-def get_paired_bais(wildcards):
+def get_alleles_MHCI(wildcards):
+ alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
return expand(
- "results/recal/{sample}.sorted.bam.bai", sample=get_paired_samples(wildcards)
+ "results/hla_la/{group}.{alias}.hlaI.tsv",
+ group=wildcards.group,
+ alias=alias,
)
-def get_normal(wildcards):
- return samples.loc[(wildcards.sample), "matched_normal"]
-
-
-def get_reads(wildcards):
- return get_seperate(wildcards.sample, wildcards.group)
-
-
-def get_seperate(sample, group):
- return units.loc[(sample, "DNA"), "fq{}".format(str(group))]
-
-
-def get_proteome(wildcards):
+def get_alleles_MHCII(wildcards):
+ alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias
return expand(
- "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin",
- normal=get_normal(wildcards),
- mhc=wildcards.mhc,
+ # TODO: check that hlaII is correct here, and not hlaI which it previously was
+ "results/hla_la/{group}.{alias}.hlaII.tsv",
+ group=wildcards.group,
+ alias=alias,
)
-def get_alleles_MHCI(wildcards):
- if wildcards.group == "wt":
- return "results/optitype/{S}/hla_alleles_{S}.tsv".format(
- S=get_normal(wildcards)
- )
- else:
- return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample)
-
-
-def get_alleles_MHCII(wildcards):
- if wildcards.group == "wt":
- return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards))
- else:
- return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample)
-
-
-def get_normal_bam(wildcards):
- return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards))
-
-
-def get_normal_bai(wildcards):
- return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards))
+##### Other stuff ####
+
+neofox_important_cols = {
+ "general": [
+ "gene",
+ "mutation_mutatedXmer",
+ "mutation_wildTypeXmer",
+ "purity_adjusted_DNA_VAF",
+ "imputedGeneExpression",
+ ],
+ "I": [
+ "PRIME_best_rank",
+ "PRIME_best_score",
+ "PRIME_best_peptide",
+ "PRIME_best_allele",
+ "Recognition_Potential_MHCI_9mer",
+ "Improved_Binder_MHCI",
+ "Selfsimilarity_MHCI_conserved_binder",
+ "Best_rank_MHCI_9mer_score",
+ "Best_rank_MHCI_9mer_score_WT",
+ "Best_rank_MHCI_9mer_epitope",
+ "Best_rank_MHCI_9mer_epitope_WT",
+ "Best_rank_MHCI_9mer_allele",
+ "Best_rank_MHCI_9mer_allele_WT",
+ ],
+ "II": [
+ "MixMHC2pred_best_rank",
+ "MixMHC2pred_best_peptide",
+ "MixMHC2pred_best_allele",
+ "Best_rank_MHCII_score",
+ "Best_rank_MHCII_score_WT",
+ "Best_rank_MHCII_score_epitope",
+ "Best_rank_MHCII_score_epitope_WT",
+ "Best_rank_MHCII_score_allele",
+ "Best_rank_MHCII_score_allele_WT",
+ ]
+}
\ No newline at end of file
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
new file mode 100644
index 00000000..d09c9cd7
--- /dev/null
+++ b/workflow/rules/datavzrd.smk
@@ -0,0 +1,51 @@
+rule prepare_neoprint:
+ input:
+ neopeptides="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv",
+ output:
+ mhc_one="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.I.sorted.tsv",
+ mhc_two="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.II.sorted.tsv",
+ log:
+ "logs/prepare_neoprint/{group}.{tumor_alias}.log",
+ params:
+ purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze(),
+ neofox_important_cols=neofox_important_cols,
+ conda:
+ "../envs/pandas.yaml"
+ script:
+ "../scripts/prepare_neoprint.py"
+
+
+rule render_datavzrd_neoprint_config:
+ input:
+ template=workflow.source_path(
+ "../resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml"
+ ),
+ neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv",
+ output:
+ "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml",
+ params:
+ neofox_important_cols=neofox_important_cols,
+ log:
+ "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log",
+ template_engine:
+ "yte"
+
+
+rule datavzrd_neoprint:
+ input:
+ neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv",
+ config="resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml",
+ output:
+ report(
+ directory("results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}"),
+ htmlindex="index.html",
+ caption="../report/neopeptides.rst",
+ category="Neopeptides",
+ labels=lambda wc: {"group": wc.group, "sample_type": wc.tumor_alias, "MHC": wc.mhc},
+ ),
+ conda:
+ "../envs/datavzrd.yaml"
+ log:
+ "logs/datavzrd_neoprint/{group}.{tumor_alias}.{mhc}.log",
+ shell:
+ "datavzrd {input.config} --output {output} &> {log}"
diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
deleted file mode 100644
index f32c203d..00000000
--- a/workflow/rules/filtering.smk
+++ /dev/null
@@ -1,106 +0,0 @@
-rule filter_by_annotation:
- input:
- "{prefix}.bcf",
- output:
- "{prefix}.{filter}.filtered_ann.bcf",
- log:
- "logs/filter-calls/annotation/{prefix}.{filter}.log",
- params:
- filter=lambda w: config["calling"]["filter"][w.filter],
- conda:
- "../envs/vembrane.yaml"
- shell:
- 'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}'
-
-
-rule filter_odds:
- input:
- get_annotated_bcf,
- output:
- "results/calls/{pair}.{event}.{scatteritem}.filtered_odds.bcf",
- params:
- events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][
- "varlociraptor"
- ],
- log:
- "logs/filter-calls/posterior_odds/{pair}.{scatteritem}.{event}.log",
- conda:
- "../envs/varlociraptor.yaml"
- shell:
- "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}"
-
-
-rule gather_calls:
- input:
- calls=gather.calling(
- "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf"
- ),
- idx=gather.calling(
- "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi"
- ),
- output:
- "results/calls/{pair}.{event}.filtered_odds.bcf",
- log:
- "logs/gather-calls/{pair}.{event}.log",
- params:
- "-a -Ob",
- wrapper:
- "0.67.0/bio/bcftools/concat"
-
-
-rule control_fdr:
- input:
- "results/calls/{pair}.{event}.filtered_odds.bcf",
- output:
- "results/calls/{pair}.{vartype}.{event}.fdr-controlled.bcf",
- log:
- "logs/control-fdr/{pair}.{vartype}.{event}.log",
- params:
- query=get_fdr_control_params,
- conda:
- "../envs/varlociraptor.yaml"
- shell:
- "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} "
- "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}"
-
-
-rule merge_calls:
- input:
- calls=get_merge_input(".bcf"),
- idx=get_merge_input(".bcf.csi"),
- output:
- "results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
- log:
- "logs/merge-calls/{pair}.{event}.log",
- params:
- "-a -Ob",
- wrapper:
- "0.59.2/bio/bcftools/concat"
-
-
-rule change_samplenames:
- input:
- call="results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
- output:
- temp("results/merged-calls/{pair}.{event}.renaming.txt"),
- log:
- "logs/change-samplenames/{pair}.{event}.log",
- params:
- prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0],
- shell:
- "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}"
-
-
-rule reheader_varlociraptor:
- input:
- vcf="results/merged-calls/{pair}.{event}.fdr-controlled.bcf",
- samples="results/merged-calls/{pair}.{event}.renaming.txt",
- output:
- "results/merged-calls/{pair}.{event}.reheader.bcf",
- log:
- "logs/reheader-calls/{pair}.{event}.log",
- params:
- extra="",
- view_extra="-O b",
- wrapper:
- "0.60.0/bio/bcftools/reheader"
diff --git a/workflow/rules/hla_typing.smk b/workflow/rules/hla_typing.smk
new file mode 100644
index 00000000..98034abf
--- /dev/null
+++ b/workflow/rules/hla_typing.smk
@@ -0,0 +1,69 @@
+rule hla_la:
+ input:
+ bam=get_bam_from_group_and_alias(),
+ bai=get_bam_from_group_and_alias(ext=".bai"),
+ index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
+ ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac",
+ output:
+ "results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+ threads: 7
+ log:
+ "logs/hla_la/{group}_{alias}.log",
+ params:
+ graph=lambda w, input: os.path.basename(os.path.dirname(input.index)),
+ graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)),
+ workdir=lambda w, output: os.path.dirname(
+ os.path.dirname(os.path.dirname(output[0]))
+ ),
+ conda:
+ "../envs/hla_la.yaml"
+ shell:
+ "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1"
+
+
+rule net_mhc_pan_alleles:
+ output:
+ mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt",
+ conda:
+ "../envs/tcsh.yaml"
+ log:
+ "logs/net_mhc_pan/available_alleles.net_mhc_pan.log",
+ params:
+ net_mhc=config["params"]["net_mhc_pan"]["location"],
+ shell:
+ "{params.net_mhc}/netMHCpan -listMHC > {output.mhc_one_alleles} 2> {log}"
+
+
+rule net_mhc_two_pan_alleles:
+ output:
+ mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt",
+ conda:
+ "../envs/tcsh.yaml"
+ log:
+ "logs/net_mhc_pan/available_alleles.net_mhc_two_pan.log",
+ params:
+ net_mhc=config["params"]["net_mhc_two_pan"]["location"],
+ shell:
+ "{params.net_mhc}/netMHCIIpan -list > {output.mhc_two_alleles} 2> {log}"
+
+
+rule parse_and_filter_hla_alleles_for_netmhc:
+ input:
+ hla_la_bestguess="results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt",
+ mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt",
+ mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt",
+ output:
+ hlaI=report(
+ "results/hla_la/{group}.{alias}.hlaI.tsv",
+ caption="../report/hla_alleles.rst",
+ category="HLA alleles",
+ ),
+ hlaII=report(
+ "results/hla_la/{group}.{alias}.hlaII.tsv",
+ caption="../report/hla_alleles.rst",
+ category="HLA alleles",
+ ),
+ log:
+ "logs/parse_hla_la/{group}.{alias}.log",
+ script:
+ "../scripts/parse_and_filter_hla_alleles_for_netmhc.py"
diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
deleted file mode 100644
index 8cfcb51c..00000000
--- a/workflow/rules/mapping.smk
+++ /dev/null
@@ -1,70 +0,0 @@
-rule map_reads:
- input:
- reads=get_map_reads_input,
- idx=rules.bwa_index.output,
- output:
- temp("results/mapped/{sample}.sorted.bam"),
- log:
- "logs/bwa_mem/{sample}.log",
- params:
- index=lambda w, input: os.path.splitext(input.idx[0])[0],
- extra=get_read_group,
- sort="samtools",
- sort_order="coordinate",
- threads: 8
- wrapper:
- "0.56.0/bio/bwa/mem"
-
-
-rule mark_duplicates:
- input:
- "results/mapped/{sample}.sorted.bam",
- output:
- bam=temp("results/dedup/{sample}.sorted.bam"),
- metrics="results/qc/dedup/{sample}.metrics.txt",
- log:
- "logs/picard/dedup/{sample}.log",
- params:
- config["params"]["picard"]["MarkDuplicates"],
- wrapper:
- "0.39.0/bio/picard/markduplicates"
-
-
-rule recalibrate_base_qualities:
- input:
- bam=get_recalibrate_quality_input,
- bai=lambda w: get_recalibrate_quality_input(w, bai=True),
- ref="resources/genome.fasta",
- ref_dict="resources/genome.dict",
- ref_fai="resources/genome.fasta.fai",
- known="resources/variation.noiupac.vcf.gz",
- tbi="resources/variation.noiupac.vcf.gz.tbi",
- output:
- recal_table=temp("results/recal/{sample}.grp"),
- params:
- extra=config["params"]["gatk"]["BaseRecalibrator"],
- java_opts="",
- log:
- "logs/gatk/baserecalibrator/{sample}.log",
- threads: 8
- wrapper:
- "0.62.0/bio/gatk/baserecalibratorspark"
-
-
-rule apply_bqsr:
- input:
- bam=get_recalibrate_quality_input,
- bai=lambda w: get_recalibrate_quality_input(w, bai=True),
- ref="resources/genome.fasta",
- ref_dict="resources/genome.dict",
- ref_fai="resources/genome.fasta.fai",
- recal_table="results/recal/{sample}.grp",
- output:
- bam=protected("results/recal/{sample}.sorted.bam"),
- log:
- "logs/gatk/gatk_applybqsr/{sample}.log",
- params:
- extra=config["params"]["gatk"]["applyBQSR"], # optional
- java_opts="", # optional
- wrapper:
- "0.62.0/bio/gatk/applybqsr"
diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk
new file mode 100644
index 00000000..535015fc
--- /dev/null
+++ b/workflow/rules/mhc_binding.smk
@@ -0,0 +1,109 @@
+rule net_mhc_pan:
+ input:
+ peptides=expand(
+ "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa",
+ peptide_length=config["params"]["net_mhc_pan"]["peptide_len"],
+ ),
+ alleles=get_alleles_MHCI,
+ output:
+ "results/net_mhc_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+ log:
+ "logs/net_mhc_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+ conda:
+ "../envs/tcsh.yaml"
+ params:
+ extra=config["params"]["net_mhc_pan"]["extra"],
+ netMHC=config["params"]["net_mhc_pan"]["location"],
+ length=config["params"]["net_mhc_pan"]["peptide_len"],
+ alleles=lambda wc, input: ",".join(
+ pd.read_csv(input.alleles[0], header=None)[0]
+ ),
+ shell:
+ "( "
+ "if [ -s {input.peptides} ]; "
+ "then "
+ " {params.netMHC}/netMHCpan {params.extra} -BA -s -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+ "else "
+ " touch {output}; "
+ "fi "
+ " ) 2> {log}"
+
+
+rule net_mhc_two_pan:
+ input:
+ peptides=expand(
+ "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa",
+ peptide_length=config["params"]["net_mhc_two_pan"]["peptide_len"],
+ ),
+ alleles=get_alleles_MHCII,
+ output:
+ "results/net_mhc_two_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv",
+ log:
+ "logs/net_mhc_two_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log",
+ conda:
+ "../envs/tcsh.yaml"
+ params:
+ extra=config["params"]["net_mhc_two_pan"]["extra"],
+ netMHC=config["params"]["net_mhc_two_pan"]["location"],
+ length=config["params"]["net_mhc_two_pan"]["peptide_len"],
+ alleles=lambda wc, input: ",".join(
+ pd.read_csv(input.alleles[0], header=None)[0]
+ ),
+ shell:
+ "( "
+ "if [ -s {input.peptides} ]; "
+ "then "
+ " {params.netMHC}/netMHCIIpan {params.extra} -BA -s -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; "
+ "else "
+ " touch {output}; "
+ "fi "
+ " ) 2> {log}"
+
+
+rule tidy_mhc_out:
+ input:
+ expand(
+ "results/{{mhc}}/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv",
+ contig=contigs,
+ ),
+ output:
+ joined_mhc_out="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv",
+ log:
+ "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log",
+ script:
+ "../scripts/tidy_mhc_output.py"
+
+
+rule merge_neoantigen_info:
+ input:
+ info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv",
+ neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv",
+ normal="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.normal.tsv",
+ output:
+ report(
+ "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
+ caption="../report/neoantigens.dna.rst",
+ category="Neopeptides",
+ ),
+ log:
+ "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
+ script:
+ "../scripts/merge_neoantigen_info.py"
+
+
+rule add_rna_info:
+ input:
+ counts="results/kallisto/{group}.{tumor_alias}",
+ table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv",
+ output:
+ report(
+ "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv",
+ caption="../report/neoantigens.rna.rst",
+ category="Neopeptides",
+ ),
+ params:
+ abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts),
+ log:
+ "logs/add_rna/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log",
+ script:
+ "../scripts/add_rna_info.py"
diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk
index bd9e6510..a52d72ef 100644
--- a/workflow/rules/microphaser.smk
+++ b/workflow/rules/microphaser.smk
@@ -1,117 +1,175 @@
-rule microphaser_somatic:
+rule norm_bcf:
input:
- vcf="results/strelka/merged/{sample}/all_variants.norm.annotated.bcf",
- bam="results/recal/{sample}.sorted.bam",
- bai="results/recal/{sample}.sorted.bam.bai",
+ "results/final-calls/{group}.{set}.bcf",
+ ref="resources/genome.fasta",
+ output:
+ "results/final-calls/{group}.{set}.norm.bcf",
+ log:
+ "logs/bcftools/norm/{group}.{set}.log",
+ params:
+ extra="-m-"
+ wrapper:
+ "v1.12.0/bio/bcftools/norm"
+
+
+rule add_somatic_flag:
+ input:
+ bcf="results/final-calls/{group}.{set}.norm.bcf",
+ header_line="resources/somatic_flag_header_line.txt",
+ flag_bed="resources/genome.somatic_flag.bed.gz",
+ flag_bed_idx="resources/genome.somatic_flag.bed.gz.tbi",
+ output:
+ "results/final-calls/{group}.{set}.somatic_flag.norm.bcf",
+ log:
+ "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log",
+ conda:
+ "../envs/bcftools.yaml"
+ shell:
+ "( bcftools annotate "
+ " --annotations {input.flag_bed} "
+ " --mark-sites +SOMATIC "
+ " --columns CHROM,FROM,TO "
+ " --header-lines {input.header_line} "
+ " -O b -o {output} "
+ " {input.bcf} "
+ ") 2> {log}"
+
+
+rule merge_tumor_normal:
+ input:
+ calls=expand(
+ "results/final-calls/{{group}}.{sets}.norm.bcf",
+ sets=[
+ config["params"]["microphaser"]["events"]["normal"],
+ config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag",
+ ],
+ ),
+ index=expand(
+ "results/final-calls/{{group}}.{sets}.norm.bcf.csi",
+ sets=[
+ config["params"]["microphaser"]["events"]["normal"],
+ config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag",
+ ],
+ ),
+ output:
+ "results/final-calls/{group}.merged_tumor_normal.norm.bcf",
+ log:
+ "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log",
+ params:
+ extra="-a",
+ wrapper:
+ "v1.14.1/bio/bcftools/concat"
+
+
+rule microphaser_tumor:
+ input:
+ bcf="results/final-calls/{group}.merged_tumor_normal.norm.annotated.bcf",
+ bam=get_bam_from_group_and_alias(),
+ bai=get_bam_from_group_and_alias(ext=".bai"),
track="resources/annotation/{contig}.gtf",
ref="resources/genome.fasta",
output:
- mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.mt.fa",
- wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.wt.fa",
- tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv",
+ mt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa",
+ wt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa",
+ tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
log:
- "logs/microphaser/somatic/{sample}-{contig}.log",
+ "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log",
conda:
"../envs/microphaser.yaml"
params:
- window_length=config["params"]["microphaser"]["window_len"],
+ window_length=lambda wc: int(wc.peptide_length) * 3,
shell:
- "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
+ "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
"< {input.track} > {output.mt_fasta} 2> {log}"
-rule microphaser_germline:
+rule microphaser_normal:
input:
- vcf="results/strelka/germline/{normal}/results/variants/variants.reheader.norm.bcf",
- bam="results/recal/{normal}.sorted.bam",
- bai="results/recal/{normal}.sorted.bam.bai",
+ bcf="results/final-calls/{group}.{normal_set}.variants.reheader.norm.bcf",
+ bam=get_bam_from_group_and_alias(),
+ bai=get_bam_from_group_and_alias(ext=".bai"),
track="resources/annotation/{contig}.gtf",
ref="resources/genome.fasta",
output:
wt_fasta=(
- "results/microphaser/fasta/germline/{normal}/{normal}.germline.{contig}.fa"
+ "results/microphaser/fasta/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.fa"
),
wt_tsv=(
- "results/microphaser/info/germline/{normal}/{normal}.germline.{contig}.tsv"
+ "results/microphaser/info/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.tsv"
),
log:
- "logs/microphaser/germline/{normal}-{contig}.log",
+ "logs/microphaser_normal/contigs/{group}/{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.log",
conda:
"../envs/microphaser.yaml"
params:
- window_length=config["params"]["microphaser"]["window_len"],
+ window_length=lambda wc: int(wc.peptide_length) * 3,
shell:
- "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
+ "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
"< {input.track} > {output.wt_fasta} 2> {log}"
-rule concat_proteome:
+rule concat_normal_proteome:
input:
expand(
- "results/microphaser/fasta/germline/{{normal}}/{{normal}}.germline.{contig}.fa",
+ "results/microphaser/fasta/contigs/{{group}}.normal.{{normal_set}}.pep_len_{{peptide_length}}.{contig}.fa",
contig=contigs,
),
output:
- "results/microphaser/fasta/germline/{normal}/reference_proteome.fa",
+ "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa",
log:
- "logs/microphaser/concat-ref-proteome/{normal}.log",
+ "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.pep_len_{peptide_length}.log",
shell:
"cat {input} > {output} 2> {log}"
-rule build_germline_proteome:
+rule build_normal_proteome_db:
input:
- "results/microphaser/fasta/germline/{normal}/reference_proteome.fa",
+ "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa",
output:
- bin="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin",
- fasta="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.peptides.fasta",
+ bin="results/microphaser/bin/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.bin",
+ fasta="results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.peptides.fasta",
log:
- "logs/microphaser/build-ref-proteome-db/{normal}-{mhc}.log",
+ "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}.pep_len_{peptide_length}.log",
conda:
"../envs/microphaser.yaml"
- params:
- length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][
- wildcards.mhc
- ],
shell:
- "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1"
+ "( microphaser build_reference -r {input} -o {output.bin} -l {wildcards.peptide_length} > {output.fasta} ) 2> {log}"
rule microphaser_filter:
input:
- tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv",
- proteome=get_proteome,
+ tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
+ proteome=expand(
+ "results/microphaser/bin/{{group}}.{normal_set}.normal_proteome.pep_len_{{peptide_length}}.bin",
+ normal_set=config["params"]["microphaser"]["events"]["normal"],
+ ),
output:
mt_fasta=(
- "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.mt.fa"
+ "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa"
),
wt_fasta=(
- "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.wt.fa"
+ "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa"
),
- tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv",
- removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv",
+ tsv="results/microphaser/info/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv",
+ removed="results/microphaser/info/removed/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.removed.tsv",
log:
- "logs/microphaser/filter/{sample}-{mhc}-{contig}.log",
+ "logs/microphaser_filter/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log",
conda:
"../envs/microphaser.yaml"
- params:
- length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][
- wildcards.mhc
- ],
shell:
- "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}"
+ "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {wildcards.peptide_length} > {output.mt_fasta} 2>{log}"
rule concat_tsvs:
input:
expand(
- "results/microphaser/info/{{sample}}/filtered/{{mhc}}/{{sample}}.{contig}.tsv",
+ "results/microphaser/info/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{{peptide_length}}.{contig}.tsv",
contig=contigs,
),
output:
- "results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv",
+ "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv",
log:
- "logs/concat-tsv/{sample}-{mhc}.log",
+ "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log",
conda:
"../envs/xsv.yaml"
shell:
diff --git a/workflow/rules/oncoprint.smk b/workflow/rules/oncoprint.smk
deleted file mode 100644
index aa670044..00000000
--- a/workflow/rules/oncoprint.smk
+++ /dev/null
@@ -1,28 +0,0 @@
-rule build_oncoprint_table:
- input:
- bcf=get_oncoprint_batch,
- output:
- "plots/oncoprint/{batch}.{event}.tsv",
- log:
- "logs/oncoprint/{batch}.{event}.table.log",
- conda:
- "../envs/oncoprinttable.yaml"
- script:
- "../scripts/build_oncoprint_matrix.py"
-
-
-rule plot_oncoprint:
- input:
- "plots/oncoprint/{batch}.{event}.tsv",
- output:
- report(
- "plots/oncoprint/{batch}.{event}.pdf",
- category="Oncoprint",
- caption="../report/oncoprint.rst",
- ),
- log:
- "logs/oncoprint/{batch}.{event}.plot.log",
- conda:
- "../envs/oncoprint.yaml"
- script:
- "../scripts/oncoprint.R"
diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk
index 872247ac..d7279042 100644
--- a/workflow/rules/phylogeny.smk
+++ b/workflow/rules/phylogeny.smk
@@ -1,7 +1,7 @@
def get_somatic_calls(wildcards):
return expand(
- "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
- sample=samples[samples.type == "tumor"]["sample"],
+ "results/final-calls/somatic/{sample}/results/variants/somatic.complete.tumor.bcf",
+ sample=samples[samples.alias == "tumor"]["sample_name"],
)
@@ -9,20 +9,20 @@ rule merge_snvs:
input:
calls=get_somatic_calls,
output:
- "results/strelka/merged_calls.vcf",
+ "results/final-calls/merged_calls.vcf",
log:
"results/logs/bcftools/merge.log",
params:
- "--use-header strelka/sampleheader.txt --force-samples",
+ "--use-header final-calls/sampleheader.txt --force-samples",
wrapper:
- "0.36.0/bio/bcftools/merge"
+ "v1.12.0/bio/bcftools/merge"
rule query:
input:
- "results/strelka/merged_calls.vcf",
+ "results/final-calls/merged_calls.vcf",
output:
- "results/strelka/call_matrix.tsv",
+ "results/final-calls/call_matrix.tsv",
log:
"results/logs/bcftools/query.log",
params:
@@ -35,7 +35,7 @@ rule query:
rule nj_tree:
input:
- matrix="results/strelka/call_matrix.tsv",
+ matrix="results/final-calls/call_matrix.tsv",
output:
pdf="results/plots/phylogeny_njtree.pdf",
log:
diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
index 86b209c3..80e62380 100644
--- a/workflow/rules/ref.smk
+++ b/workflow/rules/ref.smk
@@ -10,7 +10,7 @@ rule get_genome:
release=config["ref"]["release"],
cache: True
wrapper:
- "0.45.1/bio/reference/ensembl-sequence"
+ "v1.12.0/bio/reference/ensembl-sequence"
rule get_cdna:
@@ -25,21 +25,7 @@ rule get_cdna:
release=config["ref"]["release"],
cache: True
wrapper:
- "0.45.1/bio/reference/ensembl-sequence"
-
-
-rule kallisto_index:
- input:
- "resources/genome.cdna.fasta",
- output:
- "resources/kallisto/transcripts.idx",
- params:
- extra="",
- log:
- "logs/kallisto/index.log",
- cache: True
- wrapper:
- "0.60.1/bio/kallisto/index"
+ "v1.12.0/bio/reference/ensembl-sequence"
rule get_annotation:
@@ -55,29 +41,24 @@ rule get_annotation:
log:
"logs/get-annotation.log",
wrapper:
- "0.45.1/bio/reference/ensembl-annotation"
+ "v1.12.0/bio/reference/ensembl-annotation"
-rule STAR_index:
+# TODO: remove this rule, once microphaser is fixed to make gene_name optional
+rule remove_records_with_gene_name_missing:
input:
- fasta="resources/genome.fasta",
- gtf="resources/genome.gtf",
+ "resources/genome.gtf",
output:
- directory("resources/STAR_index"),
- params:
- sjdb_overhang="100",
- extra="",
+ "resources/genome.records_with_gene_name.gtf",
log:
- "logs/star/index.log",
- threads: 32
- cache: True
- wrapper:
- "0.42.0/bio/star/index"
+ "logs/remove_records_with_gene_name_missing.log",
+ shell:
+ '( grep "gene_name" {input} > {output} ) 2> {log}'
rule split_annotation:
input:
- "resources/genome.gtf",
+ "resources/genome.records_with_gene_name.gtf",
output:
"resources/annotation/{contig}.gtf",
log:
@@ -95,142 +76,126 @@ rule genome_faidx:
"logs/genome-faidx.log",
cache: True
wrapper:
- "0.45.1/bio/samtools/faidx"
+ "v1.12.0/bio/samtools/faidx"
-rule genome_dict:
- input:
- "resources/genome.fasta",
+rule create_somatic_flag_header_line:
output:
- "resources/genome.dict",
+ "resources/somatic_flag_header_line.txt",
log:
- "logs/picard/create-dict.log",
- cache: True
- wrapper:
- "0.45.1/bio/picard/createsequencedictionary"
+ "logs/create_somatic_flag_header_line.log",
+ shell:
+ """
+ ( echo '##INFO=' > {output} ) 2> {log}
+ """
-rule get_callregions:
+rule create_genome_somatic_flag_bed:
input:
"resources/genome.fasta.fai",
output:
- "resources/genome.callregions.bed.gz",
+ "resources/genome.somatic_flag.bed",
log:
- "logs/get-callregions.log",
- params:
- n_contigs=config["ref"]["n_chromosomes"],
+ "logs/create_genome_somatic_flag_bed.log",
conda:
- "../envs/index.yaml"
+ "../envs/gawk.yaml"
+ cache: True
shell:
- "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})"
- " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}"
+ """
+ ( awk 'BEGIN {{ OFS="\\t" }} {{ print $1,0,$2 }}' {input} > {output} ) 2> {log}
+ """
-rule get_known_variants:
+rule bgzip_genome_somatic_flag_bed:
input:
- # use fai to annotate contig lengths for GATK BQSR
- fai="resources/genome.fasta.fai",
+ "resources/genome.somatic_flag.bed",
output:
- vcf="resources/variation.vcf.gz",
+ "resources/genome.somatic_flag.bed.gz",
log:
- "logs/get-known-variants.log",
- params:
- species=config["ref"]["species"],
- release=config["ref"]["release"],
- build=config["ref"]["build"],
- type="all",
- cache: True
+ "logs/bgzip/genome.somatic_flag.log",
wrapper:
- "0.59.2/bio/reference/ensembl-variation"
+ "v1.12.0/bio/bgzip"
-rule remove_iupac_codes:
+rule tabix_genome_somatic_flag_bed:
input:
- "resources/variation.vcf.gz",
+ "resources/genome.somatic_flag.bed.gz",
output:
- "resources/variation.noiupac.vcf.gz",
- log:
- "logs/fix-iupac-alleles.log",
+ "resources/genome.somatic_flag.bed.gz.tbi",
conda:
- "../envs/rbt.yaml"
- cache: True
+ "../envs/htslib.yaml"
+ log:
+ "logs/tabix/genome.somatic_flag.log",
shell:
- "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}"
+ "( tabix -p bed {input} ) 2> {log}"
-rule bwa_index:
+rule genome_dict:
input:
"resources/genome.fasta",
output:
- multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"),
+ "resources/genome.dict",
log:
- "logs/bwa_index.log",
+ "logs/picard/create-dict.log",
cache: True
wrapper:
- "0.45.1/bio/bwa/index"
+ "v1.12.0/bio/picard/createsequencedictionary"
-rule download_HLALA_graph:
+rule download_hla_la_graph:
output:
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"),
- directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping_PRGonly"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/referenceGenomeSimulations"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"),
directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"),
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa",
"resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
+ params:
+ graphs_dir=lambda w, output: output[0].replace(
+ "graphs/PRG_MHC_GRCh38_withIMGT/PRG", "graphs"
+ ),
log:
- "logs/download-HLA-LA-graph.log",
- cache: True
+ "logs/download_hla_la_graph.log",
shell:
- "cd resources/graphs && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
- "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz"
+ "( cd {params.graphs_dir} && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
+ "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}"
-rule index_HLALA:
+rule index_hla_la:
input:
"resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt",
output:
"resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH",
- "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathindex",
- cache: True
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathIndex",
conda:
"../envs/hla_la.yaml"
params:
path=lambda wc, input: os.path.dirname(os.path.dirname(input[0])),
graph=lambda wc, input: os.path.basename(os.path.dirname(input[0])),
log:
- "logs/index-HLA-LA-graph.log",
+ "logs/index_hla_la_graph.log",
shell:
"HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
-rule get_vep_cache:
- output:
- directory("resources/vep/cache"),
- params:
- species=config["ref"]["species"],
- build=config["ref"]["build"],
- release=config["ref"]["release"],
- log:
- "logs/vep/cache.log",
- cache: True
- wrapper:
- "0.59.2/bio/vep/cache"
-
-
-rule get_vep_plugins:
+rule index_hla_la_extended_ref:
+ input:
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa",
output:
- directory("resources/vep/plugins"),
- params:
- release=config["ref"]["release"],
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.amb",
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.ann",
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.bwt",
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac",
+ "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.sa",
+ conda:
+ "../envs/hla_la.yaml"
log:
- "logs/vep/plugins.log",
- cache: True
- wrapper:
- "0.59.2/bio/vep/plugins"
+ "logs/index_hla_la_extended_ref.log",
+ shell:
+ "bwa index {input} > {log} 2>&1"
rule make_sampleheader:
diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk
deleted file mode 100644
index 2ef064a1..00000000
--- a/workflow/rules/tmb.smk
+++ /dev/null
@@ -1,20 +0,0 @@
-if config["tmb"]["activate"]:
-
- rule estimate_tmb:
- input:
- "results/merged-calls/{pair}.somatic.fdr-controlled.bcf",
- output:
- "results/plots/tmb/{pair}.{plotmode}.vl.json",
- conda:
- "../envs/varlociraptor.yaml"
- log:
- "logs/tmb/{pair}-{plotmode}.log",
- params:
- **config["tmb"],
- shell:
- "varlociraptor estimate tmb "
- " --plot-mode {wildcards.plotmode} "
- "--coding-genome-size {params.coding_genome_size} "
- "--somatic-tumor-events {params.somatic_events} "
- "--tumor-sample {params.tumor_sample} "
- "< {input} > {output} 2> {log}"
diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk
deleted file mode 100644
index f26ed6f3..00000000
--- a/workflow/rules/trim.smk
+++ /dev/null
@@ -1,71 +0,0 @@
-rule get_sra:
- output:
- "sra/{accession}_1.fastq",
- "sra/{accession}_2.fastq",
- log:
- "logs/get-sra/{accession}.log",
- wrapper:
- "0.56.0/bio/sra-tools/fasterq-dump"
-
-
-rule cutadapt_pipe:
- input:
- get_cutadapt_pipe_input,
- output:
- pipe("pipe/cutadapt/{sample}/{seqtype}/{unit}.{fq}.{ext}"),
- log:
- "logs/pipe-fastqs/cutadapt/{sample}-{seqtype}-{unit}.{fq}.{ext}.log",
- wildcard_constraints:
- ext=r"fastq|fastq\.gz",
- threads: 0 # this does not need CPU
- shell:
- "cat {input} > {output} 2> {log}"
-
-
-rule cutadapt_pe:
- input:
- get_cutadapt_input,
- output:
- fastq1="results/trimmed/{sample}/{seqtype}/{unit}_R1.fastq.gz",
- fastq2="results/trimmed/{sample}/{seqtype}/{unit}_R2.fastq.gz",
- qc="results/trimmed/{sample}/{seqtype}/{unit}.paired.qc.txt",
- log:
- "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
- params:
- others=config["params"]["cutadapt"],
- adapters=lambda w: str(
- units.loc[w.sample].loc[w.seqtype].loc[w.unit, "adapters"]
- ),
- threads: 8
- wrapper:
- "0.59.2/bio/cutadapt/pe"
-
-
-rule cutadapt_se:
- input:
- get_cutadapt_input,
- output:
- fastq="results/trimmed/{sample}/{seqtype}/{unit}.single.fastq.gz",
- qc="results/trimmed/{sample}/{seqtype}/{unit}.single.qc.txt",
- log:
- "logs/cutadapt/{sample}-{seqtype}-{unit}.log",
- params:
- others=config["params"]["cutadapt"],
- adapters_r1=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]),
- threads: 8
- wrapper:
- "0.59.2/bio/cutadapt/se"
-
-
-rule merge_fastqs:
- input:
- get_fastqs,
- output:
- "results/merged/{seqtype}/{sample}_{read}.fastq.gz",
- log:
- "logs/merge-fastqs/{seqtype}_{sample}_{read}.log",
- wildcard_constraints:
- read="single|R1|R2",
- seqtype="DNA|RNA",
- shell:
- "cat {input} > {output} 2> {log}"
diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk
index e7586ef0..c8b11474 100644
--- a/workflow/rules/utils.smk
+++ b/workflow/rules/utils.smk
@@ -13,38 +13,13 @@ rule bam_index:
input:
"{prefix}.bam",
output:
- "{prefix}.bam.bai",
+ "{prefix}.bai",
log:
"logs/bam-index/{prefix}.log",
wrapper:
"0.59.2/bio/samtools/index"
-rule tabix_known_variants:
- input:
- "resources/{prefix}.{format}.gz",
- output:
- "resources/{prefix}.{format}.gz.tbi",
- log:
- "logs/tabix/{prefix}.{format}.log",
- params:
- get_tabix_params,
- cache: True
- wrapper:
- "0.59.2/bio/tabix"
-
-
-rule gzip_fastq:
- input:
- "{prefix}.fastq",
- output:
- "{prefix}.fastq.gz",
- log:
- "logs/gz-fastq/{prefix}.log",
- shell:
- "gzip < {input} > {output}"
-
-
rule tsv_to_excel:
input:
tsv="results/{x}.tsv",
diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk
deleted file mode 100644
index 4cb6013a..00000000
--- a/workflow/rules/varlociraptor.smk
+++ /dev/null
@@ -1,89 +0,0 @@
-rule render_scenario:
- input:
- config["calling"]["scenario"],
- output:
- report(
- "results/scenarios/{pair}.yaml",
- caption="../report/scenario.rst",
- category="Variant calling scenarios",
- ),
- params:
- samples=samples,
- log:
- "logs/scenarious/{pair}.log",
- conda:
- "../envs/render_scenario.yaml"
- script:
- "../scripts/render-scenario.py"
-
-
-rule varlociraptor_preprocess:
- input:
- ref="resources/genome.fasta",
- ref_idx="resources/genome.fasta.fai",
- candidates="results/candidate-calls/{pair}.{caller}.{scatteritem}.bcf",
- bam="results/recal/{sample}.sorted.bam",
- bai="results/recal/{sample}.sorted.bai",
- output:
- "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf",
- params:
- omit_isize="",
- log:
- "logs/varlociraptor/preprocess/{pair}/{sample}.{caller}.{scatteritem}.log",
- conda:
- "../envs/varlociraptor.yaml"
- shell:
- "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} "
- "{input.ref} --bam {input.bam} --output {output} 2> {log}"
-
-
-rule varlociraptor_call:
- input:
- obs=get_pair_observations,
- scenario="results/scenarios/{pair}.yaml",
- output:
- temp("results/calls/{pair}.{caller}.{scatteritem}.bcf"),
- log:
- "logs/varlociraptor/call/{pair}.{caller}.{scatteritem}.log",
- params:
- obs=lambda w, input: [
- "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs)
- ],
- conda:
- "../envs/varlociraptor.yaml"
- benchmark:
- "benchmarks/varlociraptor/call/{pair}.{caller}.{scatteritem}.tsv"
- shell:
- "varlociraptor "
- "call variants generic --obs {params.obs} "
- "--scenario {input.scenario} > {output} 2> {log}"
-
-
-rule sort_calls:
- input:
- "results/calls/{pair}.{caller}.{scatteritem}.bcf",
- output:
- temp("results/calls/{pair}.{caller}.{scatteritem}.sorted.bcf"),
- log:
- "logs/bcf-sort/{pair}.{caller}.{scatteritem}.log",
- conda:
- "../envs/bcftools.yaml"
- resources:
- mem_mb=8000,
- shell:
- "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` "
- "-Ob {input} > {output} 2> {log}"
-
-
-rule bcftools_concat:
- input:
- calls=get_scattered_calls(),
- indexes=get_scattered_calls(ext=".bcf.csi"),
- output:
- "results/calls/{pair}.{scatteritem}.bcf",
- log:
- "logs/concat-calls/{pair}.{scatteritem}.log",
- params:
- "-a -Ob", # TODO Check this
- wrapper:
- "0.59.2/bio/bcftools/concat"
diff --git a/workflow/rules/vega.smk b/workflow/rules/vega.smk
deleted file mode 100644
index 4199e8fc..00000000
--- a/workflow/rules/vega.smk
+++ /dev/null
@@ -1,15 +0,0 @@
-rule vg2svg:
- input:
- "{prefix}.vl.json",
- output:
- report(
- "{prefix}.svg",
- caption="../report/tmb.rst",
- category="Tumor Mutational Burden",
- ),
- log:
- "logs/vega/{prefix}.log",
- conda:
- "../envs/vega.yaml"
- shell:
- "vl2svg {input} {output} > {log} 2>&1"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index 6fafb49f..05e55b83 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -4,21 +4,6 @@ description: snakemake configuration file
type: object
-definitions:
- filterentry:
- type: object
- additionalProperties:
- type: string
- evententry:
- type: object
- properties:
- varlociraptor:
- type: array
- items:
- type: string
- filter:
- type: string
-
properties:
samples:
type: string
@@ -42,198 +27,61 @@ properties:
- build
- n_chromosomes
-
- tmb:
- type: object
- properties:
- activate:
- type: boolean
- coding_genome_size:
- # TODO allow integer here!
- type: string
- tumor_sample:
- type: string
- somatic_events:
- type: array
- items:
- type: string
- required:
- - activate
- - coding_genome_size
- - tumor_sample
- - somatic_events
-
-
- calling:
- type: object
- properties:
- freebayes:
- type: object
- properties:
- activate:
- type: boolean
- scenario:
- type: string
- fdr-control:
- type: object
- properties:
- threshold:
- type: number
- minimum: 0.0
- maximum: 1.0
- events:
- $ref: "#/definitions/evententry"
- description: "a map of pairs"
- required:
- - threshold
- - events
- required:
- - freebayes
- - scenario
- - fdr-control
-
- remove_duplicates:
- type: object
- properties:
- activate:
- type: boolean
-
- trimming:
- type: object
- properties:
- activate:
- type: boolean
-
- epitope_prediction:
+ neoantigen_prediction:
type: object
properties:
activate:
type: boolean
- affinity:
+ params:
type: object
properties:
- netMHCpan:
+ microphaser:
type: object
properties:
- activate:
- type: boolean
- params:
- type: string
- netMHCIIpan:
+ events:
+ type: object
+ properties:
+ normal:
+ type: string
+ tumor:
+ type: string
+ required:
+ - normal
+ - tumor
+ net_mhc_pan:
type: object
properties:
activate:
type: boolean
- params:
+ peptide_len:
+ type: integer
+ extra:
type: string
-
- HLAtyping:
- type: object
- properties:
- HLA_LA:
- type: object
- properties:
- activate:
- type: boolean
-
- annotations:
- type: object
- properties:
- vep:
- properties:
- params:
+ location:
type: string
- plugins:
- type: array
- items:
- type: string
required:
- - params
- - plugins
- required:
- - vep
-
- fusion:
- type: object
- properties:
- arriba:
+ - activate
+ net_mhc_two_pan:
type: object
properties:
activate:
type: boolean
- blacklist:
- type: string
- params:
- type: string
-
- params:
- type: object
- properties:
- cutadapt:
- type: string
- bwa:
- type: string
- gatk:
- type: object
- properties:
- BaseRecalibrator:
- type: string
- applyBQSR:
- type: string
- required:
- - BaseRecalibrator
- - applyBQSR
- picard:
- type: object
- properties:
- MarkDuplicates:
+ peptide_len:
+ type: integer
+ extra:
type: string
required:
- - MarkDuplicates
- strelka:
- type: object
- properties:
- config:
- type: string
- run:
- type: string
- razers3:
- type: string
- optitype:
- type: string
- microphaser:
- type: object
- properties:
- window_len:
- type: integer
- peptide_len:
- type: object
- properties:
- netMHCpan:
- type: integer
- netMHCIIpan:
- type: integer
- kallisto:
- type: string
- star:
- type: string
+ - activate
required:
- - bwa
- - gatk
- - picard
- - strelka
- - razers3
- microphaser
- - optitype
+ - net_mhc_pan
+ - net_mhc_two_pan
+
required:
- samples
- units
- ref
- - tmb
- - calling
- params
- - annotations
- - epitope_prediction
- - affinity
+ - neoantigen_prediction
diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml
index e626f60f..e795cdee 100644
--- a/workflow/schemas/samples.schema.yaml
+++ b/workflow/schemas/samples.schema.yaml
@@ -2,12 +2,18 @@ $schema: "http://json-schema.org/draft-04/schema#"
description: an entry in the sample sheet
properties:
- sample:
+ sample_name:
type: string
- description: sample name/identifier
- type:
+ description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-')
+ pattern: "^[a-zA-Z_0-9-]+$"
+ alias:
type: string
- description: healthy or tumor sample
+ description: sample name within the VCF/BCF files generated for a group (e.g. tumor, normal, etc.) (alphanumeric string, that may additionally contain '_' and '-')
+ pattern: "^[a-zA-Z_0-9-]+$"
+ group:
+ type: string
+ description: group of samples called jointly (alphanumeric string, that may additionally contain '_' and '-')
+ pattern: "^[a-zA-Z_0-9-]+$"
matched_normal:
type: string
description: the corresponding healthy control to this tumor sample
@@ -18,11 +24,25 @@ properties:
description: Purity to use for tumor/normal groups.
platform:
type: string
- enum: ["CAPILLARY", "LS454", "ILLUMINA", "SOLID", "HELICOS", "IONTORRENT", "ONT", "PACBIO"]
-
+ enum:
+ - "CAPILLARY"
+ - "LS454"
+ - "ILLUMINA"
+ - "SOLID"
+ - "HELICOS"
+ - "IONTORRENT"
+ - "ONT"
+ - "PACBIO"
+ description: used sequencing platform
+ purity:
+ type: number
+ minimum: 0.0
+ maximum: 1.0
+ description: Purity to use for tumor/normal groups.
required:
- - sample
- - type
+ - sample_name
+ - alias
+ - group
- platform
diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml
index 6fb9a6cf..4fec2074 100644
--- a/workflow/schemas/units.schema.yaml
+++ b/workflow/schemas/units.schema.yaml
@@ -2,18 +2,22 @@ $schema: "http://json-schema.org/draft-04/schema#"
description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
type: object
properties:
- sample:
+ sample_name:
type: string
- description: sample name/id the unit has been sequenced from
+ pattern: "^[a-zA-Z_0-9-]+$"
+ description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-')
sequencing_type:
type: string
enum: ["DNA", "RNA"]
- unit:
+ description: type of sequenced material ('DNA' or 'RNA')
+ unit_name:
type: string
- description: unit or lane name
+ pattern: "^[a-zA-Z_0-9-]+$"
+ description: unit id (alphanumeric string, that may additionally contain '_' and '-')
fq1:
type: string
- description: path to FASTQ file
+ pattern: "^[^ \t]+$"
+ description: path to FASTQ file (may not contain whitespace)
fq2:
type: string
description: path to second FASTQ file (leave empty in case of single-end)
@@ -22,9 +26,9 @@ properties:
description: SRA id for automatic download of unit
adapters:
type: string
- description: adapter trimming settings to use (for cutadapt)
+ description: cutadapt adapter trimming settings to use (see https://cutadapt.readthedocs.io)
required:
- - sample
- - unit
+ - sample_name
+ - unit_name
- sequencing_type
diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py
index bdcdcac2..0eb2298e 100644
--- a/workflow/scripts/add_rna_info.py
+++ b/workflow/scripts/add_rna_info.py
@@ -1,13 +1,19 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
import pandas as pd
## load data table
-data = pd.read_csv(snakemake.input["table"], sep='\t')
+data = pd.read_csv(snakemake.input["table"], sep="\t")
## Merge transcript count
-transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t')
+transcript_count = pd.read_csv(snakemake.params["abundance"], sep="\t")
transcript_count = transcript_count[["target_id", "tpm"]]
transcript_count.columns = ["Transcript_ID", "TPM"]
-transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0]
+transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split(
+ ".", expand=True
+)[0]
data = data.merge(transcript_count, on="Transcript_ID", how="left")
-data.to_csv(snakemake.output[0], sep='\t', index=False)
+data.to_csv(snakemake.output[0], sep="\t", index=False)
diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
new file mode 100644
index 00000000..4806d56e
--- /dev/null
+++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py
@@ -0,0 +1,20 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+columns_mapping = {
+ "gene_name": "gene",
+ "normal_peptide": "mutation.wildTypeXmer",
+ "tumor_peptide": "mutation.mutatedXmer",
+ "freq": "dnaVariantAlleleFrequency",
+}
+
+candidates = (
+ pd.read_csv(snakemake.input.microphaser, sep="\t", quoting=3)
+ .rename(columns=columns_mapping)
+ .assign(patientIdentifier=snakemake.wildcards.group)
+)
+
+candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3, index=False)
diff --git a/workflow/scripts/build_oncoprint_matrix.py b/workflow/scripts/build_oncoprint_matrix.py
deleted file mode 100644
index 2b65721b..00000000
--- a/workflow/scripts/build_oncoprint_matrix.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pysam
-import pandas as pd
-import os
-import sys
-
-# redirect output to log file
-log = open(snakemake.log[0], "w")
-sys.stdout = log
-sys.stderr = log
-
-input_files = snakemake.input
-
-df = pd.DataFrame(columns=["Sample"])
-for sample_file in input_files:
- variant_file = pysam.VariantFile(sample_file)
- sample_name = os.path.basename(sample_file).split(".")[0]
- gene_variant_dict = {"Sample": [sample_name]}
- for rec in variant_file.fetch():
- for sample in rec.samples:
- allele_frequencies = rec.samples[sample]["AF"] #Can be multiple entries
- for allele_frequency in allele_frequencies:
- variant = rec.info["SVLEN"]
- if variant[0]:
- variant_type = "INDEL"
- else:
- variant_type = "SNV"
- transcripts = rec.info["ANN"]
- for transcript in transcripts:
- gene = transcript.split("|")[3]
- impact = transcript.split("|")[2]
- if gene and impact != "MODIFIER":
- if gene not in gene_variant_dict:
- gene_variant_dict[gene] = set()
- gene_variant_dict[gene].add(variant_type)
- break
- for key, value in gene_variant_dict.items():
- gene_variant_dict[key] = ','.join(value)
- sample_df = pd.DataFrame(gene_variant_dict, index=[0])
- df = pd.concat([df, sample_df], join="outer", ignore_index=False, sort=False)
-df.set_index("Sample", inplace=True)
-with open(snakemake.output[0], 'w') as output_f:
- print(df.to_csv(sep="\t", index=True), file=output_f)
diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py
index 1b6050e4..eb9c917a 100644
--- a/workflow/scripts/count_neoantigen_occurrences.py
+++ b/workflow/scripts/count_neoantigen_occurrences.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
import pandas as pd
import glob
@@ -5,16 +9,21 @@
dfs, dfs_id = dict(), []
-for f in files:
- df = pd.read_csv(f, sep = '\t')
- dfs[f] = df
- df = df[["ID_tumor"]]
- dfs_id.append(df)
+for f in files:
+ df = pd.read_csv(f, sep="\t")
+ dfs[f] = df
+ df = df[["ID_tumor"]]
+ dfs_id.append(df)
ids = pd.concat(dfs_id)
-ids=ids.groupby(ids.columns.tolist(), as_index = False).size().reset_index().rename(columns = {0:"occurrences"})
-ids.to_csv("out/tables/candidate_occurrences.tsv", sep = '\t', index = False)
+ids = (
+ ids.groupby(ids.columns.tolist(), as_index=False)
+ .size()
+ .reset_index()
+ .rename(columns={0: "occurrences"})
+)
+ids.to_csv("out/tables/candidate_occurrences.tsv", sep="\t", index=False)
for k, v in dfs.items():
- df = v.merge(ids, on = "ID_tumor")
- df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep = ',', index = False)
+ df = v.merge(ids, on="ID_tumor")
+ df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep=",", index=False)
diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py
new file mode 100644
index 00000000..4242d684
--- /dev/null
+++ b/workflow/scripts/create_neo_fox_group_sheet.py
@@ -0,0 +1,64 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+HLA_SUFFIXES_REGEX = r"[NLSCAQ]?"
+
+# allowed loci according to NeoFox input data documentation:
+# https://neofox.readthedocs.io/en/latest/03_01_input_data.html#file-with-patient-data
+# * mhcIAlleles: comma separated MHC I alleles of the patient for HLA-A, HLA-B and
+# HLA-C. If homozygous, the allele should be added twice.
+# * mhcIIAlleles: comma separated MHC II alleles of the patient for HLA-DRB1, HLA-DQA1,
+# HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice.
+ALLOWED_LOCI = {
+ "A",
+ "B",
+ "C",
+ "DRB1",
+ "DQA1",
+ "DQB1",
+ "DPA1",
+ "DPB1",
+}
+
+mhc_alleles = pd.read_csv(
+ snakemake.input.hla_la_bestguess,
+ sep="\t",
+ )
+# the Allele column can contain multiple ";"-separated entries for the
+# same locus -- NeoFox does a hard assertion that only two alleles per
+# gene exist, so we chose to only ever keep the first of such multiple
+# possibilities
+mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";")
+mhc_alleles = mhc_alleles.explode(["Allele"]).drop_duplicates(subset=["Locus", "Chromosome"])
+mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)]
+mhc_alleles.loc[:, "Allele"] = (
+ mhc_alleles["Allele"]
+ .str
+ .replace(
+ r"([A-Z]+\d?)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+ r"HLA-\1*\2:\3\5",
+ regex=True,
+ )
+)
+# the multiple ";"-separated entries from above can be identical after reducing
+# to the allele group (1st number) and specific HLA protein (2nd number)
+mhc_alleles = mhc_alleles.drop_duplicates(subset=["Chromosome", "Allele"])
+
+mhc_one_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() == 1, "Allele"] )
+mhc_two_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() > 1, "Allele"] )
+
+patient_info = pd.DataFrame(
+ data={
+ "identifier": [ snakemake.params.group.name ],
+ "tumorType": [ snakemake.params.group["tumorType"] ],
+ "mhcIAlleles": [ mhc_one_alleles ],
+ "mhcIIAlleles": [ mhc_two_alleles ],
+ }
+# This is required for cases where no tumorType is available, as NeoFox does not
+# seem to be able to handle empty entries, here -- so we remove the whole column
+).dropna(axis="columns")
+
+patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False)
diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py
deleted file mode 100644
index 5ce0cf15..00000000
--- a/workflow/scripts/group_mhc_output.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import sys
-import os
-import pandas as pd
-import numpy as np
-
-first = True
-out = open(snakemake.output[0], "w")
-for e in snakemake.input:
- if os.path.getsize(e) > 0:
- mhcout = open(e, 'r')
- alleles = next(mhcout).split('\t')
- header = next(mhcout).rstrip().split('\t')
- if first:
- allele = ''
- for i in range(0, len(header)):
- #print(header[i].rstrip())
- if i < len(alleles):
- #print(alleles[i])
- if alleles[i] != '':
- allele = alleles[i].rstrip() + '_'
- header[i] = allele + header[i].rstrip()
- header[len(header) -1] = "NB"
- first = False
- #print(header)
- out.write('\t'.join(header) + '\n')
- for line in mhcout:
- out.write(line)
-out.close()
diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py
deleted file mode 100644
index b90af1cb..00000000
--- a/workflow/scripts/merge_data.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import sys
-import os
-import pandas as pd
-import numpy as np
-
-
-def select_columns(mhc):
- rank_cols = [c for c in mhc.columns if "Rank" in c]
- affinity_cols = [c for c in mhc.columns if "nM" in c]
- mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"]
- mhc = mhc[mhc_cols]
- mhc["Rank_min"] = mhc[rank_cols].min(axis=1)
- mhc["Aff_min"] = mhc[affinity_cols].min(axis=1)
- mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1)
- mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1)
- mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","")
- mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
- return mhc
-
-def merge(info, tumor, normal, outfile):
- tumor = select_columns(tumor)
- normal = select_columns(normal)
- id_length = len(tumor.ID[0])
- print(info.columns)
- info["ID"] = info["id"].astype(str).str[:id_length]
- merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID'])
- merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns})
- info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"})
- merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')
-
- merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
- ## Are all possible variants in the peptide ("Cis") or not ("Trans")
- merged_dataframe["Variant_Orientation"] = "Cis"
- trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
- merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans"
-
- ## check misssense/silent mutation status
- nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal
- merged_dataframe = merged_dataframe[nonsilent]
- merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"])
-
- data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth",
- "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor",
- "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal",
- "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]]
-
- data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth",
- "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor",
- "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal",
- "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]
-
- # data = data[data.BindingHLAs_tumor > 0]
- # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)]
- #data = data[(data.BindingHLAs_normal == 0)]
-
- ### Delete Stop-Codon including peptides
- data = data[data.Peptide_tumor.str.count("x") == 0]
- data = data[data.Peptide_tumor.str.count("X") == 0]
- data.sort_values(["Chromosome", "somatic_positions"], inplace=True)
- ### Remove Duplicate kmers
- data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"])
-
- data.to_csv(outfile, index=False, sep = '\t')
-
-
-## highlight the difference between mutated neopeptide and wildtype
-def diffEpitope(e1,e2):
- if str(e2) == 'nan' or str(e2) == '':
- return(e1)
- e1 = str(e1)
- e2 = str(e2)
- diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]]
- e_new = e1
- e2_new = e2
- for p in diff_pos:
- e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:]
- e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:]
- return(e_new)
-
-
-def main():
- info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str)
- tumor = pd.read_csv(snakemake.input[1], sep = '\t')
- normal = pd.read_csv(snakemake.input[2], sep = '\t')
- outfile = snakemake.output[0]
- merge(info, tumor, normal, outfile)
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py
deleted file mode 100644
index fcc6aba4..00000000
--- a/workflow/scripts/merge_mhcflurry.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys
-import os
-import pandas as pd
-import numpy as np
-
-
-## highlight the difference between mutated neopeptide and wildtype
-def diffEpitope(e1,e2):
- if str(e2) == 'nan':
- return(e1)
- e1 = str(e1)
- e2 = str(e2)
- diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]]
- e_new = e1
- e2_new = e2
- for p in diff_pos:
- e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:]
- e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:]
- return(e_new)
-
-info = pd.read_csv(snakemake.input[0])
-tumor = pd.read_csv(snakemake.input[1])
-normal = pd.read_csv(snakemake.input[2])
-outfile = snakemake.output[0]
-
-
-tumor = tumor[["source_sequence_name","peptide","allele","affinity","percentile_rank"]]
-tumor = tumor.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index()
-tumor.columns = tumor.columns.map("-".join)
-tumor = tumor.rename(columns={col: col.replace("-","") for col in tumor.columns if col.endswith("-")})
-
-normal = normal[["source_sequence_name","peptide","allele","affinity","percentile_rank"]]
-normal = normal.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index()
-normal.columns = normal.columns.map("-".join)
-normal = normal.rename(columns={col: col.replace("-","") for col in normal.columns if col.endswith("-")})
-
-merged = tumor.merge(normal, how="left", on=["source_sequence_name"])
-
-merged = merged.rename(columns={col: col.replace("_y","_normal") for col in merged.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged.columns})
-## add info
-info = info.rename(columns={"id":"ID","gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"})
-merged_dataframe = merged.merge(info, how="left", left_on="source_sequence_name", right_on="ID")
-
-merged_dataframe["peptide_tumor"]=merged_dataframe[["peptide_tumor","peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
-## Are all possible variants in the peptide ("Cis") or not ("Trans")
-merged_dataframe["Variant_Orientation"] = "Cis"
-trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
-merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans"
-
-## check misssense/silent mutation status
-nonsilent = merged_dataframe.peptide_tumor != merged_dataframe.peptide_normal
-merged_dataframe = merged_dataframe[nonsilent]
-data = merged_dataframe.drop_duplicates(subset=["Gene_ID","offset","peptide_tumor","Somatic_AminoAcid_Change"])
-
-### Delete Stop-Codon including peptides
-data = data[data.peptide_tumor.str.count("x") == 0]
-data = data[data.peptide_tumor.str.count("X") == 0]
-data.to_csv(outfile, index=False, sep = '\t')
-
-
-
-
-
diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py
new file mode 100644
index 00000000..669a7716
--- /dev/null
+++ b/workflow/scripts/merge_neoantigen_info.py
@@ -0,0 +1,295 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+from typing import List, Tuple
+
+
+def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame:
+ df = df.set_index(["id", "pos_in_id_seq", "pep_seq", "allele"])
+ rank_col = f"{rank_type}_rank"
+ score_col = f"{rank_type}_score"
+ prefix = f"top_{rank_col}_"
+ columns_to_keep = ["bind_core", rank_col, score_col]
+ return (
+ df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep]
+ .reset_index(level="allele")
+ .add_prefix(prefix)
+ .reset_index()
+ .drop_duplicates()
+ )
+
+
+def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame:
+ merge_cols = ["id", "pos_in_id_seq", "pep_seq"]
+ common_info = sample.loc[
+ :, merge_cols + ["ave_el_score", "num_binders"]
+ ].drop_duplicates()
+ sample_el = get_best_rank_per_peptide(sample, "el")
+ sample_ba = get_best_rank_per_peptide(sample, "ba")
+ sample_filtered = sample_el.merge(sample_ba, on=merge_cols)
+ return sample_filtered.merge(common_info, how="left", on=merge_cols).assign(
+ alias=alias
+ )
+
+
+def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
+ """
+ Highlight the difference between mutated neopeptide and normal peptide
+ """
+ if normal_p == "nan" or normal_p == "":
+ return (tumor_p, normal_p)
+ assert len(tumor_p) == len(
+ normal_p
+ ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
+ diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
+ tp_changed = tumor_p
+ np_changed = normal_p
+ for p in diff_pos:
+ tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :]
+ np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :]
+ return (tp_changed, np_changed)
+
+
+def diff_tumor_normal_peptides(
+ group: pd.DataFrame, column: str, tumor_alias: str
+) -> pd.DataFrame:
+ group = group
+ normal_pep = group.loc[group["alias"] == "normal", column].fillna("")
+ if normal_pep.empty:
+ normal_pep = ""
+ else:
+ normal_pep = normal_pep.squeeze()
+ tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze()
+ # Remove groups where the tumor peptide contains a stop codon.
+ # TODO: Maybe this should be a hard fail complaining to fix this upstream?
+ if "X" in tumor_pep:
+ print(f"Warning: ", file=sys.stderr)
+ return group.loc[[], :]
+ # Silent mutations should not be included in microphaser output.
+ if normal_pep == tumor_pep:
+ raise ValueError(
+ f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n"
+ "Please fix this upstream or comment out this check to ignore this problem.\n"
+ )
+ t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep)
+ group.loc[group["alias"] == tumor_alias, column] = t_diff
+ group.loc[group["alias"] == "normal", column] = n_diff
+ return group
+
+
+def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame:
+ """
+ Get the -o info output of the microphaser filter command into tidy data format.
+ """
+ info = info.rename(columns={"credible_interval": "freq_credible_interval"})
+ # Aggregate multiple identical entries that differ only in 'id' and 'transcript'
+ # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
+ # list.
+ cols = [c for c in info.columns if c not in ["id", "transcript", "depth", "freq", "freq_credible_interval"]]
+ aggregation_functions = {
+ "id": lambda i: list(i),
+ "transcript": lambda t: "|".join(list(t)),
+ "depth": lambda d: "|".join(list(d)),
+ "freq": lambda f: "|".join(list(f)),
+ "freq_credible_interval": lambda c: "|".join(list(c)),
+ }
+ info = (
+ info.groupby(cols, dropna=False)
+ .agg(aggregation_functions)
+ .reset_index()
+ .explode("id")
+ .set_index(
+ [
+ "id",
+ "transcript",
+ "gene_id",
+ "gene_name",
+ "chrom",
+ "offset",
+ "frame",
+ "freq",
+ "freq_credible_interval",
+ "depth",
+ "strand",
+ ]
+ )
+ )
+ int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"]
+ info[int_cols] = info[int_cols].astype("int32")
+ # TODO: Ensure that microphaser output contains only one entry per id.
+ # If there is more than one entry per index, ensure that they are identical
+ if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0:
+ raise ValueError(
+ f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n"
+ )
+ # Always take the first entry for each index.
+ info = info.groupby(info.index).head(1)
+ # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal.
+ # TODO: factor out tidying of column pairs into a function.
+ num_var_in_pep_tidy = (
+ info.assign(ngermline=lambda x: x.nvar - x.nsomatic)
+ .melt(
+ var_name="alias",
+ value_name="num_var_in_pep",
+ value_vars=["ngermline", "nsomatic"],
+ ignore_index=False,
+ )
+ .replace({"ngermline": "normal", "nsomatic": tumor_alias})
+ .set_index("alias", append=True)
+ )
+ num_var_sites_tidy = (
+ info.assign(ngermvariant_sites=lambda x: x.nvariant_sites - x.nsomvariant_sites)
+ .melt(
+ var_name="alias",
+ value_name="num_var_sites",
+ value_vars=["ngermvariant_sites", "nsomvariant_sites"],
+ ignore_index=False,
+ )
+ .replace({"ngermvariant_sites": "normal", "nsomvariant_sites": tumor_alias})
+ .set_index("alias", append=True)
+ )
+ genomic_pos_tidy = (
+ info.melt(
+ var_name="alias",
+ value_name="genomic_pos",
+ value_vars=["somatic_positions", "germline_positions"],
+ ignore_index=False,
+ )
+ .replace({"somatic_positions": tumor_alias, "germline_positions": "normal"})
+ .set_index("alias", append=True)
+ )
+ aa_changes_tidy = (
+ info.melt(
+ var_name="alias",
+ value_name="aa_changes",
+ value_vars=["somatic_aa_change", "germline_aa_change"],
+ ignore_index=False,
+ )
+ .replace({"somatic_aa_change": tumor_alias, "germline_aa_change": "normal"})
+ .set_index("alias", append=True)
+ )
+ nt_seq_tidy = (
+ info.melt(
+ var_name="alias",
+ value_name="nt_seq",
+ value_vars=["normal_sequence", "mutant_sequence"],
+ ignore_index=False,
+ )
+ .replace({"normal_sequence": "normal", "mutant_sequence": tumor_alias})
+ .set_index("alias", append=True)
+ )
+ all_tidy = num_var_in_pep_tidy.join(
+ [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]
+ )
+ return all_tidy.reset_index()
+
+
+def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""):
+ if sum(df.duplicated(subset=cols)) > 0:
+ duplicates = df[
+ df.duplicated(
+ subset=cols,
+ keep=False,
+ )
+ ]
+ cols_str = '", "'.join(cols)
+ raise ValueError(
+ f'Found multiple rows with identical [ "{cols_str}" ] entries.\n'
+ "This indicates an upstream issue, please fix this.\n"
+ f"{specific_error}"
+ "The offending entries are:\n"
+ f"{duplicates}\n"
+ )
+
+
+def merge_data_frames(
+ info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str
+) -> pd.DataFrame:
+ # get and merge tumor and normal
+ tumor_filtered = get_filtered_per_alias(tumor, tumor_alias)
+ normal_filtered = get_filtered_per_alias(normal, "normal")
+ all_filtered = (
+ pd.concat([tumor_filtered, normal_filtered])
+ .groupby("id", group_keys=False)
+ .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias)
+ )
+ info_tidy = tidy_info(info, tumor_alias)
+
+ # netMHCpan 4.1 truncates the fasta entry IDs, so we have to cut down the IDs
+ # that microphaser originally provided to make the following .join() work
+ len_tumor_id = len(tumor_filtered["id"][0])
+ len_normal_id = len(normal_filtered["id"][0])
+ assert (
+ len_tumor_id == len_normal_id
+ ), f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n"
+ info_tidy["id"] = info_tidy["id"].str[:len_tumor_id]
+ # Double-check for duplicates resulting from the id truncation
+ check_duplicates(
+ info_tidy,
+ ["id", "alias"],
+ specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n",
+ )
+
+ all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"])
+
+ # Double-check for weird duplicates, as previously done in Jan's code.
+ # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"],
+ # we check for everything except id.
+ cols_without_id = [c for c in all_annotated.columns if c not in ["id"]]
+ all_annotated = all_annotated.drop_duplicates(subset=cols_without_id)
+ check_duplicates(all_annotated, cols_without_id)
+
+ column_order = [
+ "id",
+ "pep_seq",
+ "pos_in_id_seq",
+ "alias",
+ "num_binders",
+ "freq",
+ "depth",
+ "num_var_sites",
+ "num_var_in_pep",
+ "top_el_rank_allele",
+ "top_el_rank_bind_core",
+ "top_el_rank_el_rank",
+ "top_el_rank_el_score",
+ "ave_el_score",
+ "top_ba_rank_allele",
+ "top_ba_rank_bind_core",
+ "top_ba_rank_ba_rank",
+ "top_ba_rank_ba_score",
+ "aa_changes",
+ "genomic_pos",
+ "nt_seq",
+ "gene_name",
+ "gene_id",
+ "transcript",
+ "chrom",
+ "offset",
+ "frame",
+ "strand",
+ "freq_credible_interval",
+ ]
+
+ def get_id_rank(group: pd.DataFrame, tumor_alias: str):
+ return group.loc[group["alias"] == tumor_alias, "top_el_rank_el_rank"].squeeze()
+
+ sort_rank = (
+ all_annotated.groupby("id").apply(get_id_rank, tumor_alias).rename("sort_rank")
+ )
+ all_sorted = (
+ all_annotated.merge(sort_rank, on=["id"], how="left")
+ .sort_values(["sort_rank", "id", "alias"], ascending=[True, True, False])
+ .drop(columns="sort_rank")
+ )
+
+ return all_sorted.reindex(columns=column_order)
+
+
+info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str)
+tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str})
+normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str})
+data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias)
+data.to_csv(snakemake.output[0], index=False, sep="\t")
diff --git a/workflow/scripts/oncoprint.R b/workflow/scripts/oncoprint.R
deleted file mode 100644
index 5476b305..00000000
--- a/workflow/scripts/oncoprint.R
+++ /dev/null
@@ -1,56 +0,0 @@
-# log to file
-log <- file(snakemake@log[[1]], open="wt")
-sink(log)
-sink(log, type="message")
-
-library(ComplexHeatmap)
-library(ggplot2)
-
-table = read.table(snakemake@input[[1]], sep="\t", header=TRUE, row.names=1)
-mat = as.matrix(table)
-mat = t(mat)
-## remove "full" lines
-mat = mat[rowSums(mat == "") > 0,]
-## remove single lines
-mat = mat[rowSums(mat != "") > 0,]
-
-col = c(SNV = "blue", INDEL = "red")
-
-alter_fun = list(
- SNV = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9,
- gp = gpar(fill = col["SNV"], col = NA)),
- INDEL = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.4,
- gp = gpar(fill = col["INDEL"], col = NA))
- )
-
-
-heatmap_legend_param = list(title = "Alterations", at = c("SNV", "INDEL"),
- labels = c("SNV", "INDEL"))
-if (ncol(mat) > 1 ){
- mat <- mat[order(apply(mat, 1, function(row) sum(row != "")), decreasing = T), ]
-}
-
-i = 0
-c = 0
-matlist = list()
-while (i + 2000 < nrow(mat)) {
- m <- mat[(i + 1):(i + 2000), , drop=FALSE]
- rows_matrix <- nrow(m)
- height_plot <- (rows_matrix/5)
- if (height_plot < 4) {
- height_plot <- 4
- }
- pdf(file = sub("0.pdf", paste(c, ".pdf", sep=''), snakemake@output[[1]]), height=height_plot)
- if (rows_matrix > 0) {
- oncoprint <- oncoPrint(m,
- alter_fun = alter_fun, col = col,
- remove_empty_columns = FALSE, remove_empty_rows = TRUE,
- pct_side = "right", row_names_side = "left",
- show_column_names=T,
- column_title = "OncoPrint", heatmap_legend_param = heatmap_legend_param)
- draw(oncoprint, newpage=F)
- }
- dev.off()
- i = i + 2000
- c = c + 1
-}
\ No newline at end of file
diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py
deleted file mode 100644
index ff1906f3..00000000
--- a/workflow/scripts/parse_HLA_types.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pandas as pd
-
-hlaI = ["A","B","C"]
-
-hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"]
-
-hlas = pd.read_csv(snakemake.input[0], sep='\t')
-
-hlasI = hlas[hlas.Locus.isin(hlaI)]
-hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')
-hlasI = hlasI[["Allele"]].drop_duplicates()
-hlasI.to_csv(snakemake.output[0], sep='\t', index=False)
-
-hlasII = hlas[hlas.Locus.isin(hlaII)]
-hlasII["HLA"] = hlasII.Locus.str[0:2]
-hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')
-
-hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates()
-hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_")
-hlasII.to_csv(snakemake.output[1], sep='\t', index=False)
diff --git a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
new file mode 100644
index 00000000..79c6994c
--- /dev/null
+++ b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py
@@ -0,0 +1,145 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+
+# # read in available alleles
+
+
+def read_allele_list(filename: str):
+ with open(filename, "r") as alleles_in:
+ alleles = set()
+ for line in alleles_in:
+ if not (line.startswith("#") or line == "\n"):
+ alleles.add(line.strip())
+ return alleles
+
+
+HLA_SUFFIXES_REGEX = r"[NLSCAQ]?"
+
+# netMHCpan alleles and loci
+HLA_ONE_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_one_alleles)
+hla_one_net_mhc_alleles = pd.Series(list(HLA_ONE_NET_MHC_ALLELES))
+HLA_ONE_LOCI = set(
+ hla_one_net_mhc_alleles[hla_one_net_mhc_alleles.str.startswith("HLA-")]
+ .str.replace(r"HLA-([A-Z])[\d:]+" + HLA_SUFFIXES_REGEX, r"\1", regex=True)
+ .drop_duplicates()
+)
+
+
+# netMHCIIpan alleles and loci
+HLA_TWO_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_two_alleles)
+hla_two_net_mhc_alleles = pd.Series(list(HLA_TWO_NET_MHC_ALLELES))
+# DRB alleles need to be formatted differently from DP and DQ alleles,
+# so we extract them separately.
+DRB_LOCI = set(
+ hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("DRB")]
+ .str.replace(r"(DRB\d)_\d+" + HLA_SUFFIXES_REGEX, r"\1", regex=True)
+ .drop_duplicates()
+)
+
+ALPHA_BETA_LOCI = set(
+ hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("HLA-")]
+ .str.replace(
+ r"HLA-(D[A-Z]A\d)\d+"
+ + HLA_SUFFIXES_REGEX
+ + r"-(D[A-Z]B\d)\d+"
+ + HLA_SUFFIXES_REGEX,
+ r"\1_\2",
+ regex=True,
+ )
+ .str.split("_")
+ .explode()
+ .drop_duplicates()
+)
+
+# read in alleles as determined by HLA-LA
+hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t")
+
+# the Allele column can contain multiple ";"-separated entries for the
+# same locus
+hlas.loc[:, "Allele"] = hlas["Allele"].str.split(pat=";")
+hlas["alternative"] = hlas["Allele"].apply(lambda x: range(len(x)))
+hlas = hlas.explode(["Allele", "alternative"])
+
+# reformat to netMHCpan allele list format:
+# * https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/allele.list
+# it needs to be in the format of the first column of the above list, as explained in
+# the "Instructions" tab under "MHC SELECTION" point "2)" at:
+# * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1
+hla_one_alleles = (
+ hlas.loc[hlas["Locus"].isin(HLA_ONE_LOCI), "Allele"]
+ .str.replace(
+ r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+ r"HLA-\1\2:\3\5",
+ regex=True,
+ )
+ .drop_duplicates()
+)
+hla_one_alleles[hla_one_alleles.isin(HLA_ONE_NET_MHC_ALLELES)].to_csv(
+ snakemake.output.hlaI, sep="\t", index=False, header=False
+)
+
+# reformat to netMHCIIpan allele list format:
+# * https://services.healthtech.dtu.dk/services/netMHCIIpan-4.1/alleles_name.list
+# contrary to the format in that list, alleles actually need to be formatted like this,
+# with s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only
+# the 1st and 2nd sub-number are used) according to the official nomenclature (see:
+# http://www.hla.alleles.org/nomenclature/naming.html ):
+# * DRB alleles: "_"
+# * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA):
+# "HLA--"
+# This format was determined by manually selecting combinations above the
+# "type a list of molecules names" field of the "Submission" tab at:
+# * https://services.healthtech.dtu.dk/service.php?netMHCIIpan-4.1
+
+# TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary:
+# * example: DRB1_1501-DRB30101-DRB40301N
+# * "DRB1_-DRB3-DRB4"
+drb_alleles = hlas.loc[hlas["Locus"].isin(DRB_LOCI)]
+hla_two_alleles = (
+ drb_alleles["Allele"]
+ .str.replace(
+ r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+ r"\1_\2\3\5",
+ regex=True,
+ )
+ .drop_duplicates()
+)
+
+# handle alleles where a combination of alpha and beta always exists
+alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA_LOCI)]
+alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace(
+ r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")",
+ r"\1\2\3\5",
+ regex=True,
+)
+# we need a variable to group alpha and beta of the same gene combination together
+alpha_beta_alleles.loc[:, "gene_group"] = alpha_beta_alleles["Locus"].str.replace(
+ r"(D[A-Z])[AB]\d", r"\1", regex=True
+)
+# we need to handle cases where we had multiple allele entries in an
+# alpha or beta locus, adding in a duplicate of the corresponding locus
+alleles_to_duplicate = alpha_beta_alleles.loc[
+ alpha_beta_alleles["alternative"] > 0,
+ ["Locus", "Chromosome", "alternative"],
+].replace(regex={"Locus": {"(D[A-Z])A(\d)": r"\1B\2", "(D[A-Z])B(\d)": r"\1A\2"}})
+alleles_to_insert = alleles_to_duplicate.merge(
+ alpha_beta_alleles.drop("alternative", axis="columns"),
+ on=["Locus", "Chromosome"],
+ how="left",
+)
+alpha_beta_alleles = pd.concat(
+ [alpha_beta_alleles, alleles_to_insert]
+).drop_duplicates()
+
+hla_two_alleles = hla_two_alleles.append(
+ alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"]
+ .transform(lambda x: f"HLA-{'-'.join(x)}")
+ .drop_duplicates()
+)
+
+hla_two_alleles[hla_two_alleles.isin(HLA_TWO_NET_MHC_ALLELES)].to_csv(
+ snakemake.output.hlaII, sep="\t", index=False, header=False
+)
diff --git a/workflow/scripts/phylogeny.R b/workflow/scripts/phylogeny.R
index 7f26becf..9e32c131 100644
--- a/workflow/scripts/phylogeny.R
+++ b/workflow/scripts/phylogeny.R
@@ -1,3 +1,7 @@
+log <- file(snakemake@log[[1]], open="wt")
+sink(log)
+sink(log, type="message")
+
library(phangorn)
## read the variant matrix
diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py
new file mode 100644
index 00000000..0a535d14
--- /dev/null
+++ b/workflow/scripts/prepare_neoprint.py
@@ -0,0 +1,88 @@
+import sys
+import json
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pandas as pd
+from typing import Tuple
+
+def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]:
+ """
+ Highlight the difference between mutated neopeptide and normal peptide
+ """
+ if normal_p == "nan" or normal_p == "NA" or normal_p == "":
+ return (tumor_p, normal_p)
+ assert len(tumor_p) == len(
+ normal_p
+ ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths."
+ diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]]
+ tp_changed = tumor_p
+ np_changed = normal_p
+ for p in diff_pos:
+ tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :]
+ np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :]
+ return (tp_changed, np_changed)
+
+all_neopeptides = pd.read_csv(snakemake.input.neopeptides, sep="\t")
+
+# If we leave in any dots, datavzrd will interpret this as attributes
+all_neopeptides.columns = all_neopeptides.columns.str.replace(".", "_", regex=False)
+
+# Aggregate multiple identical entries that differ only in 'id' and 'transcript'
+# into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated
+# list.
+# TODO: Remove this redundancy from microphaser output before passing it along to other
+# tools.
+cols = [c for c in all_neopeptides.columns if c not in ["id", "transcript"]]
+aggregation_functions = {
+ "id": lambda i: list(i)[0],
+ "transcript": lambda t: "|".join(list(t)),
+}
+all_neopeptides = (
+ all_neopeptides.groupby(cols, dropna=False)
+ .agg(aggregation_functions)
+ .reset_index()
+ .explode("id")
+)
+
+# highlight mutations in the original Xmers
+all_neopeptides[["mutation_mutatedXmer", "mutation_wildTypeXmer"]] = (
+ pd.DataFrame(
+ all_neopeptides
+ .fillna({"mutation_wildTypeXmer": ""})
+ .apply(lambda row: highlight_peptides_diff(row["mutation_mutatedXmer"], row["mutation_wildTypeXmer"]), axis="columns")
+ .tolist()
+ )
+)
+
+# create new purity-adjusted DNA variant allele frequency
+all_neopeptides["purity_adjusted_DNA_VAF"] = all_neopeptides["dnaVariantAlleleFrequency"] / snakemake.params.purity
+
+# round all floats to the specified decimals
+all_neopeptides = all_neopeptides.round(decimals=5)
+
+# define important columns to move to the left of the table
+
+neofox_important_cols = snakemake.params.neofox_important_cols
+
+important_cols = neofox_important_cols["general"] + neofox_important_cols["I"] + neofox_important_cols["II"]
+
+
+mhc_one = (
+ all_neopeptides[ important_cols + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
+ .sort_values(by = ["PRIME_best_rank", "MixMHC2pred_best_rank"])
+ .groupby("PRIME_best_rank")
+ .head(n=1)
+)
+
+mhc_one.to_csv(snakemake.output.mhc_one, sep="\t", index=False)
+
+# move important columns to the left of the table
+mhc_two = (
+ all_neopeptides[ neofox_important_cols["general"] + neofox_important_cols["II"] + neofox_important_cols["I"] + [ col for col in all_neopeptides.columns if col not in important_cols ] ]
+ .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"])
+ .groupby("MixMHC2pred_best_rank")
+ .head(n=1)
+)
+
+mhc_two.to_csv(snakemake.output.mhc_two, sep="\t", index=False)
diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py
deleted file mode 100644
index 692b0ee5..00000000
--- a/workflow/scripts/render-scenario.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from jinja2 import Template
-import pandas as pd
-
-with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out:
- samples = snakemake.params.samples
- out.write(Template(template.read()).render(
- samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])]
- ))
diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py
index 3c7c287d..ffbd7aaa 100644
--- a/workflow/scripts/sample_comp_plot.py
+++ b/workflow/scripts/sample_comp_plot.py
@@ -1,3 +1,7 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
import os
import glob
import pandas as pd
@@ -6,8 +10,10 @@
import matplotlib.pyplot as plt
from pysam import VariantFile
-variant_df = pd.read_csv(snakemake.input[0], sep='\t').fillna(0.0)
-variant_df = variant_df[["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]]
+variant_df = pd.read_csv(snakemake.input[0], sep="\t").fillna(0.0)
+variant_df = variant_df[
+ ["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]
+]
## tidy data - for facet plot
tidy_df = variant_df.melt(id_vars=["CHROM", "POS"], var_name="Sample", value_name="VAF")
g = sns.FacetGrid(tidy_df, col="Sample")
@@ -15,25 +21,27 @@
g.savefig(snakemake.output["facet"])
plt.close()
## pairplot
-sns.pairplot(variant_df.drop(["CHROM", "POS"],axis=1), diag_kind="kde")
+sns.pairplot(variant_df.drop(["CHROM", "POS"], axis=1), diag_kind="kde")
plt.savefig(snakemake.output["pairplot"])
plt.close()
+
def overlap_pct(x, y, **kws):
n = 0
- for i in range(0,len(x)):
+ for i in range(0, len(x)):
if (x[i] > 0) & (y[i] > 0):
n += 1
- overlap=n/len([e for e in x if e > 0])
+ overlap = n / len([e for e in x if e > 0])
ax = plt.gca()
- ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4))
- ax.annotate("Shared Variants: {}".format(n), xy=(.2, .6))
+ ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(0.2, 0.4))
+ ax.annotate("Shared Variants: {}".format(n), xy=(0.2, 0.6))
+
def variants(x, **kws):
positive = len([e for e in x if e > 0])
- ax = plt.gca()
- ax.annotate("#Variants: {}".format(positive), xy=(.2, .5),
-xycoords=ax.transAxes)
+ ax = plt.gca()
+ ax.annotate("#Variants: {}".format(positive), xy=(0.2, 0.5), xycoords=ax.transAxes)
+
g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
@@ -42,7 +50,7 @@ def variants(x, **kws):
g.savefig(snakemake.output["grid"])
plt.close()
-#def neg_overlap_pct(x, y, **kws):
+# def neg_overlap_pct(x, y, **kws):
# n = 0
# for i in range(0,len(x)):
# if (x[i] == 0) & (y[i] == 0):
@@ -52,25 +60,24 @@ def variants(x, **kws):
# ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4))
# ax.annotate("Shared missing variants: {}".format(n), xy=(.2, .6))
-#def neg_variants(x, **kws):
+# def neg_variants(x, **kws):
# zero = len([e for e in x if e == 0])
-# ax = plt.gca()
+# ax = plt.gca()
# ax.annotate("#Missing Variants: {}".format(zero), xy=(.2, .5),
-#xycoords=ax.transAxes)
+# xycoords=ax.transAxes)
-#g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
+# g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1))
-#g.map_offdiag(neg_overlap_pct)
-#g.map_diag(neg_variants)
-#g.savefig("plots/Mssing_Variant_table.pdf")
-#plt.close()
+# g.map_offdiag(neg_overlap_pct)
+# g.map_diag(neg_variants)
+# g.savefig("plots/Mssing_Variant_table.pdf")
+# plt.close()
-for c in variant_df.drop(["CHROM", "POS"], axis=1).columns:
- sns.distplot(variant_df[[c]][variant_df[c] > 0])
- plt.title(c)
- plt.savefig("plots/positive_" + c +".distplot.pdf")
+for c in variant_df.drop(["CHROM", "POS"], axis=1).columns:
+ sns.distplot(variant_df[[c]][variant_df[c] > 0])
+ plt.title(c)
+ plt.savefig("plots/positive_" + c + ".distplot.pdf")
plt.close()
sns.distplot(variant_df[[c]])
- plt.savefig("plots/all_" + c +".distplot.pdf")
+ plt.savefig("plots/all_" + c + ".distplot.pdf")
plt.close()
-
diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py
new file mode 100644
index 00000000..fa129c0a
--- /dev/null
+++ b/workflow/scripts/tidy_mhc_output.py
@@ -0,0 +1,125 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import os
+import pandas as pd
+
+from itertools import cycle
+
+# assumptions of this script about netMHCpan or netMHCIIpan:
+# * version 4.1
+# * output generated via `-xls` option
+# * generated with the `-BA` option to include binding affinity prediction
+
+# The mapping of index column names used here to original names in netMHCpan files
+# is (please excuse the pd.NA tuples, they make header and index handling
+# easier further down the line):
+INDEX_NAMES = {
+ (pd.NA, "Pos"): "pos_in_id_seq",
+ (pd.NA, "Peptide"): "pep_seq",
+ (pd.NA, "ID"): "id",
+ (pd.NA, "Ave"): "ave_el_score",
+ (pd.NA, "NB"): "num_binders",
+}
+
+if snakemake.wildcards.mhc == "net_mhc_pan":
+ # The mapping of column names used here to original names in netMHCpan files is:
+ COLUMN_NAMES = {
+ "BA-score": "ba_score",
+ "BA_Rank": "ba_rank",
+ "EL-score": "el_score",
+ "EL_Rank": "el_rank",
+ "core": "bind_core",
+ }
+elif snakemake.wildcards.mhc == "net_mhc_two_pan":
+ # The mapping of column names used here to original names in netMHCIIpan files is:
+ COLUMN_NAMES = {
+ "Score_BA": "ba_score",
+ "Rank_BA": "ba_rank",
+ "Score": "el_score",
+ "Rank": "el_rank",
+ "Core": "bind_core",
+ }
+else:
+ sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}")
+
+COLUMNS_TO_DROP = {
+ # I have not found any docs or indication in the manuscript, what the column
+ # `Target` is. It might be the column `Exp_Bind` from Figure 1B here (it's
+ # the only column that only appears in netMHCIIpan output and not netMHCpan
+ # output):
+ # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7319546/figure/F1/
+ # If so, it is only for benchmarking purposes according to the docs. And it
+ # is always NA, even in their manuscript. So we simply remove it here.
+ "Target",
+ # There doesn't seem to be an `icore` equivalent in netMHCIIpan output.
+ "icore",
+ # There doesn't seem to be an `nM` equivalent in netMHCpan output.
+ "nM",
+}
+
+
+def parse_file(mhc_in: str):
+ """
+ Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile `
+ directive into a tidy pandas data frame.
+ """
+ if os.path.getsize(mhc_in) == 0:
+ # Short-circuit empty files, but generate correct header.
+ return pd.DataFrame(
+ columns=list(COLUMN_NAMES.values())
+ + ["allele"]
+ + list(INDEX_NAMES.values())
+ )
+
+ # We need to fix the utterly broken headers first
+
+ # parse first header line into pandas.Series and name it
+ first_header_line = pd.read_csv(mhc_in, nrows=1, header=None, sep="\t").iloc[0, :]
+ first_header_line.name = "allele"
+
+ # parse second header line into pandas.Series and name it
+ second_header_line = pd.read_csv(
+ mhc_in, skiprows=1, nrows=1, header=None, sep="\t"
+ ).iloc[0, :]
+ second_header_line.name = "column_name"
+
+ header = pd.concat([first_header_line, second_header_line], axis="columns")
+ header = header.fillna(method="ffill")
+ header.loc[
+ header.column_name.isin([index_col for (_, index_col) in INDEX_NAMES.keys()]),
+ "allele",
+ ] = pd.NA
+
+ # It's a compound header over two rows and a compound row index in the initial
+ # three and final two columns of the table. For some reason, the final two
+ # columns are added to the index but not removed from the table, so we do this
+ # manually with `.iloc[]``.
+ data = pd.read_csv(mhc_in, sep="\t", skiprows=2, header=None)
+
+ data.columns = pd.MultiIndex.from_frame(header)
+
+ # remove columns only present in one of the two tools' output
+ columns_to_keep = [
+ col
+ for col in list(data.columns.get_level_values("column_name"))
+ if col not in COLUMNS_TO_DROP
+ ]
+ idx = pd.IndexSlice
+ data = data.loc[:, idx[:, columns_to_keep]]
+
+ # properly set row index columns (values to repeat in every row while doing the stack below)
+ data = data.set_index(list(INDEX_NAMES.keys()))
+ data.index.set_names(INDEX_NAMES, inplace=True)
+
+ # Turn into longer table with one HLA Allele per row instead of MultiIndex
+ # header, rename columns to something readable and turn index into columns.
+ data = data.stack(level="allele").rename(columns=COLUMN_NAMES).reset_index()
+
+ return data
+
+
+all_data = pd.concat((parse_file(f) for f in snakemake.input), axis="index")
+
+all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False)
diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py
index 28a24594..f99e77ec 100644
--- a/workflow/scripts/tsv_to_xlsx.py
+++ b/workflow/scripts/tsv_to_xlsx.py
@@ -1,7 +1,8 @@
import sys
-import pandas as pd
sys.stderr = open(snakemake.log[0], "w")
+import pandas as pd
+
data = pd.read_csv(snakemake.input.tsv, sep="\t")
-data.to_excel(snakemake.output.xlsx, index=False)
\ No newline at end of file
+data.to_excel(snakemake.output.xlsx, index=False)