diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 503d4a5e..364e02a4 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -8,33 +8,42 @@ on: jobs: + Formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Formatting + uses: github/super-linter@v4 + env: + VALIDATE_ALL_CODEBASE: false + DEFAULT_BRANCH: master + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VALIDATE_SNAKEMAKE_SNAKEFMT: true + + Linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 - name: Lint workflow - uses: snakemake/snakemake-github-action@v1.17.0 + uses: snakemake/snakemake-github-action@v1.22.0 with: directory: . snakefile: workflow/Snakefile args: "--lint" - stagein: | - export TMPDIR=/tmp Testing: runs-on: ubuntu-latest - needs: Linting + needs: + - Linting + - Formatting steps: - - uses: actions/checkout@v1 - - name: Checkout submodules - uses: textbook/git-checkout-submodule-action@2.0.0 + - uses: actions/checkout@v2 - name: Test workflow (local FASTQs) - uses: snakemake/snakemake-github-action@v1.17.0 + uses: snakemake/snakemake-github-action@v1.22.0 with: directory: .test snakefile: workflow/Snakefile args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba" - stagein: | - export TMPDIR=/tmp diff --git a/.gitignore b/.gitignore index ee6ed52e..02a9591f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,21 @@ * -!scripts -!scripts/* -!scripts/common -!scripts/common/* -scripts/.snakemake* -!Snakefile -!config.yaml -!samples.tsv -!resources -!resources/* -!envs -!envs/* -!environment.yaml +!.github/workflows/* +!.test +!.test/* +!.test/config/* +!.test/data/* +!config +!config/* +!config/HLA_Data/* +!workflow +!workflow/* +!workflow/report/* +!workflow/envs/* +!workflow/schemas/* +!workflow/scripts/* +!workflow/rules/* +!workflow/rules/annotation/* +!workflow/resources/* +!.gitignore !LICENSE !README.md -!rules -!rules/* -!.gitignore -!.editorconfig -!.gitattributes -!.test -!.test/data diff --git a/.test/config/config.yaml b/.test/config/config.yaml index d10fc559..7eb854e1 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,84 +1,10 @@ samples: "config/samples.tsv" units: "config/units.tsv" -# boolean if read trimming should be skipped -trimming: - activate: false -remove_duplicates: +neoantigen_prediction: activate: true -calling: - freebayes: - activate: false - # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling - scenario: config/scenario.yaml - filter: - # Filter candidate variants (this filter helps to keep the number of evaluated candidates small). - # It should ideally generate a superset of all other filters defined below. - # Annotation of candidate variants tries to be as fast as possible, only using VEP - # default parameters. - candidates: "" - # Add any number of named filters here. They will be applied independenty, - # and can be referred in FDR control below to generate calls for different events. - # In particular, you can also filter by ID or dbsnp annotations here. - # See http://snpeff.sourceforge.net/SnpSift.html#filter - filtername: "ANN['IMPACT'] != 'MODIFIER'" - fdr-control: - threshold: 0.05 - events: - complete: - varlociraptor: - - "somatic" - - "germline" - somatic: - varlociraptor: - - "somatic" - germline: - varlociraptor: - - "germline" - -fusion: - arriba: - activate: false - blacklist: - "arriba_blacklist" - params: - "-T -P" - -tmb: - activate: false - coding_genome_size: 3e7 - # Name of the tumor sample in the scenario.yaml. - tumor_sample: tumor - somatic_events: - - somatic - - -epitope_prediction: - activate: false - - -affinity: - netMHCpan: - activate: false - params: "-BA -l 9 -s -xls" - location: "../netMHCpan-4.0" - netMHCIIpan: - activate: false - params: "-length 15 -s -xls" - location: "../netMHCIIpan-4.0" - - -HLAtyping: - # activate to use razers3 to pre-filter reads before using optitype - optitype_prefiltering: - activate: false - optitype_data: "config/HLA_Data/hla_reference_dna.fasta" - # activate to predict MHC-I and MHC-II alleles with HLA-LA - HLA_LA: - activate: false - ref: # Number of chromosomes to consider for calling. @@ -87,49 +13,42 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 100 + release: 108 # Genome build build: GRCh38 -annotations: - vep: - params: "--everything" - plugins: - # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt". - - LoFtool - params: - cutadapt: "" - bwa: - "-M" - picard: - MarkDuplicates: - "VALIDATION_STRINGENCY=LENIENT" - gatk: - BaseRecalibrator: "--tmp-dir tmp" - applyBQSR: "" - strelka: - config: - "--exome" - run: - "--mode local" - razers3: - "-i 95 -m 1 -dr 0" - optitype: - "" microphaser: - window_len: - 33 - peptide_len: - netMHCpan: - 9 - netMHCIIpan: - 15 - kallisto: - "-b 100" - star: >- - --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip - --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 - --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 + events: + tumor: "tumor_only" + normal: "normal_only" + net_mhc_pan: + activate: true + peptide_len: 9 + extra: "" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCpan-4.1" + net_mhc_two_pan: + activate: false + peptide_len: 15 + extra: "" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCIIpan-4.1" + neo_fox: + activate: false + # This should be at least as long as the desired net_mhc_two_pan peptide length + peptide_len: 15 + # update the version number to get a newer release of the reference set of HLA Alleles + hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt" + extra: "" diff --git a/.test/config/groups.tsv b/.test/config/groups.tsv new file mode 100644 index 00000000..c0a77bbd --- /dev/null +++ b/.test/config/groups.tsv @@ -0,0 +1,2 @@ +group tumorType +A LUSC diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 044002d7..2aca82dc 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,3 +1,3 @@ -sample type matched_normal purity platform -A_normal normal ILLUMINA -A_tumor tumor A_normal 1 ILLUMINA +sample_name group alias purity platform +A_normal A normal 1 ILLUMINA +A_tumor A tumor .99 ILLUMINA diff --git a/.test/config/units.tsv b/.test/config/units.tsv index cdb8af8e..4984edd8 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,3 +1,3 @@ -sample sequencing_type unit fq1 fq2 sra adapters +sample_name sequencing_type unit_name fq1 fq2 sra adapters A_normal DNA lane1 data/reads/A_normal.1.fastq.gz data/reads/A_normal.2.fastq.gz A_tumor DNA lane1 data/reads/A_tumor.1.fastq.gz data/reads/A_tumor.2.fastq.gz diff --git a/config/config.yaml b/config/config.yaml index f1e6818a..e00ae4d9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,73 +1,11 @@ samples: "config/samples.tsv" units: "config/units.tsv" +groups: "config/groups.tsv" -# boolean if read trimming should be skipped -trimming: - activate: false -remove_duplicates: +neoantigen_prediction: activate: true -calling: - freebayes: - activate: false - # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling - scenario: config/scenario.yaml - fdr-control: - threshold: 0.05 - events: - complete: - varlociraptor: - - "somatic" - - "germline" - somatic: - varlociraptor: - - "somatic" - germline: - varlociraptor: - - "germline" - -fusion: - arriba: - activate: false - blacklist: - "arriba_blacklist" - params: - "-T -P" - -tmb: - activate: false - coding_genome_size: 3e7 - # Name of the tumor sample in the scenario.yaml. - tumor_sample: tumor - somatic_events: - - somatic - - -epitope_prediction: - activate: true - - -affinity: - netMHCpan: - activate: true - params: "-BA -l 9 -s -xls" - location: "../netMHCpan-4.0" - netMHCIIpan: - activate: false - params: "-length 15 -s -xls" - location: "../netMHCIIpan-4.0" - - -HLAtyping: - # activate to use razers3 to pre-filter reads before using optitype - optitype_prefiltering: - activate: false - optitype_data: "config/HLA_Data/hla_reference_dna.fasta" - # activate to predict MHC-I and MHC-II alleles with HLA-LA - HLA_LA: - activate: false - ref: # Number of chromosomes to consider for calling. @@ -76,49 +14,44 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 100 + release: 108 # Genome build build: GRCh38 -annotations: - vep: - params: "--everything" - plugins: - # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt". - - LoFtool - params: - cutadapt: "" - bwa: - "-M" - picard: - MarkDuplicates: - "VALIDATION_STRINGENCY=LENIENT" - gatk: - BaseRecalibrator: "--tmp-dir tmp" - applyBQSR: "" - strelka: - config: - "--exome" - run: - "--mode local" - razers3: - "-i 95 -m 1 -dr 0" - optitype: - "" microphaser: - window_len: - 33 - peptide_len: - netMHCpan: - 9 - netMHCIIpan: - 15 - kallisto: - "-b 100" - star: >- - --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip - --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 - --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 + # window_len should be at least 3 times the longest peptide_len specified below + events: + normal: "normal_only" + tumor: "tumor_only" + net_mhc_pan: + activate: true + peptide_len: 9 + extra: "" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCpan-4.1" + net_mhc_two_pan: + activate: false + peptide_len: 15 + extra: "" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCIIpan-4.1" + neo_fox: + activate: false + # This should be at least as long as the desired net_mhc_two_pan peptide length + peptide_len: 15 + # update the version number to get a newer release of the reference set of HLA Alleles + hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt" + extra: "" + diff --git a/config/groups.tsv b/config/groups.tsv new file mode 100644 index 00000000..11353453 --- /dev/null +++ b/config/groups.tsv @@ -0,0 +1,3 @@ +group tumorType +A LUSC +B LUAD diff --git a/config/samples.tsv b/config/samples.tsv index 6e291f15..249e0521 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,4 +1,5 @@ -sample type matched_normal purity platform -A_normal normal ILLUMINA -A_tumor tumor A_normal 1 ILLUMINA -B_tumor tumor A_normal 1 ILLUMINA +sample_name group alias purity platform +A_normal A normal 1 ILLUMINA +A_tumor A tumor .99 ILLUMINA +B_normal B normal 1 ILLUMINA +B_tumor B tumor .98 ILLUMINA diff --git a/config/units.tsv b/config/units.tsv index f8758dfb..c417a45c 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1,5 +1,6 @@ -sample sequencing_type unit fq1 fq2 sra adapters +sample_name sequencing_type unit_name fq1 fq2 sra adapters A_normal DNA lane1 A_normal_1.fastq.gz A_normal_2.fastq.gz A_tumor DNA lane2 A_tumor_1.fastq.gz A_tumor_2.fastq.gz +B_normal DNA lane1 B_normal_1.fastq.gz B_normal_2.fastq.gz B_tumor DNA lane1 B_tumor_1.fastq.gz B_tumor_2.fastq.gz B_tumor RNA lane1 B_tumor_RNA_1.fastq.gz B_tumor_RNA_2.fastq.gz diff --git a/workflow/Snakefile b/workflow/Snakefile index 2b881b44..7282e12f 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -12,6 +12,23 @@ scattergather: calling=24, +##### required envvars ##### + + +envvars: + # For NeoFox installation: + # The tarballs for both netMHCpan and netMHCIIpan + # need to be donwloaded manually after registering + # online and deposited at a filesystem location + # that needs to be available as a shell environment + # variable when running snakemake. + # For downloading, see the "Downloads" tab at: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + "NET_MHC_PAN_4_1_TARBALL", + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0 + "NET_MHC_TWO_PAN_4_0_TARBALL", + + ##### setup report ##### @@ -27,24 +44,14 @@ container: "docker://continuumio/miniconda3" include: "rules/common.smk" include: "rules/utils.smk" -include: "rules/trim.smk" include: "rules/ref.smk" -include: "rules/mapping.smk" -include: "rules/calling.smk" -include: "rules/candidate_calling.smk" -include: "rules/varlociraptor.smk" -include: "rules/annotation.smk" -include: "rules/filtering.smk" include: "rules/microphaser.smk" -include: "rules/HLAtyping.smk" -include: "rules/MHC_binding.smk" -include: "rules/RNA.smk" -include: "rules/tmb.smk" -include: "rules/vega.smk" +include: "rules/hla_typing.smk" +include: "rules/mhc_binding.smk" +include: "rules/annotate_neoantigens.smk" +include: "rules/datavzrd.smk" rule all: input: get_final_output(), - get_fusion_output(), - get_tmb_targets(), diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml index 70e39da1..35bf7643 100644 --- a/workflow/envs/bcftools.yaml +++ b/workflow/envs/bcftools.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml new file mode 100644 index 00000000..859add75 --- /dev/null +++ b/workflow/envs/datavzrd.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - datavzrd =2.2 diff --git a/workflow/envs/gawk.yaml b/workflow/envs/gawk.yaml new file mode 100644 index 00000000..fa09e590 --- /dev/null +++ b/workflow/envs/gawk.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - gawk =5.1 diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml index 6c2487fb..6a24bc29 100644 --- a/workflow/envs/hla_la.yaml +++ b/workflow/envs/hla_la.yaml @@ -1,8 +1,5 @@ channels: - conda-forge - bioconda - - jafors dependencies: - - hla-la ==1.0.5 - - samtools ==1.10 - - boost-cpp ==1.73.0 + - hla-la ==1.0.3 diff --git a/workflow/envs/htslib.yaml b/workflow/envs/htslib.yaml new file mode 100644 index 00000000..b5d7959d --- /dev/null +++ b/workflow/envs/htslib.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - htslib =1.14 diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml index 84d2ad83..90698479 100644 --- a/workflow/envs/microphaser.yaml +++ b/workflow/envs/microphaser.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - microphaser =0.2 + - microphaser =0.7 diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh new file mode 100755 index 00000000..c344af3e --- /dev/null +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -0,0 +1,157 @@ +#!/usr/bin/env bash +set -euo pipefail + +# set all the necessary conda paths and +# ensure they exist +CONDA_BIN="${CONDA_PREFIX}/bin/" +# this is needed for the allele list files of MixMHCpred, MixMHC2pred +# and PRIME, where NeoFox expects some of the files in that directory +mkdir -p ${CONDA_BIN}/lib/ +CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/" +mkdir -p $CONDA_MAN1 +CONDA_INFO="${CONDA_PREFIX}/share/info/" +mkdir -p $CONDA_INFO +CONDA_LIB="${CONDA_PREFIX}/lib/" +mkdir -p ${CONDA_LIB} +CONDA_ETC="${CONDA_PREFIX}/etc/" +mkdir -p $CONDA_ETC + +# install MixMHCpred, following: +# https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README +MIX_MHC_PRED_VERSION="2.1" +MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/" +wget -q https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz +tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz +cd MixMHCpred-${MIX_MHC_PRED_VERSION} +g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x +# TODO: when updating to v2.2, change this line to: +# MMP_PLACEHOLDER="/PATH_TO_MIXMHCPRED/lib" +MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER" +grep "${MMP_PLACEHOLDER}" MixMHCpred +sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred +mv MixMHCpred ${CONDA_BIN} +# The allele_list.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L123 +cp lib/allele_list.txt ${CONDA_BIN}/lib/ +mv lib $MIX_MHC_PRED_LIB_PATH +# TODO: when updating to v2.2, change this line to: +# mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf +mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf +MixMHCpred -i test/test.fa -o test/out.txt -a A0101,A2501,B0801,B1801 +diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt) +cd .. +rm v${MIX_MHC_PRED_VERSION}.tar.gz +rm -r MixMHCpred-${MIX_MHC_PRED_VERSION} + +# install MixMHC2pred, mostly following (we use the default GitHub-created +# .tar.gz file instead of the hand-crafted .zip file for future-proofing): +# https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md +MIX_MHC_TWO_PRED_VERSION="1.2" +MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/" +wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} +mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix +# The Alleles_list.txt file needs to be in the same directory as the binaries, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L114 +mv -t ${CONDA_BIN} Alleles_list.txt +mv rpep ${CONDA_ETC} +ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep +mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE +MixMHC2pred_unix -i test/testData.txt -o test/out.txt -a DRB1_11_01 DRB3_02_02 DPA1_01_03__DPB1_04_01 DQA1_05_05__DQB1_03_01 +diff test/out.txt test/out_compare.txt +cd .. +rm v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} + +# install PRIME, mostly following (minor corrections): +# https://github.com/GfellerLab/PRIME/blob/v1.0/README +PRIME_VERSION="1.0" +PRIME_LIB_PATH="${CONDA_LIB}/prime/" +wget -q https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz +tar xzf v${PRIME_VERSION}.tar.gz +cd PRIME-${PRIME_VERSION} +PRIME_PLACEHOLDER="/app/PRIME/lib" +grep "${PRIME_PLACEHOLDER}" PRIME +sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME +mv PRIME ${CONDA_BIN} +# The alleles.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L132 +cp lib/alleles.txt ${CONDA_BIN}/lib/ +mv lib $PRIME_LIB_PATH +mv PRIME_license.pdf ${CONDA_INFO} +PRIME -i test/test.txt -o test/out.txt -a A0201,A0101 +diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt) +cd .. +rm v${PRIME_VERSION}.tar.gz +rm -r PRIME-${PRIME_VERSION} + +# This is the non-portable version of the 1st line +# in both netMHCpan and netMHCIIpan, assuming a +# root install of tcsh. +TCSH_ROOT="#! /bin/tcsh -f" +# For the scripts to work with any tcsh in the +# $PATH, we need to change this to: +TCSH_PATH="#!/usr/bin/env tcsh" + +# install netMHCpan version 4.1 +# requires tcsh to have been installed via conda dependencies +NET_MHC_PAN_4_1_LIB="${CONDA_LIB}/netMHCpan_4_1/" +mkdir -p ${NET_MHC_PAN_4_1_LIB} +NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/" +mkdir -p ${NET_MHC_PAN_4_1_ETC} +tar xzf ${NET_MHC_PAN_4_1_TARBALL} +cd netMHCpan-4.1 +wget -q https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz +tar xzf data.tar.gz +rm data.tar.gz +grep "${TCSH_ROOT}" netMHCpan +sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCpan +grep -P "setenv\s+NMHOME" netMHCpan +sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_PAN_4_1_LIB}%" netMHCpan +mv -t ${CONDA_BIN} netMHCpan +mv -t ${CONDA_MAN1} netMHCpan.1 +mv -t ${CONDA_INFO} netMHCpan-4.1.readme +mv -t ${NET_MHC_PAN_4_1_LIB} Linux_x86_64 +mv -t ${NET_MHC_PAN_4_1_ETC} data +ln -s ${NET_MHC_PAN_4_1_ETC}/data ${NET_MHC_PAN_4_1_LIB}/data +cd test +netMHCpan -p test.pep -BA -xls -a HLA-A01:01,HLA-A02:01 -xlsfile my_NetMHCpan_out.xls +diff NetMHCpan_out.xls my_NetMHCpan_out.xls +cd ../.. +rm -r netMHCpan-4.1 + +# install netMHCIIpan version 4.0 +# requires tcsh to have been installed via conda dependencies +NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/" +mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB} +NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/" +mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC} +tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL} +cd netMHCIIpan-4.0 +wget -q https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz +tar xzf data.tar.gz +rm data.tar.gz +grep "${TCSH_ROOT}" netMHCIIpan +sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCIIpan +grep -P "setenv\s+NMHOME" netMHCIIpan +sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_TWO_PAN_4_0_LIB}%" netMHCIIpan +mv -t ${CONDA_BIN} netMHCIIpan +mv -t ${CONDA_MAN1} netMHCIIpan.1 +mv -t ${CONDA_INFO} netMHCIIpan-4.0.readme +mv -t ${NET_MHC_TWO_PAN_4_0_LIB} Linux_x86_64 NetMHCIIpan-4.0.pl +mv -t ${NET_MHC_TWO_PAN_4_0_ETC} data +ln -s ${NET_MHC_TWO_PAN_4_0_ETC}/data ${NET_MHC_TWO_PAN_4_0_LIB}/data +cd test +netMHCIIpan -f example.fsa -a DRB1_0101 > example.fsa.myout +diff example.fsa.out example.fsa.myout +netMHCIIpan -f example.pep -inptype 1 -a DRB1_0101 > example.pep.myout +diff example.pep.out example.pep.myout +netMHCIIpan -f example.fsa -a H-2-IAb -s -u > example.fsa.sorted.myout +diff example.fsa.sorted.out example.fsa.sorted.myout +netMHCIIpan -f example.fsa -hlaseq DRB10101.fsa > example.fsa_hlaseq.myout +diff example.fsa_hlaseq.out example.fsa_hlaseq.myout +netMHCIIpan -f example.fsa -hlaseqA alpha.dat -hlaseq beta.dat > example.fsa_hlaseq_A+B.myout +diff example.fsa_hlaseq_A+B.out example.fsa_hlaseq_A+B.myout +cd ../.. +rm -r netMHCIIpan-4.0 diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml new file mode 100644 index 00000000..780af1d6 --- /dev/null +++ b/workflow/envs/neo_fox_deps.yaml @@ -0,0 +1,30 @@ +channels: + - conda-forge + - bioconda +dependencies: + # https://neofox.readthedocs.io/en/latest/02_installation.html#step-by-step-guide-without-docker + - r-base =3.6 + - python >=3.7, <=3.8 + # implicit unmentioned dependency of MixMHCpred and PRIME + - perl + # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp + - blast =2 + # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64 + - cxx-compiler + # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts + - tcsh =6.24 + # just make sure this is available for the post-deploy.sh script + - sed =4.8 + # R packages mentioned at the end of this section: + # https://neofox.readthedocs.io/en/latest/02_installation.html#configuration-of-the-reference-folder + - r-lattice + - r-ggplot2 + - r-caret + - r-peptides + - r-doparallel + - r-gbm + - bioconductor-biostrings + # https://neofox.readthedocs.io/en/latest/02_installation.html#install-neofox + - pip + - pip: + - neofox==0.6.4 diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml new file mode 100644 index 00000000..5ced20fa --- /dev/null +++ b/workflow/envs/pandas.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - pandas=1.4 \ No newline at end of file diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml index 6c8be1fb..94ff5bae 100644 --- a/workflow/envs/rbt.yaml +++ b/workflow/envs/rbt.yaml @@ -1,6 +1,6 @@ channels: - - bioconda - conda-forge + - bioconda dependencies: - rust-bio-tools =0.19 - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/envs/tcsh.yaml b/workflow/envs/tcsh.yaml new file mode 100644 index 00000000..9bc374fd --- /dev/null +++ b/workflow/envs/tcsh.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - tcsh =6.24 diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml index a76d765a..270dc831 100644 --- a/workflow/envs/varlociraptor.yaml +++ b/workflow/envs/varlociraptor.yaml @@ -3,4 +3,4 @@ channels: - bioconda dependencies: - varlociraptor =2.3.0 - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/report/HLA_Types.rst b/workflow/report/HLA_Types.rst deleted file mode 100644 index 999fe494..00000000 --- a/workflow/report/HLA_Types.rst +++ /dev/null @@ -1 +0,0 @@ -Typing of HLA profile. diff --git a/workflow/report/WES_results.rst b/workflow/report/WES_results.rst deleted file mode 100644 index 4a699bdf..00000000 --- a/workflow/report/WES_results.rst +++ /dev/null @@ -1 +0,0 @@ -"Results - neoantigen candidate table" diff --git a/workflow/report/hla_alleles.rst b/workflow/report/hla_alleles.rst new file mode 100644 index 00000000..a793a914 --- /dev/null +++ b/workflow/report/hla_alleles.rst @@ -0,0 +1 @@ +HLA allele profile as determined by HLA-LA. diff --git a/workflow/report/neoantigens.dna.rst b/workflow/report/neoantigens.dna.rst new file mode 100644 index 00000000..17ba537e --- /dev/null +++ b/workflow/report/neoantigens.dna.rst @@ -0,0 +1,60 @@ +Neoantigens and corresponding normal peptides as phased and determined by +microphaser, with elution ligand / binding affinity predictions by netMHC +(netMHCpan and netMHCIIpan) to the HLA alleles determined by HLA-LA. + +=================== +Column descriptions +=================== + +* **id**: Peptide ID assigned by microphaser. +* **pep_seq**: Sequence of the full peptide that was given to netMHC(II)pan. Amino acids that + are different between the normal and the tumor sample are highlighted in lower case. +* **pos_in_id_seq**: Position in pep_seq of the peptide that was used for prediction. + It seems like netMHCpan positions start at 0 and netMHCIIpan positions at 1. +* **alias**: Indicator of the type of sample (normal vs. some tumor sample). +* **num_binders**: Total number of peptide-HLA allele pairs from pep_seq that are considered binders, + either weak or strong. Cutoffs are applied to el_rank and are: + * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder) + * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder) +* **freq**: Allelic frequency of the peptide as predicted by microphaser. For a + credible allele frequency interval, see column freq_credible_interval. +* **depth**: Read depth at the peptide position. +* **num_var_sites**: Number of variant sites on the peptides haplotype in that sample (alias). +* **num_var_in_pep**: Number of variant sites within the peptide sequence. +* **top_el_rank_allele**: HLA allele with the best eluted ligand prediction score percentile rank. +* **top_el_rank_bind_core**: Binding core of the peptide for the HLA allele with the best + elution ligand score percentile rank. +* **top_el_rank_el_rank**: Percentile rank of the elution ligand score. The rank of the predicted binding + score when compared to a set of random natural peptides. This measure is not + affected by inherent bias of certain molecules towards higher or lower mean + predicted affinities. It is the recommended value for determining likely + binders / neoantigens of interest. Cutoffs recommended by netMHC(II)pan authors + are: + * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder) + * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder) +* **top_el_rank_el_score**: The raw eluted ligand prediction score +* **ave_el_score**: Average across the eluted ligand prediction scores of all alleles for this + peptide in the particular sample (alias). +* **top_ba_rank_allele**: HLA allele with the best binding affinity prediction score percentile rank. +* **top_ba_rank_bind_core**: Binding core of the peptide for the HLA allele with the best + binding affinity score percentile rank. +* **top_ba_rank_ba_rank**: Percentile rank of the predicted binding affinity compared to a set of 100.000 + random natural peptides. This measure is not affected by inherent bias of certain + molecules towards higher or lower mean predicted affinities. +* **top_ba_rank_ba_score**: Predicted binding affinity in log-scale. +* **aa_changes**: List of aa changes. For normal samples, only germline changes are listed. For + tumor samples, only somatic tumor changes are listed, even though the germline + changes also affect the tumor peptide. +* **genomic_pos**: Genomic position of the nucleotide change. +* **nt_seq**: Nucleotide sequence underlying the peptide. Nucleotide changes underlying the + amino acid changes are highlighted as lower case letters. +* **gene_name**: Common gene name / gene symbol of the peptide's gene of origin. +* **gene_id**: Ensembl gene id of the peptide's gene of origin. +* **transcript**: List of Ensembl transcript ids in which the peptide occurs. +* **chrom**: Chromosome on which the peptide's gene of origin is located. +* **offset**: Chromosomal position of the peptide's gene of origin. +* **frame**: Open reading frame that the peptide originates from. 0 indicates the regular + reading frame, non-zero values indicate frame shifts. +* **strand**: Strand of the gene / transcript. +* **freq_credible_interval**: Credible interval for freq, the allelic frequency of the peptide as predicted + by microphaser. diff --git a/workflow/report/RNA_results.rst b/workflow/report/neoantigens.rna.rst similarity index 100% rename from workflow/report/RNA_results.rst rename to workflow/report/neoantigens.rna.rst diff --git a/workflow/report/neopeptides.rst b/workflow/report/neopeptides.rst new file mode 100644 index 00000000..1e6d6625 --- /dev/null +++ b/workflow/report/neopeptides.rst @@ -0,0 +1,9 @@ +Neopeptides and corresponding normal peptides as phased and determined by +microphaser, with various annotation scores gathered and provided by NeoFox, +using the HLA alleles determined by HLA-LA. + +=================== +Column descriptions +=================== + +TODO: transform spreadsheet into RST table diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml new file mode 100644 index 00000000..270c77a7 --- /dev/null +++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml @@ -0,0 +1,344 @@ +__definitions__: + - import pandas as pd + - from itertools import chain + - from copy import deepcopy +__variables__: + important_cols: ?set(params.neofox_important_cols["general"]) | set(params.neofox_important_cols[wildcards.mhc]) + cols: ?set(pd.read_csv(input.neopeptides, sep="\t").columns.values) + coldefs: + gene: + link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value} + mutation_mutatedXmer: + custom: | + function(value, row) { + return value.split("").map(function(a) { + if (a === a.toLowerCase()) { + return `${a}` + } else { + return a + } + }).join(""); + } + purity_adjusted_DNA_VAF: + plot: + ticks: + scale: linear + imputedGeneExpression: + plot: + ticks: + scale: linear + PRIME_best_rank: + plot: + heatmap: + scale: symlog + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - white + PRIME_best_score: + plot: + ticks: + scale: linear + Best_rank_MHCI_9mer_score: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCI_9mer_score_WT + Best_rank_MHCI_9mer_score_WT: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCI_9mer_score + MixMHC2pred_best_rank: + plot: + heatmap: + scale: linear + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - "#ffffff" + Best_rank_MHCII_score: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCII_score_WT + Best_rank_MHCII_score_WT: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCII_score + Recognition_Potential_MHCI_9mer: + plot: + ticks: + scale: linear + Improved_Binder_MHCI: + plot: + heatmap: + scale: ordinal + domain: + - 0 + - 1 + range: + - "#ffffff" + - "#2CA02C" + Selfsimilarity_MHCI_conserved_binder: + plot: + ticks: + scale: linear + mutation_position: + display-mode: detail + dnaVariantAlleleFrequency: + display-mode: detail + rnaVariantAlleleFrequency: + display-mode: hidden + rnaExpression: + display-mode: hidden + ADN_MHCI: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + ADN_MHCII: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + Amplitude_MHCII_rank: + display-mode: detail + plot: + heatmap: + scale: linear + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - "#ffffff" + Amplitude_MHCI_affinity: + display-mode: detail + plot: + ticks: + scale: linear + Amplitude_MHCI_affinity_9mer: + display-mode: detail + plot: + ticks: + scale: linear + MixMHC2pred_best_allele: + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_rank_MHCII_score_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_rank_MHCII_score_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCII_score_WT + Best_affinity_MHCII_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCII_score + PRIME_best_allele: + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_rank_MHCI_9mer_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_rank_MHCI_9mer_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_9mer_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_9mer_score_WT + Best_affinity_MHCI_9mer_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_9mer_score + Best_affinity_MHCI_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_score_WT + Best_affinity_MHCI_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_score + +name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}" + +default-view: "overview" + +datasets: + neoprint: + path: ?input.neopeptides + separator: "\t" + +views: + overview: + desc: | + Neopeptide candidates with annotations gathered and provided by NeoFox. + + ### Column descriptions + * **Recognition_Potential_MHCI_9mer:** The recognition potential of a neoantigen is the likelihood that it is effectively recognized by the TCR repertoire (definition: Amplitude_MHCI_affinity_9mer x Pathogensimiliarity_MHCI_affinity_9mer) \[[runup to equation (5) in Luksza et al. 2017](https://doi.org/10.1038/nature24473)\]. + * **Selfsimilarity_MHCI_conserved_binder:** Score for k-mer based similarity between Best_rank_MHCI_score_epitope and Best_affinity_MHCI_epitope_WT. For conserved binders only (i.e. NOT improved binders), where this score is considered indicative of immunogenicity \[[page 3 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\]. + * **Improved_Binder_MHCI:** Cutoff of 1.2 on the ratio between normal and mutated peptide rank scores to designate a peptide as an improved binder (as opposed to a conserved binder) (definition: (Best_rank_MHCI_score_WT / Best_rank_MHCI_score ) > 1.2) \[[page 4 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\] + + dataset: neoprint + + render-table: + columns: + ?for col, coldef in coldefs.items(): + __definitions__: + - | + coldef_ = deepcopy(coldef) + if col not in important_cols: + coldef_["display-mode"] = "detail" + ?col: ?coldef_ + ?for col in cols: + ?if col not in coldefs and col not in important_cols: + ?col: + display-mode: detail \ No newline at end of file diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk deleted file mode 100644 index 682808ef..00000000 --- a/workflow/rules/HLAtyping.smk +++ /dev/null @@ -1,98 +0,0 @@ -rule HLA_LA: - input: - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bam.bai", - index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", - output: - "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt", - threads: 7 - log: - "logs/HLA-LA/{sample}.log", - params: - graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), - graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), - conda: - "../envs/hla_la.yaml" - shell: - "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" - - -rule parse_HLA_LA: - input: - "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt", - output: - report( - "results/HLA-LA/hlaI_{sample}.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(HLA-LA)", - ), - report( - "results/HLA-LA/hlaII_{sample}.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(HLA-LA)", - ), - log: - "logs/parse-HLA-LA/{sample}.log", - script: - "../scripts/parse_HLA_types.py" - - -rule razers3: - input: - reads="results/merged/DNA/{sample}_{fq}.fastq.gz", - output: - bam="results/razers3/bam/{sample}_{fq}.bam", - threads: 8 - log: - "logs/razers3/{sample}_{fq}.log", - params: - genome=config["HLAtyping"]["optitype_data"], - extra=config["params"]["razers3"], - wrapper: - "0.61.0/bio/razers3" - - -rule bam2fq: - input: - "results/razers3/bam/{sample}_{fq}.bam", - output: - "results/razers3/fastq/{sample}_{fq}.fished.fastq", - params: - "", - log: - "logs/razers3-bam2fq/{sample}-{fq}.log", - threads: 1 - wrapper: - "0.61.0/bio/samtools/bam2fq/interleaved" - - -rule OptiType: - input: - reads=get_optitype_reads_input, - output: - multiext( - "results/optitype/{sample}/{sample}", "_coverage_plot.pdf", "_result.tsv" - ), - log: - "logs/optitype/{sample}.log", - params: - extra=config["params"]["optitype"], - sequencing_type="dna", - wrapper: - "0.63.0/bio/optitype" - - -rule parse_Optitype: - input: - "results/optitype/{sample}/{sample}_result.tsv", - output: - report( - "results/optitype/{sample}/hla_alleles_{sample}.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(Optitype)", - ), - log: - "logs/parse-optitype/{sample}.log", - shell: - "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' " - '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}' diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk deleted file mode 100644 index 2d95e343..00000000 --- a/workflow/rules/MHC_binding.smk +++ /dev/null @@ -1,127 +0,0 @@ -# rule mhcflurry: -# input: -# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{group}.fa", -# alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv", -# wt_alleles=get_germline_optitype -# output: -# "results/mhcflurry/{sample}/{chr}/output.{group}.csv" -# log: -# "logs/mhcflurry/{sample}-{chr}-{group}.log" -# run: -# if "wt" in input.peptides: -# alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0]) -# else: -# alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0]) -# cmd = "if [ -s {input.peptides} ]; then mhctools --mhc-predictor mhcflurry --mhc-alleles {alleles} --input-fasta-file {input.peptides} --output-csv {output} > {log}; else touch {output}; fi" -# shell(cmd) - - -rule netMHCpan: - input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{group}.fa", - alleles=get_alleles_MHCI, - output: - "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{group}.xls", - log: - "logs/netMHCpan/{sample}-{chr}-{group}.log", - params: - extra=config["affinity"]["netMHCpan"]["params"], - netMHC=config["affinity"]["netMHCpan"]["location"], - run: - alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0]) - cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" - shell(cmd) - - -rule netMHCIIpan: - input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{group}.fa", - alleles=get_alleles_MHCII, - output: - "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{group}.xls", - log: - "logs/netMHCIIpan/{sample}-{chr}-{group}.log", - params: - extra=config["affinity"]["netMHCIIpan"]["params"], - netMHC=config["affinity"]["netMHCIIpan"]["location"], - run: - alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist()) - cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" - shell(cmd) - - -rule parse_mhc_out: - input: - expand( - "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{group}}.xls", - chr=contigs, - ), - output: - "results/{mhc}/{sample}/{sample}.mhc.{group}.tsv", - log: - "logs/parse-mhc/{mhc}-{sample}-{group}.log", - wildcard_constraints: - group="wt|mt", - script: - "../scripts/group_mhc_output.py" - - -# rule parse_mhcflurry: -# input: -# expand("results/mhcflurry/{{sample}}/{chr}/output.{{group}}.csv", chr=contigs) -# output: -# "results/mhcflurry/{sample}/{sample}.mhc.{group}.csv" -# wildcard_constraints: -# group="wt|mt" -# log: -# "logs/parse-mhc/mhcflurry-{sample}-{group}.log" -# conda: -# "../envs/xsv.yaml" -# shell: -# "xsv cat rows -d ',' {input} | cut --complement -f2,7,8 > {output}" - - -rule mhc_csv_table: - input: - info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv", - mt="results/{mhc}/{sample}/{sample}.mhc.mt.tsv", - wt="results/{mhc}/{sample}/{sample}.mhc.wt.tsv", - output: - report( - "results/neoantigens/{mhc}/{sample}.DNA.tsv", - caption="../report/WES_results.rst", - category="Results WES (netMHC)", - ), - log: - "logs/create-mhc-table/{mhc}-{sample}.log", - script: - "../scripts/merge_data.py" - - -# rule mhcflurry_table: -# input: -# info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv", -# mt="results/mhcflurry/{sample}/{sample}.mhc.mt.tsv", -# wt="results/mhcflurry/{sample}/{sample}.mhc.wt.tsv" -# output: -# report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)") -# script: -# "../scripts/merge_mhcflurry.py" - - -rule add_RNA_info: - input: - counts="results/kallisto/{sample}", - table="results/neoantigens/{mhc}/{sample}.DNA.tsv", - output: - report( - "results/neoantigens/{mhc}/{sample}.RNA.tsv", - caption="../report/RNA_results.rst", - category="Results RNA", - ), - params: - abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), - log: - "logs/add-RNA/{mhc}-{sample}.log", - script: - "../scripts/add_rna_info.py" diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk deleted file mode 100644 index 940d7c02..00000000 --- a/workflow/rules/RNA.smk +++ /dev/null @@ -1,71 +0,0 @@ -rule kallisto_quant: - input: - fastq=get_quant_reads_input, - index="resources/kallisto/transcripts.idx", - output: - directory("results/kallisto/{sample}"), - params: - extra=kallisto_params, - log: - "results/logs/kallisto/quant/{sample}.log", - wrapper: - "0.60.1/bio/kallisto/quant" - - -rule STAR_align: - input: - "resources/STAR_index", - fq1=lambda wc: units.loc[(wc.sample, "RNA"), "fq1"], - fq2=lambda wc: units.loc[(wc.sample, "RNA"), "fq2"], - gtf="resources/genome.gtf", - output: - # see STAR manual for additional output files - "results/star/{sample}/Aligned.out.bam", - "results/star/{sample}/ReadsPerGene.out.tab", - log: - "logs/star/{sample}.log", - params: - # path to STAR reference genome index - index="STAR_index", - # optional parameters - designed to get chimeric alignments for fusion detection - extra=lambda wc, input: "--quantMode GeneCounts --sjdbGTFfile {} {}".format( - input.gtf, config["params"]["star"] - ), - threads: 8 - wrapper: - "0.42.0/bio/star/align" - - -rule arriba: - input: - bam="results/star/{sample}/Aligned.out.bam", - genome="resources/genome.fasta", - annotation="resources/genome.gtf", - output: - fusions="results/arriba/{sample}.fusions.tsv", - discarded="results/arriba/{sample}.fusions.discarded.tsv", - params: - blacklist=config["fusion"]["arriba"]["blacklist"], - extra=config["fusion"]["arriba"]["params"], - log: - "results/logs/arriba/{sample}.log", - threads: 1 - wrapper: - "0.60.1/bio/arriba" - - -## TODO: Update -# rule fusioncatcher: -# input: -# fq1=lambda w: units.loc[(w.sample, "RNA"), "fq1"], -# fq2=lambda w: units.loc[(w.sample, "RNA"), "fq2"] -# output: -# directory("fusioncatcher/{sample}") -# params: -# extra="-T tmp -d ../../fusioncatcher_data" -# log: -# "logs/fusioncatcher/{sample}.log" -# threads: -# 8 -# shell: -# "fusioncatcher -i {input.fq1},{input.fq2} -o {output} {params.extra} -p {threads} > {log}" diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk new file mode 100644 index 00000000..70e1a492 --- /dev/null +++ b/workflow/rules/annotate_neoantigens.smk @@ -0,0 +1,124 @@ +rule prepare_neo_fox_config_and_resources: + output: + config="resources/neo_fox/neo_fox_config.txt", + # we cannot put the exact files generated into the + # output, as snakemake will generate the respective + # subdirectories and NeoFox has default exist_ok=False + # set for os.makedirs: + # https://github.com/TRON-Bioinformatics/neofox/blob/fb6cdf9f10e77c409d0fa44657ef520eedca6994/neofox/references/installer.py#L221 + references=directory("resources/neo_fox/references/"), + log: + "logs/neo_fox/neo_fox_config.log", + conda: + "../envs/neo_fox_deps.yaml" + params: + hla_alleles=config["params"]["neo_fox"]["hla_alleles"], + shell: + """ + # environment variables necessary for neofox-configure + # NOTE: we have to provide all binaries with hard-coded + # paths, because NeoFox checks that the file at the path + # exists (and not simply that a binary exists): + # https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L90 + CONDA_BIN=$CONDA_PREFIX/bin + + ## pre-installed via conda + export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb + echo "NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb" > {output.config} + export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript + echo "NEOFOX_RSCRIPT=$CONDA_BIN/Rscript" >> {output.config} + + ## pre-installed into conda environment via post-deploy script + export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan + echo "NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan" >> {output.config} + export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan + echo "NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan" >> {output.config} + + ## specification of hla_allele link via config.yaml + export NEOFOX_HLA_DATABASE={params.hla_alleles} + + neofox-configure --reference-folder {output.references} + echo "NEOFOX_REFERENCE_FOLDER={output.references}" >> {output.config} + + # further environment variables needed for the config file + + ## pre-installed via conda + echo "NEOFOX_BLASTP=$CONDA_BIN/blastp" >> {output.config} + + ## pre-installed into conda environment via post-deploy script + echo "NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred" >> {output.config} + echo "NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix" >> {output.config} + echo "NEOFOX_PRIME=$CONDA_BIN/PRIME" >> {output.config} + """ + + +rule adjust_microphaser_output_for_neo_fox: + input: + microphaser="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + output: + neo_fox="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + log: + "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log", + threads: 1 + conda: + "../envs/pandas.yaml" + script: + "../scripts/adjust_microphaser_output_for_neo_fox.py" + + +rule create_neo_fox_group_sheet: + input: + hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt", + output: + group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", + log: + "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log", + conda: + "../envs/pandas.yaml" + params: + group=lambda wc: group_annotation.loc[wc.group], + script: + "../scripts/create_neo_fox_group_sheet.py" + + +rule neo_fox: + input: + config="resources/neo_fox/neo_fox_config.txt", + candidates=expand( + "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + peptide_length=config["params"]["neo_fox"]["peptide_len"], + ), + group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", + output: + tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", + json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json", + log: + "logs/neo_fox/annotated/{group}.{tumor_alias}.log", + threads: 8 + conda: + "../envs/neo_fox_deps.yaml" + params: + folder=lambda wc, output: path.dirname(output.tsv), + prefix=lambda wc, output: path.splitext(path.basename(output.tsv))[0], + organism="human" + if config["ref"]["species"] == "homo_sapiens" + else "mouse" + if config["ref"]["species"] == "mus_musculus" + else "unsupported", + shell: + "(neofox " + " --num-cpus {threads} " + " --config {input.config} " + " --candidate-file {input.candidates} " + " --patient-data {input.group_sheet} " + " --with-table " + " --with-json " + " --organism {params.organism} " + " --output-folder {params.folder} " + " --output-prefix {params.prefix} ; " + " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " + " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " + # this implicitly created log does not seem to contain all of stderr, + # so we rather do our own capturing below + " rm {params.folder}/{params.prefix}.log; " + ") 2> {log} " diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk deleted file mode 100644 index 64088375..00000000 --- a/workflow/rules/annotation.smk +++ /dev/null @@ -1,44 +0,0 @@ -rule annotate_variants: - input: - calls="results/calls/{group}.{scatteritem}.bcf", - cache="resources/vep/cache", - plugins="resources/vep/plugins", - output: - calls="results/calls/{group}.{scatteritem}.annotated.bcf", - stats=report( - "results/calls/{group}.{scatteritem}.stats.html", - caption="../report/stats.rst", - category="QC", - ), - params: - # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. - plugins=config["annotations"]["vep"]["plugins"], - extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]), - log: - "logs/vep/{group}.{scatteritem}.annotate.log", - wrapper: - "0.59.2/bio/vep/annotate" - - -rule annotate_strelka_variants: - input: - calls="results/strelka/{calls}.bcf", - cache="resources/vep/cache", - plugins="resources/vep/plugins", - output: - calls="results/strelka/{calls}.annotated.bcf", - stats=report( - "results/strelka/{calls}.annotated.stats.html", - caption="../report/stats.rst", - category="QC", - ), - params: - # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. - plugins=[], #config["annotations"]["vep"]["plugins"], - extra="--vcf_info_field ANN --hgvs --symbol --canonical", - log: - "logs/vep/{calls}.strelka.annotate.log", - wrapper: - "0.59.2/bio/vep/annotate" diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk deleted file mode 100644 index 98ef992f..00000000 --- a/workflow/rules/calling.smk +++ /dev/null @@ -1,147 +0,0 @@ -rule strelka_somatic: - input: - normal=get_normal_bam, - normal_index=get_normal_bai, - tumor="results/recal/{sample}.sorted.bam", - tumor_index="results/recal/{sample}.sorted.bam.bai", - fasta="resources/genome.fasta", - fasta_index="resources/genome.fasta.fai", - callregions="resources/genome.callregions.bed.gz", - output: - "results/strelka/somatic/{sample}/results/variants/somatic.snvs.vcf.gz", - "results/strelka/somatic/{sample}/results/variants/somatic.indels.vcf.gz", - log: - "logs/calling/strelka_somatic/{sample}.log", - params: - config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] - ), - run_extra=config["params"]["strelka"]["run"], - threads: 22 - wrapper: - "0.65.0/bio/strelka/somatic" - - -rule strelka_germline: - input: - bam="results/recal/{normal}.sorted.bam", - normal_index="results/recal/{normal}.sorted.bam.bai", - fasta="resources/genome.fasta", - fasta_index="resources/genome.fasta.fai", - callregions="resources/genome.callregions.bed.gz", - output: - "results/strelka/germline/{normal}/results/variants/variants.vcf.gz", - log: - "logs/calling/strelka_germline/{normal}.log", - params: - config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] - ), - run_extra="", - threads: 22 - wrapper: - "0.65.0/bio/strelka/germline" - - -rule vcf_to_bcf: - input: - "{variants}.vcf.gz", - output: - "{variants}.output.bcf", - log: - "logs/bcftools/to-bcf/{variants}.log", - params: - "-O b -f PASS", - wrapper: - "0.60.0/bio/bcftools/view" - - -rule concat_somatic: - input: - calls=expand( - "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf", - type=["snvs", "indels"], - ), - indices=expand( - "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf.csi", - type=["snvs", "indels"], - ), - output: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf", - log: - "bcftools/concat-somatic/{sample}.log", - params: - "-O b -a", - wrapper: - "0.60.0/bio/bcftools/concat" - - -rule get_tumor_from_somatic: - input: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf", - output: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", - log: - "logs/bcftools/view-TUMOR/{sample}.log", - params: - "-O b -s TUMOR", - wrapper: - "0.60.0/bio/bcftools/view" - - -rule reheader_germline: - input: - vcf="{germline}/variants.output.bcf", - samples="resources/sampleheader.txt", - output: - "{germline}/variants.reheader.bcf", - log: - "logs/bcftools/reheader/{germline}.log", - params: - extra="", - view_extra="-O b", - wrapper: - "0.60.0/bio/bcftools/reheader" - - -rule concat_variants: - input: - calls=lambda w: get_pair_variants(w, index=False), - index=lambda w: get_pair_variants(w, index=True), - output: - "results/strelka/merged/{sample}/all_variants.bcf", - log: - "bcftools/concat-all/{sample}.log", - params: - extra="-O b -a", - wrapper: - "0.64.0/bio/bcftools/concat" - - -rule preprocess_variants: - input: - variants="{variants}.bcf", - output: - "{variants}.prepy.bcf", - params: - extra="-L --somatic", - genome="resources/genome.fasta", - log: - "logs/prepy/{variants}.log", - threads: 2 - wrapper: - "0.60.0/bio/hap.py/pre.py" - - -rule norm_vcf: - input: - "{prefix}.bcf", - genome="resources/genome.fasta", - output: - "{prefix}.norm.bcf", - log: - "logs/bcftools/norm/{prefix}.log", - params: - "-f {} -O b -m-".format("resources/genome.fasta"), # optional parameters for bcftools norm (except -o) - wrapper: - "0.65.0/bio/bcftools/norm" diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk deleted file mode 100644 index 74386763..00000000 --- a/workflow/rules/candidate_calling.smk +++ /dev/null @@ -1,28 +0,0 @@ -rule freebayes: - input: - ref="resources/genome.fasta", - # you can have a list of samples here - samples=get_paired_bams, - output: - "results/candidate-calls/{pair}.freebayes.bcf", - log: - "logs/{pair}.log", - params: - extra=config["params"].get("freebayes", ""), - chunksize=100000, - threads: 60 - wrapper: - "0.65.0/bio/freebayes" - - -rule scatter_candidates: - input: - "results/candidate-calls/{pair}.{caller}.bcf", - output: - scatter.calling("results/candidate-calls/{{pair}}.{{caller}}.{scatteritem}.bcf"), - log: - "logs/scatter-candidates/{pair}.{caller}.log", - conda: - "../envs/rbt.yaml" - shell: - "rbt vcf-split {input} {output}" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3bc6c60d..23a4b97e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,6 +1,7 @@ import glob import pandas as pd +from os import path from snakemake.remote import FTP from snakemake.utils import validate @@ -10,16 +11,56 @@ ftp = FTP.RemoteProvider() validate(config, schema="../schemas/config.schema.yaml") -##### sample sheets ##### - -samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False) +##### samples sheet ##### + +samples = ( + pd.read_csv( + config["samples"], + sep="\t", + dtype={ + "sample_name": str, + "group": str, + "alias": str, + }, + comment="#", + ) + .set_index("sample_name", drop=False) + .sort_index() +) validate(samples, schema="../schemas/samples.schema.yaml") -units = pd.read_csv(config["units"], dtype=str, sep="\t").set_index( - ["sample", "sequencing_type", "unit"], drop=False +##### units sheet ##### + +units = ( + pd.read_csv( + config["units"], + sep="\t", + dtype={ + "sample_name": str, + "sequencing_type": str, + "unit_name": str, + "adapters": str, + }, + comment="#", + ) + .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False) + .sort_index() ) validate(units, schema="../schemas/units.schema.yaml") +groups = samples["group"].unique() + +if "groups" in config: + group_annotation = ( + pd.read_csv(config["groups"], sep="\t", dtype={"group": str}) + .set_index("group") + .sort_index() + ) + group_annotation = group_annotation.loc[groups] +else: + group_annotation = pd.DataFrame({"group": groups}).set_index("group") + + contigs = [c for c in range(1, 23)] contigs.extend(["X", "Y"]) @@ -27,10 +68,29 @@ contigs.extend(["X", "Y"]) wildcard_constraints: - pair="|".join(samples[samples.type == "tumor"]["sample"]), - sample="|".join(samples["sample"]), + cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]), + normal_sample="|".join(samples[samples.alias == "normal"]["sample_name"]), + sample="|".join(samples["sample_name"]), + unit="|".join(units["unit_name"]), + alias="|".join(pd.unique(samples["alias"])), + tumor_alias="|".join( + pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"]) + ), + normal_alias="normal", + tumor_set=config["params"]["microphaser"]["events"]["tumor"], + normal_set=config["params"]["microphaser"]["events"]["normal"], + set="|".join( + [ + config["params"]["microphaser"]["events"]["tumor"], + config["params"]["microphaser"]["events"]["normal"], + ] + ), + group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), - event="somatic|germline|complete", + peptide_type="|".join(["normal", "neo"]), + event="|".join(["somatic", "germline", "complete"]), + read="|".join(["single", "R1", "R2"]), + seqtype="|".join(["DNA", "RNA"]), ### Output generation ### @@ -44,60 +104,57 @@ def is_activated(xpath): def get_final_output(): - if config["epitope_prediction"]["activate"]: - final_output = expand( - "results/neoantigens/{mhc}/{S.sample}.{S.sequencing_type}.xlsx", - S=units.loc[samples[samples.type == "tumor"]["sample"]] - .drop_duplicates(["sample", "sequencing_type"]) - .itertuples(), - mhc=list( - filter( - None, - [ - "netMHCpan" if is_activated("affinity/netMHCpan") else None, - "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None, - ], + final_output = [] + for group in pd.unique(samples["group"]): + smps = samples.loc[samples["group"] == group, "sample_name"] + tumor_aliases = samples.loc[ + (samples["group"] == group) & (samples["alias"].str.match("tumor")), + "alias", + ] + if config["neoantigen_prediction"]["activate"]: + final_output.extend( + expand( + "results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}", + group=group, + tumor_alias=tumor_aliases, + mhc=["I", "II"], ) - ), - ) - else: - if config["HLAtyping"]["HLA_LA"]["activate"]: - final_output = expand( - [ - "results/optitype/{sample}/hla_alleles_{sample}.tsv", - "results/HLA-LA/hlaI_{sample}.tsv", - "results/HLA-LA/hlaII_{sample}.tsv", - ], - sample=samples["sample"], ) + # sequencing_types = pd.unique( + # units.loc[units["sample_name"].isin(smps), "sequencing_type"] + # ) + # final_output.extend( + # expand( + # "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", + # group=group, + # tumor_alias=tumor_aliases, + # mhc=list( + # filter( + # None, + # [ + # "net_mhc_pan" + # if is_activated("params/net_mhc_pan") + # else None, + # "net_mhc_two_pan" + # if is_activated("params/net_mhc_two_pan") + # else None, + # ], + # ) + # ), + # seqtype=sequencing_types, + # ) + # ) else: final_output = expand( - "results/optitype/{sample}/hla_alleles_{sample}.tsv", - sample=samples["sample"], + [ + "results/hla_la/{group}.{tumor_alias}.hlaI.tsv", + "results/hla_la/{group}.{tumor_alias}.hlaII.tsv", + ], + group=group, + tumor_alias=tumor_aliases, ) - return final_output - - -def get_fusion_output(): - if config["fusion"]["arriba"]["activate"]: - fusion_output = expand( - "results/fusion/arriba/{sample}.fusions.tsv", - sample=units[units["sequencing_type"] == "RNA"]["sample"], - ) - else: - fusion_output = [] - return fusion_output - -def get_tmb_targets(): - if is_activated("tmb"): - return expand( - "results/plots/tmb/{group}.{mode}.svg", - group=samples[(samples.type == "tumor")]["sample"], - mode=config["tmb"].get("mode", "curve"), - ) - else: - return [] + return final_output caller = list( @@ -106,47 +163,6 @@ caller = list( ### helper functions ### -## alignment ## - - -def get_cutadapt_input(wildcards): - unit = units.loc[wildcards.sample].loc[wildcards.unit].loc[wildcards.seqtype] - - if pd.isna(unit["fq1"]): - # SRA sample (always paired-end for now) - accession = unit["sra"] - return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]) - - if unit["fq1"].endswith("gz"): - ending = ".gz" - else: - ending = "" - - if pd.isna(unit["fq2"]): - # single end local sample - return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format( - S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending - ) - else: - # paired end local sample - return expand( - "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format( - S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending - ), - read=["fq1", "fq2"], - ) - - -def get_cutadapt_pipe_input(wildcards): - pattern = ( - units.loc[wildcards.sample] - .loc[wildcards.seqtype] - .loc[wildcards.unit, wildcards.fq] - ) - files = list(sorted(glob.glob(pattern))) - assert len(files) > 0, "no files found at {}".format(pattern) - return files - def is_paired_end(sample, seqtype): sample_units = units.loc[sample].loc[seqtype] @@ -163,249 +179,82 @@ def is_paired_end(sample, seqtype): return all_paired -def get_fastqs(wc): - if config["trimming"]["activate"]: - return expand( - "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz", - unit=units.loc[wc.seqtype].loc[wc.sample, "unit_name"], - sample=wc.sample, - read=wc.read, - seqtype=wc.seqtype, - ) - unit = units.loc[wc.sample].loc[wc.seqtype] - if all(pd.isna(unit["fq1"])): - # SRA sample (always paired-end for now) - accession = unit["sra"] - return expand( - "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1] - ) - fq = "fq{}".format(wc.read[-1]) - return units.loc[wc.sample].loc[wc.seqtype, fq].tolist() - - -def get_map_reads_input(wildcards): - if is_paired_end(wildcards.sample, "DNA"): - return [ - "results/merged/DNA/{sample}_R1.fastq.gz", - "results/merged/DNA/{sample}_R2.fastq.gz", - ] - return "results/merged/DNA/{sample}_single.fastq.gz" - - -def get_read_group(wildcards): - """Denote sample name and platform in read group.""" - return r"-R '@RG\tID:{sample}\tSM:{sample}\tPL:{platform}'".format( - sample=wildcards.sample, platform=samples.loc[wildcards.sample, "platform"] - ) - - -def get_recalibrate_quality_input(wildcards, bai=False): - ext = ".bai" if bai else "" - if is_activated("remove_duplicates"): - return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext) - else: - return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext) - - -## HLA Typing ## - - -def get_optitype_reads_input(wildcards): - if is_activated("HLAtyping/optitype_prefiltering"): - if is_paired_end(wildcards.sample, "DNA"): - return expand( - "results/razers3/fastq/{sample}_{fq}.fished.fastq", - sample=wildcards.sample, - fq=["R1", "R2"], - ) - return "results/razers3/fastq/{sample}_single.fastq" - else: - return get_map_reads_input(wildcards) - - -def get_oncoprint_batch(wildcards): - if wildcards.batch == "all": - groups = samples[samples["type"] == "tumor"]["sample"].unique() - else: - groups = samples.loc[ - samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch, - "group", - ].unique() - return expand( - "results/merged-calls/{group}.{{event}}.fdr-controlled.bcf", group=groups - ) - - -## variant calls ## - - -def get_annotated_bcf(wildcards): - selection = ".annotated" - return "results/calls/{pair}.{scatteritem}{selection}.bcf".format( - pair=wildcards.pair, selection=selection, scatteritem=wildcards.scatteritem - ) - - -def get_scattered_calls(ext=".bcf"): - def inner(wildcards): - return expand( - "results/calls/{{pair}}.{caller}.{{scatteritem}}.sorted{ext}", - caller=caller, - ext=ext, - ) - - return inner - - -def get_fdr_control_params(wildcards): - query = config["calling"]["fdr-control"]["events"][wildcards.event] - threshold = query.get( - "threshold", config["calling"]["fdr-control"].get("threshold", 0.05) - ) - events = query["varlociraptor"] - return {"threshold": threshold, "events": events} - - -def get_pair_variants(wildcards, index): - if index: - ext = ".csi" - else: - ext = "" - variants = [ - "results/strelka/somatic/{}/results/variants/somatic.complete.tumor.bcf{}".format( - wildcards.sample, ext - ) - ] - variants.append( - "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format( - get_normal(wildcards), ext - ) - ) - return variants - +def get_sample_from_group_and_alias(group, alias): + sample = samples.loc[ + (samples["group"] == group) & (samples["alias"] == alias), "sample_name" + ].squeeze() + return sample -def get_pair_observations(wildcards): - return expand( - "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf", - caller=wildcards.caller, - pair=wildcards.pair, - scatteritem=wildcards.scatteritem, - sample=get_paired_samples(wildcards), - ) - -def get_merge_input(ext=".bcf"): +def get_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): - return expand( - "results/calls/{{pair}}.{vartype}.{{event}}.fdr-controlled{ext}", - ext=ext, - vartype=["SNV", "INS", "DEL", "MNV"], - filter=config["calling"]["fdr-control"]["events"][wildcards.event], + alias = wildcards.get( + "alias", + wildcards.get("tumor_alias", wildcards.get("normal_alias", "unknown")), ) + if alias == "unknown": + raise CustomException( + "get_bam_from_group_and_alias() requires one of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'." + ) + sample = get_sample_from_group_and_alias(wildcards.group, alias) + return f"results/recal/{sample}.sorted{ext}" return inner -def get_pair_aliases(wildcards): - return [ - samples.loc[samples.loc[wildcards.pair, "matched_normal"], "type"], - samples.loc[wildcards.pair, "type"], - ] - - -def get_tabix_params(wildcards): - if wildcards.format == "vcf": - return "-p vcf" - if wildcards.format == "txt": - return "-s 1 -b 2 -e 2" - raise ValueError("Invalid format for tabix: {}".format(wildcards.format)) - - -## RNA ## - - -def get_quant_reads_input(wildcards): - if is_paired_end(wildcards.sample, "RNA"): - return [ - "results/merged/RNA/{sample}_R1.fastq.gz", - "results/merged/RNA/{sample}_R2.fastq.gz", - ] - return "results/merged/RNA/{sample}_single.fastq.gz" - - -def kallisto_params(wildcards, input): - extra = config["params"]["kallisto"] - if len(input.fastq) == 1: - extra += " --single" - extra += ( - " --fragment-length {unit.fragment_len_mean} " "--sd {unit.fragment_len_sd}" - ).format(unit=units.loc[(wildcards.sample, wildcards.unit)]) - else: - extra += " --fusion" - return extra - - -## helper functions ## - - -def get_paired_samples(wildcards): - return [ - samples.loc[(wildcards.pair), "matched_normal"], - samples.loc[wildcards.pair, "sample"], - ] - - -def get_paired_bams(wildcards): - return expand( - "results/recal/{sample}.sorted.bam", sample=get_paired_samples(wildcards) - ) - - -def get_paired_bais(wildcards): +def get_alleles_MHCI(wildcards): + alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/recal/{sample}.sorted.bam.bai", sample=get_paired_samples(wildcards) + "results/hla_la/{group}.{alias}.hlaI.tsv", + group=wildcards.group, + alias=alias, ) -def get_normal(wildcards): - return samples.loc[(wildcards.sample), "matched_normal"] - - -def get_reads(wildcards): - return get_seperate(wildcards.sample, wildcards.group) - - -def get_seperate(sample, group): - return units.loc[(sample, "DNA"), "fq{}".format(str(group))] - - -def get_proteome(wildcards): +def get_alleles_MHCII(wildcards): + alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin", - normal=get_normal(wildcards), - mhc=wildcards.mhc, + # TODO: check that hlaII is correct here, and not hlaI which it previously was + "results/hla_la/{group}.{alias}.hlaII.tsv", + group=wildcards.group, + alias=alias, ) -def get_alleles_MHCI(wildcards): - if wildcards.group == "wt": - return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_normal(wildcards) - ) - else: - return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample) - - -def get_alleles_MHCII(wildcards): - if wildcards.group == "wt": - return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards)) - else: - return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample) - - -def get_normal_bam(wildcards): - return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards)) - - -def get_normal_bai(wildcards): - return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards)) +##### Other stuff #### + +neofox_important_cols = { + "general": [ + "gene", + "mutation_mutatedXmer", + "mutation_wildTypeXmer", + "purity_adjusted_DNA_VAF", + "imputedGeneExpression", + ], + "I": [ + "PRIME_best_rank", + "PRIME_best_score", + "PRIME_best_peptide", + "PRIME_best_allele", + "Recognition_Potential_MHCI_9mer", + "Improved_Binder_MHCI", + "Selfsimilarity_MHCI_conserved_binder", + "Best_rank_MHCI_9mer_score", + "Best_rank_MHCI_9mer_score_WT", + "Best_rank_MHCI_9mer_epitope", + "Best_rank_MHCI_9mer_epitope_WT", + "Best_rank_MHCI_9mer_allele", + "Best_rank_MHCI_9mer_allele_WT", + ], + "II": [ + "MixMHC2pred_best_rank", + "MixMHC2pred_best_peptide", + "MixMHC2pred_best_allele", + "Best_rank_MHCII_score", + "Best_rank_MHCII_score_WT", + "Best_rank_MHCII_score_epitope", + "Best_rank_MHCII_score_epitope_WT", + "Best_rank_MHCII_score_allele", + "Best_rank_MHCII_score_allele_WT", + ] +} \ No newline at end of file diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk new file mode 100644 index 00000000..d09c9cd7 --- /dev/null +++ b/workflow/rules/datavzrd.smk @@ -0,0 +1,51 @@ +rule prepare_neoprint: + input: + neopeptides="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", + output: + mhc_one="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.I.sorted.tsv", + mhc_two="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.II.sorted.tsv", + log: + "logs/prepare_neoprint/{group}.{tumor_alias}.log", + params: + purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze(), + neofox_important_cols=neofox_important_cols, + conda: + "../envs/pandas.yaml" + script: + "../scripts/prepare_neoprint.py" + + +rule render_datavzrd_neoprint_config: + input: + template=workflow.source_path( + "../resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml" + ), + neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv", + output: + "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml", + params: + neofox_important_cols=neofox_important_cols, + log: + "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log", + template_engine: + "yte" + + +rule datavzrd_neoprint: + input: + neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv", + config="resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml", + output: + report( + directory("results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}"), + htmlindex="index.html", + caption="../report/neopeptides.rst", + category="Neopeptides", + labels=lambda wc: {"group": wc.group, "sample_type": wc.tumor_alias, "MHC": wc.mhc}, + ), + conda: + "../envs/datavzrd.yaml" + log: + "logs/datavzrd_neoprint/{group}.{tumor_alias}.{mhc}.log", + shell: + "datavzrd {input.config} --output {output} &> {log}" diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk deleted file mode 100644 index f32c203d..00000000 --- a/workflow/rules/filtering.smk +++ /dev/null @@ -1,106 +0,0 @@ -rule filter_by_annotation: - input: - "{prefix}.bcf", - output: - "{prefix}.{filter}.filtered_ann.bcf", - log: - "logs/filter-calls/annotation/{prefix}.{filter}.log", - params: - filter=lambda w: config["calling"]["filter"][w.filter], - conda: - "../envs/vembrane.yaml" - shell: - 'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}' - - -rule filter_odds: - input: - get_annotated_bcf, - output: - "results/calls/{pair}.{event}.{scatteritem}.filtered_odds.bcf", - params: - events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][ - "varlociraptor" - ], - log: - "logs/filter-calls/posterior_odds/{pair}.{scatteritem}.{event}.log", - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}" - - -rule gather_calls: - input: - calls=gather.calling( - "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf" - ), - idx=gather.calling( - "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi" - ), - output: - "results/calls/{pair}.{event}.filtered_odds.bcf", - log: - "logs/gather-calls/{pair}.{event}.log", - params: - "-a -Ob", - wrapper: - "0.67.0/bio/bcftools/concat" - - -rule control_fdr: - input: - "results/calls/{pair}.{event}.filtered_odds.bcf", - output: - "results/calls/{pair}.{vartype}.{event}.fdr-controlled.bcf", - log: - "logs/control-fdr/{pair}.{vartype}.{event}.log", - params: - query=get_fdr_control_params, - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} " - "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}" - - -rule merge_calls: - input: - calls=get_merge_input(".bcf"), - idx=get_merge_input(".bcf.csi"), - output: - "results/merged-calls/{pair}.{event}.fdr-controlled.bcf", - log: - "logs/merge-calls/{pair}.{event}.log", - params: - "-a -Ob", - wrapper: - "0.59.2/bio/bcftools/concat" - - -rule change_samplenames: - input: - call="results/merged-calls/{pair}.{event}.fdr-controlled.bcf", - output: - temp("results/merged-calls/{pair}.{event}.renaming.txt"), - log: - "logs/change-samplenames/{pair}.{event}.log", - params: - prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0], - shell: - "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}" - - -rule reheader_varlociraptor: - input: - vcf="results/merged-calls/{pair}.{event}.fdr-controlled.bcf", - samples="results/merged-calls/{pair}.{event}.renaming.txt", - output: - "results/merged-calls/{pair}.{event}.reheader.bcf", - log: - "logs/reheader-calls/{pair}.{event}.log", - params: - extra="", - view_extra="-O b", - wrapper: - "0.60.0/bio/bcftools/reheader" diff --git a/workflow/rules/hla_typing.smk b/workflow/rules/hla_typing.smk new file mode 100644 index 00000000..98034abf --- /dev/null +++ b/workflow/rules/hla_typing.smk @@ -0,0 +1,69 @@ +rule hla_la: + input: + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), + index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", + ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac", + output: + "results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt", + threads: 7 + log: + "logs/hla_la/{group}_{alias}.log", + params: + graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), + graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), + workdir=lambda w, output: os.path.dirname( + os.path.dirname(os.path.dirname(output[0])) + ), + conda: + "../envs/hla_la.yaml" + shell: + "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1" + + +rule net_mhc_pan_alleles: + output: + mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt", + conda: + "../envs/tcsh.yaml" + log: + "logs/net_mhc_pan/available_alleles.net_mhc_pan.log", + params: + net_mhc=config["params"]["net_mhc_pan"]["location"], + shell: + "{params.net_mhc}/netMHCpan -listMHC > {output.mhc_one_alleles} 2> {log}" + + +rule net_mhc_two_pan_alleles: + output: + mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt", + conda: + "../envs/tcsh.yaml" + log: + "logs/net_mhc_pan/available_alleles.net_mhc_two_pan.log", + params: + net_mhc=config["params"]["net_mhc_two_pan"]["location"], + shell: + "{params.net_mhc}/netMHCIIpan -list > {output.mhc_two_alleles} 2> {log}" + + +rule parse_and_filter_hla_alleles_for_netmhc: + input: + hla_la_bestguess="results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt", + mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt", + mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt", + output: + hlaI=report( + "results/hla_la/{group}.{alias}.hlaI.tsv", + caption="../report/hla_alleles.rst", + category="HLA alleles", + ), + hlaII=report( + "results/hla_la/{group}.{alias}.hlaII.tsv", + caption="../report/hla_alleles.rst", + category="HLA alleles", + ), + log: + "logs/parse_hla_la/{group}.{alias}.log", + script: + "../scripts/parse_and_filter_hla_alleles_for_netmhc.py" diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk deleted file mode 100644 index 8cfcb51c..00000000 --- a/workflow/rules/mapping.smk +++ /dev/null @@ -1,70 +0,0 @@ -rule map_reads: - input: - reads=get_map_reads_input, - idx=rules.bwa_index.output, - output: - temp("results/mapped/{sample}.sorted.bam"), - log: - "logs/bwa_mem/{sample}.log", - params: - index=lambda w, input: os.path.splitext(input.idx[0])[0], - extra=get_read_group, - sort="samtools", - sort_order="coordinate", - threads: 8 - wrapper: - "0.56.0/bio/bwa/mem" - - -rule mark_duplicates: - input: - "results/mapped/{sample}.sorted.bam", - output: - bam=temp("results/dedup/{sample}.sorted.bam"), - metrics="results/qc/dedup/{sample}.metrics.txt", - log: - "logs/picard/dedup/{sample}.log", - params: - config["params"]["picard"]["MarkDuplicates"], - wrapper: - "0.39.0/bio/picard/markduplicates" - - -rule recalibrate_base_qualities: - input: - bam=get_recalibrate_quality_input, - bai=lambda w: get_recalibrate_quality_input(w, bai=True), - ref="resources/genome.fasta", - ref_dict="resources/genome.dict", - ref_fai="resources/genome.fasta.fai", - known="resources/variation.noiupac.vcf.gz", - tbi="resources/variation.noiupac.vcf.gz.tbi", - output: - recal_table=temp("results/recal/{sample}.grp"), - params: - extra=config["params"]["gatk"]["BaseRecalibrator"], - java_opts="", - log: - "logs/gatk/baserecalibrator/{sample}.log", - threads: 8 - wrapper: - "0.62.0/bio/gatk/baserecalibratorspark" - - -rule apply_bqsr: - input: - bam=get_recalibrate_quality_input, - bai=lambda w: get_recalibrate_quality_input(w, bai=True), - ref="resources/genome.fasta", - ref_dict="resources/genome.dict", - ref_fai="resources/genome.fasta.fai", - recal_table="results/recal/{sample}.grp", - output: - bam=protected("results/recal/{sample}.sorted.bam"), - log: - "logs/gatk/gatk_applybqsr/{sample}.log", - params: - extra=config["params"]["gatk"]["applyBQSR"], # optional - java_opts="", # optional - wrapper: - "0.62.0/bio/gatk/applybqsr" diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk new file mode 100644 index 00000000..535015fc --- /dev/null +++ b/workflow/rules/mhc_binding.smk @@ -0,0 +1,109 @@ +rule net_mhc_pan: + input: + peptides=expand( + "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa", + peptide_length=config["params"]["net_mhc_pan"]["peptide_len"], + ), + alleles=get_alleles_MHCI, + output: + "results/net_mhc_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + log: + "logs/net_mhc_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + conda: + "../envs/tcsh.yaml" + params: + extra=config["params"]["net_mhc_pan"]["extra"], + netMHC=config["params"]["net_mhc_pan"]["location"], + length=config["params"]["net_mhc_pan"]["peptide_len"], + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], header=None)[0] + ), + shell: + "( " + "if [ -s {input.peptides} ]; " + "then " + " {params.netMHC}/netMHCpan {params.extra} -BA -s -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + "else " + " touch {output}; " + "fi " + " ) 2> {log}" + + +rule net_mhc_two_pan: + input: + peptides=expand( + "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa", + peptide_length=config["params"]["net_mhc_two_pan"]["peptide_len"], + ), + alleles=get_alleles_MHCII, + output: + "results/net_mhc_two_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + log: + "logs/net_mhc_two_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + conda: + "../envs/tcsh.yaml" + params: + extra=config["params"]["net_mhc_two_pan"]["extra"], + netMHC=config["params"]["net_mhc_two_pan"]["location"], + length=config["params"]["net_mhc_two_pan"]["peptide_len"], + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], header=None)[0] + ), + shell: + "( " + "if [ -s {input.peptides} ]; " + "then " + " {params.netMHC}/netMHCIIpan {params.extra} -BA -s -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + "else " + " touch {output}; " + "fi " + " ) 2> {log}" + + +rule tidy_mhc_out: + input: + expand( + "results/{{mhc}}/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv", + contig=contigs, + ), + output: + joined_mhc_out="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv", + log: + "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log", + script: + "../scripts/tidy_mhc_output.py" + + +rule merge_neoantigen_info: + input: + info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", + neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv", + normal="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.normal.tsv", + output: + report( + "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", + caption="../report/neoantigens.dna.rst", + category="Neopeptides", + ), + log: + "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", + script: + "../scripts/merge_neoantigen_info.py" + + +rule add_rna_info: + input: + counts="results/kallisto/{group}.{tumor_alias}", + table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", + output: + report( + "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv", + caption="../report/neoantigens.rna.rst", + category="Neopeptides", + ), + params: + abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), + log: + "logs/add_rna/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", + script: + "../scripts/add_rna_info.py" diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index bd9e6510..a52d72ef 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,117 +1,175 @@ -rule microphaser_somatic: +rule norm_bcf: input: - vcf="results/strelka/merged/{sample}/all_variants.norm.annotated.bcf", - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bam.bai", + "results/final-calls/{group}.{set}.bcf", + ref="resources/genome.fasta", + output: + "results/final-calls/{group}.{set}.norm.bcf", + log: + "logs/bcftools/norm/{group}.{set}.log", + params: + extra="-m-" + wrapper: + "v1.12.0/bio/bcftools/norm" + + +rule add_somatic_flag: + input: + bcf="results/final-calls/{group}.{set}.norm.bcf", + header_line="resources/somatic_flag_header_line.txt", + flag_bed="resources/genome.somatic_flag.bed.gz", + flag_bed_idx="resources/genome.somatic_flag.bed.gz.tbi", + output: + "results/final-calls/{group}.{set}.somatic_flag.norm.bcf", + log: + "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log", + conda: + "../envs/bcftools.yaml" + shell: + "( bcftools annotate " + " --annotations {input.flag_bed} " + " --mark-sites +SOMATIC " + " --columns CHROM,FROM,TO " + " --header-lines {input.header_line} " + " -O b -o {output} " + " {input.bcf} " + ") 2> {log}" + + +rule merge_tumor_normal: + input: + calls=expand( + "results/final-calls/{{group}}.{sets}.norm.bcf", + sets=[ + config["params"]["microphaser"]["events"]["normal"], + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", + ], + ), + index=expand( + "results/final-calls/{{group}}.{sets}.norm.bcf.csi", + sets=[ + config["params"]["microphaser"]["events"]["normal"], + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", + ], + ), + output: + "results/final-calls/{group}.merged_tumor_normal.norm.bcf", + log: + "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log", + params: + extra="-a", + wrapper: + "v1.14.1/bio/bcftools/concat" + + +rule microphaser_tumor: + input: + bcf="results/final-calls/{group}.merged_tumor_normal.norm.annotated.bcf", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.mt.fa", - wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.wt.fa", - tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv", + mt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa", + tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", log: - "logs/microphaser/somatic/{sample}-{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" params: - window_length=config["params"]["microphaser"]["window_len"], + window_length=lambda wc: int(wc.peptide_length) * 3, shell: - "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " + "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" -rule microphaser_germline: +rule microphaser_normal: input: - vcf="results/strelka/germline/{normal}/results/variants/variants.reheader.norm.bcf", - bam="results/recal/{normal}.sorted.bam", - bai="results/recal/{normal}.sorted.bam.bai", + bcf="results/final-calls/{group}.{normal_set}.variants.reheader.norm.bcf", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: wt_fasta=( - "results/microphaser/fasta/germline/{normal}/{normal}.germline.{contig}.fa" + "results/microphaser/fasta/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.fa" ), wt_tsv=( - "results/microphaser/info/germline/{normal}/{normal}.germline.{contig}.tsv" + "results/microphaser/info/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.tsv" ), log: - "logs/microphaser/germline/{normal}-{contig}.log", + "logs/microphaser_normal/contigs/{group}/{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" params: - window_length=config["params"]["microphaser"]["window_len"], + window_length=lambda wc: int(wc.peptide_length) * 3, shell: - "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " + "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" -rule concat_proteome: +rule concat_normal_proteome: input: expand( - "results/microphaser/fasta/germline/{{normal}}/{{normal}}.germline.{contig}.fa", + "results/microphaser/fasta/contigs/{{group}}.normal.{{normal_set}}.pep_len_{{peptide_length}}.{contig}.fa", contig=contigs, ), output: - "results/microphaser/fasta/germline/{normal}/reference_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa", log: - "logs/microphaser/concat-ref-proteome/{normal}.log", + "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.pep_len_{peptide_length}.log", shell: "cat {input} > {output} 2> {log}" -rule build_germline_proteome: +rule build_normal_proteome_db: input: - "results/microphaser/fasta/germline/{normal}/reference_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa", output: - bin="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin", - fasta="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.peptides.fasta", + bin="results/microphaser/bin/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.bin", + fasta="results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.peptides.fasta", log: - "logs/microphaser/build-ref-proteome-db/{normal}-{mhc}.log", + "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}.pep_len_{peptide_length}.log", conda: "../envs/microphaser.yaml" - params: - length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][ - wildcards.mhc - ], shell: - "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1" + "( microphaser build_reference -r {input} -o {output.bin} -l {wildcards.peptide_length} > {output.fasta} ) 2> {log}" rule microphaser_filter: input: - tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv", - proteome=get_proteome, + tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", + proteome=expand( + "results/microphaser/bin/{{group}}.{normal_set}.normal_proteome.pep_len_{{peptide_length}}.bin", + normal_set=config["params"]["microphaser"]["events"]["normal"], + ), output: mt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.mt.fa" + "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.wt.fa" + "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa" ), - tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv", - removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", + removed="results/microphaser/info/removed/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.removed.tsv", log: - "logs/microphaser/filter/{sample}-{mhc}-{contig}.log", + "logs/microphaser_filter/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" - params: - length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][ - wildcards.mhc - ], shell: - "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}" + "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {wildcards.peptide_length} > {output.mt_fasta} 2>{log}" rule concat_tsvs: input: expand( - "results/microphaser/info/{{sample}}/filtered/{{mhc}}/{{sample}}.{contig}.tsv", + "results/microphaser/info/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{{peptide_length}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv", + "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", log: - "logs/concat-tsv/{sample}-{mhc}.log", + "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log", conda: "../envs/xsv.yaml" shell: diff --git a/workflow/rules/oncoprint.smk b/workflow/rules/oncoprint.smk deleted file mode 100644 index aa670044..00000000 --- a/workflow/rules/oncoprint.smk +++ /dev/null @@ -1,28 +0,0 @@ -rule build_oncoprint_table: - input: - bcf=get_oncoprint_batch, - output: - "plots/oncoprint/{batch}.{event}.tsv", - log: - "logs/oncoprint/{batch}.{event}.table.log", - conda: - "../envs/oncoprinttable.yaml" - script: - "../scripts/build_oncoprint_matrix.py" - - -rule plot_oncoprint: - input: - "plots/oncoprint/{batch}.{event}.tsv", - output: - report( - "plots/oncoprint/{batch}.{event}.pdf", - category="Oncoprint", - caption="../report/oncoprint.rst", - ), - log: - "logs/oncoprint/{batch}.{event}.plot.log", - conda: - "../envs/oncoprint.yaml" - script: - "../scripts/oncoprint.R" diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk index 872247ac..d7279042 100644 --- a/workflow/rules/phylogeny.smk +++ b/workflow/rules/phylogeny.smk @@ -1,7 +1,7 @@ def get_somatic_calls(wildcards): return expand( - "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", - sample=samples[samples.type == "tumor"]["sample"], + "results/final-calls/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", + sample=samples[samples.alias == "tumor"]["sample_name"], ) @@ -9,20 +9,20 @@ rule merge_snvs: input: calls=get_somatic_calls, output: - "results/strelka/merged_calls.vcf", + "results/final-calls/merged_calls.vcf", log: "results/logs/bcftools/merge.log", params: - "--use-header strelka/sampleheader.txt --force-samples", + "--use-header final-calls/sampleheader.txt --force-samples", wrapper: - "0.36.0/bio/bcftools/merge" + "v1.12.0/bio/bcftools/merge" rule query: input: - "results/strelka/merged_calls.vcf", + "results/final-calls/merged_calls.vcf", output: - "results/strelka/call_matrix.tsv", + "results/final-calls/call_matrix.tsv", log: "results/logs/bcftools/query.log", params: @@ -35,7 +35,7 @@ rule query: rule nj_tree: input: - matrix="results/strelka/call_matrix.tsv", + matrix="results/final-calls/call_matrix.tsv", output: pdf="results/plots/phylogeny_njtree.pdf", log: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 86b209c3..80e62380 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -10,7 +10,7 @@ rule get_genome: release=config["ref"]["release"], cache: True wrapper: - "0.45.1/bio/reference/ensembl-sequence" + "v1.12.0/bio/reference/ensembl-sequence" rule get_cdna: @@ -25,21 +25,7 @@ rule get_cdna: release=config["ref"]["release"], cache: True wrapper: - "0.45.1/bio/reference/ensembl-sequence" - - -rule kallisto_index: - input: - "resources/genome.cdna.fasta", - output: - "resources/kallisto/transcripts.idx", - params: - extra="", - log: - "logs/kallisto/index.log", - cache: True - wrapper: - "0.60.1/bio/kallisto/index" + "v1.12.0/bio/reference/ensembl-sequence" rule get_annotation: @@ -55,29 +41,24 @@ rule get_annotation: log: "logs/get-annotation.log", wrapper: - "0.45.1/bio/reference/ensembl-annotation" + "v1.12.0/bio/reference/ensembl-annotation" -rule STAR_index: +# TODO: remove this rule, once microphaser is fixed to make gene_name optional +rule remove_records_with_gene_name_missing: input: - fasta="resources/genome.fasta", - gtf="resources/genome.gtf", + "resources/genome.gtf", output: - directory("resources/STAR_index"), - params: - sjdb_overhang="100", - extra="", + "resources/genome.records_with_gene_name.gtf", log: - "logs/star/index.log", - threads: 32 - cache: True - wrapper: - "0.42.0/bio/star/index" + "logs/remove_records_with_gene_name_missing.log", + shell: + '( grep "gene_name" {input} > {output} ) 2> {log}' rule split_annotation: input: - "resources/genome.gtf", + "resources/genome.records_with_gene_name.gtf", output: "resources/annotation/{contig}.gtf", log: @@ -95,142 +76,126 @@ rule genome_faidx: "logs/genome-faidx.log", cache: True wrapper: - "0.45.1/bio/samtools/faidx" + "v1.12.0/bio/samtools/faidx" -rule genome_dict: - input: - "resources/genome.fasta", +rule create_somatic_flag_header_line: output: - "resources/genome.dict", + "resources/somatic_flag_header_line.txt", log: - "logs/picard/create-dict.log", - cache: True - wrapper: - "0.45.1/bio/picard/createsequencedictionary" + "logs/create_somatic_flag_header_line.log", + shell: + """ + ( echo '##INFO=' > {output} ) 2> {log} + """ -rule get_callregions: +rule create_genome_somatic_flag_bed: input: "resources/genome.fasta.fai", output: - "resources/genome.callregions.bed.gz", + "resources/genome.somatic_flag.bed", log: - "logs/get-callregions.log", - params: - n_contigs=config["ref"]["n_chromosomes"], + "logs/create_genome_somatic_flag_bed.log", conda: - "../envs/index.yaml" + "../envs/gawk.yaml" + cache: True shell: - "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})" - " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}" + """ + ( awk 'BEGIN {{ OFS="\\t" }} {{ print $1,0,$2 }}' {input} > {output} ) 2> {log} + """ -rule get_known_variants: +rule bgzip_genome_somatic_flag_bed: input: - # use fai to annotate contig lengths for GATK BQSR - fai="resources/genome.fasta.fai", + "resources/genome.somatic_flag.bed", output: - vcf="resources/variation.vcf.gz", + "resources/genome.somatic_flag.bed.gz", log: - "logs/get-known-variants.log", - params: - species=config["ref"]["species"], - release=config["ref"]["release"], - build=config["ref"]["build"], - type="all", - cache: True + "logs/bgzip/genome.somatic_flag.log", wrapper: - "0.59.2/bio/reference/ensembl-variation" + "v1.12.0/bio/bgzip" -rule remove_iupac_codes: +rule tabix_genome_somatic_flag_bed: input: - "resources/variation.vcf.gz", + "resources/genome.somatic_flag.bed.gz", output: - "resources/variation.noiupac.vcf.gz", - log: - "logs/fix-iupac-alleles.log", + "resources/genome.somatic_flag.bed.gz.tbi", conda: - "../envs/rbt.yaml" - cache: True + "../envs/htslib.yaml" + log: + "logs/tabix/genome.somatic_flag.log", shell: - "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}" + "( tabix -p bed {input} ) 2> {log}" -rule bwa_index: +rule genome_dict: input: "resources/genome.fasta", output: - multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"), + "resources/genome.dict", log: - "logs/bwa_index.log", + "logs/picard/create-dict.log", cache: True wrapper: - "0.45.1/bio/bwa/index" + "v1.12.0/bio/picard/createsequencedictionary" -rule download_HLALA_graph: +rule download_hla_la_graph: output: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"), - directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping_PRGonly"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/referenceGenomeSimulations"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"), + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa", "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", + params: + graphs_dir=lambda w, output: output[0].replace( + "graphs/PRG_MHC_GRCh38_withIMGT/PRG", "graphs" + ), log: - "logs/download-HLA-LA-graph.log", - cache: True + "logs/download_hla_la_graph.log", shell: - "cd resources/graphs && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " - "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz" + "( cd {params.graphs_dir} && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " + "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}" -rule index_HLALA: +rule index_hla_la: input: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", output: "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", - "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathindex", - cache: True + "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathIndex", conda: "../envs/hla_la.yaml" params: path=lambda wc, input: os.path.dirname(os.path.dirname(input[0])), graph=lambda wc, input: os.path.basename(os.path.dirname(input[0])), log: - "logs/index-HLA-LA-graph.log", + "logs/index_hla_la_graph.log", shell: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" -rule get_vep_cache: - output: - directory("resources/vep/cache"), - params: - species=config["ref"]["species"], - build=config["ref"]["build"], - release=config["ref"]["release"], - log: - "logs/vep/cache.log", - cache: True - wrapper: - "0.59.2/bio/vep/cache" - - -rule get_vep_plugins: +rule index_hla_la_extended_ref: + input: + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa", output: - directory("resources/vep/plugins"), - params: - release=config["ref"]["release"], + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.amb", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.ann", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.bwt", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.sa", + conda: + "../envs/hla_la.yaml" log: - "logs/vep/plugins.log", - cache: True - wrapper: - "0.59.2/bio/vep/plugins" + "logs/index_hla_la_extended_ref.log", + shell: + "bwa index {input} > {log} 2>&1" rule make_sampleheader: diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk deleted file mode 100644 index 2ef064a1..00000000 --- a/workflow/rules/tmb.smk +++ /dev/null @@ -1,20 +0,0 @@ -if config["tmb"]["activate"]: - - rule estimate_tmb: - input: - "results/merged-calls/{pair}.somatic.fdr-controlled.bcf", - output: - "results/plots/tmb/{pair}.{plotmode}.vl.json", - conda: - "../envs/varlociraptor.yaml" - log: - "logs/tmb/{pair}-{plotmode}.log", - params: - **config["tmb"], - shell: - "varlociraptor estimate tmb " - " --plot-mode {wildcards.plotmode} " - "--coding-genome-size {params.coding_genome_size} " - "--somatic-tumor-events {params.somatic_events} " - "--tumor-sample {params.tumor_sample} " - "< {input} > {output} 2> {log}" diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk deleted file mode 100644 index f26ed6f3..00000000 --- a/workflow/rules/trim.smk +++ /dev/null @@ -1,71 +0,0 @@ -rule get_sra: - output: - "sra/{accession}_1.fastq", - "sra/{accession}_2.fastq", - log: - "logs/get-sra/{accession}.log", - wrapper: - "0.56.0/bio/sra-tools/fasterq-dump" - - -rule cutadapt_pipe: - input: - get_cutadapt_pipe_input, - output: - pipe("pipe/cutadapt/{sample}/{seqtype}/{unit}.{fq}.{ext}"), - log: - "logs/pipe-fastqs/cutadapt/{sample}-{seqtype}-{unit}.{fq}.{ext}.log", - wildcard_constraints: - ext=r"fastq|fastq\.gz", - threads: 0 # this does not need CPU - shell: - "cat {input} > {output} 2> {log}" - - -rule cutadapt_pe: - input: - get_cutadapt_input, - output: - fastq1="results/trimmed/{sample}/{seqtype}/{unit}_R1.fastq.gz", - fastq2="results/trimmed/{sample}/{seqtype}/{unit}_R2.fastq.gz", - qc="results/trimmed/{sample}/{seqtype}/{unit}.paired.qc.txt", - log: - "logs/cutadapt/{sample}-{seqtype}-{unit}.log", - params: - others=config["params"]["cutadapt"], - adapters=lambda w: str( - units.loc[w.sample].loc[w.seqtype].loc[w.unit, "adapters"] - ), - threads: 8 - wrapper: - "0.59.2/bio/cutadapt/pe" - - -rule cutadapt_se: - input: - get_cutadapt_input, - output: - fastq="results/trimmed/{sample}/{seqtype}/{unit}.single.fastq.gz", - qc="results/trimmed/{sample}/{seqtype}/{unit}.single.qc.txt", - log: - "logs/cutadapt/{sample}-{seqtype}-{unit}.log", - params: - others=config["params"]["cutadapt"], - adapters_r1=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), - threads: 8 - wrapper: - "0.59.2/bio/cutadapt/se" - - -rule merge_fastqs: - input: - get_fastqs, - output: - "results/merged/{seqtype}/{sample}_{read}.fastq.gz", - log: - "logs/merge-fastqs/{seqtype}_{sample}_{read}.log", - wildcard_constraints: - read="single|R1|R2", - seqtype="DNA|RNA", - shell: - "cat {input} > {output} 2> {log}" diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index e7586ef0..c8b11474 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -13,38 +13,13 @@ rule bam_index: input: "{prefix}.bam", output: - "{prefix}.bam.bai", + "{prefix}.bai", log: "logs/bam-index/{prefix}.log", wrapper: "0.59.2/bio/samtools/index" -rule tabix_known_variants: - input: - "resources/{prefix}.{format}.gz", - output: - "resources/{prefix}.{format}.gz.tbi", - log: - "logs/tabix/{prefix}.{format}.log", - params: - get_tabix_params, - cache: True - wrapper: - "0.59.2/bio/tabix" - - -rule gzip_fastq: - input: - "{prefix}.fastq", - output: - "{prefix}.fastq.gz", - log: - "logs/gz-fastq/{prefix}.log", - shell: - "gzip < {input} > {output}" - - rule tsv_to_excel: input: tsv="results/{x}.tsv", diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk deleted file mode 100644 index 4cb6013a..00000000 --- a/workflow/rules/varlociraptor.smk +++ /dev/null @@ -1,89 +0,0 @@ -rule render_scenario: - input: - config["calling"]["scenario"], - output: - report( - "results/scenarios/{pair}.yaml", - caption="../report/scenario.rst", - category="Variant calling scenarios", - ), - params: - samples=samples, - log: - "logs/scenarious/{pair}.log", - conda: - "../envs/render_scenario.yaml" - script: - "../scripts/render-scenario.py" - - -rule varlociraptor_preprocess: - input: - ref="resources/genome.fasta", - ref_idx="resources/genome.fasta.fai", - candidates="results/candidate-calls/{pair}.{caller}.{scatteritem}.bcf", - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bai", - output: - "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf", - params: - omit_isize="", - log: - "logs/varlociraptor/preprocess/{pair}/{sample}.{caller}.{scatteritem}.log", - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} " - "{input.ref} --bam {input.bam} --output {output} 2> {log}" - - -rule varlociraptor_call: - input: - obs=get_pair_observations, - scenario="results/scenarios/{pair}.yaml", - output: - temp("results/calls/{pair}.{caller}.{scatteritem}.bcf"), - log: - "logs/varlociraptor/call/{pair}.{caller}.{scatteritem}.log", - params: - obs=lambda w, input: [ - "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs) - ], - conda: - "../envs/varlociraptor.yaml" - benchmark: - "benchmarks/varlociraptor/call/{pair}.{caller}.{scatteritem}.tsv" - shell: - "varlociraptor " - "call variants generic --obs {params.obs} " - "--scenario {input.scenario} > {output} 2> {log}" - - -rule sort_calls: - input: - "results/calls/{pair}.{caller}.{scatteritem}.bcf", - output: - temp("results/calls/{pair}.{caller}.{scatteritem}.sorted.bcf"), - log: - "logs/bcf-sort/{pair}.{caller}.{scatteritem}.log", - conda: - "../envs/bcftools.yaml" - resources: - mem_mb=8000, - shell: - "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` " - "-Ob {input} > {output} 2> {log}" - - -rule bcftools_concat: - input: - calls=get_scattered_calls(), - indexes=get_scattered_calls(ext=".bcf.csi"), - output: - "results/calls/{pair}.{scatteritem}.bcf", - log: - "logs/concat-calls/{pair}.{scatteritem}.log", - params: - "-a -Ob", # TODO Check this - wrapper: - "0.59.2/bio/bcftools/concat" diff --git a/workflow/rules/vega.smk b/workflow/rules/vega.smk deleted file mode 100644 index 4199e8fc..00000000 --- a/workflow/rules/vega.smk +++ /dev/null @@ -1,15 +0,0 @@ -rule vg2svg: - input: - "{prefix}.vl.json", - output: - report( - "{prefix}.svg", - caption="../report/tmb.rst", - category="Tumor Mutational Burden", - ), - log: - "logs/vega/{prefix}.log", - conda: - "../envs/vega.yaml" - shell: - "vl2svg {input} {output} > {log} 2>&1" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 6fafb49f..05e55b83 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -4,21 +4,6 @@ description: snakemake configuration file type: object -definitions: - filterentry: - type: object - additionalProperties: - type: string - evententry: - type: object - properties: - varlociraptor: - type: array - items: - type: string - filter: - type: string - properties: samples: type: string @@ -42,198 +27,61 @@ properties: - build - n_chromosomes - - tmb: - type: object - properties: - activate: - type: boolean - coding_genome_size: - # TODO allow integer here! - type: string - tumor_sample: - type: string - somatic_events: - type: array - items: - type: string - required: - - activate - - coding_genome_size - - tumor_sample - - somatic_events - - - calling: - type: object - properties: - freebayes: - type: object - properties: - activate: - type: boolean - scenario: - type: string - fdr-control: - type: object - properties: - threshold: - type: number - minimum: 0.0 - maximum: 1.0 - events: - $ref: "#/definitions/evententry" - description: "a map of pairs" - required: - - threshold - - events - required: - - freebayes - - scenario - - fdr-control - - remove_duplicates: - type: object - properties: - activate: - type: boolean - - trimming: - type: object - properties: - activate: - type: boolean - - epitope_prediction: + neoantigen_prediction: type: object properties: activate: type: boolean - affinity: + params: type: object properties: - netMHCpan: + microphaser: type: object properties: - activate: - type: boolean - params: - type: string - netMHCIIpan: + events: + type: object + properties: + normal: + type: string + tumor: + type: string + required: + - normal + - tumor + net_mhc_pan: type: object properties: activate: type: boolean - params: + peptide_len: + type: integer + extra: type: string - - HLAtyping: - type: object - properties: - HLA_LA: - type: object - properties: - activate: - type: boolean - - annotations: - type: object - properties: - vep: - properties: - params: + location: type: string - plugins: - type: array - items: - type: string required: - - params - - plugins - required: - - vep - - fusion: - type: object - properties: - arriba: + - activate + net_mhc_two_pan: type: object properties: activate: type: boolean - blacklist: - type: string - params: - type: string - - params: - type: object - properties: - cutadapt: - type: string - bwa: - type: string - gatk: - type: object - properties: - BaseRecalibrator: - type: string - applyBQSR: - type: string - required: - - BaseRecalibrator - - applyBQSR - picard: - type: object - properties: - MarkDuplicates: + peptide_len: + type: integer + extra: type: string required: - - MarkDuplicates - strelka: - type: object - properties: - config: - type: string - run: - type: string - razers3: - type: string - optitype: - type: string - microphaser: - type: object - properties: - window_len: - type: integer - peptide_len: - type: object - properties: - netMHCpan: - type: integer - netMHCIIpan: - type: integer - kallisto: - type: string - star: - type: string + - activate required: - - bwa - - gatk - - picard - - strelka - - razers3 - microphaser - - optitype + - net_mhc_pan + - net_mhc_two_pan + required: - samples - units - ref - - tmb - - calling - params - - annotations - - epitope_prediction - - affinity + - neoantigen_prediction diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index e626f60f..e795cdee 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -2,12 +2,18 @@ $schema: "http://json-schema.org/draft-04/schema#" description: an entry in the sample sheet properties: - sample: + sample_name: type: string - description: sample name/identifier - type: + description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" + alias: type: string - description: healthy or tumor sample + description: sample name within the VCF/BCF files generated for a group (e.g. tumor, normal, etc.) (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" + group: + type: string + description: group of samples called jointly (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" matched_normal: type: string description: the corresponding healthy control to this tumor sample @@ -18,11 +24,25 @@ properties: description: Purity to use for tumor/normal groups. platform: type: string - enum: ["CAPILLARY", "LS454", "ILLUMINA", "SOLID", "HELICOS", "IONTORRENT", "ONT", "PACBIO"] - + enum: + - "CAPILLARY" + - "LS454" + - "ILLUMINA" + - "SOLID" + - "HELICOS" + - "IONTORRENT" + - "ONT" + - "PACBIO" + description: used sequencing platform + purity: + type: number + minimum: 0.0 + maximum: 1.0 + description: Purity to use for tumor/normal groups. required: - - sample - - type + - sample_name + - alias + - group - platform diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml index 6fb9a6cf..4fec2074 100644 --- a/workflow/schemas/units.schema.yaml +++ b/workflow/schemas/units.schema.yaml @@ -2,18 +2,22 @@ $schema: "http://json-schema.org/draft-04/schema#" description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data type: object properties: - sample: + sample_name: type: string - description: sample name/id the unit has been sequenced from + pattern: "^[a-zA-Z_0-9-]+$" + description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-') sequencing_type: type: string enum: ["DNA", "RNA"] - unit: + description: type of sequenced material ('DNA' or 'RNA') + unit_name: type: string - description: unit or lane name + pattern: "^[a-zA-Z_0-9-]+$" + description: unit id (alphanumeric string, that may additionally contain '_' and '-') fq1: type: string - description: path to FASTQ file + pattern: "^[^ \t]+$" + description: path to FASTQ file (may not contain whitespace) fq2: type: string description: path to second FASTQ file (leave empty in case of single-end) @@ -22,9 +26,9 @@ properties: description: SRA id for automatic download of unit adapters: type: string - description: adapter trimming settings to use (for cutadapt) + description: cutadapt adapter trimming settings to use (see https://cutadapt.readthedocs.io) required: - - sample - - unit + - sample_name + - unit_name - sequencing_type diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py index bdcdcac2..0eb2298e 100644 --- a/workflow/scripts/add_rna_info.py +++ b/workflow/scripts/add_rna_info.py @@ -1,13 +1,19 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import pandas as pd ## load data table -data = pd.read_csv(snakemake.input["table"], sep='\t') +data = pd.read_csv(snakemake.input["table"], sep="\t") ## Merge transcript count -transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t') +transcript_count = pd.read_csv(snakemake.params["abundance"], sep="\t") transcript_count = transcript_count[["target_id", "tpm"]] transcript_count.columns = ["Transcript_ID", "TPM"] -transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0] +transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split( + ".", expand=True +)[0] data = data.merge(transcript_count, on="Transcript_ID", how="left") -data.to_csv(snakemake.output[0], sep='\t', index=False) +data.to_csv(snakemake.output[0], sep="\t", index=False) diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py new file mode 100644 index 00000000..4806d56e --- /dev/null +++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py @@ -0,0 +1,20 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd + +columns_mapping = { + "gene_name": "gene", + "normal_peptide": "mutation.wildTypeXmer", + "tumor_peptide": "mutation.mutatedXmer", + "freq": "dnaVariantAlleleFrequency", +} + +candidates = ( + pd.read_csv(snakemake.input.microphaser, sep="\t", quoting=3) + .rename(columns=columns_mapping) + .assign(patientIdentifier=snakemake.wildcards.group) +) + +candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3, index=False) diff --git a/workflow/scripts/build_oncoprint_matrix.py b/workflow/scripts/build_oncoprint_matrix.py deleted file mode 100644 index 2b65721b..00000000 --- a/workflow/scripts/build_oncoprint_matrix.py +++ /dev/null @@ -1,42 +0,0 @@ -import pysam -import pandas as pd -import os -import sys - -# redirect output to log file -log = open(snakemake.log[0], "w") -sys.stdout = log -sys.stderr = log - -input_files = snakemake.input - -df = pd.DataFrame(columns=["Sample"]) -for sample_file in input_files: - variant_file = pysam.VariantFile(sample_file) - sample_name = os.path.basename(sample_file).split(".")[0] - gene_variant_dict = {"Sample": [sample_name]} - for rec in variant_file.fetch(): - for sample in rec.samples: - allele_frequencies = rec.samples[sample]["AF"] #Can be multiple entries - for allele_frequency in allele_frequencies: - variant = rec.info["SVLEN"] - if variant[0]: - variant_type = "INDEL" - else: - variant_type = "SNV" - transcripts = rec.info["ANN"] - for transcript in transcripts: - gene = transcript.split("|")[3] - impact = transcript.split("|")[2] - if gene and impact != "MODIFIER": - if gene not in gene_variant_dict: - gene_variant_dict[gene] = set() - gene_variant_dict[gene].add(variant_type) - break - for key, value in gene_variant_dict.items(): - gene_variant_dict[key] = ','.join(value) - sample_df = pd.DataFrame(gene_variant_dict, index=[0]) - df = pd.concat([df, sample_df], join="outer", ignore_index=False, sort=False) -df.set_index("Sample", inplace=True) -with open(snakemake.output[0], 'w') as output_f: - print(df.to_csv(sep="\t", index=True), file=output_f) diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py index 1b6050e4..eb9c917a 100644 --- a/workflow/scripts/count_neoantigen_occurrences.py +++ b/workflow/scripts/count_neoantigen_occurrences.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import pandas as pd import glob @@ -5,16 +9,21 @@ dfs, dfs_id = dict(), [] -for f in files: - df = pd.read_csv(f, sep = '\t') - dfs[f] = df - df = df[["ID_tumor"]] - dfs_id.append(df) +for f in files: + df = pd.read_csv(f, sep="\t") + dfs[f] = df + df = df[["ID_tumor"]] + dfs_id.append(df) ids = pd.concat(dfs_id) -ids=ids.groupby(ids.columns.tolist(), as_index = False).size().reset_index().rename(columns = {0:"occurrences"}) -ids.to_csv("out/tables/candidate_occurrences.tsv", sep = '\t', index = False) +ids = ( + ids.groupby(ids.columns.tolist(), as_index=False) + .size() + .reset_index() + .rename(columns={0: "occurrences"}) +) +ids.to_csv("out/tables/candidate_occurrences.tsv", sep="\t", index=False) for k, v in dfs.items(): - df = v.merge(ids, on = "ID_tumor") - df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep = ',', index = False) + df = v.merge(ids, on="ID_tumor") + df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep=",", index=False) diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py new file mode 100644 index 00000000..4242d684 --- /dev/null +++ b/workflow/scripts/create_neo_fox_group_sheet.py @@ -0,0 +1,64 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd + +HLA_SUFFIXES_REGEX = r"[NLSCAQ]?" + +# allowed loci according to NeoFox input data documentation: +# https://neofox.readthedocs.io/en/latest/03_01_input_data.html#file-with-patient-data +# * mhcIAlleles: comma separated MHC I alleles of the patient for HLA-A, HLA-B and +# HLA-C. If homozygous, the allele should be added twice. +# * mhcIIAlleles: comma separated MHC II alleles of the patient for HLA-DRB1, HLA-DQA1, +# HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice. +ALLOWED_LOCI = { + "A", + "B", + "C", + "DRB1", + "DQA1", + "DQB1", + "DPA1", + "DPB1", +} + +mhc_alleles = pd.read_csv( + snakemake.input.hla_la_bestguess, + sep="\t", + ) +# the Allele column can contain multiple ";"-separated entries for the +# same locus -- NeoFox does a hard assertion that only two alleles per +# gene exist, so we chose to only ever keep the first of such multiple +# possibilities +mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";") +mhc_alleles = mhc_alleles.explode(["Allele"]).drop_duplicates(subset=["Locus", "Chromosome"]) +mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)] +mhc_alleles.loc[:, "Allele"] = ( + mhc_alleles["Allele"] + .str + .replace( + r"([A-Z]+\d?)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"HLA-\1*\2:\3\5", + regex=True, + ) +) +# the multiple ";"-separated entries from above can be identical after reducing +# to the allele group (1st number) and specific HLA protein (2nd number) +mhc_alleles = mhc_alleles.drop_duplicates(subset=["Chromosome", "Allele"]) + +mhc_one_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() == 1, "Allele"] ) +mhc_two_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() > 1, "Allele"] ) + +patient_info = pd.DataFrame( + data={ + "identifier": [ snakemake.params.group.name ], + "tumorType": [ snakemake.params.group["tumorType"] ], + "mhcIAlleles": [ mhc_one_alleles ], + "mhcIIAlleles": [ mhc_two_alleles ], + } +# This is required for cases where no tumorType is available, as NeoFox does not +# seem to be able to handle empty entries, here -- so we remove the whole column +).dropna(axis="columns") + +patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False) diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py deleted file mode 100644 index 5ce0cf15..00000000 --- a/workflow/scripts/group_mhc_output.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -import os -import pandas as pd -import numpy as np - -first = True -out = open(snakemake.output[0], "w") -for e in snakemake.input: - if os.path.getsize(e) > 0: - mhcout = open(e, 'r') - alleles = next(mhcout).split('\t') - header = next(mhcout).rstrip().split('\t') - if first: - allele = '' - for i in range(0, len(header)): - #print(header[i].rstrip()) - if i < len(alleles): - #print(alleles[i]) - if alleles[i] != '': - allele = alleles[i].rstrip() + '_' - header[i] = allele + header[i].rstrip() - header[len(header) -1] = "NB" - first = False - #print(header) - out.write('\t'.join(header) + '\n') - for line in mhcout: - out.write(line) -out.close() diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py deleted file mode 100644 index b90af1cb..00000000 --- a/workflow/scripts/merge_data.py +++ /dev/null @@ -1,89 +0,0 @@ -import sys -import os -import pandas as pd -import numpy as np - - -def select_columns(mhc): - rank_cols = [c for c in mhc.columns if "Rank" in c] - affinity_cols = [c for c in mhc.columns if "nM" in c] - mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"] - mhc = mhc[mhc_cols] - mhc["Rank_min"] = mhc[rank_cols].min(axis=1) - mhc["Aff_min"] = mhc[affinity_cols].min(axis=1) - mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1) - mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1) - mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","") - mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") - return mhc - -def merge(info, tumor, normal, outfile): - tumor = select_columns(tumor) - normal = select_columns(normal) - id_length = len(tumor.ID[0]) - print(info.columns) - info["ID"] = info["id"].astype(str).str[:id_length] - merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID']) - merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns}) - info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"}) - merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') - - merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) - ## Are all possible variants in the peptide ("Cis") or not ("Trans") - merged_dataframe["Variant_Orientation"] = "Cis" - trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar - merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans" - - ## check misssense/silent mutation status - nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal - merged_dataframe = merged_dataframe[nonsilent] - merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"]) - - data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth", - "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor", - "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal", - "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]] - - data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth", - "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor", - "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal", - "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"] - - # data = data[data.BindingHLAs_tumor > 0] - # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)] - #data = data[(data.BindingHLAs_normal == 0)] - - ### Delete Stop-Codon including peptides - data = data[data.Peptide_tumor.str.count("x") == 0] - data = data[data.Peptide_tumor.str.count("X") == 0] - data.sort_values(["Chromosome", "somatic_positions"], inplace=True) - ### Remove Duplicate kmers - data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"]) - - data.to_csv(outfile, index=False, sep = '\t') - - -## highlight the difference between mutated neopeptide and wildtype -def diffEpitope(e1,e2): - if str(e2) == 'nan' or str(e2) == '': - return(e1) - e1 = str(e1) - e2 = str(e2) - diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]] - e_new = e1 - e2_new = e2 - for p in diff_pos: - e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:] - e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:] - return(e_new) - - -def main(): - info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str) - tumor = pd.read_csv(snakemake.input[1], sep = '\t') - normal = pd.read_csv(snakemake.input[2], sep = '\t') - outfile = snakemake.output[0] - merge(info, tumor, normal, outfile) - -if __name__ == '__main__': - sys.exit(main()) diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py deleted file mode 100644 index fcc6aba4..00000000 --- a/workflow/scripts/merge_mhcflurry.py +++ /dev/null @@ -1,63 +0,0 @@ -import sys -import os -import pandas as pd -import numpy as np - - -## highlight the difference between mutated neopeptide and wildtype -def diffEpitope(e1,e2): - if str(e2) == 'nan': - return(e1) - e1 = str(e1) - e2 = str(e2) - diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]] - e_new = e1 - e2_new = e2 - for p in diff_pos: - e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:] - e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:] - return(e_new) - -info = pd.read_csv(snakemake.input[0]) -tumor = pd.read_csv(snakemake.input[1]) -normal = pd.read_csv(snakemake.input[2]) -outfile = snakemake.output[0] - - -tumor = tumor[["source_sequence_name","peptide","allele","affinity","percentile_rank"]] -tumor = tumor.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index() -tumor.columns = tumor.columns.map("-".join) -tumor = tumor.rename(columns={col: col.replace("-","") for col in tumor.columns if col.endswith("-")}) - -normal = normal[["source_sequence_name","peptide","allele","affinity","percentile_rank"]] -normal = normal.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index() -normal.columns = normal.columns.map("-".join) -normal = normal.rename(columns={col: col.replace("-","") for col in normal.columns if col.endswith("-")}) - -merged = tumor.merge(normal, how="left", on=["source_sequence_name"]) - -merged = merged.rename(columns={col: col.replace("_y","_normal") for col in merged.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged.columns}) -## add info -info = info.rename(columns={"id":"ID","gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"}) -merged_dataframe = merged.merge(info, how="left", left_on="source_sequence_name", right_on="ID") - -merged_dataframe["peptide_tumor"]=merged_dataframe[["peptide_tumor","peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) -## Are all possible variants in the peptide ("Cis") or not ("Trans") -merged_dataframe["Variant_Orientation"] = "Cis" -trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar -merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans" - -## check misssense/silent mutation status -nonsilent = merged_dataframe.peptide_tumor != merged_dataframe.peptide_normal -merged_dataframe = merged_dataframe[nonsilent] -data = merged_dataframe.drop_duplicates(subset=["Gene_ID","offset","peptide_tumor","Somatic_AminoAcid_Change"]) - -### Delete Stop-Codon including peptides -data = data[data.peptide_tumor.str.count("x") == 0] -data = data[data.peptide_tumor.str.count("X") == 0] -data.to_csv(outfile, index=False, sep = '\t') - - - - - diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py new file mode 100644 index 00000000..669a7716 --- /dev/null +++ b/workflow/scripts/merge_neoantigen_info.py @@ -0,0 +1,295 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd +from typing import List, Tuple + + +def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: + df = df.set_index(["id", "pos_in_id_seq", "pep_seq", "allele"]) + rank_col = f"{rank_type}_rank" + score_col = f"{rank_type}_score" + prefix = f"top_{rank_col}_" + columns_to_keep = ["bind_core", rank_col, score_col] + return ( + df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep] + .reset_index(level="allele") + .add_prefix(prefix) + .reset_index() + .drop_duplicates() + ) + + +def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: + merge_cols = ["id", "pos_in_id_seq", "pep_seq"] + common_info = sample.loc[ + :, merge_cols + ["ave_el_score", "num_binders"] + ].drop_duplicates() + sample_el = get_best_rank_per_peptide(sample, "el") + sample_ba = get_best_rank_per_peptide(sample, "ba") + sample_filtered = sample_el.merge(sample_ba, on=merge_cols) + return sample_filtered.merge(common_info, how="left", on=merge_cols).assign( + alias=alias + ) + + +def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: + """ + Highlight the difference between mutated neopeptide and normal peptide + """ + if normal_p == "nan" or normal_p == "": + return (tumor_p, normal_p) + assert len(tumor_p) == len( + normal_p + ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." + diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] + tp_changed = tumor_p + np_changed = normal_p + for p in diff_pos: + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :] + return (tp_changed, np_changed) + + +def diff_tumor_normal_peptides( + group: pd.DataFrame, column: str, tumor_alias: str +) -> pd.DataFrame: + group = group + normal_pep = group.loc[group["alias"] == "normal", column].fillna("") + if normal_pep.empty: + normal_pep = "" + else: + normal_pep = normal_pep.squeeze() + tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze() + # Remove groups where the tumor peptide contains a stop codon. + # TODO: Maybe this should be a hard fail complaining to fix this upstream? + if "X" in tumor_pep: + print(f"Warning: ", file=sys.stderr) + return group.loc[[], :] + # Silent mutations should not be included in microphaser output. + if normal_pep == tumor_pep: + raise ValueError( + f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" + "Please fix this upstream or comment out this check to ignore this problem.\n" + ) + t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep) + group.loc[group["alias"] == tumor_alias, column] = t_diff + group.loc[group["alias"] == "normal", column] = n_diff + return group + + +def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: + """ + Get the -o info output of the microphaser filter command into tidy data format. + """ + info = info.rename(columns={"credible_interval": "freq_credible_interval"}) + # Aggregate multiple identical entries that differ only in 'id' and 'transcript' + # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated + # list. + cols = [c for c in info.columns if c not in ["id", "transcript", "depth", "freq", "freq_credible_interval"]] + aggregation_functions = { + "id": lambda i: list(i), + "transcript": lambda t: "|".join(list(t)), + "depth": lambda d: "|".join(list(d)), + "freq": lambda f: "|".join(list(f)), + "freq_credible_interval": lambda c: "|".join(list(c)), + } + info = ( + info.groupby(cols, dropna=False) + .agg(aggregation_functions) + .reset_index() + .explode("id") + .set_index( + [ + "id", + "transcript", + "gene_id", + "gene_name", + "chrom", + "offset", + "frame", + "freq", + "freq_credible_interval", + "depth", + "strand", + ] + ) + ) + int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"] + info[int_cols] = info[int_cols].astype("int32") + # TODO: Ensure that microphaser output contains only one entry per id. + # If there is more than one entry per index, ensure that they are identical + if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0: + raise ValueError( + f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n" + ) + # Always take the first entry for each index. + info = info.groupby(info.index).head(1) + # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal. + # TODO: factor out tidying of column pairs into a function. + num_var_in_pep_tidy = ( + info.assign(ngermline=lambda x: x.nvar - x.nsomatic) + .melt( + var_name="alias", + value_name="num_var_in_pep", + value_vars=["ngermline", "nsomatic"], + ignore_index=False, + ) + .replace({"ngermline": "normal", "nsomatic": tumor_alias}) + .set_index("alias", append=True) + ) + num_var_sites_tidy = ( + info.assign(ngermvariant_sites=lambda x: x.nvariant_sites - x.nsomvariant_sites) + .melt( + var_name="alias", + value_name="num_var_sites", + value_vars=["ngermvariant_sites", "nsomvariant_sites"], + ignore_index=False, + ) + .replace({"ngermvariant_sites": "normal", "nsomvariant_sites": tumor_alias}) + .set_index("alias", append=True) + ) + genomic_pos_tidy = ( + info.melt( + var_name="alias", + value_name="genomic_pos", + value_vars=["somatic_positions", "germline_positions"], + ignore_index=False, + ) + .replace({"somatic_positions": tumor_alias, "germline_positions": "normal"}) + .set_index("alias", append=True) + ) + aa_changes_tidy = ( + info.melt( + var_name="alias", + value_name="aa_changes", + value_vars=["somatic_aa_change", "germline_aa_change"], + ignore_index=False, + ) + .replace({"somatic_aa_change": tumor_alias, "germline_aa_change": "normal"}) + .set_index("alias", append=True) + ) + nt_seq_tidy = ( + info.melt( + var_name="alias", + value_name="nt_seq", + value_vars=["normal_sequence", "mutant_sequence"], + ignore_index=False, + ) + .replace({"normal_sequence": "normal", "mutant_sequence": tumor_alias}) + .set_index("alias", append=True) + ) + all_tidy = num_var_in_pep_tidy.join( + [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy] + ) + return all_tidy.reset_index() + + +def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""): + if sum(df.duplicated(subset=cols)) > 0: + duplicates = df[ + df.duplicated( + subset=cols, + keep=False, + ) + ] + cols_str = '", "'.join(cols) + raise ValueError( + f'Found multiple rows with identical [ "{cols_str}" ] entries.\n' + "This indicates an upstream issue, please fix this.\n" + f"{specific_error}" + "The offending entries are:\n" + f"{duplicates}\n" + ) + + +def merge_data_frames( + info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str +) -> pd.DataFrame: + # get and merge tumor and normal + tumor_filtered = get_filtered_per_alias(tumor, tumor_alias) + normal_filtered = get_filtered_per_alias(normal, "normal") + all_filtered = ( + pd.concat([tumor_filtered, normal_filtered]) + .groupby("id", group_keys=False) + .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias) + ) + info_tidy = tidy_info(info, tumor_alias) + + # netMHCpan 4.1 truncates the fasta entry IDs, so we have to cut down the IDs + # that microphaser originally provided to make the following .join() work + len_tumor_id = len(tumor_filtered["id"][0]) + len_normal_id = len(normal_filtered["id"][0]) + assert ( + len_tumor_id == len_normal_id + ), f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" + info_tidy["id"] = info_tidy["id"].str[:len_tumor_id] + # Double-check for duplicates resulting from the id truncation + check_duplicates( + info_tidy, + ["id", "alias"], + specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n", + ) + + all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"]) + + # Double-check for weird duplicates, as previously done in Jan's code. + # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"], + # we check for everything except id. + cols_without_id = [c for c in all_annotated.columns if c not in ["id"]] + all_annotated = all_annotated.drop_duplicates(subset=cols_without_id) + check_duplicates(all_annotated, cols_without_id) + + column_order = [ + "id", + "pep_seq", + "pos_in_id_seq", + "alias", + "num_binders", + "freq", + "depth", + "num_var_sites", + "num_var_in_pep", + "top_el_rank_allele", + "top_el_rank_bind_core", + "top_el_rank_el_rank", + "top_el_rank_el_score", + "ave_el_score", + "top_ba_rank_allele", + "top_ba_rank_bind_core", + "top_ba_rank_ba_rank", + "top_ba_rank_ba_score", + "aa_changes", + "genomic_pos", + "nt_seq", + "gene_name", + "gene_id", + "transcript", + "chrom", + "offset", + "frame", + "strand", + "freq_credible_interval", + ] + + def get_id_rank(group: pd.DataFrame, tumor_alias: str): + return group.loc[group["alias"] == tumor_alias, "top_el_rank_el_rank"].squeeze() + + sort_rank = ( + all_annotated.groupby("id").apply(get_id_rank, tumor_alias).rename("sort_rank") + ) + all_sorted = ( + all_annotated.merge(sort_rank, on=["id"], how="left") + .sort_values(["sort_rank", "id", "alias"], ascending=[True, True, False]) + .drop(columns="sort_rank") + ) + + return all_sorted.reindex(columns=column_order) + + +info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str) +tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str}) +normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str}) +data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias) +data.to_csv(snakemake.output[0], index=False, sep="\t") diff --git a/workflow/scripts/oncoprint.R b/workflow/scripts/oncoprint.R deleted file mode 100644 index 5476b305..00000000 --- a/workflow/scripts/oncoprint.R +++ /dev/null @@ -1,56 +0,0 @@ -# log to file -log <- file(snakemake@log[[1]], open="wt") -sink(log) -sink(log, type="message") - -library(ComplexHeatmap) -library(ggplot2) - -table = read.table(snakemake@input[[1]], sep="\t", header=TRUE, row.names=1) -mat = as.matrix(table) -mat = t(mat) -## remove "full" lines -mat = mat[rowSums(mat == "") > 0,] -## remove single lines -mat = mat[rowSums(mat != "") > 0,] - -col = c(SNV = "blue", INDEL = "red") - -alter_fun = list( - SNV = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, - gp = gpar(fill = col["SNV"], col = NA)), - INDEL = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.4, - gp = gpar(fill = col["INDEL"], col = NA)) - ) - - -heatmap_legend_param = list(title = "Alterations", at = c("SNV", "INDEL"), - labels = c("SNV", "INDEL")) -if (ncol(mat) > 1 ){ - mat <- mat[order(apply(mat, 1, function(row) sum(row != "")), decreasing = T), ] -} - -i = 0 -c = 0 -matlist = list() -while (i + 2000 < nrow(mat)) { - m <- mat[(i + 1):(i + 2000), , drop=FALSE] - rows_matrix <- nrow(m) - height_plot <- (rows_matrix/5) - if (height_plot < 4) { - height_plot <- 4 - } - pdf(file = sub("0.pdf", paste(c, ".pdf", sep=''), snakemake@output[[1]]), height=height_plot) - if (rows_matrix > 0) { - oncoprint <- oncoPrint(m, - alter_fun = alter_fun, col = col, - remove_empty_columns = FALSE, remove_empty_rows = TRUE, - pct_side = "right", row_names_side = "left", - show_column_names=T, - column_title = "OncoPrint", heatmap_legend_param = heatmap_legend_param) - draw(oncoprint, newpage=F) - } - dev.off() - i = i + 2000 - c = c + 1 -} \ No newline at end of file diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py deleted file mode 100644 index ff1906f3..00000000 --- a/workflow/scripts/parse_HLA_types.py +++ /dev/null @@ -1,20 +0,0 @@ -import pandas as pd - -hlaI = ["A","B","C"] - -hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"] - -hlas = pd.read_csv(snakemake.input[0], sep='\t') - -hlasI = hlas[hlas.Locus.isin(hlaI)] -hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') -hlasI = hlasI[["Allele"]].drop_duplicates() -hlasI.to_csv(snakemake.output[0], sep='\t', index=False) - -hlasII = hlas[hlas.Locus.isin(hlaII)] -hlasII["HLA"] = hlasII.Locus.str[0:2] -hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') - -hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates() -hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_") -hlasII.to_csv(snakemake.output[1], sep='\t', index=False) diff --git a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py new file mode 100644 index 00000000..79c6994c --- /dev/null +++ b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py @@ -0,0 +1,145 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd + +# # read in available alleles + + +def read_allele_list(filename: str): + with open(filename, "r") as alleles_in: + alleles = set() + for line in alleles_in: + if not (line.startswith("#") or line == "\n"): + alleles.add(line.strip()) + return alleles + + +HLA_SUFFIXES_REGEX = r"[NLSCAQ]?" + +# netMHCpan alleles and loci +HLA_ONE_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_one_alleles) +hla_one_net_mhc_alleles = pd.Series(list(HLA_ONE_NET_MHC_ALLELES)) +HLA_ONE_LOCI = set( + hla_one_net_mhc_alleles[hla_one_net_mhc_alleles.str.startswith("HLA-")] + .str.replace(r"HLA-([A-Z])[\d:]+" + HLA_SUFFIXES_REGEX, r"\1", regex=True) + .drop_duplicates() +) + + +# netMHCIIpan alleles and loci +HLA_TWO_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_two_alleles) +hla_two_net_mhc_alleles = pd.Series(list(HLA_TWO_NET_MHC_ALLELES)) +# DRB alleles need to be formatted differently from DP and DQ alleles, +# so we extract them separately. +DRB_LOCI = set( + hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("DRB")] + .str.replace(r"(DRB\d)_\d+" + HLA_SUFFIXES_REGEX, r"\1", regex=True) + .drop_duplicates() +) + +ALPHA_BETA_LOCI = set( + hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("HLA-")] + .str.replace( + r"HLA-(D[A-Z]A\d)\d+" + + HLA_SUFFIXES_REGEX + + r"-(D[A-Z]B\d)\d+" + + HLA_SUFFIXES_REGEX, + r"\1_\2", + regex=True, + ) + .str.split("_") + .explode() + .drop_duplicates() +) + +# read in alleles as determined by HLA-LA +hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t") + +# the Allele column can contain multiple ";"-separated entries for the +# same locus +hlas.loc[:, "Allele"] = hlas["Allele"].str.split(pat=";") +hlas["alternative"] = hlas["Allele"].apply(lambda x: range(len(x))) +hlas = hlas.explode(["Allele", "alternative"]) + +# reformat to netMHCpan allele list format: +# * https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/allele.list +# it needs to be in the format of the first column of the above list, as explained in +# the "Instructions" tab under "MHC SELECTION" point "2)" at: +# * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 +hla_one_alleles = ( + hlas.loc[hlas["Locus"].isin(HLA_ONE_LOCI), "Allele"] + .str.replace( + r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"HLA-\1\2:\3\5", + regex=True, + ) + .drop_duplicates() +) +hla_one_alleles[hla_one_alleles.isin(HLA_ONE_NET_MHC_ALLELES)].to_csv( + snakemake.output.hlaI, sep="\t", index=False, header=False +) + +# reformat to netMHCIIpan allele list format: +# * https://services.healthtech.dtu.dk/services/netMHCIIpan-4.1/alleles_name.list +# contrary to the format in that list, alleles actually need to be formatted like this, +# with s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only +# the 1st and 2nd sub-number are used) according to the official nomenclature (see: +# http://www.hla.alleles.org/nomenclature/naming.html ): +# * DRB alleles: "_" +# * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA): +# "HLA--" +# This format was determined by manually selecting combinations above the +# "type a list of molecules names" field of the "Submission" tab at: +# * https://services.healthtech.dtu.dk/service.php?netMHCIIpan-4.1 + +# TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary: +# * example: DRB1_1501-DRB30101-DRB40301N +# * "DRB1_-DRB3-DRB4" +drb_alleles = hlas.loc[hlas["Locus"].isin(DRB_LOCI)] +hla_two_alleles = ( + drb_alleles["Allele"] + .str.replace( + r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"\1_\2\3\5", + regex=True, + ) + .drop_duplicates() +) + +# handle alleles where a combination of alpha and beta always exists +alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA_LOCI)] +alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace( + r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"\1\2\3\5", + regex=True, +) +# we need a variable to group alpha and beta of the same gene combination together +alpha_beta_alleles.loc[:, "gene_group"] = alpha_beta_alleles["Locus"].str.replace( + r"(D[A-Z])[AB]\d", r"\1", regex=True +) +# we need to handle cases where we had multiple allele entries in an +# alpha or beta locus, adding in a duplicate of the corresponding locus +alleles_to_duplicate = alpha_beta_alleles.loc[ + alpha_beta_alleles["alternative"] > 0, + ["Locus", "Chromosome", "alternative"], +].replace(regex={"Locus": {"(D[A-Z])A(\d)": r"\1B\2", "(D[A-Z])B(\d)": r"\1A\2"}}) +alleles_to_insert = alleles_to_duplicate.merge( + alpha_beta_alleles.drop("alternative", axis="columns"), + on=["Locus", "Chromosome"], + how="left", +) +alpha_beta_alleles = pd.concat( + [alpha_beta_alleles, alleles_to_insert] +).drop_duplicates() + +hla_two_alleles = hla_two_alleles.append( + alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"] + .transform(lambda x: f"HLA-{'-'.join(x)}") + .drop_duplicates() +) + +hla_two_alleles[hla_two_alleles.isin(HLA_TWO_NET_MHC_ALLELES)].to_csv( + snakemake.output.hlaII, sep="\t", index=False, header=False +) diff --git a/workflow/scripts/phylogeny.R b/workflow/scripts/phylogeny.R index 7f26becf..9e32c131 100644 --- a/workflow/scripts/phylogeny.R +++ b/workflow/scripts/phylogeny.R @@ -1,3 +1,7 @@ +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + library(phangorn) ## read the variant matrix diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py new file mode 100644 index 00000000..0a535d14 --- /dev/null +++ b/workflow/scripts/prepare_neoprint.py @@ -0,0 +1,88 @@ +import sys +import json + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd +from typing import Tuple + +def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: + """ + Highlight the difference between mutated neopeptide and normal peptide + """ + if normal_p == "nan" or normal_p == "NA" or normal_p == "": + return (tumor_p, normal_p) + assert len(tumor_p) == len( + normal_p + ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." + diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] + tp_changed = tumor_p + np_changed = normal_p + for p in diff_pos: + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :] + return (tp_changed, np_changed) + +all_neopeptides = pd.read_csv(snakemake.input.neopeptides, sep="\t") + +# If we leave in any dots, datavzrd will interpret this as attributes +all_neopeptides.columns = all_neopeptides.columns.str.replace(".", "_", regex=False) + +# Aggregate multiple identical entries that differ only in 'id' and 'transcript' +# into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated +# list. +# TODO: Remove this redundancy from microphaser output before passing it along to other +# tools. +cols = [c for c in all_neopeptides.columns if c not in ["id", "transcript"]] +aggregation_functions = { + "id": lambda i: list(i)[0], + "transcript": lambda t: "|".join(list(t)), +} +all_neopeptides = ( + all_neopeptides.groupby(cols, dropna=False) + .agg(aggregation_functions) + .reset_index() + .explode("id") +) + +# highlight mutations in the original Xmers +all_neopeptides[["mutation_mutatedXmer", "mutation_wildTypeXmer"]] = ( + pd.DataFrame( + all_neopeptides + .fillna({"mutation_wildTypeXmer": ""}) + .apply(lambda row: highlight_peptides_diff(row["mutation_mutatedXmer"], row["mutation_wildTypeXmer"]), axis="columns") + .tolist() + ) +) + +# create new purity-adjusted DNA variant allele frequency +all_neopeptides["purity_adjusted_DNA_VAF"] = all_neopeptides["dnaVariantAlleleFrequency"] / snakemake.params.purity + +# round all floats to the specified decimals +all_neopeptides = all_neopeptides.round(decimals=5) + +# define important columns to move to the left of the table + +neofox_important_cols = snakemake.params.neofox_important_cols + +important_cols = neofox_important_cols["general"] + neofox_important_cols["I"] + neofox_important_cols["II"] + + +mhc_one = ( + all_neopeptides[ important_cols + [ col for col in all_neopeptides.columns if col not in important_cols ] ] + .sort_values(by = ["PRIME_best_rank", "MixMHC2pred_best_rank"]) + .groupby("PRIME_best_rank") + .head(n=1) +) + +mhc_one.to_csv(snakemake.output.mhc_one, sep="\t", index=False) + +# move important columns to the left of the table +mhc_two = ( + all_neopeptides[ neofox_important_cols["general"] + neofox_important_cols["II"] + neofox_important_cols["I"] + [ col for col in all_neopeptides.columns if col not in important_cols ] ] + .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"]) + .groupby("MixMHC2pred_best_rank") + .head(n=1) +) + +mhc_two.to_csv(snakemake.output.mhc_two, sep="\t", index=False) diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py deleted file mode 100644 index 692b0ee5..00000000 --- a/workflow/scripts/render-scenario.py +++ /dev/null @@ -1,8 +0,0 @@ -from jinja2 import Template -import pandas as pd - -with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out: - samples = snakemake.params.samples - out.write(Template(template.read()).render( - samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])] - )) diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py index 3c7c287d..ffbd7aaa 100644 --- a/workflow/scripts/sample_comp_plot.py +++ b/workflow/scripts/sample_comp_plot.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import os import glob import pandas as pd @@ -6,8 +10,10 @@ import matplotlib.pyplot as plt from pysam import VariantFile -variant_df = pd.read_csv(snakemake.input[0], sep='\t').fillna(0.0) -variant_df = variant_df[["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]] +variant_df = pd.read_csv(snakemake.input[0], sep="\t").fillna(0.0) +variant_df = variant_df[ + ["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")] +] ## tidy data - for facet plot tidy_df = variant_df.melt(id_vars=["CHROM", "POS"], var_name="Sample", value_name="VAF") g = sns.FacetGrid(tidy_df, col="Sample") @@ -15,25 +21,27 @@ g.savefig(snakemake.output["facet"]) plt.close() ## pairplot -sns.pairplot(variant_df.drop(["CHROM", "POS"],axis=1), diag_kind="kde") +sns.pairplot(variant_df.drop(["CHROM", "POS"], axis=1), diag_kind="kde") plt.savefig(snakemake.output["pairplot"]) plt.close() + def overlap_pct(x, y, **kws): n = 0 - for i in range(0,len(x)): + for i in range(0, len(x)): if (x[i] > 0) & (y[i] > 0): n += 1 - overlap=n/len([e for e in x if e > 0]) + overlap = n / len([e for e in x if e > 0]) ax = plt.gca() - ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4)) - ax.annotate("Shared Variants: {}".format(n), xy=(.2, .6)) + ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(0.2, 0.4)) + ax.annotate("Shared Variants: {}".format(n), xy=(0.2, 0.6)) + def variants(x, **kws): positive = len([e for e in x if e > 0]) - ax = plt.gca() - ax.annotate("#Variants: {}".format(positive), xy=(.2, .5), -xycoords=ax.transAxes) + ax = plt.gca() + ax.annotate("#Variants: {}".format(positive), xy=(0.2, 0.5), xycoords=ax.transAxes) + g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) @@ -42,7 +50,7 @@ def variants(x, **kws): g.savefig(snakemake.output["grid"]) plt.close() -#def neg_overlap_pct(x, y, **kws): +# def neg_overlap_pct(x, y, **kws): # n = 0 # for i in range(0,len(x)): # if (x[i] == 0) & (y[i] == 0): @@ -52,25 +60,24 @@ def variants(x, **kws): # ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4)) # ax.annotate("Shared missing variants: {}".format(n), xy=(.2, .6)) -#def neg_variants(x, **kws): +# def neg_variants(x, **kws): # zero = len([e for e in x if e == 0]) -# ax = plt.gca() +# ax = plt.gca() # ax.annotate("#Missing Variants: {}".format(zero), xy=(.2, .5), -#xycoords=ax.transAxes) +# xycoords=ax.transAxes) -#g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) +# g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) -#g.map_offdiag(neg_overlap_pct) -#g.map_diag(neg_variants) -#g.savefig("plots/Mssing_Variant_table.pdf") -#plt.close() +# g.map_offdiag(neg_overlap_pct) +# g.map_diag(neg_variants) +# g.savefig("plots/Mssing_Variant_table.pdf") +# plt.close() -for c in variant_df.drop(["CHROM", "POS"], axis=1).columns: - sns.distplot(variant_df[[c]][variant_df[c] > 0]) - plt.title(c) - plt.savefig("plots/positive_" + c +".distplot.pdf") +for c in variant_df.drop(["CHROM", "POS"], axis=1).columns: + sns.distplot(variant_df[[c]][variant_df[c] > 0]) + plt.title(c) + plt.savefig("plots/positive_" + c + ".distplot.pdf") plt.close() sns.distplot(variant_df[[c]]) - plt.savefig("plots/all_" + c +".distplot.pdf") + plt.savefig("plots/all_" + c + ".distplot.pdf") plt.close() - diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py new file mode 100644 index 00000000..fa129c0a --- /dev/null +++ b/workflow/scripts/tidy_mhc_output.py @@ -0,0 +1,125 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import os +import pandas as pd + +from itertools import cycle + +# assumptions of this script about netMHCpan or netMHCIIpan: +# * version 4.1 +# * output generated via `-xls` option +# * generated with the `-BA` option to include binding affinity prediction + +# The mapping of index column names used here to original names in netMHCpan files +# is (please excuse the pd.NA tuples, they make header and index handling +# easier further down the line): +INDEX_NAMES = { + (pd.NA, "Pos"): "pos_in_id_seq", + (pd.NA, "Peptide"): "pep_seq", + (pd.NA, "ID"): "id", + (pd.NA, "Ave"): "ave_el_score", + (pd.NA, "NB"): "num_binders", +} + +if snakemake.wildcards.mhc == "net_mhc_pan": + # The mapping of column names used here to original names in netMHCpan files is: + COLUMN_NAMES = { + "BA-score": "ba_score", + "BA_Rank": "ba_rank", + "EL-score": "el_score", + "EL_Rank": "el_rank", + "core": "bind_core", + } +elif snakemake.wildcards.mhc == "net_mhc_two_pan": + # The mapping of column names used here to original names in netMHCIIpan files is: + COLUMN_NAMES = { + "Score_BA": "ba_score", + "Rank_BA": "ba_rank", + "Score": "el_score", + "Rank": "el_rank", + "Core": "bind_core", + } +else: + sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}") + +COLUMNS_TO_DROP = { + # I have not found any docs or indication in the manuscript, what the column + # `Target` is. It might be the column `Exp_Bind` from Figure 1B here (it's + # the only column that only appears in netMHCIIpan output and not netMHCpan + # output): + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7319546/figure/F1/ + # If so, it is only for benchmarking purposes according to the docs. And it + # is always NA, even in their manuscript. So we simply remove it here. + "Target", + # There doesn't seem to be an `icore` equivalent in netMHCIIpan output. + "icore", + # There doesn't seem to be an `nM` equivalent in netMHCpan output. + "nM", +} + + +def parse_file(mhc_in: str): + """ + Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile ` + directive into a tidy pandas data frame. + """ + if os.path.getsize(mhc_in) == 0: + # Short-circuit empty files, but generate correct header. + return pd.DataFrame( + columns=list(COLUMN_NAMES.values()) + + ["allele"] + + list(INDEX_NAMES.values()) + ) + + # We need to fix the utterly broken headers first + + # parse first header line into pandas.Series and name it + first_header_line = pd.read_csv(mhc_in, nrows=1, header=None, sep="\t").iloc[0, :] + first_header_line.name = "allele" + + # parse second header line into pandas.Series and name it + second_header_line = pd.read_csv( + mhc_in, skiprows=1, nrows=1, header=None, sep="\t" + ).iloc[0, :] + second_header_line.name = "column_name" + + header = pd.concat([first_header_line, second_header_line], axis="columns") + header = header.fillna(method="ffill") + header.loc[ + header.column_name.isin([index_col for (_, index_col) in INDEX_NAMES.keys()]), + "allele", + ] = pd.NA + + # It's a compound header over two rows and a compound row index in the initial + # three and final two columns of the table. For some reason, the final two + # columns are added to the index but not removed from the table, so we do this + # manually with `.iloc[]``. + data = pd.read_csv(mhc_in, sep="\t", skiprows=2, header=None) + + data.columns = pd.MultiIndex.from_frame(header) + + # remove columns only present in one of the two tools' output + columns_to_keep = [ + col + for col in list(data.columns.get_level_values("column_name")) + if col not in COLUMNS_TO_DROP + ] + idx = pd.IndexSlice + data = data.loc[:, idx[:, columns_to_keep]] + + # properly set row index columns (values to repeat in every row while doing the stack below) + data = data.set_index(list(INDEX_NAMES.keys())) + data.index.set_names(INDEX_NAMES, inplace=True) + + # Turn into longer table with one HLA Allele per row instead of MultiIndex + # header, rename columns to something readable and turn index into columns. + data = data.stack(level="allele").rename(columns=COLUMN_NAMES).reset_index() + + return data + + +all_data = pd.concat((parse_file(f) for f in snakemake.input), axis="index") + +all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False) diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py index 28a24594..f99e77ec 100644 --- a/workflow/scripts/tsv_to_xlsx.py +++ b/workflow/scripts/tsv_to_xlsx.py @@ -1,7 +1,8 @@ import sys -import pandas as pd sys.stderr = open(snakemake.log[0], "w") +import pandas as pd + data = pd.read_csv(snakemake.input.tsv, sep="\t") -data.to_excel(snakemake.output.xlsx, index=False) \ No newline at end of file +data.to_excel(snakemake.output.xlsx, index=False)