From 6e6b0c34e98654a77dc8f6c4d15057eeef65ddbe Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 18:42:04 +0100 Subject: [PATCH 001/191] update gitignore file to current folder layout --- .gitignore | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index ee6ed52e..8fe284d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,20 @@ * -!scripts -!scripts/* -!scripts/common -!scripts/common/* -scripts/.snakemake* -!Snakefile -!config.yaml -!samples.tsv -!resources -!resources/* -!envs -!envs/* -!environment.yaml +!.test +!.test/* +!.test/config/* +!.test/data/* +!config +!config/* +!config/HLA_Data/* +!workflow +!workflow/* +!workflow/report/* +!workflow/envs/* +!workflow/schemas/* +!workflow/scripts/* +!workflow/rules/* +!workflow/rules/annotation/* +!workflow/resources/* +!.gitignore !LICENSE !README.md -!rules -!rules/* -!.gitignore -!.editorconfig -!.gitattributes -!.test -!.test/data From e73b8d8f8c8532c909a362b87165ec37858d3b89 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 18:42:59 +0100 Subject: [PATCH 002/191] switch schemas and tsvs for samples and units to same column names as dna-seq-varlociraptor workflow --- .test/config/samples.tsv | 6 ++--- .test/config/units.tsv | 2 +- config/samples.tsv | 9 +++---- config/units.tsv | 3 ++- workflow/schemas/samples.schema.yaml | 36 +++++++++++++++++++++------- workflow/schemas/units.schema.yaml | 20 +++++++++------- 6 files changed, 51 insertions(+), 25 deletions(-) diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 044002d7..2aca82dc 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,3 +1,3 @@ -sample type matched_normal purity platform -A_normal normal ILLUMINA -A_tumor tumor A_normal 1 ILLUMINA +sample_name group alias purity platform +A_normal A normal 1 ILLUMINA +A_tumor A tumor .99 ILLUMINA diff --git a/.test/config/units.tsv b/.test/config/units.tsv index cdb8af8e..4984edd8 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,3 +1,3 @@ -sample sequencing_type unit fq1 fq2 sra adapters +sample_name sequencing_type unit_name fq1 fq2 sra adapters A_normal DNA lane1 data/reads/A_normal.1.fastq.gz data/reads/A_normal.2.fastq.gz A_tumor DNA lane1 data/reads/A_tumor.1.fastq.gz data/reads/A_tumor.2.fastq.gz diff --git a/config/samples.tsv b/config/samples.tsv index 6e291f15..249e0521 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,4 +1,5 @@ -sample type matched_normal purity platform -A_normal normal ILLUMINA -A_tumor tumor A_normal 1 ILLUMINA -B_tumor tumor A_normal 1 ILLUMINA +sample_name group alias purity platform +A_normal A normal 1 ILLUMINA +A_tumor A tumor .99 ILLUMINA +B_normal B normal 1 ILLUMINA +B_tumor B tumor .98 ILLUMINA diff --git a/config/units.tsv b/config/units.tsv index f8758dfb..c417a45c 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1,5 +1,6 @@ -sample sequencing_type unit fq1 fq2 sra adapters +sample_name sequencing_type unit_name fq1 fq2 sra adapters A_normal DNA lane1 A_normal_1.fastq.gz A_normal_2.fastq.gz A_tumor DNA lane2 A_tumor_1.fastq.gz A_tumor_2.fastq.gz +B_normal DNA lane1 B_normal_1.fastq.gz B_normal_2.fastq.gz B_tumor DNA lane1 B_tumor_1.fastq.gz B_tumor_2.fastq.gz B_tumor RNA lane1 B_tumor_RNA_1.fastq.gz B_tumor_RNA_2.fastq.gz diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index e626f60f..e795cdee 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -2,12 +2,18 @@ $schema: "http://json-schema.org/draft-04/schema#" description: an entry in the sample sheet properties: - sample: + sample_name: type: string - description: sample name/identifier - type: + description: sample name/identifier (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" + alias: type: string - description: healthy or tumor sample + description: sample name within the VCF/BCF files generated for a group (e.g. tumor, normal, etc.) (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" + group: + type: string + description: group of samples called jointly (alphanumeric string, that may additionally contain '_' and '-') + pattern: "^[a-zA-Z_0-9-]+$" matched_normal: type: string description: the corresponding healthy control to this tumor sample @@ -18,11 +24,25 @@ properties: description: Purity to use for tumor/normal groups. platform: type: string - enum: ["CAPILLARY", "LS454", "ILLUMINA", "SOLID", "HELICOS", "IONTORRENT", "ONT", "PACBIO"] - + enum: + - "CAPILLARY" + - "LS454" + - "ILLUMINA" + - "SOLID" + - "HELICOS" + - "IONTORRENT" + - "ONT" + - "PACBIO" + description: used sequencing platform + purity: + type: number + minimum: 0.0 + maximum: 1.0 + description: Purity to use for tumor/normal groups. required: - - sample - - type + - sample_name + - alias + - group - platform diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml index 6fb9a6cf..4fec2074 100644 --- a/workflow/schemas/units.schema.yaml +++ b/workflow/schemas/units.schema.yaml @@ -2,18 +2,22 @@ $schema: "http://json-schema.org/draft-04/schema#" description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data type: object properties: - sample: + sample_name: type: string - description: sample name/id the unit has been sequenced from + pattern: "^[a-zA-Z_0-9-]+$" + description: sample name/id the unit has been sequenced from (alphanumeric string, that may additionally contain '_' and '-') sequencing_type: type: string enum: ["DNA", "RNA"] - unit: + description: type of sequenced material ('DNA' or 'RNA') + unit_name: type: string - description: unit or lane name + pattern: "^[a-zA-Z_0-9-]+$" + description: unit id (alphanumeric string, that may additionally contain '_' and '-') fq1: type: string - description: path to FASTQ file + pattern: "^[^ \t]+$" + description: path to FASTQ file (may not contain whitespace) fq2: type: string description: path to second FASTQ file (leave empty in case of single-end) @@ -22,9 +26,9 @@ properties: description: SRA id for automatic download of unit adapters: type: string - description: adapter trimming settings to use (for cutadapt) + description: cutadapt adapter trimming settings to use (see https://cutadapt.readthedocs.io) required: - - sample - - unit + - sample_name + - unit_name - sequencing_type From 23f6738eeeb1dab7fdfca15bb10c601252909d77 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 18:46:05 +0100 Subject: [PATCH 003/191] adapt parsing and wildcards to new samples.tsv and units.tsv column names, make wildcard names more descriptive --- workflow/rules/MHC_binding.smk | 38 +++++----- workflow/rules/annotation.smk | 8 +- workflow/rules/calling.smk | 18 ++--- workflow/rules/candidate_calling.smk | 10 +-- workflow/rules/common.smk | 105 +++++++++++++++++---------- workflow/rules/filtering.smk | 36 ++++----- workflow/rules/microphaser.smk | 8 +- workflow/rules/phylogeny.smk | 2 +- workflow/rules/tmb.smk | 6 +- workflow/rules/varlociraptor.smk | 30 ++++---- 10 files changed, 145 insertions(+), 116 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 2d95e343..e43f248d 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -1,12 +1,12 @@ # rule mhcflurry: # input: -# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{group}.fa", +# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{peptide_type}.fa", # alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv", # wt_alleles=get_germline_optitype # output: -# "results/mhcflurry/{sample}/{chr}/output.{group}.csv" +# "results/mhcflurry/{sample}/{chr}/output.{peptide_type}.csv" # log: -# "logs/mhcflurry/{sample}-{chr}-{group}.log" +# "logs/mhcflurry/{sample}-{chr}-{peptide_type}.log" # run: # if "wt" in input.peptides: # alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0]) @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{group}.fa", + peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{group}.xls", + "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls", log: - "logs/netMHCpan/{sample}-{chr}-{group}.log", + "logs/netMHCpan/{sample}-{chr}-{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{group}.fa", + peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{group}.xls", + "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{sample}-{chr}-{group}.log", + "logs/netMHCIIpan/{sample}-{chr}-{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,13 +53,13 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{group}}.xls", + "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{peptide_type}}.xls", chr=contigs, ), output: - "results/{mhc}/{sample}/{sample}.mhc.{group}.tsv", + "results/{mhc}/{sample}/{sample}.mhc.{peptide_type}.tsv", log: - "logs/parse-mhc/{mhc}-{sample}-{group}.log", + "logs/parse-mhc/{mhc}-{sample}-{peptide_type}.log", wildcard_constraints: group="wt|mt", script: @@ -68,13 +68,13 @@ rule parse_mhc_out: # rule parse_mhcflurry: # input: -# expand("results/mhcflurry/{{sample}}/{chr}/output.{{group}}.csv", chr=contigs) +# expand("results/mhcflurry/{{sample}}/{chr}/output.{{peptide_type}}.csv", chr=contigs) # output: -# "results/mhcflurry/{sample}/{sample}.mhc.{group}.csv" +# "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv" # wildcard_constraints: # group="wt|mt" # log: -# "logs/parse-mhc/mhcflurry-{sample}-{group}.log" +# "logs/parse-mhc/mhcflurry-{sample}-{peptide_type}.log" # conda: # "../envs/xsv.yaml" # shell: @@ -84,8 +84,8 @@ rule parse_mhc_out: rule mhc_csv_table: input: info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv", - mt="results/{mhc}/{sample}/{sample}.mhc.mt.tsv", - wt="results/{mhc}/{sample}/{sample}.mhc.wt.tsv", + neo="results/{mhc}/{sample}/{sample}.mhc.neo.tsv", + normal="results/{mhc}/{sample}/{sample}.mhc.normal.tsv", output: report( "results/neoantigens/{mhc}/{sample}.DNA.tsv", @@ -101,8 +101,8 @@ rule mhc_csv_table: # rule mhcflurry_table: # input: # info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv", -# mt="results/mhcflurry/{sample}/{sample}.mhc.mt.tsv", -# wt="results/mhcflurry/{sample}/{sample}.mhc.wt.tsv" +# neo="results/mhcflurry/{sample}/{sample}.mhc.neo.tsv", +# normal="results/mhcflurry/{sample}/{sample}.mhc.normal.tsv" # output: # report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)") # script: diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 64088375..8a504e98 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -1,12 +1,12 @@ rule annotate_variants: input: - calls="results/calls/{group}.{scatteritem}.bcf", + calls="results/calls/{cancer_sample}.{scatteritem}.bcf", cache="resources/vep/cache", plugins="resources/vep/plugins", output: - calls="results/calls/{group}.{scatteritem}.annotated.bcf", + calls="results/calls/{cancer_sample}.{scatteritem}.annotated.bcf", stats=report( - "results/calls/{group}.{scatteritem}.stats.html", + "results/calls/{cancer_sample}.{scatteritem}.stats.html", caption="../report/stats.rst", category="QC", ), @@ -16,7 +16,7 @@ rule annotate_variants: plugins=config["annotations"]["vep"]["plugins"], extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]), log: - "logs/vep/{group}.{scatteritem}.annotate.log", + "logs/vep/{cancer_sample}.{scatteritem}.annotate.log", wrapper: "0.59.2/bio/vep/annotate" diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index 98ef992f..70a228a4 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -2,16 +2,16 @@ rule strelka_somatic: input: normal=get_normal_bam, normal_index=get_normal_bai, - tumor="results/recal/{sample}.sorted.bam", - tumor_index="results/recal/{sample}.sorted.bam.bai", + tumor="results/recal/{cancer_sample}.sorted.bam", + tumor_index="results/recal/{cancer_sample}.sorted.bam.bai", fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/somatic/{sample}/results/variants/somatic.snvs.vcf.gz", - "results/strelka/somatic/{sample}/results/variants/somatic.indels.vcf.gz", + "results/strelka/somatic/{cancer_sample}/results/variants/somatic.snvs.vcf.gz", + "results/strelka/somatic/{cancer_sample}/results/variants/somatic.indels.vcf.gz", log: - "logs/calling/strelka_somatic/{sample}.log", + "logs/calling/strelka_somatic/{cancer_sample}.log", params: config_extra="--callRegions {} {}".format( "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] @@ -24,15 +24,15 @@ rule strelka_somatic: rule strelka_germline: input: - bam="results/recal/{normal}.sorted.bam", - normal_index="results/recal/{normal}.sorted.bam.bai", + bam="results/recal/{normal_sample}.sorted.bam", + normal_index="results/recal/{normal_sample}.sorted.bam.bai", fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/germline/{normal}/results/variants/variants.vcf.gz", + "results/strelka/germline/{normal_sample}/results/variants/variants.vcf.gz", log: - "logs/calling/strelka_germline/{normal}.log", + "logs/calling/strelka_germline/{normal_sample}.log", params: config_extra="--callRegions {} {}".format( "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk index 74386763..2cdfbb2c 100644 --- a/workflow/rules/candidate_calling.smk +++ b/workflow/rules/candidate_calling.smk @@ -4,9 +4,9 @@ rule freebayes: # you can have a list of samples here samples=get_paired_bams, output: - "results/candidate-calls/{pair}.freebayes.bcf", + "results/candidate-calls/{cancer_sample}.freebayes.bcf", log: - "logs/{pair}.log", + "logs/{cancer_sample}.log", params: extra=config["params"].get("freebayes", ""), chunksize=100000, @@ -17,11 +17,11 @@ rule freebayes: rule scatter_candidates: input: - "results/candidate-calls/{pair}.{caller}.bcf", + "results/candidate-calls/{cancer_sample}.{caller}.bcf", output: - scatter.calling("results/candidate-calls/{{pair}}.{{caller}}.{scatteritem}.bcf"), + scatter.calling("results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"), log: - "logs/scatter-candidates/{pair}.{caller}.log", + "logs/scatter-candidates/{cancer_sample}.{caller}.log", conda: "../envs/rbt.yaml" shell: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3bc6c60d..5b371175 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -10,13 +10,31 @@ ftp = FTP.RemoteProvider() validate(config, schema="../schemas/config.schema.yaml") -##### sample sheets ##### - -samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False) +##### samples sheet ##### + +samples = ( + pd.read_csv( + config["samples"], + sep="\t", + dtype={"sample_name": str, "group": str}, + comment="#", + ) + .set_index("sample_name", drop=False) + .sort_index() +) validate(samples, schema="../schemas/samples.schema.yaml") -units = pd.read_csv(config["units"], dtype=str, sep="\t").set_index( - ["sample", "sequencing_type", "unit"], drop=False +##### units sheet ##### + +units = ( + pd.read_csv( + config["units"], + sep="\t", + dtype={"sample_name": str, "sequencing_type": str, "unit_name": str}, + comment="#", + ) + .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False) + .sort_index() ) validate(units, schema="../schemas/units.schema.yaml") @@ -27,10 +45,14 @@ contigs.extend(["X", "Y"]) wildcard_constraints: - pair="|".join(samples[samples.type == "tumor"]["sample"]), - sample="|".join(samples["sample"]), + cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]), + sample="|".join(samples["sample_name"]), + unit="|".join(units["unit_name"]), + alias="|".join(pd.unique(samples["alias"])), + group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), - event="somatic|germline|complete", + peptide_type="|".join(["normal", "neo"]), + event="|".join(["somatic", "germline", "complete"]), ### Output generation ### @@ -46,9 +68,9 @@ def is_activated(xpath): def get_final_output(): if config["epitope_prediction"]["activate"]: final_output = expand( - "results/neoantigens/{mhc}/{S.sample}.{S.sequencing_type}.xlsx", - S=units.loc[samples[samples.type == "tumor"]["sample"]] - .drop_duplicates(["sample", "sequencing_type"]) + "results/neoantigens/{mhc}/{S.sample_name}.{S.sequencing_type}.xlsx", + S=units.loc[samples[samples.alias == "tumor"]["sample_name"]] + .drop_duplicates(["sample_name", "sequencing_type"]) .itertuples(), mhc=list( filter( @@ -68,12 +90,12 @@ def get_final_output(): "results/HLA-LA/hlaI_{sample}.tsv", "results/HLA-LA/hlaII_{sample}.tsv", ], - sample=samples["sample"], + sample=samples["sample_name"], ) else: final_output = expand( "results/optitype/{sample}/hla_alleles_{sample}.tsv", - sample=samples["sample"], + sample=samples["sample_name"], ) return final_output @@ -82,7 +104,7 @@ def get_fusion_output(): if config["fusion"]["arriba"]["activate"]: fusion_output = expand( "results/fusion/arriba/{sample}.fusions.tsv", - sample=units[units["sequencing_type"] == "RNA"]["sample"], + sample=units[units["sequencing_type"] == "RNA"]["sample_name"], ) else: fusion_output = [] @@ -93,7 +115,7 @@ def get_tmb_targets(): if is_activated("tmb"): return expand( "results/plots/tmb/{group}.{mode}.svg", - group=samples[(samples.type == "tumor")]["sample"], + group=samples[(samples.alias == "tumor")]["sample_name"], mode=config["tmb"].get("mode", "curve"), ) else: @@ -125,13 +147,13 @@ def get_cutadapt_input(wildcards): if pd.isna(unit["fq2"]): # single end local sample return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format( - S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending + S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending ) else: # paired end local sample return expand( "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format( - S=unit.sample, U=unit.unit, T=unit.sequencing_type, E=ending + S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending ), read=["fq1", "fq2"], ) @@ -225,7 +247,7 @@ def get_optitype_reads_input(wildcards): def get_oncoprint_batch(wildcards): if wildcards.batch == "all": - groups = samples[samples["type"] == "tumor"]["sample"].unique() + groups = samples[samples["alias"] == "tumor"]["sample_name"].unique() else: groups = samples.loc[ samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch, @@ -241,15 +263,15 @@ def get_oncoprint_batch(wildcards): def get_annotated_bcf(wildcards): selection = ".annotated" - return "results/calls/{pair}.{scatteritem}{selection}.bcf".format( - pair=wildcards.pair, selection=selection, scatteritem=wildcards.scatteritem + return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format( + cancer_sample=wildcards.cancer_sample, selection=selection, scatteritem=wildcards.scatteritem ) def get_scattered_calls(ext=".bcf"): def inner(wildcards): return expand( - "results/calls/{{pair}}.{caller}.{{scatteritem}}.sorted{ext}", + "results/calls/{{cancer_sample}}.{caller}.{{scatteritem}}.sorted{ext}", caller=caller, ext=ext, ) @@ -278,7 +300,7 @@ def get_pair_variants(wildcards, index): ] variants.append( "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format( - get_normal(wildcards), ext + get_normal(wildcards.sample), ext ) ) return variants @@ -286,9 +308,9 @@ def get_pair_variants(wildcards, index): def get_pair_observations(wildcards): return expand( - "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf", + "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf", caller=wildcards.caller, - pair=wildcards.pair, + cancer_sample=wildcards.cancer_sample, scatteritem=wildcards.scatteritem, sample=get_paired_samples(wildcards), ) @@ -297,7 +319,7 @@ def get_pair_observations(wildcards): def get_merge_input(ext=".bcf"): def inner(wildcards): return expand( - "results/calls/{{pair}}.{vartype}.{{event}}.fdr-controlled{ext}", + "results/calls/{{cancer_sample}}.{vartype}.{{event}}.fdr-controlled{ext}", ext=ext, vartype=["SNV", "INS", "DEL", "MNV"], filter=config["calling"]["fdr-control"]["events"][wildcards.event], @@ -308,8 +330,11 @@ def get_merge_input(ext=".bcf"): def get_pair_aliases(wildcards): return [ - samples.loc[samples.loc[wildcards.pair, "matched_normal"], "type"], - samples.loc[wildcards.pair, "type"], + samples.loc[ + get_normal(wildcards.cancer_sample), + "alias" + ], + samples.loc[wildcards.cancer_sample, "alias"], ] @@ -350,8 +375,8 @@ def kallisto_params(wildcards, input): def get_paired_samples(wildcards): return [ - samples.loc[(wildcards.pair), "matched_normal"], - samples.loc[wildcards.pair, "sample"], + get_normal(wildcards.cancer_sample), + samples.loc[wildcards.cancer_sample, "sample_name"], ] @@ -367,8 +392,12 @@ def get_paired_bais(wildcards): ) -def get_normal(wildcards): - return samples.loc[(wildcards.sample), "matched_normal"] +def get_normal(sample_name): + normal_sample = samples.loc[ + (samples["group"] == samples.loc[sample_name, "group"]) & (samples["alias"] == "normal"), + "sample_name" + ].iat[0] + return normal_sample def get_reads(wildcards): @@ -382,30 +411,30 @@ def get_seperate(sample, group): def get_proteome(wildcards): return expand( "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin", - normal=get_normal(wildcards), + normal=get_normal(wildcards.sample), mhc=wildcards.mhc, ) def get_alleles_MHCI(wildcards): - if wildcards.group == "wt": + if wildcards.peptide_type == "wt": return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_normal(wildcards) + S=get_normal(wildcards.sample) ) else: return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample) def get_alleles_MHCII(wildcards): - if wildcards.group == "wt": - return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards)) + if wildcards.peptide_type == "wt": + return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.sample)) else: return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample) def get_normal_bam(wildcards): - return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards)) + return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample)) def get_normal_bai(wildcards): - return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards)) + return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards.cancer_sample)) diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index f32c203d..169265f0 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -17,13 +17,13 @@ rule filter_odds: input: get_annotated_bcf, output: - "results/calls/{pair}.{event}.{scatteritem}.filtered_odds.bcf", + "results/calls/{cancer_sample}.{event}.{scatteritem}.filtered_odds.bcf", params: events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][ "varlociraptor" ], log: - "logs/filter-calls/posterior_odds/{pair}.{scatteritem}.{event}.log", + "logs/filter-calls/posterior_odds/{cancer_sample}.{scatteritem}.{event}.log", conda: "../envs/varlociraptor.yaml" shell: @@ -33,15 +33,15 @@ rule filter_odds: rule gather_calls: input: calls=gather.calling( - "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf" + "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf" ), idx=gather.calling( - "results/calls/{{pair}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi" + "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi" ), output: - "results/calls/{pair}.{event}.filtered_odds.bcf", + "results/calls/{cancer_sample}.{event}.filtered_odds.bcf", log: - "logs/gather-calls/{pair}.{event}.log", + "logs/gather-calls/{cancer_sample}.{event}.log", params: "-a -Ob", wrapper: @@ -50,11 +50,11 @@ rule gather_calls: rule control_fdr: input: - "results/calls/{pair}.{event}.filtered_odds.bcf", + "results/calls/{cancer_sample}.{event}.filtered_odds.bcf", output: - "results/calls/{pair}.{vartype}.{event}.fdr-controlled.bcf", + "results/calls/{cancer_sample}.{vartype}.{event}.fdr-controlled.bcf", log: - "logs/control-fdr/{pair}.{vartype}.{event}.log", + "logs/control-fdr/{cancer_sample}.{vartype}.{event}.log", params: query=get_fdr_control_params, conda: @@ -69,9 +69,9 @@ rule merge_calls: calls=get_merge_input(".bcf"), idx=get_merge_input(".bcf.csi"), output: - "results/merged-calls/{pair}.{event}.fdr-controlled.bcf", + "results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", log: - "logs/merge-calls/{pair}.{event}.log", + "logs/merge-calls/{cancer_sample}.{event}.log", params: "-a -Ob", wrapper: @@ -80,11 +80,11 @@ rule merge_calls: rule change_samplenames: input: - call="results/merged-calls/{pair}.{event}.fdr-controlled.bcf", + call="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", output: - temp("results/merged-calls/{pair}.{event}.renaming.txt"), + temp("results/merged-calls/{cancer_sample}.{event}.renaming.txt"), log: - "logs/change-samplenames/{pair}.{event}.log", + "logs/change-samplenames/{cancer_sample}.{event}.log", params: prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0], shell: @@ -93,12 +93,12 @@ rule change_samplenames: rule reheader_varlociraptor: input: - vcf="results/merged-calls/{pair}.{event}.fdr-controlled.bcf", - samples="results/merged-calls/{pair}.{event}.renaming.txt", + vcf="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", + samples="results/merged-calls/{cancer_sample}.{event}.renaming.txt", output: - "results/merged-calls/{pair}.{event}.reheader.bcf", + "results/merged-calls/{cancer_sample}.{event}.reheader.bcf", log: - "logs/reheader-calls/{pair}.{event}.log", + "logs/reheader-calls/{cancer_sample}.{event}.log", params: extra="", view_extra="-O b", diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index bd9e6510..582ac7c1 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -6,8 +6,8 @@ rule microphaser_somatic: track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.mt.fa", - wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.wt.fa", + mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.normal.fa", tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv", log: "logs/microphaser/somatic/{sample}-{contig}.log", @@ -83,10 +83,10 @@ rule microphaser_filter: proteome=get_proteome, output: mt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.mt.fa" + "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.wt.fa" + "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.normal.fa" ), tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv", removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv", diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk index 872247ac..d7401bf8 100644 --- a/workflow/rules/phylogeny.smk +++ b/workflow/rules/phylogeny.smk @@ -1,7 +1,7 @@ def get_somatic_calls(wildcards): return expand( "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", - sample=samples[samples.type == "tumor"]["sample"], + sample=samples[samples.alias == "tumor"]["sample_name"], ) diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk index 2ef064a1..e2b2a43d 100644 --- a/workflow/rules/tmb.smk +++ b/workflow/rules/tmb.smk @@ -2,13 +2,13 @@ if config["tmb"]["activate"]: rule estimate_tmb: input: - "results/merged-calls/{pair}.somatic.fdr-controlled.bcf", + "results/merged-calls/{cancer_sample}.somatic.fdr-controlled.bcf", output: - "results/plots/tmb/{pair}.{plotmode}.vl.json", + "results/plots/tmb/{cancer_sample}.{plotmode}.vl.json", conda: "../envs/varlociraptor.yaml" log: - "logs/tmb/{pair}-{plotmode}.log", + "logs/tmb/{cancer_sample}-{plotmode}.log", params: **config["tmb"], shell: diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk index 4cb6013a..f94052fd 100644 --- a/workflow/rules/varlociraptor.smk +++ b/workflow/rules/varlociraptor.smk @@ -3,14 +3,14 @@ rule render_scenario: config["calling"]["scenario"], output: report( - "results/scenarios/{pair}.yaml", + "results/scenarios/{cancer_sample}.yaml", caption="../report/scenario.rst", category="Variant calling scenarios", ), params: samples=samples, log: - "logs/scenarious/{pair}.log", + "logs/scenarious/{cancer_sample}.log", conda: "../envs/render_scenario.yaml" script: @@ -21,15 +21,15 @@ rule varlociraptor_preprocess: input: ref="resources/genome.fasta", ref_idx="resources/genome.fasta.fai", - candidates="results/candidate-calls/{pair}.{caller}.{scatteritem}.bcf", + candidates="results/candidate-calls/{cancer_sample}.{caller}.{scatteritem}.bcf", bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bai", + bai="results/recal/{sample}.sorted.bam.bai", output: - "results/observations/{pair}/{sample}.{caller}.{scatteritem}.bcf", + "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf", params: omit_isize="", log: - "logs/varlociraptor/preprocess/{pair}/{sample}.{caller}.{scatteritem}.log", + "logs/varlociraptor/preprocess/{cancer_sample}/{sample}.{caller}.{scatteritem}.log", conda: "../envs/varlociraptor.yaml" shell: @@ -40,11 +40,11 @@ rule varlociraptor_preprocess: rule varlociraptor_call: input: obs=get_pair_observations, - scenario="results/scenarios/{pair}.yaml", + scenario="results/scenarios/{cancer_sample}.yaml", output: - temp("results/calls/{pair}.{caller}.{scatteritem}.bcf"), + temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf"), log: - "logs/varlociraptor/call/{pair}.{caller}.{scatteritem}.log", + "logs/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.log", params: obs=lambda w, input: [ "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs) @@ -52,7 +52,7 @@ rule varlociraptor_call: conda: "../envs/varlociraptor.yaml" benchmark: - "benchmarks/varlociraptor/call/{pair}.{caller}.{scatteritem}.tsv" + "benchmarks/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.tsv" shell: "varlociraptor " "call variants generic --obs {params.obs} " @@ -61,11 +61,11 @@ rule varlociraptor_call: rule sort_calls: input: - "results/calls/{pair}.{caller}.{scatteritem}.bcf", + "results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf", output: - temp("results/calls/{pair}.{caller}.{scatteritem}.sorted.bcf"), + temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.sorted.bcf"), log: - "logs/bcf-sort/{pair}.{caller}.{scatteritem}.log", + "logs/bcf-sort/{cancer_sample}.{caller}.{scatteritem}.log", conda: "../envs/bcftools.yaml" resources: @@ -80,9 +80,9 @@ rule bcftools_concat: calls=get_scattered_calls(), indexes=get_scattered_calls(ext=".bcf.csi"), output: - "results/calls/{pair}.{scatteritem}.bcf", + "results/calls/{cancer_sample}.{scatteritem}.bcf", log: - "logs/concat-calls/{pair}.{scatteritem}.log", + "logs/concat-calls/{cancer_sample}.{scatteritem}.log", params: "-a -Ob", # TODO Check this wrapper: From caafd0131e4bd46ca263a8c4fefbe332866621c9 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 19:57:43 +0100 Subject: [PATCH 004/191] exclude .github/workflows/* from .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8fe284d1..02a9591f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ * +!.github/workflows/* !.test !.test/* !.test/config/* From b0651b1bd56e9b5ed08c18aa24e34a460936cd50 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 19:58:28 +0100 Subject: [PATCH 005/191] update github actions workflow --- .github/workflows/main.yaml | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 503d4a5e..364e02a4 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -8,33 +8,42 @@ on: jobs: + Formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Formatting + uses: github/super-linter@v4 + env: + VALIDATE_ALL_CODEBASE: false + DEFAULT_BRANCH: master + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VALIDATE_SNAKEMAKE_SNAKEFMT: true + + Linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 - name: Lint workflow - uses: snakemake/snakemake-github-action@v1.17.0 + uses: snakemake/snakemake-github-action@v1.22.0 with: directory: . snakefile: workflow/Snakefile args: "--lint" - stagein: | - export TMPDIR=/tmp Testing: runs-on: ubuntu-latest - needs: Linting + needs: + - Linting + - Formatting steps: - - uses: actions/checkout@v1 - - name: Checkout submodules - uses: textbook/git-checkout-submodule-action@2.0.0 + - uses: actions/checkout@v2 - name: Test workflow (local FASTQs) - uses: snakemake/snakemake-github-action@v1.17.0 + uses: snakemake/snakemake-github-action@v1.22.0 with: directory: .test snakefile: workflow/Snakefile args: "--use-conda --show-failed-logs -j 10 --conda-cleanup-pkgs cache --conda-frontend mamba" - stagein: | - export TMPDIR=/tmp From e65f344afd0d50e57644ba5476297b159be78301 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 20:05:09 +0100 Subject: [PATCH 006/191] snakefmt --- workflow/rules/calling.smk | 6 ++++-- workflow/rules/candidate_calling.smk | 4 +++- workflow/rules/common.smk | 23 ++++++++++++++--------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index 70a228a4..a3889467 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -14,7 +14,8 @@ rule strelka_somatic: "logs/calling/strelka_somatic/{cancer_sample}.log", params: config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] + "resources/genome.callregions.bed.gz", + config["params"]["strelka"]["config"], ), run_extra=config["params"]["strelka"]["run"], threads: 22 @@ -35,7 +36,8 @@ rule strelka_germline: "logs/calling/strelka_germline/{normal_sample}.log", params: config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", config["params"]["strelka"]["config"] + "resources/genome.callregions.bed.gz", + config["params"]["strelka"]["config"], ), run_extra="", threads: 22 diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk index 2cdfbb2c..d6b6942d 100644 --- a/workflow/rules/candidate_calling.smk +++ b/workflow/rules/candidate_calling.smk @@ -19,7 +19,9 @@ rule scatter_candidates: input: "results/candidate-calls/{cancer_sample}.{caller}.bcf", output: - scatter.calling("results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf"), + scatter.calling( + "results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf" + ), log: "logs/scatter-candidates/{cancer_sample}.{caller}.log", conda: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 5b371175..54befabf 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -264,7 +264,9 @@ def get_oncoprint_batch(wildcards): def get_annotated_bcf(wildcards): selection = ".annotated" return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format( - cancer_sample=wildcards.cancer_sample, selection=selection, scatteritem=wildcards.scatteritem + cancer_sample=wildcards.cancer_sample, + selection=selection, + scatteritem=wildcards.scatteritem, ) @@ -330,10 +332,7 @@ def get_merge_input(ext=".bcf"): def get_pair_aliases(wildcards): return [ - samples.loc[ - get_normal(wildcards.cancer_sample), - "alias" - ], + samples.loc[get_normal(wildcards.cancer_sample), "alias"], samples.loc[wildcards.cancer_sample, "alias"], ] @@ -394,8 +393,9 @@ def get_paired_bais(wildcards): def get_normal(sample_name): normal_sample = samples.loc[ - (samples["group"] == samples.loc[sample_name, "group"]) & (samples["alias"] == "normal"), - "sample_name" + (samples["group"] == samples.loc[sample_name, "group"]) + & (samples["alias"] == "normal"), + "sample_name", ].iat[0] return normal_sample @@ -433,8 +433,13 @@ def get_alleles_MHCII(wildcards): def get_normal_bam(wildcards): - return expand("results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample)) + return expand( + "results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample) + ) def get_normal_bai(wildcards): - return expand("results/recal/{normal}.sorted.bam.bai", normal=get_normal(wildcards.cancer_sample)) + return expand( + "results/recal/{normal}.sorted.bam.bai", + normal=get_normal(wildcards.cancer_sample), + ) From c7b6897ef75cd72e53d1b27cc53f46cd5f2106d8 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 20:47:46 +0100 Subject: [PATCH 007/191] remove unused rule gzip_fastq that confuses adapter trimming mode --- workflow/rules/utils.smk | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index e7586ef0..65161772 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -34,17 +34,6 @@ rule tabix_known_variants: "0.59.2/bio/tabix" -rule gzip_fastq: - input: - "{prefix}.fastq", - output: - "{prefix}.fastq.gz", - log: - "logs/gz-fastq/{prefix}.log", - shell: - "gzip < {input} > {output}" - - rule tsv_to_excel: input: tsv="results/{x}.tsv", From 688c027c2f69882b6cb941a59ecd04dd0480d037 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 20:48:07 +0100 Subject: [PATCH 008/191] fix trimming rules syntax --- workflow/rules/common.smk | 30 +++++++++++++++++++++++------- workflow/rules/trim.smk | 3 --- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 54befabf..273704e3 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -53,6 +53,8 @@ wildcard_constraints: caller="|".join(["freebayes", "delly"]), peptide_type="|".join(["normal", "neo"]), event="|".join(["somatic", "germline", "complete"]), + read="|".join(["single", "R1", "R2"]), + seqtype="|".join(["DNA", "RNA"]), ### Output generation ### @@ -132,28 +134,38 @@ caller = list( def get_cutadapt_input(wildcards): - unit = units.loc[wildcards.sample].loc[wildcards.unit].loc[wildcards.seqtype] + unit = units.loc[ + (units["sample_name"] == wildcards.sample) + & (units["unit_name"] == wildcards.unit) + & (units["sequencing_type"] == wildcards.seqtype) + ] - if pd.isna(unit["fq1"]): + if pd.isna(unit["fq1"].iat[0]): # SRA sample (always paired-end for now) accession = unit["sra"] return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]) - if unit["fq1"].endswith("gz"): + if unit["fq1"].iat[0].endswith("gz"): ending = ".gz" else: ending = "" - if pd.isna(unit["fq2"]): + if pd.isna(unit["fq2"].iat[0]): # single end local sample return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format( - S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending + S=unit["sample_name"].iat[0], + U=unit["unit_name"].iat[0], + T=unit["sequencing_type"].iat[0], + E=ending, ) else: # paired end local sample return expand( "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format( - S=unit.sample_name, U=unit.unit_name, T=unit.sequencing_type, E=ending + S=unit["sample_name"].iat[0], + U=unit["unit_name"].iat[0], + T=unit["sequencing_type"].iat[0], + E=ending, ), read=["fq1", "fq2"], ) @@ -189,7 +201,11 @@ def get_fastqs(wc): if config["trimming"]["activate"]: return expand( "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz", - unit=units.loc[wc.seqtype].loc[wc.sample, "unit_name"], + unit=units.loc[ + (units["sequencing_type"] == wc.seqtype) + & (units["sample_name"] == wc.sample), + "unit_name", + ], sample=wc.sample, read=wc.read, seqtype=wc.seqtype, diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index f26ed6f3..cf09cba1 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -64,8 +64,5 @@ rule merge_fastqs: "results/merged/{seqtype}/{sample}_{read}.fastq.gz", log: "logs/merge-fastqs/{seqtype}_{sample}_{read}.log", - wildcard_constraints: - read="single|R1|R2", - seqtype="DNA|RNA", shell: "cat {input} > {output} 2> {log}" From f228219c91a97b980634aa8ca6731efa9b94f4e6 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 20:49:12 +0100 Subject: [PATCH 009/191] turn EVERYTHING on in the GitHub Actions test workflow and run... --- .test/config/config.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index d10fc559..85e69827 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -3,14 +3,14 @@ units: "config/units.tsv" # boolean if read trimming should be skipped trimming: - activate: false + activate: true remove_duplicates: activate: true calling: freebayes: - activate: false + activate: true # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling scenario: config/scenario.yaml filter: @@ -40,14 +40,14 @@ calling: fusion: arriba: - activate: false + activate: true blacklist: "arriba_blacklist" params: "-T -P" tmb: - activate: false + activate: true coding_genome_size: 3e7 # Name of the tumor sample in the scenario.yaml. tumor_sample: tumor @@ -56,16 +56,16 @@ tmb: epitope_prediction: - activate: false + activate: true affinity: netMHCpan: - activate: false + activate: true params: "-BA -l 9 -s -xls" location: "../netMHCpan-4.0" netMHCIIpan: - activate: false + activate: true params: "-length 15 -s -xls" location: "../netMHCIIpan-4.0" @@ -73,11 +73,11 @@ affinity: HLAtyping: # activate to use razers3 to pre-filter reads before using optitype optitype_prefiltering: - activate: false + activate: true optitype_data: "config/HLA_Data/hla_reference_dna.fasta" # activate to predict MHC-I and MHC-II alleles with HLA-LA HLA_LA: - activate: false + activate: true ref: From ed496c4a352991d499c7c52d99182c099238a0ef Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 13 Jan 2022 22:13:09 +0100 Subject: [PATCH 010/191] add missing conda env for rule get_callregions --- workflow/envs/htslib.yaml | 5 +++++ workflow/rules/ref.smk | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 workflow/envs/htslib.yaml diff --git a/workflow/envs/htslib.yaml b/workflow/envs/htslib.yaml new file mode 100644 index 00000000..b5d7959d --- /dev/null +++ b/workflow/envs/htslib.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - htslib =1.14 diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 86b209c3..7e3a3d18 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -120,7 +120,7 @@ rule get_callregions: params: n_contigs=config["ref"]["n_chromosomes"], conda: - "../envs/index.yaml" + "../envs/htslib.yaml" shell: "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})" " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}" From 530ef666c3e86c129b32f82939ed55801259e73e Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 14 Jan 2022 18:03:43 +0100 Subject: [PATCH 011/191] update cutadapt wrappers and try fixing adapter handling --- workflow/rules/common.smk | 12 +++++++++++- workflow/rules/trim.smk | 14 ++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 273704e3..bc922529 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -30,7 +30,7 @@ units = ( pd.read_csv( config["units"], sep="\t", - dtype={"sample_name": str, "sequencing_type": str, "unit_name": str}, + dtype={"sample_name": str, "sequencing_type": str, "unit_name": str, "adapters": str}, comment="#", ) .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False) @@ -181,6 +181,16 @@ def get_cutadapt_pipe_input(wildcards): assert len(files) > 0, "no files found at {}".format(pattern) return files +def get_cutadapt_adapters(wildcards): + unit = units.loc[wildcards.sample].loc[wildcards.unit] + try: + adapters = unit["adapters"] + if isinstance(adapters, str): + return adapters + return "" + except KeyError: + return "" + def is_paired_end(sample, seqtype): sample_units = units.loc[sample].loc[seqtype] diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index cf09cba1..b0145449 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -32,13 +32,11 @@ rule cutadapt_pe: log: "logs/cutadapt/{sample}-{seqtype}-{unit}.log", params: - others=config["params"]["cutadapt"], - adapters=lambda w: str( - units.loc[w.sample].loc[w.seqtype].loc[w.unit, "adapters"] - ), + extra=config["params"]["cutadapt"], + adapters=get_cutadapt_adapters, threads: 8 wrapper: - "0.59.2/bio/cutadapt/pe" + "0.85.1/bio/cutadapt/pe" rule cutadapt_se: @@ -50,11 +48,11 @@ rule cutadapt_se: log: "logs/cutadapt/{sample}-{seqtype}-{unit}.log", params: - others=config["params"]["cutadapt"], - adapters_r1=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), + extra=config["params"]["cutadapt"], + adapters_r1=get_cutadapt_adapters, threads: 8 wrapper: - "0.59.2/bio/cutadapt/se" + "0.85.1/bio/cutadapt/se" rule merge_fastqs: From 8c5d952d9fe63585a923210c3ae164902550a7b1 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 14 Jan 2022 18:08:27 +0100 Subject: [PATCH 012/191] snakefmt --- workflow/rules/common.smk | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index bc922529..14c7e6be 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -30,7 +30,12 @@ units = ( pd.read_csv( config["units"], sep="\t", - dtype={"sample_name": str, "sequencing_type": str, "unit_name": str, "adapters": str}, + dtype={ + "sample_name": str, + "sequencing_type": str, + "unit_name": str, + "adapters": str, + }, comment="#", ) .set_index(["sample_name", "sequencing_type", "unit_name"], drop=False) @@ -181,6 +186,7 @@ def get_cutadapt_pipe_input(wildcards): assert len(files) > 0, "no files found at {}".format(pattern) return files + def get_cutadapt_adapters(wildcards): unit = units.loc[wildcards.sample].loc[wildcards.unit] try: From 48300a4448a5220e80668e05594e75e663a4666c Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 14 Jan 2022 18:31:02 +0100 Subject: [PATCH 013/191] fix cutadapt wrappers' version number --- workflow/rules/trim.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index b0145449..b5573c61 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -36,7 +36,7 @@ rule cutadapt_pe: adapters=get_cutadapt_adapters, threads: 8 wrapper: - "0.85.1/bio/cutadapt/pe" + "v0.85.1/bio/cutadapt/pe" rule cutadapt_se: @@ -52,7 +52,7 @@ rule cutadapt_se: adapters_r1=get_cutadapt_adapters, threads: 8 wrapper: - "0.85.1/bio/cutadapt/se" + "v0.85.1/bio/cutadapt/se" rule merge_fastqs: From 7800664cc3a4d5e030d59ed0355ced86113eb5d0 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 14 Jan 2022 20:21:25 +0100 Subject: [PATCH 014/191] deactivate HLA_LA in test workflow, as it includes huge graph download that times out the workflow --- .test/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 85e69827..839a126c 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -77,7 +77,7 @@ HLAtyping: optitype_data: "config/HLA_Data/hla_reference_dna.fasta" # activate to predict MHC-I and MHC-II alleles with HLA-LA HLA_LA: - activate: true + activate: false ref: From 5d35a0979d81f885bf36c0190ccf04e3afa1cabc Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 14 Jan 2022 21:36:31 +0100 Subject: [PATCH 015/191] also turn off netMHCIIpan to avoid HLA-LA download --- .test/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 839a126c..4add6179 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -65,7 +65,7 @@ affinity: params: "-BA -l 9 -s -xls" location: "../netMHCpan-4.0" netMHCIIpan: - activate: true + activate: false params: "-length 15 -s -xls" location: "../netMHCIIpan-4.0" From 680d976969d62a164169eca32dd1594eef34529e Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Tue, 18 Jan 2022 19:44:53 +0100 Subject: [PATCH 016/191] try dna-seq-varlociraptor workflow formulations of cutadapt_input function definitions --- workflow/rules/common.smk | 43 ++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 14c7e6be..838f6641 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -139,37 +139,33 @@ caller = list( def get_cutadapt_input(wildcards): - unit = units.loc[ - (units["sample_name"] == wildcards.sample) - & (units["unit_name"] == wildcards.unit) - & (units["sequencing_type"] == wildcards.seqtype) - ] + unit = units.loc[wildcards.sample].loc[wildcards.seqtype].loc[wildcards.unit] - if pd.isna(unit["fq1"].iat[0]): + if pd.isna(unit["fq1"]): # SRA sample (always paired-end for now) accession = unit["sra"] return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]) - if unit["fq1"].iat[0].endswith("gz"): + if unit["fq1"].endswith("gz"): ending = ".gz" else: ending = "" - if pd.isna(unit["fq2"].iat[0]): + if pd.isna(unit["fq2"]): # single end local sample return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format( - S=unit["sample_name"].iat[0], - U=unit["unit_name"].iat[0], - T=unit["sequencing_type"].iat[0], + S=unit.sample_name, + U=unit.unit_name, + T=unit.sequencing_type, E=ending, ) else: # paired end local sample return expand( "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format( - S=unit["sample_name"].iat[0], - U=unit["unit_name"].iat[0], - T=unit["sequencing_type"].iat[0], + S=unit.sample_name, + U=unit.unit_name, + T=unit.sequencing_type, E=ending, ), read=["fq1", "fq2"], @@ -182,8 +178,23 @@ def get_cutadapt_pipe_input(wildcards): .loc[wildcards.seqtype] .loc[wildcards.unit, wildcards.fq] ) - files = list(sorted(glob.glob(pattern))) - assert len(files) > 0, "no files found at {}".format(pattern) + if "*" in pattern: + files = sorted( + glob.glob( + units.loc[wildcards.sample] + .loc[wildcards.seqtype] + .loc[wildcards.unit, wildcards.fq] + ) + ) + if not files: + raise ValueError( + "No raw fastq files found for unit pattern {} (sample {}, sequencing type {}). " + "Please check the your sample sheet.".format( + wildcards.unit, wildcards.sample, wildcards.seqtype + ) + ) + else: + files = [pattern] return files From cde64d8cd42bf5e51b3257437a96e1f9fc4eaac5 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Tue, 18 Jan 2022 19:45:16 +0100 Subject: [PATCH 017/191] update cutadapt rules to very latest wrapper release --- workflow/rules/trim.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index b5573c61..0ada209d 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -36,7 +36,7 @@ rule cutadapt_pe: adapters=get_cutadapt_adapters, threads: 8 wrapper: - "v0.85.1/bio/cutadapt/pe" + "v0.86.0/bio/cutadapt/pe" rule cutadapt_se: @@ -49,10 +49,10 @@ rule cutadapt_se: "logs/cutadapt/{sample}-{seqtype}-{unit}.log", params: extra=config["params"]["cutadapt"], - adapters_r1=get_cutadapt_adapters, + adapters=get_cutadapt_adapters, threads: 8 wrapper: - "v0.85.1/bio/cutadapt/se" + "v0.86.0/bio/cutadapt/se" rule merge_fastqs: From c390783e0ae2a7abe58ba99017423d47a28a6144 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 19 Jan 2022 17:51:01 +0100 Subject: [PATCH 018/191] fix render_scenario after changes to wildcards, samples.tsv and units.tsv --- workflow/scripts/render-scenario.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py index 692b0ee5..8a1d06eb 100644 --- a/workflow/scripts/render-scenario.py +++ b/workflow/scripts/render-scenario.py @@ -3,6 +3,7 @@ with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out: samples = snakemake.params.samples + group = samples.loc[samples["sample_name"] == snakemake.wildcards.cancer_sample, "group"] out.write(Template(template.read()).render( - samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])] + samples=samples[samples["group"] == group] )) From ffd5547a0a336cfe9221948f68f7891893a156e4 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 19 Jan 2022 17:52:35 +0100 Subject: [PATCH 019/191] for now, deactivate stuff not needed for dna-seq-varlociraptor compatibility --- .test/config/config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 4add6179..e432f441 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -3,14 +3,14 @@ units: "config/units.tsv" # boolean if read trimming should be skipped trimming: - activate: true + activate: false remove_duplicates: - activate: true + activate: false calling: freebayes: - activate: true + activate: false # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling scenario: config/scenario.yaml filter: From beba26e39ab413464094ecce5634ac6d8cfa3e28 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 19 Jan 2022 18:33:59 +0100 Subject: [PATCH 020/191] activate freebayes in .test to try to get tests to run further --- .test/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index e432f441..1d5f7b1b 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -10,7 +10,7 @@ remove_duplicates: calling: freebayes: - activate: false + activate: true # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling scenario: config/scenario.yaml filter: From 74e7c359f5d355dea66c67be6ea2555afd9ee006 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 14:44:22 +0100 Subject: [PATCH 021/191] make razers3 rule wildcards compatible with dna-seq-varlociraptor workflow to get entry point to work seamlessly --- workflow/rules/HLAtyping.smk | 8 ++++---- workflow/rules/common.smk | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 682808ef..c519a48f 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -39,9 +39,9 @@ rule parse_HLA_LA: rule razers3: input: - reads="results/merged/DNA/{sample}_{fq}.fastq.gz", + reads="results/merged/{sample}_{read}.fastq.gz" output: - bam="results/razers3/bam/{sample}_{fq}.bam", + bam="results/razers3/bam/{sample}_{read}.bam", threads: 8 log: "logs/razers3/{sample}_{fq}.log", @@ -54,9 +54,9 @@ rule razers3: rule bam2fq: input: - "results/razers3/bam/{sample}_{fq}.bam", + "results/razers3/bam/{sample}_{read}.bam", output: - "results/razers3/fastq/{sample}_{fq}.fished.fastq", + "results/razers3/fastq/{sample}_{read}.fished.fastq", params: "", log: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 838f6641..eb59c416 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -279,7 +279,7 @@ def get_optitype_reads_input(wildcards): if is_activated("HLAtyping/optitype_prefiltering"): if is_paired_end(wildcards.sample, "DNA"): return expand( - "results/razers3/fastq/{sample}_{fq}.fished.fastq", + "results/razers3/fastq/{sample}_{read}.fished.fastq", sample=wildcards.sample, fq=["R1", "R2"], ) From d04e2422a1752037bdcc003d9ec32f46e0c8b4a4 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 14:49:22 +0100 Subject: [PATCH 022/191] fix formatting and linting error in rule razers3 --- workflow/rules/HLAtyping.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index c519a48f..46b16f1e 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -39,12 +39,12 @@ rule parse_HLA_LA: rule razers3: input: - reads="results/merged/{sample}_{read}.fastq.gz" + reads="results/merged/{sample}_{read}.fastq.gz", output: bam="results/razers3/bam/{sample}_{read}.bam", threads: 8 log: - "logs/razers3/{sample}_{fq}.log", + "logs/razers3/{sample}_{read}.log", params: genome=config["HLAtyping"]["optitype_data"], extra=config["params"]["razers3"], From 2cd85a1473fc814755c1af166a23f59e1ea15eeb Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 15:53:01 +0100 Subject: [PATCH 023/191] also fix rule bam2fq wildcards --- workflow/rules/HLAtyping.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 46b16f1e..26f84454 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -60,7 +60,7 @@ rule bam2fq: params: "", log: - "logs/razers3-bam2fq/{sample}-{fq}.log", + "logs/razers3-bam2fq/{sample}-{read}.log", threads: 1 wrapper: "0.61.0/bio/samtools/bam2fq/interleaved" From 020e44ba9905949a3f65940b62381cefcd108687 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 16:13:16 +0100 Subject: [PATCH 024/191] fix last occurence of fq wildcard --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index eb59c416..aedcb7f8 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -281,7 +281,7 @@ def get_optitype_reads_input(wildcards): return expand( "results/razers3/fastq/{sample}_{read}.fished.fastq", sample=wildcards.sample, - fq=["R1", "R2"], + read=["R1", "R2"], ) return "results/razers3/fastq/{sample}_single.fastq" else: From 2fe0c7ee61177985aa57213fa9e35cfad28ab00a Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 16:34:20 +0100 Subject: [PATCH 025/191] fix rule razers3 input path --- workflow/rules/HLAtyping.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 26f84454..e68cd89b 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -39,7 +39,7 @@ rule parse_HLA_LA: rule razers3: input: - reads="results/merged/{sample}_{read}.fastq.gz", + reads="results/merged/DNA/{sample}_{read}.fastq.gz", output: bam="results/razers3/bam/{sample}_{read}.bam", threads: 8 From 0771ccaee8c781f21f85238ac42b6479afdc2bfb Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 16:35:28 +0100 Subject: [PATCH 026/191] clearly distinguish cancer_samples and normal_samples via globally restricted wildcards --- workflow/rules/MHC_binding.smk | 36 ++++++++++----------- workflow/rules/common.smk | 13 ++++---- workflow/rules/microphaser.smk | 58 +++++++++++++++++----------------- 3 files changed, 54 insertions(+), 53 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index e43f248d..50185ca4 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCpan/{sample}.{chr}.{peptide_type}.fa", + peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCpan/{cancer_sample}.{chr}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls", + "results/netMHCpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls", log: - "logs/netMHCpan/{sample}-{chr}-{peptide_type}.log", + "logs/netMHCpan/{cancer_sample}-{chr}-{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/{sample}/filtered/netMHCIIpan/{sample}.{chr}.{peptide_type}.fa", + peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCIIpan/{cancer_sample}.{chr}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{sample}/{chr}/{sample}.{chr}.{peptide_type}.xls", + "results/netMHCIIpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{sample}-{chr}-{peptide_type}.log", + "logs/netMHCIIpan/{cancer_sample}-{chr}-{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,13 +53,13 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{sample}}/{chr}/{{sample}}.{chr}.{{peptide_type}}.xls", + "results/{{mhc}}/{{cancer_sample}}/{chr}/{{cancer_sample}}.{chr}.{{peptide_type}}.xls", chr=contigs, ), output: - "results/{mhc}/{sample}/{sample}.mhc.{peptide_type}.tsv", + "results/{mhc}/{cancer_sample}/{sample}.mhc.{peptide_type}.tsv", log: - "logs/parse-mhc/{mhc}-{sample}-{peptide_type}.log", + "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log", wildcard_constraints: group="wt|mt", script: @@ -83,17 +83,17 @@ rule parse_mhc_out: rule mhc_csv_table: input: - info="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv", - neo="results/{mhc}/{sample}/{sample}.mhc.neo.tsv", - normal="results/{mhc}/{sample}/{sample}.mhc.normal.tsv", + info="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv", + neo="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.neo.tsv", + normal="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.normal.tsv", output: report( - "results/neoantigens/{mhc}/{sample}.DNA.tsv", + "results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv", caption="../report/WES_results.rst", category="Results WES (netMHC)", ), log: - "logs/create-mhc-table/{mhc}-{sample}.log", + "logs/create-mhc-table/{mhc}-{cancer_sample}.log", script: "../scripts/merge_data.py" @@ -111,17 +111,17 @@ rule mhc_csv_table: rule add_RNA_info: input: - counts="results/kallisto/{sample}", - table="results/neoantigens/{mhc}/{sample}.DNA.tsv", + counts="results/kallisto/{cancer_sample}", + table="results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv", output: report( - "results/neoantigens/{mhc}/{sample}.RNA.tsv", + "results/neoantigens/{mhc}/{cancer_sample}.RNA.tsv", caption="../report/RNA_results.rst", category="Results RNA", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), log: - "logs/add-RNA/{mhc}-{sample}.log", + "logs/add-RNA/{mhc}-{cancer_sample}.log", script: "../scripts/add_rna_info.py" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index aedcb7f8..d1b2c109 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -51,6 +51,7 @@ contigs.extend(["X", "Y"]) wildcard_constraints: cancer_sample="|".join(samples[samples.alias != "normal"]["sample_name"]), + normal_sample="|".join(samples[samples.alias == "normal"]["sample_name"]), sample="|".join(samples["sample_name"]), unit="|".join(units["unit_name"]), alias="|".join(pd.unique(samples["alias"])), @@ -453,8 +454,8 @@ def get_seperate(sample, group): def get_proteome(wildcards): return expand( - "results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin", - normal=get_normal(wildcards.sample), + "results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin", + normal_sample=get_normal(wildcards.cancer_sample), mhc=wildcards.mhc, ) @@ -462,17 +463,17 @@ def get_proteome(wildcards): def get_alleles_MHCI(wildcards): if wildcards.peptide_type == "wt": return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_normal(wildcards.sample) + S=get_normal(wildcards.cancer_sample) ) else: - return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.sample) + return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.cancer_sample) def get_alleles_MHCII(wildcards): if wildcards.peptide_type == "wt": - return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.sample)) + return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.cancer_sample)) else: - return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.sample) + return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample) def get_normal_bam(wildcards): diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 582ac7c1..caef83cf 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,16 +1,16 @@ rule microphaser_somatic: input: - vcf="results/strelka/merged/{sample}/all_variants.norm.annotated.bcf", - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bam.bai", + vcf="results/strelka/merged/{cancer_sample}/all_variants.norm.annotated.bcf", + bam="results/recal/{cancer_sample}.sorted.bam", + bai="results/recal/{cancer_sample}.sorted.bam.bai", track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.neo.fa", - wt_fasta="results/microphaser/fasta/{sample}/{sample}.{contig}.normal.fa", - tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv", + mt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.normal.fa", + tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv", log: - "logs/microphaser/somatic/{sample}-{contig}.log", + "logs/microphaser/somatic/{cancer_sample}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -22,20 +22,20 @@ rule microphaser_somatic: rule microphaser_germline: input: - vcf="results/strelka/germline/{normal}/results/variants/variants.reheader.norm.bcf", - bam="results/recal/{normal}.sorted.bam", - bai="results/recal/{normal}.sorted.bam.bai", + vcf="results/strelka/germline/{normal_sample}/results/variants/variants.reheader.norm.bcf", + bam="results/recal/{normal_sample}.sorted.bam", + bai="results/recal/{normal_sample}.sorted.bam.bai", track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: wt_fasta=( - "results/microphaser/fasta/germline/{normal}/{normal}.germline.{contig}.fa" + "results/microphaser/fasta/germline/{normal_sample}/{normal_sample}.germline.{contig}.fa" ), wt_tsv=( - "results/microphaser/info/germline/{normal}/{normal}.germline.{contig}.tsv" + "results/microphaser/info/germline/{normal_sample}/{normal_sample}.germline.{contig}.tsv" ), log: - "logs/microphaser/germline/{normal}-{contig}.log", + "logs/microphaser/germline/{normal_sample}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -48,25 +48,25 @@ rule microphaser_germline: rule concat_proteome: input: expand( - "results/microphaser/fasta/germline/{{normal}}/{{normal}}.germline.{contig}.fa", + "results/microphaser/fasta/germline/{{normal_sample}}/{{normal_sample}}.germline.{contig}.fa", contig=contigs, ), output: - "results/microphaser/fasta/germline/{normal}/reference_proteome.fa", + "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa", log: - "logs/microphaser/concat-ref-proteome/{normal}.log", + "logs/microphaser/concat-ref-proteome/{normal_sample}.log", shell: "cat {input} > {output} 2> {log}" rule build_germline_proteome: input: - "results/microphaser/fasta/germline/{normal}/reference_proteome.fa", + "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa", output: - bin="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.bin", - fasta="results/microphaser/fasta/germline/{normal}/{mhc}/reference_proteome.peptides.fasta", + bin="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin", + fasta="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.peptides.fasta", log: - "logs/microphaser/build-ref-proteome-db/{normal}-{mhc}.log", + "logs/microphaser/build-ref-proteome-db/{normal_sample}-{mhc}.log", conda: "../envs/microphaser.yaml" params: @@ -79,19 +79,19 @@ rule build_germline_proteome: rule microphaser_filter: input: - tsv="results/microphaser/info/{sample}/{sample}.{contig}.tsv", + tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv", proteome=get_proteome, output: mt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.neo.fa" + "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/{sample}/filtered/{mhc}/{sample}.{contig}.normal.fa" + "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.normal.fa" ), - tsv="results/microphaser/info/{sample}/filtered/{mhc}/{sample}.{contig}.tsv", - removed="results/microphaser/info/{sample}/removed/{mhc}/{sample}.{contig}.removed.tsv", + tsv="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.tsv", + removed="results/microphaser/info/{cancer_sample}/removed/{mhc}/{cancer_sample}.{contig}.removed.tsv", log: - "logs/microphaser/filter/{sample}-{mhc}-{contig}.log", + "logs/microphaser/filter/{cancer_sample}-{mhc}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -105,13 +105,13 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/{{sample}}/filtered/{{mhc}}/{{sample}}.{contig}.tsv", + "results/microphaser/info/{{cancer_sample}}/filtered/{{mhc}}/{{cancer_sample}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/{sample}/filtered/{mhc}/{sample}.tsv", + "results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv", log: - "logs/concat-tsv/{sample}-{mhc}.log", + "logs/concat-tsv/{cancer_sample}-{mhc}.log", conda: "../envs/xsv.yaml" shell: From 16631009f2ff7fc5d3464ae6a0de6b53aebec5a4 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 16:38:28 +0100 Subject: [PATCH 027/191] fix missing sample -> cancer_sample refactoring --- workflow/rules/MHC_binding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 50185ca4..ef80b8fb 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -57,7 +57,7 @@ rule parse_mhc_out: chr=contigs, ), output: - "results/{mhc}/{cancer_sample}/{sample}.mhc.{peptide_type}.tsv", + "results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.{peptide_type}.tsv", log: "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log", wildcard_constraints: From c6606427a57232d2d1d34f89d065f06b41286b10 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Fri, 21 Jan 2022 16:39:29 +0100 Subject: [PATCH 028/191] snakefmt --- workflow/rules/common.smk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d1b2c109..33fd5918 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -466,12 +466,16 @@ def get_alleles_MHCI(wildcards): S=get_normal(wildcards.cancer_sample) ) else: - return "results/optitype/{S}/hla_alleles_{S}.tsv".format(S=wildcards.cancer_sample) + return "results/optitype/{S}/hla_alleles_{S}.tsv".format( + S=wildcards.cancer_sample + ) def get_alleles_MHCII(wildcards): if wildcards.peptide_type == "wt": - return "results/HLA-LA/hlaI_{S}.tsv".format(S=get_normal(wildcards.cancer_sample)) + return "results/HLA-LA/hlaI_{S}.tsv".format( + S=get_normal(wildcards.cancer_sample) + ) else: return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample) From 82bb46986ea5b0e4bf92849994881225234f4527 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 16 Mar 2022 14:07:55 +0100 Subject: [PATCH 029/191] remove (download) and fix (index) HLALA caching --- workflow/rules/ref.smk | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 7e3a3d18..4443311a 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -183,7 +183,6 @@ rule download_HLALA_graph: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", log: "logs/download-HLA-LA-graph.log", - cache: True shell: "cd resources/graphs && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz" @@ -193,8 +192,11 @@ rule index_HLALA: input: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", output: - "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", - "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathindex", + multiext( + "resources/graphs/PRG_MHC_GRCh38_withIMGT/", + "serializedGRAPH", + "serializedGRAPH_preGapPathindex" + ) cache: True conda: "../envs/hla_la.yaml" From 5524f1d73abc5d8878ebeb88f6cd6332918ffaed Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Thu, 31 Mar 2022 13:48:18 +0200 Subject: [PATCH 030/191] snakefmt --- workflow/rules/ref.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 4443311a..64cd6e06 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -195,8 +195,8 @@ rule index_HLALA: multiext( "resources/graphs/PRG_MHC_GRCh38_withIMGT/", "serializedGRAPH", - "serializedGRAPH_preGapPathindex" - ) + "serializedGRAPH_preGapPathindex", + ), cache: True conda: "../envs/hla_la.yaml" From 08dab98f999b7b321f5c9e64cf295cbc4f7ba359 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Wed, 6 Apr 2022 14:50:51 +0200 Subject: [PATCH 031/191] WIP: switch to a proper focus on groups of samples for one individual, not yet tested --- .test/config/config.yaml | 3 + workflow/rules/MHC_binding.smk | 40 ++++++------- workflow/rules/calling.smk | 71 ++++++++++------------- workflow/rules/common.smk | 103 ++++++++++++++++++++------------- workflow/rules/microphaser.smk | 68 +++++++++++----------- 5 files changed, 150 insertions(+), 135 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 1d5f7b1b..78bf043b 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -127,6 +127,9 @@ params: 9 netMHCIIpan: 15 + events: + tumor: "strelka_somatic" + normal: "strelka_germline" kallisto: "-b 100" star: >- diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index ef80b8fb..893d403f 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -1,12 +1,12 @@ # rule mhcflurry: # input: -# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{chr}.{peptide_type}.fa", +# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{contig}.{peptide_type}.fa", # alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv", # wt_alleles=get_germline_optitype # output: -# "results/mhcflurry/{sample}/{chr}/output.{peptide_type}.csv" +# "results/mhcflurry/{sample}/{contig}/output.{peptide_type}.csv" # log: -# "logs/mhcflurry/{sample}-{chr}-{peptide_type}.log" +# "logs/mhcflurry/{sample}-{contig}-{peptide_type}.log" # run: # if "wt" in input.peptides: # alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0]) @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCpan/{cancer_sample}.{chr}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa" alleles=get_alleles_MHCI, output: - "results/netMHCpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls", + "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCpan/{cancer_sample}-{chr}-{peptide_type}.log", + "logs/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/{cancer_sample}/filtered/netMHCIIpan/{cancer_sample}.{chr}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa" alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{cancer_sample}/{chr}/{cancer_sample}.{chr}.{peptide_type}.xls", + "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{cancer_sample}-{chr}-{peptide_type}.log", + "logs/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,22 +53,20 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{cancer_sample}}/{chr}/{{cancer_sample}}.{chr}.{{peptide_type}}.xls", - chr=contigs, + "results/{{mhc}}/{{group}}/{{tumor_event}}.{contig}.{{peptide_type}}.xls", + contig=contigs, ), output: - "results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.{peptide_type}.tsv", + "results/{mhc}/{group}.{tumor_event}.mhc.{peptide_type}.tsv", log: - "logs/parse-mhc/{mhc}-{cancer_sample}-{peptide_type}.log", - wildcard_constraints: - group="wt|mt", + "logs/parse_mhc_out/{mhc}/{group}.{tumor_event}.{peptide_type}.log", script: "../scripts/group_mhc_output.py" # rule parse_mhcflurry: # input: -# expand("results/mhcflurry/{{sample}}/{chr}/output.{{peptide_type}}.csv", chr=contigs) +# expand("results/mhcflurry/{{sample}}/{contig}/output.{{peptide_type}}.csv", contig=contigs) # output: # "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv" # wildcard_constraints: @@ -83,17 +81,17 @@ rule parse_mhc_out: rule mhc_csv_table: input: - info="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv", - neo="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.neo.tsv", - normal="results/{mhc}/{cancer_sample}/{cancer_sample}.mhc.normal.tsv", + info="results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv", + neo="results/{mhc}/{group}.{tumor_event}.mhc.neo.tsv", + normal="results/{mhc}/{group}.{tumor_event}.mhc.normal.tsv", output: report( - "results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv", + "results/neoantigens/{group}.{tumor_event}.{mhc}.DNA.tsv", caption="../report/WES_results.rst", category="Results WES (netMHC)", ), log: - "logs/create-mhc-table/{mhc}-{cancer_sample}.log", + "logs/mhc_csv_table/{group}.{mhc}.{tumor_event}.log", script: "../scripts/merge_data.py" diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index a3889467..48e51d5f 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -1,17 +1,17 @@ -rule strelka_somatic: +rule strelka_tumor: input: - normal=get_normal_bam, - normal_index=get_normal_bai, - tumor="results/recal/{cancer_sample}.sorted.bam", - tumor_index="results/recal/{cancer_sample}.sorted.bam.bai", + normal=get_normal_bam(), + normal_index=get_normal_bam(ext=".bam.bai"), + tumor=get_tumor_bam(), + tumor_index=get_tumor_bam(ext=".bam.bai"), fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/somatic/{cancer_sample}/results/variants/somatic.snvs.vcf.gz", - "results/strelka/somatic/{cancer_sample}/results/variants/somatic.indels.vcf.gz", + "results/strelka/{group}.strelka_somatic.snvs.vcf.gz", + "results/strelka/{group}.strelka_somatic.indels.vcf.gz", log: - "logs/calling/strelka_somatic/{cancer_sample}.log", + "logs/calling/strelka/{group}.strelka_somatic.log", params: config_extra="--callRegions {} {}".format( "resources/genome.callregions.bed.gz", @@ -25,13 +25,13 @@ rule strelka_somatic: rule strelka_germline: input: - bam="results/recal/{normal_sample}.sorted.bam", - normal_index="results/recal/{normal_sample}.sorted.bam.bai", + bam=get_normal_bam(), + normal_index=get_normal_bam(ext=".bam.bai"), fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/germline/{normal_sample}/results/variants/variants.vcf.gz", + "results/strelka/{group}.strelka_germline.variants.vcf.gz", log: "logs/calling/strelka_germline/{normal_sample}.log", params: @@ -61,17 +61,17 @@ rule vcf_to_bcf: rule concat_somatic: input: calls=expand( - "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf", + "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf", type=["snvs", "indels"], ), indices=expand( - "results/strelka/somatic/{{sample}}/results/variants/somatic.{type}.output.bcf.csi", + "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf.csi", type=["snvs", "indels"], ), output: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf", + "results/strelka/{group}.strelka_somatic.bcf", log: - "bcftools/concat-somatic/{sample}.log", + "bcftools/concat_somatic/{group}.log", params: "-O b -a", wrapper: @@ -80,11 +80,11 @@ rule concat_somatic: rule get_tumor_from_somatic: input: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.bcf", + "results/strelka/{group}.strelka_somatic.bcf", output: - "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", + "results/strelka/{group}.strelka_somatic.tumor.bcf", log: - "logs/bcftools/view-TUMOR/{sample}.log", + "logs/bcftools/get_tumor_from_somatic/{group}.strelka_somatic.tumor.log", params: "-O b -s TUMOR", wrapper: @@ -93,12 +93,12 @@ rule get_tumor_from_somatic: rule reheader_germline: input: - vcf="{germline}/variants.output.bcf", + vcf="results/strelka/{group}.strelka_germline.variants.output.bcf", samples="resources/sampleheader.txt", output: - "{germline}/variants.reheader.bcf", + "results/strelka/{group}.strelka_germline.variants.reheader.bcf", log: - "logs/bcftools/reheader/{germline}.log", + "logs/bcftools/reheader_germline/{group}.log", params: extra="", view_extra="-O b", @@ -108,33 +108,24 @@ rule reheader_germline: rule concat_variants: input: - calls=lambda w: get_pair_variants(w, index=False), - index=lambda w: get_pair_variants(w, index=True), + calls=[ + "results/strelka/{group}.strelka_somatic.tumor.bcf", + "results/strelka/{group}.strelka_germline.variants.reheader.bcf", + ], + index=[ + "results/strelka/{group}.strelka_somatic.tumor.bcf.csi", + "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi", + ], output: - "results/strelka/merged/{sample}/all_variants.bcf", + "results/strelka/merged/{group}.strelka_somatic.strelka_germline.bcf", log: - "bcftools/concat-all/{sample}.log", + "bcftools/concat_variants/{group}.strelka_somatic.strelka_germline.log", params: extra="-O b -a", wrapper: "0.64.0/bio/bcftools/concat" -rule preprocess_variants: - input: - variants="{variants}.bcf", - output: - "{variants}.prepy.bcf", - params: - extra="-L --somatic", - genome="resources/genome.fasta", - log: - "logs/prepy/{variants}.log", - threads: 2 - wrapper: - "0.60.0/bio/hap.py/pre.py" - - rule norm_vcf: input: "{prefix}.bcf", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 33fd5918..9e2f88b5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -74,22 +74,27 @@ def is_activated(xpath): def get_final_output(): + final_output = [] if config["epitope_prediction"]["activate"]: - final_output = expand( - "results/neoantigens/{mhc}/{S.sample_name}.{S.sequencing_type}.xlsx", - S=units.loc[samples[samples.alias == "tumor"]["sample_name"]] - .drop_duplicates(["sample_name", "sequencing_type"]) - .itertuples(), - mhc=list( - filter( - None, - [ - "netMHCpan" if is_activated("affinity/netMHCpan") else None, - "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None, - ], - ) - ), - ) + for group in pd.unique(samples["group"]): + samples = samples.loc[samples["group"] == group, "sample_name"] + sequencing_types = pd.unique(units.loc[units.sample_name in samples, "sequencing_type"]) + final_output.extend( + expand( + "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv", + group=group, + tumor_event=config["params"]["microphaser"]["events"]["tumor"], + mhc=list( + filter( + None, + [ + "netMHCpan" if is_activated("affinity/netMHCpan") else None, + "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None, + ], + ) + ), + seqtype=sequencing_types, + ) else: if config["HLAtyping"]["HLA_LA"]["activate"]: final_output = expand( @@ -340,13 +345,13 @@ def get_pair_variants(wildcards, index): else: ext = "" variants = [ - "results/strelka/somatic/{}/results/variants/somatic.complete.tumor.bcf{}".format( + "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf{ext}".format( wildcards.sample, ext ) ] variants.append( "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format( - get_normal(wildcards.sample), ext + get_normal_from_group(wildcards.group), ext ) ) return variants @@ -376,7 +381,7 @@ def get_merge_input(ext=".bcf"): def get_pair_aliases(wildcards): return [ - samples.loc[get_normal(wildcards.cancer_sample), "alias"], + samples.loc[get_normal_from_sample(wildcards.cancer_sample), "alias"], samples.loc[wildcards.cancer_sample, "alias"], ] @@ -389,6 +394,16 @@ def get_tabix_params(wildcards): raise ValueError("Invalid format for tabix: {}".format(wildcards.format)) +def get_normal_bam(wildcards, ext=".bam"): + normal_sample = get_normal_from_group(wildcards.group) + return f"results/recal/{normal_sample}.sorted{ext}" + + +def get_tumor_bam(wildcards, ext=".bam"): + tumor_sample = get_tumor_from_group(wildcards.group) + return f"results/recal/{tumor_sample}.sorted{ext}" + + ## RNA ## @@ -418,7 +433,7 @@ def kallisto_params(wildcards, input): def get_paired_samples(wildcards): return [ - get_normal(wildcards.cancer_sample), + get_normal_from_sample(wildcards.cancer_sample), samples.loc[wildcards.cancer_sample, "sample_name"], ] @@ -435,7 +450,7 @@ def get_paired_bais(wildcards): ) -def get_normal(sample_name): +def get_normal_from_sample(sample_name): normal_sample = samples.loc[ (samples["group"] == samples.loc[sample_name, "group"]) & (samples["alias"] == "normal"), @@ -444,6 +459,24 @@ def get_normal(sample_name): return normal_sample +def get_normal_from_group(group): + normal_sample = samples.loc[ + (samples["group"] == group) + & (samples["alias"] == "normal"), + "sample_name", + ].iat[0] + return normal_sample + + +def get_tumor_from_group(group): + tumor_sample = samples.loc[ + (samples["group"] == group) + & (samples["alias"] == "tumor"), + "sample_name", + ].iat[0] + return tumor_sample + + def get_reads(wildcards): return get_seperate(wildcards.sample, wildcards.group) @@ -454,40 +487,30 @@ def get_seperate(sample, group): def get_proteome(wildcards): return expand( - "results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin", - normal_sample=get_normal(wildcards.cancer_sample), + "results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", + normal_event=config["params"]["microphaser"]["events"]["normal"], mhc=wildcards.mhc, ) def get_alleles_MHCI(wildcards): - if wildcards.peptide_type == "wt": + if wildcards.peptide_type == "normal": return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_normal(wildcards.cancer_sample) + S=get_normal_from_group(wildcards.group) ) else: return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=wildcards.cancer_sample + S=get_tumor_from_group(wildcards.group) ) def get_alleles_MHCII(wildcards): - if wildcards.peptide_type == "wt": + if wildcards.peptide_type == "normal": return "results/HLA-LA/hlaI_{S}.tsv".format( - S=get_normal(wildcards.cancer_sample) + S=get_normal_from_group(wildcards.group) ) else: - return "results/HLA-LA/hlaI_{S}.tsv".format(S=wildcards.cancer_sample) - - -def get_normal_bam(wildcards): - return expand( - "results/recal/{normal}.sorted.bam", normal=get_normal(wildcards.cancer_sample) - ) - + return "results/HLA-LA/hlaI_{S}.tsv".format( + S=get_tumor_from_group(wildcards.group) + ) -def get_normal_bai(wildcards): - return expand( - "results/recal/{normal}.sorted.bam.bai", - normal=get_normal(wildcards.cancer_sample), - ) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index caef83cf..695fac1b 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,16 +1,16 @@ -rule microphaser_somatic: +rule microphaser_tumor: input: - vcf="results/strelka/merged/{cancer_sample}/all_variants.norm.annotated.bcf", - bam="results/recal/{cancer_sample}.sorted.bam", - bai="results/recal/{cancer_sample}.sorted.bam.bai", + vcf="results/strelka/merged/{group}.{tumor_event}.{normal_event}.norm.annotated.bcf", + bam=get_tumor_bam(), + bai=get_tumor_bam(ext=".bam.bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.neo.fa", - wt_fasta="results/microphaser/fasta/{cancer_sample}/{cancer_sample}.{contig}.normal.fa", - tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv", + mt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa", + tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv", log: - "logs/microphaser/somatic/{cancer_sample}-{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_event}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -20,22 +20,22 @@ rule microphaser_somatic: "< {input.track} > {output.mt_fasta} 2> {log}" -rule microphaser_germline: +rule microphaser_normal: input: - vcf="results/strelka/germline/{normal_sample}/results/variants/variants.reheader.norm.bcf", - bam="results/recal/{normal_sample}.sorted.bam", - bai="results/recal/{normal_sample}.sorted.bam.bai", + vcf="results/strelka/normal/{group}.{normal_event}.variants.reheader.norm.bcf", + bam=get_normal_bam(), + bai=get_normal_bam(ext=".bam.bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: wt_fasta=( - "results/microphaser/fasta/germline/{normal_sample}/{normal_sample}.germline.{contig}.fa" + "results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa" ), wt_tsv=( - "results/microphaser/info/germline/{normal_sample}/{normal_sample}.germline.{contig}.tsv" + "results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv" ), log: - "logs/microphaser/germline/{normal_sample}-{contig}.log", + "logs/microphaser_germline/{group}/{normal_event}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -45,28 +45,28 @@ rule microphaser_germline: "< {input.track} > {output.wt_fasta} 2> {log}" -rule concat_proteome: +rule concat_normal_proteome: input: expand( - "results/microphaser/fasta/germline/{{normal_sample}}/{{normal_sample}}.germline.{contig}.fa", + "results/microphaser/fasta/{{group}}/normal.{{normal_event}}.{contig}.fa", contig=contigs, ), output: - "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa", + "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa", log: - "logs/microphaser/concat-ref-proteome/{normal_sample}.log", + "logs/microphaser/concat_normal_proteome/{group}.{normal_event}.log", shell: "cat {input} > {output} 2> {log}" -rule build_germline_proteome: +rule build_normal_proteome_db: input: - "results/microphaser/fasta/germline/{normal_sample}/reference_proteome.fa", + "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa", output: - bin="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.bin", - fasta="results/microphaser/fasta/germline/{normal_sample}/{mhc}/reference_proteome.peptides.fasta", + bin="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", + fasta="results/microphaser/fasta/{group}.{normal_event}.{mhc}.normal_proteome.peptides.fasta", log: - "logs/microphaser/build-ref-proteome-db/{normal_sample}-{mhc}.log", + "logs/microphaser/build_normal_proteome_db/{group}.{normal_event}-{mhc}.log", conda: "../envs/microphaser.yaml" params: @@ -79,19 +79,19 @@ rule build_germline_proteome: rule microphaser_filter: input: - tsv="results/microphaser/info/{cancer_sample}/{cancer_sample}.{contig}.tsv", - proteome=get_proteome, + tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv", + proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", output: mt_fasta=( - "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.neo.fa" + "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.normal.fa" + "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.normal.fa" ), - tsv="results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.{contig}.tsv", - removed="results/microphaser/info/{cancer_sample}/removed/{mhc}/{cancer_sample}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv", + removed="results/microphaser/info/removed/{group}/{mhc}.{tumor_event}.{contig}.removed.tsv", log: - "logs/microphaser/filter/{cancer_sample}-{mhc}-{contig}.log", + "logs/microphaser_filter/{group}/{mhc}.{tumor_event}.{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -105,13 +105,13 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/{{cancer_sample}}/filtered/{{mhc}}/{{cancer_sample}}.{contig}.tsv", + "results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/{cancer_sample}/filtered/{mhc}/{cancer_sample}.tsv", + "results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv", log: - "logs/concat-tsv/{cancer_sample}-{mhc}.log", + "logs/concat_tsvs/{group}.{mhc}.{tumor_event}.log", conda: "../envs/xsv.yaml" shell: From 8cc18bb6fba3015711135afc86afc4617a3d3664 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Mon, 9 May 2022 15:52:16 +0200 Subject: [PATCH 032/191] add missing parenthesis --- workflow/rules/common.smk | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9e2f88b5..7472c96a 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -94,6 +94,7 @@ def get_final_output(): ) ), seqtype=sequencing_types, + ) ) else: if config["HLAtyping"]["HLA_LA"]["activate"]: From 14d4eb93f50d4820347e156395473918870be20a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:33:59 +0000 Subject: [PATCH 033/191] remove unused function get_pair_variants() in common.smk --- workflow/rules/common.smk | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7472c96a..6b20358f 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -340,24 +340,6 @@ def get_fdr_control_params(wildcards): return {"threshold": threshold, "events": events} -def get_pair_variants(wildcards, index): - if index: - ext = ".csi" - else: - ext = "" - variants = [ - "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf{ext}".format( - wildcards.sample, ext - ) - ] - variants.append( - "results/strelka/germline/{}/results/variants/variants.reheader.bcf{}".format( - get_normal_from_group(wildcards.group), ext - ) - ) - return variants - - def get_pair_observations(wildcards): return expand( "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf", From d57cac359311f589eeebdb46e58d0ae7784df10c Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:35:06 +0000 Subject: [PATCH 034/191] switch input functions with extra args to def inner(wildcards): syntax --- workflow/rules/common.smk | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 6b20358f..8e5b8d08 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -271,12 +271,14 @@ def get_read_group(wildcards): ) -def get_recalibrate_quality_input(wildcards, bai=False): +def get_recalibrate_quality_input(bai=False): ext = ".bai" if bai else "" - if is_activated("remove_duplicates"): - return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext) - else: - return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext) + def inner(wildcards): + if is_activated("remove_duplicates"): + return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext) + else: + return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext) + return inner ## HLA Typing ## @@ -377,14 +379,18 @@ def get_tabix_params(wildcards): raise ValueError("Invalid format for tabix: {}".format(wildcards.format)) -def get_normal_bam(wildcards, ext=".bam"): - normal_sample = get_normal_from_group(wildcards.group) - return f"results/recal/{normal_sample}.sorted{ext}" +def get_normal_bam(ext=".bam"): + def inner(wildcards): + normal_sample = get_normal_from_group(wildcards.group) + return f"results/recal/{normal_sample}.sorted{ext}" + return inner -def get_tumor_bam(wildcards, ext=".bam"): - tumor_sample = get_tumor_from_group(wildcards.group) - return f"results/recal/{tumor_sample}.sorted{ext}" +def get_tumor_bam(ext=".bam"): + def inner(wildcards): + tumor_sample = get_tumor_from_group(wildcards.group) + return f"results/recal/{tumor_sample}.sorted{ext}" + return inner ## RNA ## From 569aaadbc9451202f487077a9b9bc481a76d95d3 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:36:53 +0000 Subject: [PATCH 035/191] fix overlooked wildcard rename substitution --- workflow/rules/calling.smk | 2 +- workflow/rules/microphaser.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index 48e51d5f..4193d528 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -33,7 +33,7 @@ rule strelka_germline: output: "results/strelka/{group}.strelka_germline.variants.vcf.gz", log: - "logs/calling/strelka_germline/{normal_sample}.log", + "logs/calling/strelka_germline/{group}.log", params: config_extra="--callRegions {} {}".format( "resources/genome.callregions.bed.gz", diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 695fac1b..939b2526 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -10,7 +10,7 @@ rule microphaser_tumor: wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa", tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv", log: - "logs/microphaser_tumor/{group}/{tumor_event}-{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_event}.{normal_event}.{contig}.log", conda: "../envs/microphaser.yaml" params: From 3c9e0e73e6b3855f1dfdeb0c5bc96d2f47ab9f6f Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:39:40 +0000 Subject: [PATCH 036/191] rule concat_tsvs: fix wildcard pass-through via double brackets --- workflow/rules/microphaser.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 939b2526..548e39e5 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -105,7 +105,7 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv", + "results/microphaser/info/filtered/{{group}}/{{mhc}}.{{tumor_event}}.{contig}.tsv", contig=contigs, ), output: From b785271752faf1ad6038a3b29c73e6aa4cf51a3d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:40:11 +0000 Subject: [PATCH 037/191] missing comma --- workflow/rules/MHC_binding.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 893d403f..eca8f056 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -18,7 +18,7 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa" + peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", @@ -35,7 +35,7 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa" + peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", From 67c328662faed68fbc8f240d096e577637775470 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 14:59:01 +0000 Subject: [PATCH 038/191] snakefmt --- workflow/rules/calling.smk | 4 ++-- workflow/rules/common.smk | 23 +++++++++++++++-------- workflow/rules/microphaser.smk | 8 ++------ 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index 4193d528..3ac76cb4 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -108,11 +108,11 @@ rule reheader_germline: rule concat_variants: input: - calls=[ + calls=[ "results/strelka/{group}.strelka_somatic.tumor.bcf", "results/strelka/{group}.strelka_germline.variants.reheader.bcf", ], - index=[ + index=[ "results/strelka/{group}.strelka_somatic.tumor.bcf.csi", "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi", ], diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 8e5b8d08..3eb2fd65 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -78,7 +78,9 @@ def get_final_output(): if config["epitope_prediction"]["activate"]: for group in pd.unique(samples["group"]): samples = samples.loc[samples["group"] == group, "sample_name"] - sequencing_types = pd.unique(units.loc[units.sample_name in samples, "sequencing_type"]) + sequencing_types = pd.unique( + units.loc[units.sample_name in samples, "sequencing_type"] + ) final_output.extend( expand( "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv", @@ -88,8 +90,12 @@ def get_final_output(): filter( None, [ - "netMHCpan" if is_activated("affinity/netMHCpan") else None, - "netMHCIIpan" if is_activated("affinity/netMHCIIpan") else None, + "netMHCpan" + if is_activated("affinity/netMHCpan") + else None, + "netMHCIIpan" + if is_activated("affinity/netMHCIIpan") + else None, ], ) ), @@ -273,11 +279,13 @@ def get_read_group(wildcards): def get_recalibrate_quality_input(bai=False): ext = ".bai" if bai else "" + def inner(wildcards): if is_activated("remove_duplicates"): return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext) else: return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext) + return inner @@ -383,6 +391,7 @@ def get_normal_bam(ext=".bam"): def inner(wildcards): normal_sample = get_normal_from_group(wildcards.group) return f"results/recal/{normal_sample}.sorted{ext}" + return inner @@ -390,6 +399,7 @@ def get_tumor_bam(ext=".bam"): def inner(wildcards): tumor_sample = get_tumor_from_group(wildcards.group) return f"results/recal/{tumor_sample}.sorted{ext}" + return inner @@ -450,8 +460,7 @@ def get_normal_from_sample(sample_name): def get_normal_from_group(group): normal_sample = samples.loc[ - (samples["group"] == group) - & (samples["alias"] == "normal"), + (samples["group"] == group) & (samples["alias"] == "normal"), "sample_name", ].iat[0] return normal_sample @@ -459,8 +468,7 @@ def get_normal_from_group(group): def get_tumor_from_group(group): tumor_sample = samples.loc[ - (samples["group"] == group) - & (samples["alias"] == "tumor"), + (samples["group"] == group) & (samples["alias"] == "tumor"), "sample_name", ].iat[0] return tumor_sample @@ -502,4 +510,3 @@ def get_alleles_MHCII(wildcards): return "results/HLA-LA/hlaI_{S}.tsv".format( S=get_tumor_from_group(wildcards.group) ) - diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 548e39e5..0c07d588 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -28,12 +28,8 @@ rule microphaser_normal: track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - wt_fasta=( - "results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa" - ), - wt_tsv=( - "results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv" - ), + wt_fasta=("results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"), + wt_tsv=("results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"), log: "logs/microphaser_germline/{group}/{normal_event}-{contig}.log", conda: From 5e7e7e9e6dac4e6d2d1e32b8f5c4a22b8d5ee251 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 15:56:01 +0000 Subject: [PATCH 039/191] fix samples handling for get_final_output() --- workflow/rules/common.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3eb2fd65..70897797 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -77,9 +77,9 @@ def get_final_output(): final_output = [] if config["epitope_prediction"]["activate"]: for group in pd.unique(samples["group"]): - samples = samples.loc[samples["group"] == group, "sample_name"] + smps = samples.loc[samples["group"] == group, "sample_name"] sequencing_types = pd.unique( - units.loc[units.sample_name in samples, "sequencing_type"] + units.loc[units["sample_name"].isin(smps), "sequencing_type"] ) final_output.extend( expand( From 59ef419872076a6919aac31e9dd749545eb60afd Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 15:56:46 +0000 Subject: [PATCH 040/191] include required params: microphase: events: entry "tumor:" --- config/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/config.yaml b/config/config.yaml index f1e6818a..f6e5d8c3 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -116,6 +116,8 @@ params: 9 netMHCIIpan: 15 + events: + tumor: "tumor_unfiltered" kallisto: "-b 100" star: >- From 4cfa6c1d5ec8fb9fe2e805473b33d12b3865ca80 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 16:18:19 +0000 Subject: [PATCH 041/191] set and use microphaser normal event definition in config.yaml --- config/config.yaml | 1 + workflow/rules/common.smk | 4 +--- workflow/rules/microphaser.smk | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index f6e5d8c3..b031d126 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -118,6 +118,7 @@ params: 15 events: tumor: "tumor_unfiltered" + normal: "normal_unfiltered" kallisto: "-b 100" star: >- diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 70897797..b956b741 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -484,12 +484,10 @@ def get_seperate(sample, group): def get_proteome(wildcards): return expand( - "results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", + "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin", normal_event=config["params"]["microphaser"]["events"]["normal"], - mhc=wildcards.mhc, ) - def get_alleles_MHCI(wildcards): if wildcards.peptide_type == "normal": return "results/optitype/{S}/hla_alleles_{S}.tsv".format( diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 0c07d588..1437d866 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -76,7 +76,7 @@ rule build_normal_proteome_db: rule microphaser_filter: input: tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv", - proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", + proteome=get_proteome(), output: mt_fasta=( "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa" From d6c1287fb780fa7b42e10d6c038103913854c68b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 20:57:31 +0000 Subject: [PATCH 042/191] fix input function syntax --- workflow/rules/common.smk | 10 ++-------- workflow/rules/mapping.smk | 8 ++++---- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b956b741..600b9fce 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -282,9 +282,9 @@ def get_recalibrate_quality_input(bai=False): def inner(wildcards): if is_activated("remove_duplicates"): - return "results/dedup/{}.sorted.bam{}".format(wildcards.sample, ext) + return f"results/dedup/{wildcards.sample}.sorted.bam{ext}" else: - return "results/mapped/{}.sorted.bam{}".format(wildcards.sample, ext) + return f"results/mapped/{wildcards.sample}.sorted.bam{ext}" return inner @@ -482,12 +482,6 @@ def get_seperate(sample, group): return units.loc[(sample, "DNA"), "fq{}".format(str(group))] -def get_proteome(wildcards): - return expand( - "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin", - normal_event=config["params"]["microphaser"]["events"]["normal"], - ) - def get_alleles_MHCI(wildcards): if wildcards.peptide_type == "normal": return "results/optitype/{S}/hla_alleles_{S}.tsv".format( diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 8cfcb51c..13be68ad 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -32,8 +32,8 @@ rule mark_duplicates: rule recalibrate_base_qualities: input: - bam=get_recalibrate_quality_input, - bai=lambda w: get_recalibrate_quality_input(w, bai=True), + bam=get_recalibrate_quality_input(), + bai=get_recalibrate_quality_input(bai=True), ref="resources/genome.fasta", ref_dict="resources/genome.dict", ref_fai="resources/genome.fasta.fai", @@ -53,8 +53,8 @@ rule recalibrate_base_qualities: rule apply_bqsr: input: - bam=get_recalibrate_quality_input, - bai=lambda w: get_recalibrate_quality_input(w, bai=True), + bam=get_recalibrate_quality_input(), + bai=get_recalibrate_quality_input(bai=True), ref="resources/genome.fasta", ref_dict="resources/genome.dict", ref_fai="resources/genome.fasta.fai", From 75cfee31de99731d6bc40aca59c36da5b985c181 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 9 May 2022 20:58:21 +0000 Subject: [PATCH 043/191] fix input of rule microphaser_filter --- workflow/rules/microphaser.smk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 1437d866..b8d0dae9 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -76,7 +76,10 @@ rule build_normal_proteome_db: rule microphaser_filter: input: tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv", - proteome=get_proteome(), + proteome=expand( + "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin", + normal_event=config["params"]["microphaser"]["events"]["normal"], + ), output: mt_fasta=( "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa" From 71790e3b041977916495588739c9768119525323 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 10 May 2022 16:02:14 +0000 Subject: [PATCH 044/191] reworked to consistently allow for multiple tumor aliases per group --- config/config.yaml | 4 +- workflow/rules/HLAtyping.smk | 26 ++++----- workflow/rules/MHC_binding.smk | 36 ++++++------- workflow/rules/RNA.smk | 4 +- workflow/rules/calling.smk | 44 +++++++-------- workflow/rules/common.smk | 99 +++++++++++++++++----------------- workflow/rules/microphaser.smk | 34 ++++++------ 7 files changed, 122 insertions(+), 125 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index b031d126..e5b9670a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -117,8 +117,8 @@ params: netMHCIIpan: 15 events: - tumor: "tumor_unfiltered" - normal: "normal_unfiltered" + tumor: "strelka_somatic" + normal: "strelka_germline" kallisto: "-b 100" star: >- diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index e68cd89b..bacfa45a 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -1,13 +1,13 @@ rule HLA_LA: input: - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bam.bai", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", output: - "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt", + "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt", threads: 7 log: - "logs/HLA-LA/{sample}.log", + "logs/HLA-LA/{group}.{alias}.log", params: graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), @@ -19,20 +19,20 @@ rule HLA_LA: rule parse_HLA_LA: input: - "results/HLA-LA/output/{sample}/hla/R1_bestguess_G.txt", + "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt", output: report( - "results/HLA-LA/hlaI_{sample}.tsv", + "results/HLA-LA/{group}.{alias}.hlaI.tsv", caption="../report/HLA_Types.rst", category="HLA-Typing(HLA-LA)", ), report( - "results/HLA-LA/hlaII_{sample}.tsv", + "results/HLA-LA/{group}.{alias}.hlaII.tsv", caption="../report/HLA_Types.rst", category="HLA-Typing(HLA-LA)", ), log: - "logs/parse-HLA-LA/{sample}.log", + "logs/parse-HLA-LA/{group}.{alias}.log", script: "../scripts/parse_HLA_types.py" @@ -71,10 +71,10 @@ rule OptiType: reads=get_optitype_reads_input, output: multiext( - "results/optitype/{sample}/{sample}", "_coverage_plot.pdf", "_result.tsv" + "results/optitype/{group}/{group}.{alias}", ".coverage_plot.pdf", ".result.tsv" ), log: - "logs/optitype/{sample}.log", + "logs/optitype/{group}.{alias}.log", params: extra=config["params"]["optitype"], sequencing_type="dna", @@ -84,15 +84,15 @@ rule OptiType: rule parse_Optitype: input: - "results/optitype/{sample}/{sample}_result.tsv", + "results/optitype/{group}/{group}.{alias}.result.tsv", output: report( - "results/optitype/{sample}/hla_alleles_{sample}.tsv", + "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", caption="../report/HLA_Types.rst", category="HLA-Typing(Optitype)", ), log: - "logs/parse-optitype/{sample}.log", + "logs/parse-optitype/{group}.{alias}.log", shell: "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' " '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}' diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index eca8f056..d33b0f82 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/filtered/{group}/netMHCpan.{tumor_event}.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", + "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCpan/{group}/{tumor_event}.{contig}.{peptide_type}.log", + "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/filtered/{group}/netMHCIIpan.{tumor_event}.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCIIpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.xls", + "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{group}/{tumor_event}.{contig}.{peptide_type}.log", + "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,13 +53,13 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{group}}/{{tumor_event}}.{contig}.{{peptide_type}}.xls", + "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{contig}.{{peptide_type}}.xls", contig=contigs, ), output: - "results/{mhc}/{group}.{tumor_event}.mhc.{peptide_type}.tsv", + "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.{peptide_type}.tsv", log: - "logs/parse_mhc_out/{mhc}/{group}.{tumor_event}.{peptide_type}.log", + "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{peptide_type}.log", script: "../scripts/group_mhc_output.py" @@ -81,17 +81,17 @@ rule parse_mhc_out: rule mhc_csv_table: input: - info="results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv", - neo="results/{mhc}/{group}.{tumor_event}.mhc.neo.tsv", - normal="results/{mhc}/{group}.{tumor_event}.mhc.normal.tsv", + info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv", + neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.neo.tsv", + normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.normal.tsv", output: report( - "results/neoantigens/{group}.{tumor_event}.{mhc}.DNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv", caption="../report/WES_results.rst", category="Results WES (netMHC)", ), log: - "logs/mhc_csv_table/{group}.{mhc}.{tumor_event}.log", + "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", script: "../scripts/merge_data.py" @@ -109,17 +109,17 @@ rule mhc_csv_table: rule add_RNA_info: input: - counts="results/kallisto/{cancer_sample}", - table="results/neoantigens/{mhc}/{cancer_sample}.DNA.tsv", + counts="results/kallisto/{group}.{tumor_alias}", + table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv", output: report( - "results/neoantigens/{mhc}/{cancer_sample}.RNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.RNA.tsv", caption="../report/RNA_results.rst", category="Results RNA", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), log: - "logs/add-RNA/{mhc}-{cancer_sample}.log", + "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", script: "../scripts/add_rna_info.py" diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index 940d7c02..2a927c18 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -3,11 +3,11 @@ rule kallisto_quant: fastq=get_quant_reads_input, index="resources/kallisto/transcripts.idx", output: - directory("results/kallisto/{sample}"), + directory("results/kallisto/{group}.{tumor_alias}"), params: extra=kallisto_params, log: - "results/logs/kallisto/quant/{sample}.log", + "results/logs/kallisto/quant/{group}.{tumor_alias}.log", wrapper: "0.60.1/bio/kallisto/quant" diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index 3ac76cb4..9fca7233 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -1,17 +1,17 @@ rule strelka_tumor: input: normal=get_normal_bam(), - normal_index=get_normal_bam(ext=".bam.bai"), - tumor=get_tumor_bam(), - tumor_index=get_tumor_bam(ext=".bam.bai"), + normal_index=get_normal_bam(ext=".bai"), + tumor=get_tumor_bam_from_group_and_alias(), + tumor_index=get_tumor_bam_from_group_and_alias(ext=".bai"), fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/{group}.strelka_somatic.snvs.vcf.gz", - "results/strelka/{group}.strelka_somatic.indels.vcf.gz", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.snvs.vcf.gz", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.indels.vcf.gz", log: - "logs/calling/strelka/{group}.strelka_somatic.log", + "logs/calling/strelka/{group}.{tumor_alias}.strelka_somatic.log", params: config_extra="--callRegions {} {}".format( "resources/genome.callregions.bed.gz", @@ -26,12 +26,12 @@ rule strelka_tumor: rule strelka_germline: input: bam=get_normal_bam(), - normal_index=get_normal_bam(ext=".bam.bai"), + normal_index=get_normal_bam(ext=".bai"), fasta="resources/genome.fasta", fasta_index="resources/genome.fasta.fai", callregions="resources/genome.callregions.bed.gz", output: - "results/strelka/{group}.strelka_germline.variants.vcf.gz", + "results/strelka/{group}.normal.strelka_germline.variants.vcf.gz", log: "logs/calling/strelka_germline/{group}.log", params: @@ -61,17 +61,17 @@ rule vcf_to_bcf: rule concat_somatic: input: calls=expand( - "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf", + "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf", type=["snvs", "indels"], ), indices=expand( - "results/strelka/{{group}}.strelka_somatic.{type}.output.bcf.csi", + "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf.csi", type=["snvs", "indels"], ), output: - "results/strelka/{group}.strelka_somatic.bcf", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf", log: - "bcftools/concat_somatic/{group}.log", + "bcftools/concat_somatic/{group}.{tumor_alias}.log", params: "-O b -a", wrapper: @@ -80,11 +80,11 @@ rule concat_somatic: rule get_tumor_from_somatic: input: - "results/strelka/{group}.strelka_somatic.bcf", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf", output: - "results/strelka/{group}.strelka_somatic.tumor.bcf", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf", log: - "logs/bcftools/get_tumor_from_somatic/{group}.strelka_somatic.tumor.log", + "logs/bcftools/get_tumor_from_somatic/{group}.{tumor_alias}.strelka_somatic.tumor.log", params: "-O b -s TUMOR", wrapper: @@ -93,12 +93,12 @@ rule get_tumor_from_somatic: rule reheader_germline: input: - vcf="results/strelka/{group}.strelka_germline.variants.output.bcf", + vcf="results/strelka/{group}.normal.strelka_germline.variants.output.bcf", samples="resources/sampleheader.txt", output: - "results/strelka/{group}.strelka_germline.variants.reheader.bcf", + "results/strelka/{group}.normal.strelka_germline.variants.reheader.bcf", log: - "logs/bcftools/reheader_germline/{group}.log", + "logs/bcftools/reheader_germline/{group}.normal.log", params: extra="", view_extra="-O b", @@ -109,17 +109,17 @@ rule reheader_germline: rule concat_variants: input: calls=[ - "results/strelka/{group}.strelka_somatic.tumor.bcf", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf", "results/strelka/{group}.strelka_germline.variants.reheader.bcf", ], index=[ - "results/strelka/{group}.strelka_somatic.tumor.bcf.csi", + "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf.csi", "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi", ], output: - "results/strelka/merged/{group}.strelka_somatic.strelka_germline.bcf", + "results/strelka/merged/{group}.{tumor_alias}.strelka_somatic.strelka_germline.bcf", log: - "bcftools/concat_variants/{group}.strelka_somatic.strelka_germline.log", + "bcftools/concat_variants/{group}.{tumor_alias}.strelka_somatic.strelka_germline.log", params: extra="-O b -a", wrapper: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 600b9fce..0e159e7c 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -16,7 +16,11 @@ samples = ( pd.read_csv( config["samples"], sep="\t", - dtype={"sample_name": str, "group": str}, + dtype={ + "sample_name": str, + "group": str, + "alias": str, + }, comment="#", ) .set_index("sample_name", drop=False) @@ -55,6 +59,7 @@ wildcard_constraints: sample="|".join(samples["sample_name"]), unit="|".join(units["unit_name"]), alias="|".join(pd.unique(samples["alias"])), + tumor_alias="|".join(pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])), group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), peptide_type="|".join(["normal", "neo"]), @@ -81,10 +86,12 @@ def get_final_output(): sequencing_types = pd.unique( units.loc[units["sample_name"].isin(smps), "sequencing_type"] ) + tumor_aliases = samples.loc[(samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias"] final_output.extend( expand( - "results/neoantigens/{group}.{tumor_event}.{mhc}.{seqtype}.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv", group=group, + tumor_alias=tumor_aliases, tumor_event=config["params"]["microphaser"]["events"]["tumor"], mhc=list( filter( @@ -261,8 +268,8 @@ def get_fastqs(wc): return units.loc[wc.sample].loc[wc.seqtype, fq].tolist() -def get_map_reads_input(wildcards): - if is_paired_end(wildcards.sample, "DNA"): +def get_map_reads_input(sample): + if is_paired_end(sample, "DNA"): return [ "results/merged/DNA/{sample}_R1.fastq.gz", "results/merged/DNA/{sample}_R2.fastq.gz", @@ -293,16 +300,17 @@ def get_recalibrate_quality_input(bai=False): def get_optitype_reads_input(wildcards): + sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias) if is_activated("HLAtyping/optitype_prefiltering"): - if is_paired_end(wildcards.sample, "DNA"): + if is_paired_end(sample, "DNA"): return expand( "results/razers3/fastq/{sample}_{read}.fished.fastq", - sample=wildcards.sample, + sample=sample, read=["R1", "R2"], ) return "results/razers3/fastq/{sample}_single.fastq" else: - return get_map_reads_input(wildcards) + return get_map_reads_input(sample) def get_oncoprint_batch(wildcards): @@ -387,27 +395,45 @@ def get_tabix_params(wildcards): raise ValueError("Invalid format for tabix: {}".format(wildcards.format)) +def get_sample_from_group_and_alias(group, alias): + sample = samples.loc[ + (samples["group"] == group) & (samples["alias"] == alias), + "sample_name" + ] + return sample + + + def get_normal_bam(ext=".bam"): def inner(wildcards): - normal_sample = get_normal_from_group(wildcards.group) + normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal") return f"results/recal/{normal_sample}.sorted{ext}" return inner -def get_tumor_bam(ext=".bam"): +def get_tumor_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): - tumor_sample = get_tumor_from_group(wildcards.group) + tumor_sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) return f"results/recal/{tumor_sample}.sorted{ext}" return inner +def get_bam_from_group_and_alias(ext=".bam"): + def inner(wildcards): + sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias) + return f"results/recal/{sample}.sorted{ext}" + + return inner + + ## RNA ## def get_quant_reads_input(wildcards): - if is_paired_end(wildcards.sample, "RNA"): + sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) + if is_paired_end(sample, "RNA"): return [ "results/merged/RNA/{sample}_R1.fastq.gz", "results/merged/RNA/{sample}_R2.fastq.gz", @@ -449,31 +475,6 @@ def get_paired_bais(wildcards): ) -def get_normal_from_sample(sample_name): - normal_sample = samples.loc[ - (samples["group"] == samples.loc[sample_name, "group"]) - & (samples["alias"] == "normal"), - "sample_name", - ].iat[0] - return normal_sample - - -def get_normal_from_group(group): - normal_sample = samples.loc[ - (samples["group"] == group) & (samples["alias"] == "normal"), - "sample_name", - ].iat[0] - return normal_sample - - -def get_tumor_from_group(group): - tumor_sample = samples.loc[ - (samples["group"] == group) & (samples["alias"] == "tumor"), - "sample_name", - ].iat[0] - return tumor_sample - - def get_reads(wildcards): return get_seperate(wildcards.sample, wildcards.group) @@ -483,22 +484,18 @@ def get_seperate(sample, group): def get_alleles_MHCI(wildcards): - if wildcards.peptide_type == "normal": - return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_normal_from_group(wildcards.group) - ) - else: - return "results/optitype/{S}/hla_alleles_{S}.tsv".format( - S=get_tumor_from_group(wildcards.group) - ) + alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias + return expand( + "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", + group=wildcards.group, + alias=alias, + ) def get_alleles_MHCII(wildcards): - if wildcards.peptide_type == "normal": - return "results/HLA-LA/hlaI_{S}.tsv".format( - S=get_normal_from_group(wildcards.group) - ) - else: - return "results/HLA-LA/hlaI_{S}.tsv".format( - S=get_tumor_from_group(wildcards.group) + alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias + return expand( + "results/HLA-LA/{group}.{alias}.hlaI.tsv", + group=wildcards.group, + alias=alias ) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index b8d0dae9..fb685cf8 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,16 +1,16 @@ rule microphaser_tumor: input: - vcf="results/strelka/merged/{group}.{tumor_event}.{normal_event}.norm.annotated.bcf", - bam=get_tumor_bam(), - bai=get_tumor_bam(ext=".bam.bai"), + vcf="results/strelka/merged/{group}.{tumor_alias}.{tumor_event}.{normal_event}.norm.annotated.bcf", + bam=get_tumor_bam_from_group_and_alias(), + bai=get_tumor_bam_from_group_and_alias(ext=".bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.neo.fa", - wt_fasta="results/microphaser/fasta/{group}/tumor.{tumor_event}.{normal_event}.{contig}.normal.fa", - tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{normal_event}.{contig}.tsv", + mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.normal.fa", + tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv", log: - "logs/microphaser_tumor/{group}/{tumor_event}.{normal_event}.{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -22,7 +22,7 @@ rule microphaser_tumor: rule microphaser_normal: input: - vcf="results/strelka/normal/{group}.{normal_event}.variants.reheader.norm.bcf", + vcf="results/strelka/{group}.normal.{normal_event}.variants.reheader.norm.bcf", bam=get_normal_bam(), bai=get_normal_bam(ext=".bam.bai"), track="resources/annotation/{contig}.gtf", @@ -75,22 +75,22 @@ rule build_normal_proteome_db: rule microphaser_filter: input: - tsv="results/microphaser/info/{group}/tumor.{tumor_event}.{contig}.tsv", + tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{contig}.tsv", proteome=expand( "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin", normal_event=config["params"]["microphaser"]["events"]["normal"], ), output: mt_fasta=( - "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.neo.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/filtered/{group}/{mhc}.{tumor_event}.{contig}.normal.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.normal.fa" ), - tsv="results/microphaser/info/filtered/{group}/{mhc}.{tumor_event}.{contig}.tsv", - removed="results/microphaser/info/removed/{group}/{mhc}.{tumor_event}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.tsv", + removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.removed.tsv", log: - "logs/microphaser_filter/{group}/{mhc}.{tumor_event}.{contig}.log", + "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -104,13 +104,13 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/filtered/{{group}}/{{mhc}}.{{tumor_event}}.{contig}.tsv", + "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{mhc}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/filtered/{group}.{mhc}.{tumor_event}.tsv", + "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv", log: - "logs/concat_tsvs/{group}.{mhc}.{tumor_event}.log", + "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", conda: "../envs/xsv.yaml" shell: From 6a07ec612915b6769ac9467ed35ed122355fc72d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 10 May 2022 16:04:37 +0000 Subject: [PATCH 045/191] snakefmt --- workflow/rules/HLAtyping.smk | 4 +++- workflow/rules/common.smk | 23 +++++++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index bacfa45a..d8513699 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -71,7 +71,9 @@ rule OptiType: reads=get_optitype_reads_input, output: multiext( - "results/optitype/{group}/{group}.{alias}", ".coverage_plot.pdf", ".result.tsv" + "results/optitype/{group}/{group}.{alias}", + ".coverage_plot.pdf", + ".result.tsv", ), log: "logs/optitype/{group}.{alias}.log", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0e159e7c..ffdde739 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -59,7 +59,9 @@ wildcard_constraints: sample="|".join(samples["sample_name"]), unit="|".join(units["unit_name"]), alias="|".join(pd.unique(samples["alias"])), - tumor_alias="|".join(pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"])), + tumor_alias="|".join( + pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"]) + ), group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), peptide_type="|".join(["normal", "neo"]), @@ -86,7 +88,10 @@ def get_final_output(): sequencing_types = pd.unique( units.loc[units["sample_name"].isin(smps), "sequencing_type"] ) - tumor_aliases = samples.loc[(samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias"] + tumor_aliases = samples.loc[ + (samples["group"] == group) & (samples["alias"].str.match("tumor")), + "alias", + ] final_output.extend( expand( "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv", @@ -397,13 +402,11 @@ def get_tabix_params(wildcards): def get_sample_from_group_and_alias(group, alias): sample = samples.loc[ - (samples["group"] == group) & (samples["alias"] == alias), - "sample_name" + (samples["group"] == group) & (samples["alias"] == alias), "sample_name" ] return sample - def get_normal_bam(ext=".bam"): def inner(wildcards): normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal") @@ -414,7 +417,9 @@ def get_normal_bam(ext=".bam"): def get_tumor_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): - tumor_sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) + tumor_sample = get_sample_from_group_and_alias( + wildcards.group, wildcards.tumor_alias + ) return f"results/recal/{tumor_sample}.sorted{ext}" return inner @@ -495,7 +500,5 @@ def get_alleles_MHCI(wildcards): def get_alleles_MHCII(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/HLA-LA/{group}.{alias}.hlaI.tsv", - group=wildcards.group, - alias=alias - ) + "results/HLA-LA/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias + ) From f4279b25780826b74e1e32c4e03f6829b0379256 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 08:26:25 +0000 Subject: [PATCH 046/191] fix sample selection from group and alias --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ffdde739..ef85e5d4 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -403,7 +403,7 @@ def get_tabix_params(wildcards): def get_sample_from_group_and_alias(group, alias): sample = samples.loc[ (samples["group"] == group) & (samples["alias"] == alias), "sample_name" - ] + ].squeeze() return sample From 6843133a6873f03a93e5a11211649cdbef1fd1f0 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 08:26:57 +0000 Subject: [PATCH 047/191] remove unused varlociraptor valling from workflow --- config/config.yaml | 19 ----- workflow/Snakefile | 3 - workflow/rules/annotation.smk | 23 ------ workflow/rules/candidate_calling.smk | 30 -------- workflow/rules/common.smk | 97 ------------------------ workflow/rules/filtering.smk | 106 --------------------------- workflow/rules/varlociraptor.smk | 89 ---------------------- 7 files changed, 367 deletions(-) delete mode 100644 workflow/rules/candidate_calling.smk delete mode 100644 workflow/rules/filtering.smk delete mode 100644 workflow/rules/varlociraptor.smk diff --git a/config/config.yaml b/config/config.yaml index e5b9670a..a4fe01ee 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -8,25 +8,6 @@ trimming: remove_duplicates: activate: true -calling: - freebayes: - activate: false - # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling - scenario: config/scenario.yaml - fdr-control: - threshold: 0.05 - events: - complete: - varlociraptor: - - "somatic" - - "germline" - somatic: - varlociraptor: - - "somatic" - germline: - varlociraptor: - - "germline" - fusion: arriba: activate: false diff --git a/workflow/Snakefile b/workflow/Snakefile index 2b881b44..7a5270dc 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -31,10 +31,7 @@ include: "rules/trim.smk" include: "rules/ref.smk" include: "rules/mapping.smk" include: "rules/calling.smk" -include: "rules/candidate_calling.smk" -include: "rules/varlociraptor.smk" include: "rules/annotation.smk" -include: "rules/filtering.smk" include: "rules/microphaser.smk" include: "rules/HLAtyping.smk" include: "rules/MHC_binding.smk" diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 8a504e98..271ab427 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -1,26 +1,3 @@ -rule annotate_variants: - input: - calls="results/calls/{cancer_sample}.{scatteritem}.bcf", - cache="resources/vep/cache", - plugins="resources/vep/plugins", - output: - calls="results/calls/{cancer_sample}.{scatteritem}.annotated.bcf", - stats=report( - "results/calls/{cancer_sample}.{scatteritem}.stats.html", - caption="../report/stats.rst", - category="QC", - ), - params: - # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. - plugins=config["annotations"]["vep"]["plugins"], - extra="{} --vcf_info_field ANN".format(config["annotations"]["vep"]["params"]), - log: - "logs/vep/{cancer_sample}.{scatteritem}.annotate.log", - wrapper: - "0.59.2/bio/vep/annotate" - - rule annotate_strelka_variants: input: calls="results/strelka/{calls}.bcf", diff --git a/workflow/rules/candidate_calling.smk b/workflow/rules/candidate_calling.smk deleted file mode 100644 index d6b6942d..00000000 --- a/workflow/rules/candidate_calling.smk +++ /dev/null @@ -1,30 +0,0 @@ -rule freebayes: - input: - ref="resources/genome.fasta", - # you can have a list of samples here - samples=get_paired_bams, - output: - "results/candidate-calls/{cancer_sample}.freebayes.bcf", - log: - "logs/{cancer_sample}.log", - params: - extra=config["params"].get("freebayes", ""), - chunksize=100000, - threads: 60 - wrapper: - "0.65.0/bio/freebayes" - - -rule scatter_candidates: - input: - "results/candidate-calls/{cancer_sample}.{caller}.bcf", - output: - scatter.calling( - "results/candidate-calls/{{cancer_sample}}.{{caller}}.{scatteritem}.bcf" - ), - log: - "logs/scatter-candidates/{cancer_sample}.{caller}.log", - conda: - "../envs/rbt.yaml" - shell: - "rbt vcf-split {input} {output}" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index ef85e5d4..9c942bf2 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -301,9 +301,6 @@ def get_recalibrate_quality_input(bai=False): return inner -## HLA Typing ## - - def get_optitype_reads_input(wildcards): sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias) if is_activated("HLAtyping/optitype_prefiltering"): @@ -331,67 +328,6 @@ def get_oncoprint_batch(wildcards): ) -## variant calls ## - - -def get_annotated_bcf(wildcards): - selection = ".annotated" - return "results/calls/{cancer_sample}.{scatteritem}{selection}.bcf".format( - cancer_sample=wildcards.cancer_sample, - selection=selection, - scatteritem=wildcards.scatteritem, - ) - - -def get_scattered_calls(ext=".bcf"): - def inner(wildcards): - return expand( - "results/calls/{{cancer_sample}}.{caller}.{{scatteritem}}.sorted{ext}", - caller=caller, - ext=ext, - ) - - return inner - - -def get_fdr_control_params(wildcards): - query = config["calling"]["fdr-control"]["events"][wildcards.event] - threshold = query.get( - "threshold", config["calling"]["fdr-control"].get("threshold", 0.05) - ) - events = query["varlociraptor"] - return {"threshold": threshold, "events": events} - - -def get_pair_observations(wildcards): - return expand( - "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf", - caller=wildcards.caller, - cancer_sample=wildcards.cancer_sample, - scatteritem=wildcards.scatteritem, - sample=get_paired_samples(wildcards), - ) - - -def get_merge_input(ext=".bcf"): - def inner(wildcards): - return expand( - "results/calls/{{cancer_sample}}.{vartype}.{{event}}.fdr-controlled{ext}", - ext=ext, - vartype=["SNV", "INS", "DEL", "MNV"], - filter=config["calling"]["fdr-control"]["events"][wildcards.event], - ) - - return inner - - -def get_pair_aliases(wildcards): - return [ - samples.loc[get_normal_from_sample(wildcards.cancer_sample), "alias"], - samples.loc[wildcards.cancer_sample, "alias"], - ] - - def get_tabix_params(wildcards): if wildcards.format == "vcf": return "-p vcf" @@ -433,9 +369,6 @@ def get_bam_from_group_and_alias(ext=".bam"): return inner -## RNA ## - - def get_quant_reads_input(wildcards): sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) if is_paired_end(sample, "RNA"): @@ -458,36 +391,6 @@ def kallisto_params(wildcards, input): return extra -## helper functions ## - - -def get_paired_samples(wildcards): - return [ - get_normal_from_sample(wildcards.cancer_sample), - samples.loc[wildcards.cancer_sample, "sample_name"], - ] - - -def get_paired_bams(wildcards): - return expand( - "results/recal/{sample}.sorted.bam", sample=get_paired_samples(wildcards) - ) - - -def get_paired_bais(wildcards): - return expand( - "results/recal/{sample}.sorted.bam.bai", sample=get_paired_samples(wildcards) - ) - - -def get_reads(wildcards): - return get_seperate(wildcards.sample, wildcards.group) - - -def get_seperate(sample, group): - return units.loc[(sample, "DNA"), "fq{}".format(str(group))] - - def get_alleles_MHCI(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk deleted file mode 100644 index 169265f0..00000000 --- a/workflow/rules/filtering.smk +++ /dev/null @@ -1,106 +0,0 @@ -rule filter_by_annotation: - input: - "{prefix}.bcf", - output: - "{prefix}.{filter}.filtered_ann.bcf", - log: - "logs/filter-calls/annotation/{prefix}.{filter}.log", - params: - filter=lambda w: config["calling"]["filter"][w.filter], - conda: - "../envs/vembrane.yaml" - shell: - 'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}' - - -rule filter_odds: - input: - get_annotated_bcf, - output: - "results/calls/{cancer_sample}.{event}.{scatteritem}.filtered_odds.bcf", - params: - events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][ - "varlociraptor" - ], - log: - "logs/filter-calls/posterior_odds/{cancer_sample}.{scatteritem}.{event}.log", - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}" - - -rule gather_calls: - input: - calls=gather.calling( - "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf" - ), - idx=gather.calling( - "results/calls/{{cancer_sample}}.{{event}}.{scatteritem}.filtered_odds.bcf.csi" - ), - output: - "results/calls/{cancer_sample}.{event}.filtered_odds.bcf", - log: - "logs/gather-calls/{cancer_sample}.{event}.log", - params: - "-a -Ob", - wrapper: - "0.67.0/bio/bcftools/concat" - - -rule control_fdr: - input: - "results/calls/{cancer_sample}.{event}.filtered_odds.bcf", - output: - "results/calls/{cancer_sample}.{vartype}.{event}.fdr-controlled.bcf", - log: - "logs/control-fdr/{cancer_sample}.{vartype}.{event}.log", - params: - query=get_fdr_control_params, - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} " - "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}" - - -rule merge_calls: - input: - calls=get_merge_input(".bcf"), - idx=get_merge_input(".bcf.csi"), - output: - "results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", - log: - "logs/merge-calls/{cancer_sample}.{event}.log", - params: - "-a -Ob", - wrapper: - "0.59.2/bio/bcftools/concat" - - -rule change_samplenames: - input: - call="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", - output: - temp("results/merged-calls/{cancer_sample}.{event}.renaming.txt"), - log: - "logs/change-samplenames/{cancer_sample}.{event}.log", - params: - prefix=lambda w, input: os.path.basename(input["call"]).split(".")[0], - shell: - "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}" - - -rule reheader_varlociraptor: - input: - vcf="results/merged-calls/{cancer_sample}.{event}.fdr-controlled.bcf", - samples="results/merged-calls/{cancer_sample}.{event}.renaming.txt", - output: - "results/merged-calls/{cancer_sample}.{event}.reheader.bcf", - log: - "logs/reheader-calls/{cancer_sample}.{event}.log", - params: - extra="", - view_extra="-O b", - wrapper: - "0.60.0/bio/bcftools/reheader" diff --git a/workflow/rules/varlociraptor.smk b/workflow/rules/varlociraptor.smk deleted file mode 100644 index f94052fd..00000000 --- a/workflow/rules/varlociraptor.smk +++ /dev/null @@ -1,89 +0,0 @@ -rule render_scenario: - input: - config["calling"]["scenario"], - output: - report( - "results/scenarios/{cancer_sample}.yaml", - caption="../report/scenario.rst", - category="Variant calling scenarios", - ), - params: - samples=samples, - log: - "logs/scenarious/{cancer_sample}.log", - conda: - "../envs/render_scenario.yaml" - script: - "../scripts/render-scenario.py" - - -rule varlociraptor_preprocess: - input: - ref="resources/genome.fasta", - ref_idx="resources/genome.fasta.fai", - candidates="results/candidate-calls/{cancer_sample}.{caller}.{scatteritem}.bcf", - bam="results/recal/{sample}.sorted.bam", - bai="results/recal/{sample}.sorted.bam.bai", - output: - "results/observations/{cancer_sample}/{sample}.{caller}.{scatteritem}.bcf", - params: - omit_isize="", - log: - "logs/varlociraptor/preprocess/{cancer_sample}/{sample}.{caller}.{scatteritem}.log", - conda: - "../envs/varlociraptor.yaml" - shell: - "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} " - "{input.ref} --bam {input.bam} --output {output} 2> {log}" - - -rule varlociraptor_call: - input: - obs=get_pair_observations, - scenario="results/scenarios/{cancer_sample}.yaml", - output: - temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf"), - log: - "logs/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.log", - params: - obs=lambda w, input: [ - "{}={}".format(s, f) for s, f in zip(get_pair_aliases(w), input.obs) - ], - conda: - "../envs/varlociraptor.yaml" - benchmark: - "benchmarks/varlociraptor/call/{cancer_sample}.{caller}.{scatteritem}.tsv" - shell: - "varlociraptor " - "call variants generic --obs {params.obs} " - "--scenario {input.scenario} > {output} 2> {log}" - - -rule sort_calls: - input: - "results/calls/{cancer_sample}.{caller}.{scatteritem}.bcf", - output: - temp("results/calls/{cancer_sample}.{caller}.{scatteritem}.sorted.bcf"), - log: - "logs/bcf-sort/{cancer_sample}.{caller}.{scatteritem}.log", - conda: - "../envs/bcftools.yaml" - resources: - mem_mb=8000, - shell: - "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` " - "-Ob {input} > {output} 2> {log}" - - -rule bcftools_concat: - input: - calls=get_scattered_calls(), - indexes=get_scattered_calls(ext=".bcf.csi"), - output: - "results/calls/{cancer_sample}.{scatteritem}.bcf", - log: - "logs/concat-calls/{cancer_sample}.{scatteritem}.log", - params: - "-a -Ob", # TODO Check this - wrapper: - "0.59.2/bio/bcftools/concat" From 98cc4b1bffa66d7c2b2569a37f30bbc4bf153882 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 08:31:06 +0000 Subject: [PATCH 048/191] remove varlociraptor requirements from config.schema.yaml --- workflow/schemas/config.schema.yaml | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 6fafb49f..657407a9 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -63,34 +63,6 @@ properties: - tumor_sample - somatic_events - - calling: - type: object - properties: - freebayes: - type: object - properties: - activate: - type: boolean - scenario: - type: string - fdr-control: - type: object - properties: - threshold: - type: number - minimum: 0.0 - maximum: 1.0 - events: - $ref: "#/definitions/evententry" - description: "a map of pairs" - required: - - threshold - - events - required: - - freebayes - - scenario - - fdr-control remove_duplicates: type: object From 27dc8c82ac30a481c18b41f35ba38138029a42e0 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 08:39:38 +0000 Subject: [PATCH 049/191] remove calling requirement from config.schema.yaml --- workflow/schemas/config.schema.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 657407a9..9f882ac1 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -204,7 +204,6 @@ required: - units - ref - tmb - - calling - params - annotations - epitope_prediction From f902faeb73acaaa7f69336872463b0e8f7920a8c Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 09:07:21 +0000 Subject: [PATCH 050/191] take {normal_event} wildcard through to final output requesting in common.smk --- workflow/rules/MHC_binding.smk | 34 +++++++++++++++++----------------- workflow/rules/common.smk | 3 ++- workflow/rules/microphaser.smk | 23 ++++++++++------------- 3 files changed, 29 insertions(+), 31 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index d33b0f82..e8f2d0aa 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls", + "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log", + "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.netMHCIIpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCIIpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.xls", + "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{contig}.{peptide_type}.log", + "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,13 +53,13 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{contig}.{{peptide_type}}.xls", + "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{contig}.{{peptide_type}}.xls", contig=contigs, ), output: - "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.{peptide_type}.tsv", + "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.{peptide_type}.tsv", log: - "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{peptide_type}.log", + "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{peptide_type}.log", script: "../scripts/group_mhc_output.py" @@ -81,17 +81,17 @@ rule parse_mhc_out: rule mhc_csv_table: input: - info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv", - neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.neo.tsv", - normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.mhc.normal.tsv", + info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv", + neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.neo.tsv", + normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.normal.tsv", output: report( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv", caption="../report/WES_results.rst", category="Results WES (netMHC)", ), log: - "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", + "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", script: "../scripts/merge_data.py" @@ -110,16 +110,16 @@ rule mhc_csv_table: rule add_RNA_info: input: counts="results/kallisto/{group}.{tumor_alias}", - table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.DNA.tsv", + table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv", output: report( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.RNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.RNA.tsv", caption="../report/RNA_results.rst", category="Results RNA", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), log: - "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", + "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", script: "../scripts/add_rna_info.py" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9c942bf2..500bc770 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -94,10 +94,11 @@ def get_final_output(): ] final_output.extend( expand( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{mhc}.{seqtype}.tsv", + "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{seqtype}.tsv", group=group, tumor_alias=tumor_aliases, tumor_event=config["params"]["microphaser"]["events"]["tumor"], + normal_event=config["params"]["microphaser"]["events"]["normal"], mhc=list( filter( None, diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index fb685cf8..7bb2588c 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -75,22 +75,19 @@ rule build_normal_proteome_db: rule microphaser_filter: input: - tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{contig}.tsv", - proteome=expand( - "results/microphaser/bin/{{group}}.{normal_event}.{{mhc}}.normal_proteome.bin", - normal_event=config["params"]["microphaser"]["events"]["normal"], - ), + tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv", + proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", output: mt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.neo.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.normal.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.normal.fa" ), - tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.tsv", - removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.tsv", + removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.removed.tsv", log: - "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{mhc}.{contig}.log", + "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -104,13 +101,13 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{mhc}}.{contig}.tsv", + "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{{mhc}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{mhc}.tsv", + "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv", log: - "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{mhc}.log", + "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", conda: "../envs/xsv.yaml" shell: From a5e854b2c91bd8885e32c8c70664323f0004e48d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 09:17:14 +0000 Subject: [PATCH 051/191] attempt to fix key error --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 500bc770..bb7fe8ff 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -236,7 +236,7 @@ def get_cutadapt_adapters(wildcards): def is_paired_end(sample, seqtype): - sample_units = units.loc[sample].loc[seqtype] + sample_units = units.loc[(units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)] fq2_null = sample_units["fq2"].isnull() sra_null = sample_units["sra"].isnull() paired = ~fq2_null | ~sra_null From 2263cc20a2c0099757382cee6c6d72f1b4983fdf Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 09:23:58 +0000 Subject: [PATCH 052/191] snakefmt --- workflow/rules/common.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index bb7fe8ff..75d18c16 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -236,7 +236,9 @@ def get_cutadapt_adapters(wildcards): def is_paired_end(sample, seqtype): - sample_units = units.loc[(units["sample_name"] == sample) & (units["sequencing_type"] == seqtype)] + sample_units = units.loc[ + (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype) + ] fq2_null = sample_units["fq2"].isnull() sra_null = sample_units["sra"].isnull() paired = ~fq2_null | ~sra_null From f30fab9f6457ad7e8a0d89918e890078c2aecf6f Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 09:42:54 +0000 Subject: [PATCH 053/191] fix final output requests for hla typing and string formatting in helper functions --- workflow/rules/common.smk | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 75d18c16..94b2a9b1 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -119,15 +119,15 @@ def get_final_output(): if config["HLAtyping"]["HLA_LA"]["activate"]: final_output = expand( [ - "results/optitype/{sample}/hla_alleles_{sample}.tsv", - "results/HLA-LA/hlaI_{sample}.tsv", - "results/HLA-LA/hlaII_{sample}.tsv", + "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", + "results/HLA-LA/{group}.{alias}.hlaI.tsv", + "results/HLA-LA/{group}.{alias}.hlaII.tsv", ], sample=samples["sample_name"], ) else: final_output = expand( - "results/optitype/{sample}/hla_alleles_{sample}.tsv", + "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", sample=samples["sample_name"], ) return final_output @@ -279,10 +279,10 @@ def get_fastqs(wc): def get_map_reads_input(sample): if is_paired_end(sample, "DNA"): return [ - "results/merged/DNA/{sample}_R1.fastq.gz", - "results/merged/DNA/{sample}_R2.fastq.gz", + f"results/merged/DNA/{sample}_R1.fastq.gz", + f"results/merged/DNA/{sample}_R2.fastq.gz", ] - return "results/merged/DNA/{sample}_single.fastq.gz" + return f"results/merged/DNA/{sample}_single.fastq.gz" def get_read_group(wildcards): @@ -313,7 +313,7 @@ def get_optitype_reads_input(wildcards): sample=sample, read=["R1", "R2"], ) - return "results/razers3/fastq/{sample}_single.fastq" + return f"results/razers3/fastq/{sample}_single.fastq" else: return get_map_reads_input(sample) @@ -376,10 +376,10 @@ def get_quant_reads_input(wildcards): sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) if is_paired_end(sample, "RNA"): return [ - "results/merged/RNA/{sample}_R1.fastq.gz", - "results/merged/RNA/{sample}_R2.fastq.gz", + f"results/merged/RNA/{sample}_R1.fastq.gz", + f"results/merged/RNA/{sample}_R2.fastq.gz", ] - return "results/merged/RNA/{sample}_single.fastq.gz" + return f"results/merged/RNA/{sample}_single.fastq.gz" def kallisto_params(wildcards, input): From c56b4635e6574cf0c89af57aae504b58eb34ba13 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 09:43:38 +0000 Subject: [PATCH 054/191] use `group.alias` instead of `sample` for HLA-LA `--sampleID` --- workflow/rules/HLAtyping.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index d8513699..f6151cd4 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -14,7 +14,7 @@ rule HLA_LA: conda: "../envs/hla_la.yaml" shell: - "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" + "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}.{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" rule parse_HLA_LA: From b1a3c629c1514a4ced1dd3ea356f8298ad8cd48e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 11 May 2022 13:59:26 +0000 Subject: [PATCH 055/191] keep on trying --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 94b2a9b1..3a46ca53 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -236,7 +236,7 @@ def get_cutadapt_adapters(wildcards): def is_paired_end(sample, seqtype): - sample_units = units.loc[ + sample_units = units[ (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype) ] fq2_null = sample_units["fq2"].isnull() From f1c492f7972675d34c54056cf479cc397361fa94 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 12 May 2022 07:45:31 +0000 Subject: [PATCH 056/191] fix HLA_LA sampleID --- workflow/rules/HLAtyping.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index f6151cd4..70afe0f5 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -4,22 +4,22 @@ rule HLA_LA: bai=get_bam_from_group_and_alias(ext=".bai"), index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", output: - "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt", + "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", threads: 7 log: - "logs/HLA-LA/{group}.{alias}.log", + "logs/HLA-LA/{group}_{alias}.log", params: graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), conda: "../envs/hla_la.yaml" shell: - "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}.{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" + "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" rule parse_HLA_LA: input: - "results/HLA-LA/output/{group}/{alias}/hla/R1_bestguess_G.txt", + "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", output: report( "results/HLA-LA/{group}.{alias}.hlaI.tsv", From 8a8286030215b5a37d594f9e2b55444eddb52e7b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 12 May 2022 07:50:18 +0000 Subject: [PATCH 057/191] revert get_map_reads_input to work on wildcards, as it is (also) an input function --- workflow/rules/common.smk | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3a46ca53..dcee2414 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -236,9 +236,7 @@ def get_cutadapt_adapters(wildcards): def is_paired_end(sample, seqtype): - sample_units = units[ - (units["sample_name"] == sample) & (units["sequencing_type"] == seqtype) - ] + sample_units = units.loc[sample].loc[seqtype] fq2_null = sample_units["fq2"].isnull() sra_null = sample_units["sra"].isnull() paired = ~fq2_null | ~sra_null @@ -276,13 +274,13 @@ def get_fastqs(wc): return units.loc[wc.sample].loc[wc.seqtype, fq].tolist() -def get_map_reads_input(sample): - if is_paired_end(sample, "DNA"): +def get_map_reads_input(wildcards): + if is_paired_end(wildcards.sample, "DNA"): return [ - f"results/merged/DNA/{sample}_R1.fastq.gz", - f"results/merged/DNA/{sample}_R2.fastq.gz", + f"results/merged/DNA/{wildcards.sample}_R1.fastq.gz", + f"results/merged/DNA/{wildcards.sample}_R2.fastq.gz", ] - return f"results/merged/DNA/{sample}_single.fastq.gz" + return f"results/merged/DNA/{wildcards.sample}_single.fastq.gz" def get_read_group(wildcards): @@ -315,7 +313,8 @@ def get_optitype_reads_input(wildcards): ) return f"results/razers3/fastq/{sample}_single.fastq" else: - return get_map_reads_input(sample) + wildcards["sample"] = sample + return get_map_reads_input(wildcards) def get_oncoprint_batch(wildcards): From a456dc52c4ed8a17467f5c35f6b3f40699f9ce4d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 17 May 2022 14:27:21 +0000 Subject: [PATCH 058/191] first attempt to reduce workflow to essentials (excluding everything up to variant calling), currently not working --- config/config.yaml | 53 +---- workflow/Snakefile | 9 - workflow/rules/MHC_binding.smk | 34 +-- workflow/rules/RNA.smk | 71 ------ workflow/rules/annotation.smk | 21 -- workflow/rules/calling.smk | 140 ------------ workflow/rules/common.smk | 237 ++------------------- workflow/rules/mapping.smk | 70 ------ workflow/rules/microphaser.smk | 107 +++++++--- workflow/rules/oncoprint.smk | 28 --- workflow/rules/phylogeny.smk | 12 +- workflow/rules/ref.smk | 117 ---------- workflow/rules/tmb.smk | 20 -- workflow/rules/trim.smk | 66 ------ workflow/rules/utils.smk | 18 +- workflow/rules/vega.smk | 15 -- workflow/schemas/config.schema.yaml | 103 ++------- workflow/scripts/build_oncoprint_matrix.py | 42 ---- workflow/scripts/oncoprint.R | 56 ----- workflow/scripts/render-scenario.py | 9 - 20 files changed, 137 insertions(+), 1091 deletions(-) delete mode 100644 workflow/rules/RNA.smk delete mode 100644 workflow/rules/annotation.smk delete mode 100644 workflow/rules/calling.smk delete mode 100644 workflow/rules/mapping.smk delete mode 100644 workflow/rules/oncoprint.smk delete mode 100644 workflow/rules/tmb.smk delete mode 100644 workflow/rules/trim.smk delete mode 100644 workflow/rules/vega.smk delete mode 100644 workflow/scripts/build_oncoprint_matrix.py delete mode 100644 workflow/scripts/oncoprint.R delete mode 100644 workflow/scripts/render-scenario.py diff --git a/config/config.yaml b/config/config.yaml index a4fe01ee..3bf1b781 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,29 +1,6 @@ samples: "config/samples.tsv" units: "config/units.tsv" -# boolean if read trimming should be skipped -trimming: - activate: false - -remove_duplicates: - activate: true - -fusion: - arriba: - activate: false - blacklist: - "arriba_blacklist" - params: - "-T -P" - -tmb: - activate: false - coding_genome_size: 3e7 - # Name of the tumor sample in the scenario.yaml. - tumor_sample: tumor - somatic_events: - - somatic - epitope_prediction: activate: true @@ -43,11 +20,11 @@ affinity: HLAtyping: # activate to use razers3 to pre-filter reads before using optitype optitype_prefiltering: - activate: false + activate: True optitype_data: "config/HLA_Data/hla_reference_dna.fasta" # activate to predict MHC-I and MHC-II alleles with HLA-LA HLA_LA: - activate: false + activate: true ref: @@ -71,20 +48,6 @@ annotations: - LoFtool params: - cutadapt: "" - bwa: - "-M" - picard: - MarkDuplicates: - "VALIDATION_STRINGENCY=LENIENT" - gatk: - BaseRecalibrator: "--tmp-dir tmp" - applyBQSR: "" - strelka: - config: - "--exome" - run: - "--mode local" razers3: "-i 95 -m 1 -dr 0" optitype: @@ -97,12 +60,6 @@ params: 9 netMHCIIpan: 15 - events: - tumor: "strelka_somatic" - normal: "strelka_germline" - kallisto: - "-b 100" - star: >- - --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip - --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 - --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 + variant_sets: + normal: "normal_only" + tumor: "tumor_only" diff --git a/workflow/Snakefile b/workflow/Snakefile index 7a5270dc..f75eba84 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -27,21 +27,12 @@ container: "docker://continuumio/miniconda3" include: "rules/common.smk" include: "rules/utils.smk" -include: "rules/trim.smk" include: "rules/ref.smk" -include: "rules/mapping.smk" -include: "rules/calling.smk" -include: "rules/annotation.smk" include: "rules/microphaser.smk" include: "rules/HLAtyping.smk" include: "rules/MHC_binding.smk" -include: "rules/RNA.smk" -include: "rules/tmb.smk" -include: "rules/vega.smk" rule all: input: get_final_output(), - get_fusion_output(), - get_tmb_targets(), diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index e8f2d0aa..2e80b180 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -18,12 +18,12 @@ rule netMHCpan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls", + "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", log: - "logs/netMHCpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log", + "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], @@ -35,12 +35,12 @@ rule netMHCpan: rule netMHCIIpan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.netMHCIIpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.xls", + "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", log: - "logs/netMHCIIpan/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.{peptide_type}.log", + "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], @@ -53,13 +53,13 @@ rule netMHCIIpan: rule parse_mhc_out: input: expand( - "results/{{mhc}}/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{contig}.{{peptide_type}}.xls", + "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.xls", contig=contigs, ), output: - "results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.{peptide_type}.tsv", + "results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv", log: - "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{peptide_type}.log", + "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log", script: "../scripts/group_mhc_output.py" @@ -81,17 +81,17 @@ rule parse_mhc_out: rule mhc_csv_table: input: - info="results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv", - neo="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.neo.tsv", - normal="results/{mhc}/{group}.{tumor_alias}.{tumor_event}.{normal_event}.mhc.normal.tsv", + info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", + neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv", + normal="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.normal.tsv", output: report( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", caption="../report/WES_results.rst", category="Results WES (netMHC)", ), log: - "logs/mhc_csv_table/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", + "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", script: "../scripts/merge_data.py" @@ -110,16 +110,16 @@ rule mhc_csv_table: rule add_RNA_info: input: counts="results/kallisto/{group}.{tumor_alias}", - table="results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.DNA.tsv", + table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", output: report( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.RNA.tsv", + "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv", caption="../report/RNA_results.rst", category="Results RNA", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), log: - "logs/add-RNA/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", + "logs/add-RNA/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", script: "../scripts/add_rna_info.py" diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk deleted file mode 100644 index 2a927c18..00000000 --- a/workflow/rules/RNA.smk +++ /dev/null @@ -1,71 +0,0 @@ -rule kallisto_quant: - input: - fastq=get_quant_reads_input, - index="resources/kallisto/transcripts.idx", - output: - directory("results/kallisto/{group}.{tumor_alias}"), - params: - extra=kallisto_params, - log: - "results/logs/kallisto/quant/{group}.{tumor_alias}.log", - wrapper: - "0.60.1/bio/kallisto/quant" - - -rule STAR_align: - input: - "resources/STAR_index", - fq1=lambda wc: units.loc[(wc.sample, "RNA"), "fq1"], - fq2=lambda wc: units.loc[(wc.sample, "RNA"), "fq2"], - gtf="resources/genome.gtf", - output: - # see STAR manual for additional output files - "results/star/{sample}/Aligned.out.bam", - "results/star/{sample}/ReadsPerGene.out.tab", - log: - "logs/star/{sample}.log", - params: - # path to STAR reference genome index - index="STAR_index", - # optional parameters - designed to get chimeric alignments for fusion detection - extra=lambda wc, input: "--quantMode GeneCounts --sjdbGTFfile {} {}".format( - input.gtf, config["params"]["star"] - ), - threads: 8 - wrapper: - "0.42.0/bio/star/align" - - -rule arriba: - input: - bam="results/star/{sample}/Aligned.out.bam", - genome="resources/genome.fasta", - annotation="resources/genome.gtf", - output: - fusions="results/arriba/{sample}.fusions.tsv", - discarded="results/arriba/{sample}.fusions.discarded.tsv", - params: - blacklist=config["fusion"]["arriba"]["blacklist"], - extra=config["fusion"]["arriba"]["params"], - log: - "results/logs/arriba/{sample}.log", - threads: 1 - wrapper: - "0.60.1/bio/arriba" - - -## TODO: Update -# rule fusioncatcher: -# input: -# fq1=lambda w: units.loc[(w.sample, "RNA"), "fq1"], -# fq2=lambda w: units.loc[(w.sample, "RNA"), "fq2"] -# output: -# directory("fusioncatcher/{sample}") -# params: -# extra="-T tmp -d ../../fusioncatcher_data" -# log: -# "logs/fusioncatcher/{sample}.log" -# threads: -# 8 -# shell: -# "fusioncatcher -i {input.fq1},{input.fq2} -o {output} {params.extra} -p {threads} > {log}" diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk deleted file mode 100644 index 271ab427..00000000 --- a/workflow/rules/annotation.smk +++ /dev/null @@ -1,21 +0,0 @@ -rule annotate_strelka_variants: - input: - calls="results/strelka/{calls}.bcf", - cache="resources/vep/cache", - plugins="resources/vep/plugins", - output: - calls="results/strelka/{calls}.annotated.bcf", - stats=report( - "results/strelka/{calls}.annotated.stats.html", - caption="../report/stats.rst", - category="QC", - ), - params: - # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. - plugins=[], #config["annotations"]["vep"]["plugins"], - extra="--vcf_info_field ANN --hgvs --symbol --canonical", - log: - "logs/vep/{calls}.strelka.annotate.log", - wrapper: - "0.59.2/bio/vep/annotate" diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk deleted file mode 100644 index 9fca7233..00000000 --- a/workflow/rules/calling.smk +++ /dev/null @@ -1,140 +0,0 @@ -rule strelka_tumor: - input: - normal=get_normal_bam(), - normal_index=get_normal_bam(ext=".bai"), - tumor=get_tumor_bam_from_group_and_alias(), - tumor_index=get_tumor_bam_from_group_and_alias(ext=".bai"), - fasta="resources/genome.fasta", - fasta_index="resources/genome.fasta.fai", - callregions="resources/genome.callregions.bed.gz", - output: - "results/strelka/{group}.{tumor_alias}.strelka_somatic.snvs.vcf.gz", - "results/strelka/{group}.{tumor_alias}.strelka_somatic.indels.vcf.gz", - log: - "logs/calling/strelka/{group}.{tumor_alias}.strelka_somatic.log", - params: - config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", - config["params"]["strelka"]["config"], - ), - run_extra=config["params"]["strelka"]["run"], - threads: 22 - wrapper: - "0.65.0/bio/strelka/somatic" - - -rule strelka_germline: - input: - bam=get_normal_bam(), - normal_index=get_normal_bam(ext=".bai"), - fasta="resources/genome.fasta", - fasta_index="resources/genome.fasta.fai", - callregions="resources/genome.callregions.bed.gz", - output: - "results/strelka/{group}.normal.strelka_germline.variants.vcf.gz", - log: - "logs/calling/strelka_germline/{group}.log", - params: - config_extra="--callRegions {} {}".format( - "resources/genome.callregions.bed.gz", - config["params"]["strelka"]["config"], - ), - run_extra="", - threads: 22 - wrapper: - "0.65.0/bio/strelka/germline" - - -rule vcf_to_bcf: - input: - "{variants}.vcf.gz", - output: - "{variants}.output.bcf", - log: - "logs/bcftools/to-bcf/{variants}.log", - params: - "-O b -f PASS", - wrapper: - "0.60.0/bio/bcftools/view" - - -rule concat_somatic: - input: - calls=expand( - "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf", - type=["snvs", "indels"], - ), - indices=expand( - "results/strelka/{{group}}.{{tumor_alias}}.strelka_somatic.{type}.output.bcf.csi", - type=["snvs", "indels"], - ), - output: - "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf", - log: - "bcftools/concat_somatic/{group}.{tumor_alias}.log", - params: - "-O b -a", - wrapper: - "0.60.0/bio/bcftools/concat" - - -rule get_tumor_from_somatic: - input: - "results/strelka/{group}.{tumor_alias}.strelka_somatic.bcf", - output: - "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf", - log: - "logs/bcftools/get_tumor_from_somatic/{group}.{tumor_alias}.strelka_somatic.tumor.log", - params: - "-O b -s TUMOR", - wrapper: - "0.60.0/bio/bcftools/view" - - -rule reheader_germline: - input: - vcf="results/strelka/{group}.normal.strelka_germline.variants.output.bcf", - samples="resources/sampleheader.txt", - output: - "results/strelka/{group}.normal.strelka_germline.variants.reheader.bcf", - log: - "logs/bcftools/reheader_germline/{group}.normal.log", - params: - extra="", - view_extra="-O b", - wrapper: - "0.60.0/bio/bcftools/reheader" - - -rule concat_variants: - input: - calls=[ - "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf", - "results/strelka/{group}.strelka_germline.variants.reheader.bcf", - ], - index=[ - "results/strelka/{group}.{tumor_alias}.strelka_somatic.tumor.bcf.csi", - "results/strelka/{group}.strelka_germline.variants.reheader.bcf.csi", - ], - output: - "results/strelka/merged/{group}.{tumor_alias}.strelka_somatic.strelka_germline.bcf", - log: - "bcftools/concat_variants/{group}.{tumor_alias}.strelka_somatic.strelka_germline.log", - params: - extra="-O b -a", - wrapper: - "0.64.0/bio/bcftools/concat" - - -rule norm_vcf: - input: - "{prefix}.bcf", - genome="resources/genome.fasta", - output: - "{prefix}.norm.bcf", - log: - "logs/bcftools/norm/{prefix}.log", - params: - "-f {} -O b -m-".format("resources/genome.fasta"), # optional parameters for bcftools norm (except -o) - wrapper: - "0.65.0/bio/bcftools/norm" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index dcee2414..9e25bb50 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -94,11 +94,9 @@ def get_final_output(): ] final_output.extend( expand( - "results/neoantigens/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{seqtype}.tsv", + "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", group=group, tumor_alias=tumor_aliases, - tumor_event=config["params"]["microphaser"]["events"]["tumor"], - normal_event=config["params"]["microphaser"]["events"]["normal"], mhc=list( filter( None, @@ -133,108 +131,12 @@ def get_final_output(): return final_output -def get_fusion_output(): - if config["fusion"]["arriba"]["activate"]: - fusion_output = expand( - "results/fusion/arriba/{sample}.fusions.tsv", - sample=units[units["sequencing_type"] == "RNA"]["sample_name"], - ) - else: - fusion_output = [] - return fusion_output - - -def get_tmb_targets(): - if is_activated("tmb"): - return expand( - "results/plots/tmb/{group}.{mode}.svg", - group=samples[(samples.alias == "tumor")]["sample_name"], - mode=config["tmb"].get("mode", "curve"), - ) - else: - return [] - - caller = list( filter(None, ["freebayes" if is_activated("calling/freebayes") else None]) ) ### helper functions ### -## alignment ## - - -def get_cutadapt_input(wildcards): - unit = units.loc[wildcards.sample].loc[wildcards.seqtype].loc[wildcards.unit] - - if pd.isna(unit["fq1"]): - # SRA sample (always paired-end for now) - accession = unit["sra"] - return expand("sra/{accession}_{read}.fastq", accession=accession, read=[1, 2]) - - if unit["fq1"].endswith("gz"): - ending = ".gz" - else: - ending = "" - - if pd.isna(unit["fq2"]): - # single end local sample - return "pipe/cutadapt/{S}/{T}/{U}.fq1.fastq{E}".format( - S=unit.sample_name, - U=unit.unit_name, - T=unit.sequencing_type, - E=ending, - ) - else: - # paired end local sample - return expand( - "pipe/cutadapt/{S}/{T}/{U}.{{read}}.fastq{E}".format( - S=unit.sample_name, - U=unit.unit_name, - T=unit.sequencing_type, - E=ending, - ), - read=["fq1", "fq2"], - ) - - -def get_cutadapt_pipe_input(wildcards): - pattern = ( - units.loc[wildcards.sample] - .loc[wildcards.seqtype] - .loc[wildcards.unit, wildcards.fq] - ) - if "*" in pattern: - files = sorted( - glob.glob( - units.loc[wildcards.sample] - .loc[wildcards.seqtype] - .loc[wildcards.unit, wildcards.fq] - ) - ) - if not files: - raise ValueError( - "No raw fastq files found for unit pattern {} (sample {}, sequencing type {}). " - "Please check the your sample sheet.".format( - wildcards.unit, wildcards.sample, wildcards.seqtype - ) - ) - else: - files = [pattern] - return files - - -def get_cutadapt_adapters(wildcards): - unit = units.loc[wildcards.sample].loc[wildcards.unit] - try: - adapters = unit["adapters"] - if isinstance(adapters, str): - return adapters - return "" - except KeyError: - return "" - - def is_paired_end(sample, seqtype): sample_units = units.loc[sample].loc[seqtype] fq2_null = sample_units["fq2"].isnull() @@ -250,56 +152,11 @@ def is_paired_end(sample, seqtype): return all_paired -def get_fastqs(wc): - if config["trimming"]["activate"]: - return expand( - "results/trimmed/{sample}/{seqtype}/{unit}_{read}.fastq.gz", - unit=units.loc[ - (units["sequencing_type"] == wc.seqtype) - & (units["sample_name"] == wc.sample), - "unit_name", - ], - sample=wc.sample, - read=wc.read, - seqtype=wc.seqtype, - ) - unit = units.loc[wc.sample].loc[wc.seqtype] - if all(pd.isna(unit["fq1"])): - # SRA sample (always paired-end for now) - accession = unit["sra"] - return expand( - "sra/{accession}_{read}.fastq", accession=accession, read=wc.read[-1] - ) - fq = "fq{}".format(wc.read[-1]) - return units.loc[wc.sample].loc[wc.seqtype, fq].tolist() - - -def get_map_reads_input(wildcards): - if is_paired_end(wildcards.sample, "DNA"): - return [ - f"results/merged/DNA/{wildcards.sample}_R1.fastq.gz", - f"results/merged/DNA/{wildcards.sample}_R2.fastq.gz", - ] - return f"results/merged/DNA/{wildcards.sample}_single.fastq.gz" - - -def get_read_group(wildcards): - """Denote sample name and platform in read group.""" - return r"-R '@RG\tID:{sample}\tSM:{sample}\tPL:{platform}'".format( - sample=wildcards.sample, platform=samples.loc[wildcards.sample, "platform"] - ) - - -def get_recalibrate_quality_input(bai=False): - ext = ".bai" if bai else "" - - def inner(wildcards): - if is_activated("remove_duplicates"): - return f"results/dedup/{wildcards.sample}.sorted.bam{ext}" - else: - return f"results/mapped/{wildcards.sample}.sorted.bam{ext}" - - return inner +def get_sample_from_group_and_alias(group, alias): + sample = samples.loc[ + (samples["group"] == group) & (samples["alias"] == alias), "sample_name" + ].squeeze() + return sample def get_optitype_reads_input(wildcards): @@ -317,82 +174,23 @@ def get_optitype_reads_input(wildcards): return get_map_reads_input(wildcards) -def get_oncoprint_batch(wildcards): - if wildcards.batch == "all": - groups = samples[samples["alias"] == "tumor"]["sample_name"].unique() - else: - groups = samples.loc[ - samples[config["oncoprint"]["stratify"]["by-column"]] == wildcards.batch, - "group", - ].unique() - return expand( - "results/merged-calls/{group}.{{event}}.fdr-controlled.bcf", group=groups - ) - - -def get_tabix_params(wildcards): - if wildcards.format == "vcf": - return "-p vcf" - if wildcards.format == "txt": - return "-s 1 -b 2 -e 2" - raise ValueError("Invalid format for tabix: {}".format(wildcards.format)) - - -def get_sample_from_group_and_alias(group, alias): - sample = samples.loc[ - (samples["group"] == group) & (samples["alias"] == alias), "sample_name" - ].squeeze() - return sample - - -def get_normal_bam(ext=".bam"): - def inner(wildcards): - normal_sample = get_sample_from_group_and_alias(wildcards.group, "normal") - return f"results/recal/{normal_sample}.sorted{ext}" - - return inner - - -def get_tumor_bam_from_group_and_alias(ext=".bam"): - def inner(wildcards): - tumor_sample = get_sample_from_group_and_alias( - wildcards.group, wildcards.tumor_alias - ) - return f"results/recal/{tumor_sample}.sorted{ext}" - - return inner - - def get_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): - sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias) + alias = wildcards.get("alias", + wildcards.get("tumor_alias", + wildcards.get("normal_alias", "unknown") + ) + ) + if alias == "unknown": + raise CustomException( + "get_bam_from_group_and_alias() requires on of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'." + ) + sample = get_sample_from_group_and_alias(wildcards.group, alias) return f"results/recal/{sample}.sorted{ext}" return inner -def get_quant_reads_input(wildcards): - sample = get_sample_from_group_and_alias(wildcards.group, wildcards.tumor_alias) - if is_paired_end(sample, "RNA"): - return [ - f"results/merged/RNA/{sample}_R1.fastq.gz", - f"results/merged/RNA/{sample}_R2.fastq.gz", - ] - return f"results/merged/RNA/{sample}_single.fastq.gz" - - -def kallisto_params(wildcards, input): - extra = config["params"]["kallisto"] - if len(input.fastq) == 1: - extra += " --single" - extra += ( - " --fragment-length {unit.fragment_len_mean} " "--sd {unit.fragment_len_sd}" - ).format(unit=units.loc[(wildcards.sample, wildcards.unit)]) - else: - extra += " --fusion" - return extra - - def get_alleles_MHCI(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( @@ -405,5 +203,6 @@ def get_alleles_MHCI(wildcards): def get_alleles_MHCII(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/HLA-LA/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias + #TODO: check that hlaII is correct here, and not hlaI which it previously was + "results/HLA-LA/{group}.{alias}.hlaII.tsv", group=wildcards.group, alias=alias ) diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk deleted file mode 100644 index 13be68ad..00000000 --- a/workflow/rules/mapping.smk +++ /dev/null @@ -1,70 +0,0 @@ -rule map_reads: - input: - reads=get_map_reads_input, - idx=rules.bwa_index.output, - output: - temp("results/mapped/{sample}.sorted.bam"), - log: - "logs/bwa_mem/{sample}.log", - params: - index=lambda w, input: os.path.splitext(input.idx[0])[0], - extra=get_read_group, - sort="samtools", - sort_order="coordinate", - threads: 8 - wrapper: - "0.56.0/bio/bwa/mem" - - -rule mark_duplicates: - input: - "results/mapped/{sample}.sorted.bam", - output: - bam=temp("results/dedup/{sample}.sorted.bam"), - metrics="results/qc/dedup/{sample}.metrics.txt", - log: - "logs/picard/dedup/{sample}.log", - params: - config["params"]["picard"]["MarkDuplicates"], - wrapper: - "0.39.0/bio/picard/markduplicates" - - -rule recalibrate_base_qualities: - input: - bam=get_recalibrate_quality_input(), - bai=get_recalibrate_quality_input(bai=True), - ref="resources/genome.fasta", - ref_dict="resources/genome.dict", - ref_fai="resources/genome.fasta.fai", - known="resources/variation.noiupac.vcf.gz", - tbi="resources/variation.noiupac.vcf.gz.tbi", - output: - recal_table=temp("results/recal/{sample}.grp"), - params: - extra=config["params"]["gatk"]["BaseRecalibrator"], - java_opts="", - log: - "logs/gatk/baserecalibrator/{sample}.log", - threads: 8 - wrapper: - "0.62.0/bio/gatk/baserecalibratorspark" - - -rule apply_bqsr: - input: - bam=get_recalibrate_quality_input(), - bai=get_recalibrate_quality_input(bai=True), - ref="resources/genome.fasta", - ref_dict="resources/genome.dict", - ref_fai="resources/genome.fasta.fai", - recal_table="results/recal/{sample}.grp", - output: - bam=protected("results/recal/{sample}.sorted.bam"), - log: - "logs/gatk/gatk_applybqsr/{sample}.log", - params: - extra=config["params"]["gatk"]["applyBQSR"], # optional - java_opts="", # optional - wrapper: - "0.62.0/bio/gatk/applybqsr" diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 7bb2588c..044b825d 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,68 +1,108 @@ +rule norm_bcf: + input: + "results/final-calls/{group}.{set}.bcf", + genome="resources/genome.fasta", + output: + "results/final-calls/{group}.{set}.norm.bcf", + log: + "logs/bcftools/norm/{group}.{set}.log", + params: + lambda w, input: "-f {} -O b -m-".format(input.genome), # optional parameters for bcftools norm (except -o) + wrapper: + "0.65.0/bio/bcftools/norm" + + +rule merge_tumor_normal: + input: + calls=expand( + "results/final-calls/{{group}}.{sets}.norm.bcf", + sets=[ + config["params"]["microphaser"]["variant_sets"]["normal"], + config["params"]["microphaser"]["variant_sets"]["tumor"], + ], + ), + index=expand( + "results/final-calls/{{group}}.{sets}.norm.csi", + sets=[ + config["params"]["microphaser"]["variant_sets"]["normal"], + config["params"]["microphaser"]["variant_sets"]["tumor"], + ], + ), + output: + "results/final-calls/{group}.merged_tumor_normal.norm.bcf", + log: + "bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log", + params: + extra="-O b -a", + wrapper: + "0.64.0/bio/bcftools/concat" + + rule microphaser_tumor: input: - vcf="results/strelka/merged/{group}.{tumor_alias}.{tumor_event}.{normal_event}.norm.annotated.bcf", - bam=get_tumor_bam_from_group_and_alias(), - bai=get_tumor_bam_from_group_and_alias(ext=".bai"), + bcf="results/final-calls/{group}.merged_tumor_normal.norm.annotated.bcf", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.neo.fa", - wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.normal.fa", - tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv", + mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.normal.fa", + tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv", log: - "logs/microphaser_tumor/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.{contig}.log", conda: "../envs/microphaser.yaml" params: window_length=config["params"]["microphaser"]["window_len"], shell: - "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " + "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" rule microphaser_normal: input: - vcf="results/strelka/{group}.normal.{normal_event}.variants.reheader.norm.bcf", - bam=get_normal_bam(), - bai=get_normal_bam(ext=".bam.bai"), + bcf="results/final-calls/{group}.{normal_set}.variants.reheader.norm.bcf", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - wt_fasta=("results/microphaser/fasta/{group}/normal.{normal_event}.{contig}.fa"), - wt_tsv=("results/microphaser/info/{group}/normal.{normal_event}.{contig}.tsv"), + wt_fasta=("results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"), + wt_tsv=("results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"), log: - "logs/microphaser_germline/{group}/{normal_event}-{contig}.log", + "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log", conda: "../envs/microphaser.yaml" params: window_length=config["params"]["microphaser"]["window_len"], shell: - "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " + "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" rule concat_normal_proteome: input: expand( - "results/microphaser/fasta/{{group}}/normal.{{normal_event}}.{contig}.fa", + "results/microphaser/fasta/{{group}}/normal.{{normal_set}}.{contig}.fa", contig=contigs, ), output: - "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa", log: - "logs/microphaser/concat_normal_proteome/{group}.{normal_event}.log", + "logs/microphaser/concat_normal_proteome/{group}.{normal_set}.log", shell: "cat {input} > {output} 2> {log}" rule build_normal_proteome_db: input: - "results/microphaser/fasta/{group}.{normal_event}.normal_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa", output: - bin="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", - fasta="results/microphaser/fasta/{group}.{normal_event}.{mhc}.normal_proteome.peptides.fasta", + bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin", + fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta", log: - "logs/microphaser/build_normal_proteome_db/{group}.{normal_event}-{mhc}.log", + "logs/microphaser/build_normal_proteome_db/{group}.{normal_set}-{mhc}.log", conda: "../envs/microphaser.yaml" params: @@ -75,19 +115,22 @@ rule build_normal_proteome_db: rule microphaser_filter: input: - tsv="results/microphaser/info/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{contig}.tsv", - proteome="results/microphaser/bin/{group}.{normal_event}.{mhc}.normal_proteome.bin", + tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv", + proteome=expand( + "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin", + normal_set=config["params"]["microphaser"]["variant_sets"]["normal"], + ), output: mt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.neo.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.normal.fa" + "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.normal.fa" ), - tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.tsv", - removed="results/microphaser/info/removed/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.tsv", + removed="results/microphaser/info/removed/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.removed.tsv", log: - "logs/microphaser_filter/{group}/{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.{contig}.log", + "logs/microphaser_filter/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -101,13 +144,13 @@ rule microphaser_filter: rule concat_tsvs: input: expand( - "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.{{tumor_event}}.{{normal_event}}.{{mhc}}.{contig}.tsv", + "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.merged_tumor_normal.{{mhc}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/filtered/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.tsv", + "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", log: - "logs/concat_tsvs/{group}.{tumor_alias}.{tumor_event}.{normal_event}.{mhc}.log", + "logs/concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", conda: "../envs/xsv.yaml" shell: diff --git a/workflow/rules/oncoprint.smk b/workflow/rules/oncoprint.smk deleted file mode 100644 index aa670044..00000000 --- a/workflow/rules/oncoprint.smk +++ /dev/null @@ -1,28 +0,0 @@ -rule build_oncoprint_table: - input: - bcf=get_oncoprint_batch, - output: - "plots/oncoprint/{batch}.{event}.tsv", - log: - "logs/oncoprint/{batch}.{event}.table.log", - conda: - "../envs/oncoprinttable.yaml" - script: - "../scripts/build_oncoprint_matrix.py" - - -rule plot_oncoprint: - input: - "plots/oncoprint/{batch}.{event}.tsv", - output: - report( - "plots/oncoprint/{batch}.{event}.pdf", - category="Oncoprint", - caption="../report/oncoprint.rst", - ), - log: - "logs/oncoprint/{batch}.{event}.plot.log", - conda: - "../envs/oncoprint.yaml" - script: - "../scripts/oncoprint.R" diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk index d7401bf8..63d2ad4a 100644 --- a/workflow/rules/phylogeny.smk +++ b/workflow/rules/phylogeny.smk @@ -1,6 +1,6 @@ def get_somatic_calls(wildcards): return expand( - "results/strelka/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", + "results/final-calls/somatic/{sample}/results/variants/somatic.complete.tumor.bcf", sample=samples[samples.alias == "tumor"]["sample_name"], ) @@ -9,20 +9,20 @@ rule merge_snvs: input: calls=get_somatic_calls, output: - "results/strelka/merged_calls.vcf", + "results/final-calls/merged_calls.vcf", log: "results/logs/bcftools/merge.log", params: - "--use-header strelka/sampleheader.txt --force-samples", + "--use-header final-calls/sampleheader.txt --force-samples", wrapper: "0.36.0/bio/bcftools/merge" rule query: input: - "results/strelka/merged_calls.vcf", + "results/final-calls/merged_calls.vcf", output: - "results/strelka/call_matrix.tsv", + "results/final-calls/call_matrix.tsv", log: "results/logs/bcftools/query.log", params: @@ -35,7 +35,7 @@ rule query: rule nj_tree: input: - matrix="results/strelka/call_matrix.tsv", + matrix="results/final-calls/call_matrix.tsv", output: pdf="results/plots/phylogeny_njtree.pdf", log: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 64cd6e06..513ea5dd 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -28,20 +28,6 @@ rule get_cdna: "0.45.1/bio/reference/ensembl-sequence" -rule kallisto_index: - input: - "resources/genome.cdna.fasta", - output: - "resources/kallisto/transcripts.idx", - params: - extra="", - log: - "logs/kallisto/index.log", - cache: True - wrapper: - "0.60.1/bio/kallisto/index" - - rule get_annotation: output: "resources/genome.gtf", @@ -58,23 +44,6 @@ rule get_annotation: "0.45.1/bio/reference/ensembl-annotation" -rule STAR_index: - input: - fasta="resources/genome.fasta", - gtf="resources/genome.gtf", - output: - directory("resources/STAR_index"), - params: - sjdb_overhang="100", - extra="", - log: - "logs/star/index.log", - threads: 32 - cache: True - wrapper: - "0.42.0/bio/star/index" - - rule split_annotation: input: "resources/genome.gtf", @@ -110,66 +79,6 @@ rule genome_dict: "0.45.1/bio/picard/createsequencedictionary" -rule get_callregions: - input: - "resources/genome.fasta.fai", - output: - "resources/genome.callregions.bed.gz", - log: - "logs/get-callregions.log", - params: - n_contigs=config["ref"]["n_chromosomes"], - conda: - "../envs/htslib.yaml" - shell: - "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})" - " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}" - - -rule get_known_variants: - input: - # use fai to annotate contig lengths for GATK BQSR - fai="resources/genome.fasta.fai", - output: - vcf="resources/variation.vcf.gz", - log: - "logs/get-known-variants.log", - params: - species=config["ref"]["species"], - release=config["ref"]["release"], - build=config["ref"]["build"], - type="all", - cache: True - wrapper: - "0.59.2/bio/reference/ensembl-variation" - - -rule remove_iupac_codes: - input: - "resources/variation.vcf.gz", - output: - "resources/variation.noiupac.vcf.gz", - log: - "logs/fix-iupac-alleles.log", - conda: - "../envs/rbt.yaml" - cache: True - shell: - "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}" - - -rule bwa_index: - input: - "resources/genome.fasta", - output: - multiext("resources/genome.fasta", ".amb", ".ann", ".bwt", ".pac", ".sa"), - log: - "logs/bwa_index.log", - cache: True - wrapper: - "0.45.1/bio/bwa/index" - - rule download_HLALA_graph: output: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"), @@ -209,32 +118,6 @@ rule index_HLALA: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" -rule get_vep_cache: - output: - directory("resources/vep/cache"), - params: - species=config["ref"]["species"], - build=config["ref"]["build"], - release=config["ref"]["release"], - log: - "logs/vep/cache.log", - cache: True - wrapper: - "0.59.2/bio/vep/cache" - - -rule get_vep_plugins: - output: - directory("resources/vep/plugins"), - params: - release=config["ref"]["release"], - log: - "logs/vep/plugins.log", - cache: True - wrapper: - "0.59.2/bio/vep/plugins" - - rule make_sampleheader: output: "resources/sampleheader.txt", diff --git a/workflow/rules/tmb.smk b/workflow/rules/tmb.smk deleted file mode 100644 index e2b2a43d..00000000 --- a/workflow/rules/tmb.smk +++ /dev/null @@ -1,20 +0,0 @@ -if config["tmb"]["activate"]: - - rule estimate_tmb: - input: - "results/merged-calls/{cancer_sample}.somatic.fdr-controlled.bcf", - output: - "results/plots/tmb/{cancer_sample}.{plotmode}.vl.json", - conda: - "../envs/varlociraptor.yaml" - log: - "logs/tmb/{cancer_sample}-{plotmode}.log", - params: - **config["tmb"], - shell: - "varlociraptor estimate tmb " - " --plot-mode {wildcards.plotmode} " - "--coding-genome-size {params.coding_genome_size} " - "--somatic-tumor-events {params.somatic_events} " - "--tumor-sample {params.tumor_sample} " - "< {input} > {output} 2> {log}" diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk deleted file mode 100644 index 0ada209d..00000000 --- a/workflow/rules/trim.smk +++ /dev/null @@ -1,66 +0,0 @@ -rule get_sra: - output: - "sra/{accession}_1.fastq", - "sra/{accession}_2.fastq", - log: - "logs/get-sra/{accession}.log", - wrapper: - "0.56.0/bio/sra-tools/fasterq-dump" - - -rule cutadapt_pipe: - input: - get_cutadapt_pipe_input, - output: - pipe("pipe/cutadapt/{sample}/{seqtype}/{unit}.{fq}.{ext}"), - log: - "logs/pipe-fastqs/cutadapt/{sample}-{seqtype}-{unit}.{fq}.{ext}.log", - wildcard_constraints: - ext=r"fastq|fastq\.gz", - threads: 0 # this does not need CPU - shell: - "cat {input} > {output} 2> {log}" - - -rule cutadapt_pe: - input: - get_cutadapt_input, - output: - fastq1="results/trimmed/{sample}/{seqtype}/{unit}_R1.fastq.gz", - fastq2="results/trimmed/{sample}/{seqtype}/{unit}_R2.fastq.gz", - qc="results/trimmed/{sample}/{seqtype}/{unit}.paired.qc.txt", - log: - "logs/cutadapt/{sample}-{seqtype}-{unit}.log", - params: - extra=config["params"]["cutadapt"], - adapters=get_cutadapt_adapters, - threads: 8 - wrapper: - "v0.86.0/bio/cutadapt/pe" - - -rule cutadapt_se: - input: - get_cutadapt_input, - output: - fastq="results/trimmed/{sample}/{seqtype}/{unit}.single.fastq.gz", - qc="results/trimmed/{sample}/{seqtype}/{unit}.single.qc.txt", - log: - "logs/cutadapt/{sample}-{seqtype}-{unit}.log", - params: - extra=config["params"]["cutadapt"], - adapters=get_cutadapt_adapters, - threads: 8 - wrapper: - "v0.86.0/bio/cutadapt/se" - - -rule merge_fastqs: - input: - get_fastqs, - output: - "results/merged/{seqtype}/{sample}_{read}.fastq.gz", - log: - "logs/merge-fastqs/{seqtype}_{sample}_{read}.log", - shell: - "cat {input} > {output} 2> {log}" diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 65161772..92f0ab16 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -2,7 +2,7 @@ rule bcf_index: input: "{prefix}.bcf", output: - "{prefix}.bcf.csi", + "{prefix}.csi", log: "logs/bcf-index/{prefix}.log", wrapper: @@ -13,27 +13,13 @@ rule bam_index: input: "{prefix}.bam", output: - "{prefix}.bam.bai", + "{prefix}.bai", log: "logs/bam-index/{prefix}.log", wrapper: "0.59.2/bio/samtools/index" -rule tabix_known_variants: - input: - "resources/{prefix}.{format}.gz", - output: - "resources/{prefix}.{format}.gz.tbi", - log: - "logs/tabix/{prefix}.{format}.log", - params: - get_tabix_params, - cache: True - wrapper: - "0.59.2/bio/tabix" - - rule tsv_to_excel: input: tsv="results/{x}.tsv", diff --git a/workflow/rules/vega.smk b/workflow/rules/vega.smk deleted file mode 100644 index 4199e8fc..00000000 --- a/workflow/rules/vega.smk +++ /dev/null @@ -1,15 +0,0 @@ -rule vg2svg: - input: - "{prefix}.vl.json", - output: - report( - "{prefix}.svg", - caption="../report/tmb.rst", - category="Tumor Mutational Burden", - ), - log: - "logs/vega/{prefix}.log", - conda: - "../envs/vega.yaml" - shell: - "vl2svg {input} {output} > {log} 2>&1" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 9f882ac1..80f219ff 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -42,40 +42,6 @@ properties: - build - n_chromosomes - - tmb: - type: object - properties: - activate: - type: boolean - coding_genome_size: - # TODO allow integer here! - type: string - tumor_sample: - type: string - somatic_events: - type: array - items: - type: string - required: - - activate - - coding_genome_size - - tumor_sample - - somatic_events - - - remove_duplicates: - type: object - properties: - activate: - type: boolean - - trimming: - type: object - properties: - activate: - type: boolean - epitope_prediction: type: object properties: @@ -109,23 +75,6 @@ properties: activate: type: boolean - annotations: - type: object - properties: - vep: - properties: - params: - type: string - plugins: - type: array - items: - type: string - required: - - params - - plugins - required: - - vep - fusion: type: object properties: @@ -142,34 +91,6 @@ properties: params: type: object properties: - cutadapt: - type: string - bwa: - type: string - gatk: - type: object - properties: - BaseRecalibrator: - type: string - applyBQSR: - type: string - required: - - BaseRecalibrator - - applyBQSR - picard: - type: object - properties: - MarkDuplicates: - type: string - required: - - MarkDuplicates - strelka: - type: object - properties: - config: - type: string - run: - type: string razers3: type: string optitype: @@ -186,15 +107,21 @@ properties: type: integer netMHCIIpan: type: integer - kallisto: - type: string - star: - type: string + variant_sets: + type: object + properties: + normal: + type: string + tumor: + type: string + required: + - normal + - tumor + required: + - window_len + - peptide_len + - variant_sets required: - - bwa - - gatk - - picard - - strelka - razers3 - microphaser - optitype @@ -203,8 +130,6 @@ required: - samples - units - ref - - tmb - params - - annotations - epitope_prediction - affinity diff --git a/workflow/scripts/build_oncoprint_matrix.py b/workflow/scripts/build_oncoprint_matrix.py deleted file mode 100644 index 2b65721b..00000000 --- a/workflow/scripts/build_oncoprint_matrix.py +++ /dev/null @@ -1,42 +0,0 @@ -import pysam -import pandas as pd -import os -import sys - -# redirect output to log file -log = open(snakemake.log[0], "w") -sys.stdout = log -sys.stderr = log - -input_files = snakemake.input - -df = pd.DataFrame(columns=["Sample"]) -for sample_file in input_files: - variant_file = pysam.VariantFile(sample_file) - sample_name = os.path.basename(sample_file).split(".")[0] - gene_variant_dict = {"Sample": [sample_name]} - for rec in variant_file.fetch(): - for sample in rec.samples: - allele_frequencies = rec.samples[sample]["AF"] #Can be multiple entries - for allele_frequency in allele_frequencies: - variant = rec.info["SVLEN"] - if variant[0]: - variant_type = "INDEL" - else: - variant_type = "SNV" - transcripts = rec.info["ANN"] - for transcript in transcripts: - gene = transcript.split("|")[3] - impact = transcript.split("|")[2] - if gene and impact != "MODIFIER": - if gene not in gene_variant_dict: - gene_variant_dict[gene] = set() - gene_variant_dict[gene].add(variant_type) - break - for key, value in gene_variant_dict.items(): - gene_variant_dict[key] = ','.join(value) - sample_df = pd.DataFrame(gene_variant_dict, index=[0]) - df = pd.concat([df, sample_df], join="outer", ignore_index=False, sort=False) -df.set_index("Sample", inplace=True) -with open(snakemake.output[0], 'w') as output_f: - print(df.to_csv(sep="\t", index=True), file=output_f) diff --git a/workflow/scripts/oncoprint.R b/workflow/scripts/oncoprint.R deleted file mode 100644 index 5476b305..00000000 --- a/workflow/scripts/oncoprint.R +++ /dev/null @@ -1,56 +0,0 @@ -# log to file -log <- file(snakemake@log[[1]], open="wt") -sink(log) -sink(log, type="message") - -library(ComplexHeatmap) -library(ggplot2) - -table = read.table(snakemake@input[[1]], sep="\t", header=TRUE, row.names=1) -mat = as.matrix(table) -mat = t(mat) -## remove "full" lines -mat = mat[rowSums(mat == "") > 0,] -## remove single lines -mat = mat[rowSums(mat != "") > 0,] - -col = c(SNV = "blue", INDEL = "red") - -alter_fun = list( - SNV = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.9, - gp = gpar(fill = col["SNV"], col = NA)), - INDEL = function(x, y, w, h) grid.rect(x, y, w*0.9, h*0.4, - gp = gpar(fill = col["INDEL"], col = NA)) - ) - - -heatmap_legend_param = list(title = "Alterations", at = c("SNV", "INDEL"), - labels = c("SNV", "INDEL")) -if (ncol(mat) > 1 ){ - mat <- mat[order(apply(mat, 1, function(row) sum(row != "")), decreasing = T), ] -} - -i = 0 -c = 0 -matlist = list() -while (i + 2000 < nrow(mat)) { - m <- mat[(i + 1):(i + 2000), , drop=FALSE] - rows_matrix <- nrow(m) - height_plot <- (rows_matrix/5) - if (height_plot < 4) { - height_plot <- 4 - } - pdf(file = sub("0.pdf", paste(c, ".pdf", sep=''), snakemake@output[[1]]), height=height_plot) - if (rows_matrix > 0) { - oncoprint <- oncoPrint(m, - alter_fun = alter_fun, col = col, - remove_empty_columns = FALSE, remove_empty_rows = TRUE, - pct_side = "right", row_names_side = "left", - show_column_names=T, - column_title = "OncoPrint", heatmap_legend_param = heatmap_legend_param) - draw(oncoprint, newpage=F) - } - dev.off() - i = i + 2000 - c = c + 1 -} \ No newline at end of file diff --git a/workflow/scripts/render-scenario.py b/workflow/scripts/render-scenario.py deleted file mode 100644 index 8a1d06eb..00000000 --- a/workflow/scripts/render-scenario.py +++ /dev/null @@ -1,9 +0,0 @@ -from jinja2 import Template -import pandas as pd - -with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out: - samples = snakemake.params.samples - group = samples.loc[samples["sample_name"] == snakemake.wildcards.cancer_sample, "group"] - out.write(Template(template.read()).render( - samples=samples[samples["group"] == group] - )) From fcedef9b006696eed92dfafe0e08c08a3dc5d22a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:07:38 +0000 Subject: [PATCH 059/191] config.yaml: add info where to obtain netMHCpan and netMHCIIpan software --- config/config.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/config/config.yaml b/config/config.yaml index 3bf1b781..9391901f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -10,10 +10,22 @@ affinity: netMHCpan: activate: true params: "-BA -l 9 -s -xls" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" location: "../netMHCpan-4.0" netMHCIIpan: activate: false params: "-length 15 -s -xls" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" location: "../netMHCIIpan-4.0" From a429875d344e23c5ff20b6f4b076f9af81cf772d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:09:50 +0000 Subject: [PATCH 060/191] switch optitype read fishing from razers3 to yara, to avoid extraneous memory usage --- config/config.yaml | 9 ++++++--- workflow/rules/HLAtyping.smk | 36 ++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 9391901f..587b707f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -30,10 +30,13 @@ affinity: HLAtyping: - # activate to use razers3 to pre-filter reads before using optitype + # activate to use yara to pre-filter reads before using optitype optitype_prefiltering: activate: True optitype_data: "config/HLA_Data/hla_reference_dna.fasta" + # version of the IMGT-IPD repository to use for determining HLA allele + # regions, the repo is at: https://github.com/ANHIG/IMGTHLA + imgt_hla_version: "v3.48.0-alpha" # activate to predict MHC-I and MHC-II alleles with HLA-LA HLA_LA: activate: true @@ -60,8 +63,8 @@ annotations: - LoFtool params: - razers3: - "-i 95 -m 1 -dr 0" + yara: + "-e 3" optitype: "" microphaser: diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 70afe0f5..518a621c 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -37,33 +37,37 @@ rule parse_HLA_LA: "../scripts/parse_HLA_types.py" -rule razers3: +rule yara: input: reads="results/merged/DNA/{sample}_{read}.fastq.gz", + index="resources/yara/hla_alleles.index", output: - bam="results/razers3/bam/{sample}_{read}.bam", - threads: 8 + bam=temp("results/yara/{sample}_{read}.bam"), + threads: 12 log: - "logs/razers3/{sample}_{read}.log", + "logs/yara/{sample}_{read}.log", + conda: + "../envs/yara.yaml" params: - genome=config["HLAtyping"]["optitype_data"], - extra=config["params"]["razers3"], - wrapper: - "0.61.0/bio/razers3" + extra=config["params"]["yara"], + shell: + "( yara_mapper {params.extra} -t {threads} -f bam {input.index} {input.reads} > {output.bam} ) 2> {log}" -rule bam2fq: +rule filter_yara: input: - "results/razers3/bam/{sample}_{read}.bam", + "results/yara/{sample}_{read}.bam", output: - "results/razers3/fastq/{sample}_{read}.fished.fastq", - params: - "", + temp("results/yara/{sample}_{read}.filtered.bam"), log: - "logs/razers3-bam2fq/{sample}-{read}.log", - threads: 1 + "logs/filter_yara/{sample}_{read}.filtered.log", + threads: 3 + params: + extra="-h -F 4 -b1" wrapper: - "0.61.0/bio/samtools/bam2fq/interleaved" + "v1.5.0/bio/samtools/view" + + rule OptiType: From c5d3b32f3044d8da0901b7c66d20d24497b881f1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:11:34 +0000 Subject: [PATCH 061/191] add yara indexing --- workflow/rules/ref.smk | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 513ea5dd..b1922b7a 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -118,6 +118,21 @@ rule index_HLALA: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" + + +rule yara_hla_index: + input: + config["HLAtyping"]["optitype_data"] + output: + "resources/yara/hla_alleles.index" + log: + "logs/yara_hla_index.log" + conda: + "../envs/yara.yaml" + shell: + "( yara_index {input} -o {output} ) 2> {log}" + + rule make_sampleheader: output: "resources/sampleheader.txt", From 1be3bd226f992f5e3aa10193645604d0cf7cb382 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:13:31 +0000 Subject: [PATCH 062/191] change optitype read fishing to extraction of HLA gene regions from mapped bam, update optitype --- workflow/rules/HLAtyping.smk | 47 ++++++++++++++++------------ workflow/rules/ref.smk | 60 ++++++++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 26 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 518a621c..a259a57d 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -37,42 +37,49 @@ rule parse_HLA_LA: "../scripts/parse_HLA_types.py" -rule yara: +rule get_hla_aligning_reads: input: - reads="results/merged/DNA/{sample}_{read}.fastq.gz", - index="resources/yara/hla_alleles.index", + bam=get_bam_from_group_and_alias(), + bai=get_bam_from_group_and_alias(ext=".bai"), + regions="resources/hla_alleles/hla_allele_regions.expanded_1000.bed", output: - bam=temp("results/yara/{sample}_{read}.bam"), - threads: 12 + bam="results/fished/{group}.{alias}.bam", + idx="results/fished/{group}.{alias}.bai", log: - "logs/yara/{sample}_{read}.log", - conda: - "../envs/yara.yaml" + "logs/get_hla_reads/{group}.{alias}.log", params: - extra=config["params"]["yara"], - shell: - "( yara_mapper {params.extra} -t {threads} -f bam {input.index} {input.reads} > {output.bam} ) 2> {log}" + extra=lambda wc, input: f"--regions-file {input.regions}" + wrapper: + "v1.7.0/bio/samtools/view" + + +ruleorder: get_hla_aligning_reads > bam_index -rule filter_yara: +rule hla_reads_single_ends: input: - "results/yara/{sample}_{read}.bam", + "results/fished/{group}.{alias}.bam", + "results/fished/{group}.{alias}.bai", output: - temp("results/yara/{sample}_{read}.filtered.bam"), + bam="results/fished/{group}.{alias}.{read}.bam", + idx="results/fished/{group}.{alias}.{read}.bai", log: - "logs/filter_yara/{sample}_{read}.filtered.log", - threads: 3 + "logs/split_hla_reads/{group}.{alias}.{read}.log", params: - extra="-h -F 4 -b1" + extra=lambda wc: "-f 0x80" if wc.read == "R2" else "-f 0x40" wrapper: - "v1.5.0/bio/samtools/view" + "v1.7.0/bio/samtools/view" +ruleorder: hla_reads_single_ends > bam_index rule OptiType: input: - reads=get_optitype_reads_input, + reads=[ + "results/fished/{group}.{alias}.R1.bam", + "results/fished/{group}.{alias}.R2.bam", + ], output: multiext( "results/optitype/{group}/{group}.{alias}", @@ -85,7 +92,7 @@ rule OptiType: extra=config["params"]["optitype"], sequencing_type="dna", wrapper: - "0.63.0/bio/optitype" + "v1.7.0/bio/optitype" rule parse_Optitype: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index b1922b7a..0a374f9d 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -118,19 +118,67 @@ rule index_HLALA: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" +rule download_hla_allele_list: + input: + HTTP.remote( + expand( + "raw.githubusercontent.com/ANHIG/IMGTHLA/{version}/Allelelist.txt", + version=config["HLAtyping"]["imgt_hla_version"] + ), + ), + output: + "resources/hla_alleles/Allelelist.txt", + log: + "logs/hla_alleles/download_Allelelist.log", + shell: + "( mv {input} {output} ) 2> {log}" + + +rule get_hla_allele_names: + input: + "resources/hla_alleles/Allelelist.txt", + output: + "resources/hla_alleles/hla_allele_names.txt", + log: + "logs/hla_alleles/hla_allele_names.log", + conda: + "../envs/grep_sed.yaml" + shell: + '( grep -v "^\\(#\\|Allele\\)" {input} | ' + ' cut -d "," -f 2,2 | ' + ' cut -d "*" -f 1,1 | ' + " uniq | " + " sed -e 's/^\\([A-Z]\\)$/HLA-\\1/' | " + " sed -e 's/^\\(D[A-Z]\\{{2,2\\}}[1-9]*\\)$/HLA-\\1/' " + " >{output} ) 2> {log}" + + +rule get_hla_regions_from_gtf: + input: + gtf="resources/genome.gtf", + allele_names="resources/hla_alleles/hla_allele_names.txt", + output: + "resources/hla_alleles/hla_allele_regions.bed", + log: + "logs/hla_alleles/hla_allele_regions.log", + conda: + "../envs/rust.yaml" + script: + "../scripts/hla_regions_from_gtf.rs" -rule yara_hla_index: +rule expand_hla_regions: input: - config["HLAtyping"]["optitype_data"] + bed="resources/hla_alleles/hla_allele_regions.bed", + genome="resources/genome.fasta.fai", output: - "resources/yara/hla_alleles.index" + "resources/hla_alleles/hla_allele_regions.expanded_1000.bed", log: - "logs/yara_hla_index.log" + "logs/hla_alleles/hla_allele_regions.expanded_1000.log", conda: - "../envs/yara.yaml" + "../envs/bedtools.yaml" shell: - "( yara_index {input} -o {output} ) 2> {log}" + "( sort {input.bed} | bedtools slop -b 1000 -g {input.genome} | bedtools merge > {output} ) 2> {log}" rule make_sampleheader: From 1943dfbcbfa95e8c42c5d3d70631b3b83eceaed5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:14:08 +0000 Subject: [PATCH 063/191] add yara.yaml (documentation only) --- workflow/envs/yara.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 workflow/envs/yara.yaml diff --git a/workflow/envs/yara.yaml b/workflow/envs/yara.yaml new file mode 100644 index 00000000..e5489375 --- /dev/null +++ b/workflow/envs/yara.yaml @@ -0,0 +1,5 @@ +channels: + - bioconda + - conda-forge +dependencies: + - yara =1.0.2 From b2deec62e03cf51c695e20e66c7fb65e251f00b1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:14:40 +0000 Subject: [PATCH 064/191] remove now unused yara.yaml --- workflow/envs/yara.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 workflow/envs/yara.yaml diff --git a/workflow/envs/yara.yaml b/workflow/envs/yara.yaml deleted file mode 100644 index e5489375..00000000 --- a/workflow/envs/yara.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - yara =1.0.2 From b2d14f1aa0ef83a429059352521a6aac8010b8de Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:15:27 +0000 Subject: [PATCH 065/191] adjust config.schemal.yaml to yara instead of razers3 --- workflow/schemas/config.schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 80f219ff..54271eca 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -91,7 +91,7 @@ properties: params: type: object properties: - razers3: + yara: type: string optitype: type: string @@ -122,7 +122,7 @@ properties: - peptide_len - variant_sets required: - - razers3 + - yara - microphaser - optitype From 98dd4b1f4b61b41842cdccd265402eb0dff26329 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:17:04 +0000 Subject: [PATCH 066/191] remove yara/razers3 from all config.yaml and schema --- .test/config/config.yaml | 2 -- config/config.yaml | 2 -- workflow/schemas/config.schema.yaml | 3 --- 3 files changed, 7 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 78bf043b..3042b885 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -115,8 +115,6 @@ params: "--exome" run: "--mode local" - razers3: - "-i 95 -m 1 -dr 0" optitype: "" microphaser: diff --git a/config/config.yaml b/config/config.yaml index 587b707f..1172b3a4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,8 +63,6 @@ annotations: - LoFtool params: - yara: - "-e 3" optitype: "" microphaser: diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 54271eca..652e2d72 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -91,8 +91,6 @@ properties: params: type: object properties: - yara: - type: string optitype: type: string microphaser: @@ -122,7 +120,6 @@ properties: - peptide_len - variant_sets required: - - yara - microphaser - optitype From a4f71c69af66dbe6f4d2013970f914eff92a7539 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:19:24 +0000 Subject: [PATCH 067/191] clean up .test/config/config.yaml --- .test/config/config.yaml | 68 ++-------------------------------------- 1 file changed, 2 insertions(+), 66 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 3042b885..524cc23d 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,50 +1,6 @@ samples: "config/samples.tsv" units: "config/units.tsv" -# boolean if read trimming should be skipped -trimming: - activate: false - -remove_duplicates: - activate: false - -calling: - freebayes: - activate: true - # See https://varlociraptor.github.io/docs/calling/#generic-variant-calling - scenario: config/scenario.yaml - filter: - # Filter candidate variants (this filter helps to keep the number of evaluated candidates small). - # It should ideally generate a superset of all other filters defined below. - # Annotation of candidate variants tries to be as fast as possible, only using VEP - # default parameters. - candidates: "" - # Add any number of named filters here. They will be applied independenty, - # and can be referred in FDR control below to generate calls for different events. - # In particular, you can also filter by ID or dbsnp annotations here. - # See http://snpeff.sourceforge.net/SnpSift.html#filter - filtername: "ANN['IMPACT'] != 'MODIFIER'" - fdr-control: - threshold: 0.05 - events: - complete: - varlociraptor: - - "somatic" - - "germline" - somatic: - varlociraptor: - - "somatic" - germline: - varlociraptor: - - "germline" - -fusion: - arriba: - activate: true - blacklist: - "arriba_blacklist" - params: - "-T -P" tmb: activate: true @@ -101,20 +57,6 @@ annotations: - LoFtool params: - cutadapt: "" - bwa: - "-M" - picard: - MarkDuplicates: - "VALIDATION_STRINGENCY=LENIENT" - gatk: - BaseRecalibrator: "--tmp-dir tmp" - applyBQSR: "" - strelka: - config: - "--exome" - run: - "--mode local" optitype: "" microphaser: @@ -126,11 +68,5 @@ params: netMHCIIpan: 15 events: - tumor: "strelka_somatic" - normal: "strelka_germline" - kallisto: - "-b 100" - star: >- - --outSAMmapqUnique 60 --outSAMtype BAM Unsorted --chimSegmentMin 10 --chimOutType WithinBAM SoftClip - --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 - --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3 + tumor: "tumor_only" + normal: "normal_only" From 2500cb6d5cfea69bcf18370496b7fa9cc70f9ae7 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:20:52 +0000 Subject: [PATCH 068/191] get hla_la.yaml to work by fixing further dependencies --- workflow/envs/hla_la.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml index 6c2487fb..4b94aaef 100644 --- a/workflow/envs/hla_la.yaml +++ b/workflow/envs/hla_la.yaml @@ -5,4 +5,6 @@ channels: dependencies: - hla-la ==1.0.5 - samtools ==1.10 - - boost-cpp ==1.73.0 + - bamtools ==2.5.1 + - boost-cpp ==1.74.0 + - r-base =4 From bd6b24d74b41bca144d4de428d30c0ccfb62db8e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:21:44 +0000 Subject: [PATCH 069/191] add environments for getting HLA gene regions --- workflow/envs/bedtools.yaml | 5 +++++ workflow/envs/grep_sed.yaml | 6 ++++++ workflow/envs/rust.yaml | 8 ++++++++ 3 files changed, 19 insertions(+) create mode 100644 workflow/envs/bedtools.yaml create mode 100644 workflow/envs/grep_sed.yaml create mode 100644 workflow/envs/rust.yaml diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml new file mode 100644 index 00000000..00c3a615 --- /dev/null +++ b/workflow/envs/bedtools.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bedtools =2.30 \ No newline at end of file diff --git a/workflow/envs/grep_sed.yaml b/workflow/envs/grep_sed.yaml new file mode 100644 index 00000000..00e5b6aa --- /dev/null +++ b/workflow/envs/grep_sed.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - grep =3.4 + - sed =4.8 diff --git a/workflow/envs/rust.yaml b/workflow/envs/rust.yaml new file mode 100644 index 00000000..16124ac7 --- /dev/null +++ b/workflow/envs/rust.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - rust-script >=0.17.0 + - rust >=1.58 + - cryptography >=36.0 + - c-compiler =1.3 + - pkg-config >=0.29 \ No newline at end of file From c31f660c52eed276fc3ae64ca420b4d22aae8380 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:23:09 +0000 Subject: [PATCH 070/191] add imports to use remote resource for HLA allele list --- workflow/rules/common.smk | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9e25bb50..dc653b72 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -2,8 +2,10 @@ import glob import pandas as pd from snakemake.remote import FTP +from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider from snakemake.utils import validate +HTTP = HTTPRemoteProvider() ftp = FTP.RemoteProvider() ##### config file ##### From 08197e05eebc606b4e5b6540745eb043f9bc3b47 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:26:03 +0000 Subject: [PATCH 071/191] remove generation of HLA allele gene regions --- workflow/rules/common.smk | 2 -- workflow/rules/ref.smk | 63 --------------------------------------- 2 files changed, 65 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index dc653b72..9e25bb50 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -2,10 +2,8 @@ import glob import pandas as pd from snakemake.remote import FTP -from snakemake.remote.HTTP import RemoteProvider as HTTPRemoteProvider from snakemake.utils import validate -HTTP = HTTPRemoteProvider() ftp = FTP.RemoteProvider() ##### config file ##### diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 0a374f9d..513ea5dd 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -118,69 +118,6 @@ rule index_HLALA: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" -rule download_hla_allele_list: - input: - HTTP.remote( - expand( - "raw.githubusercontent.com/ANHIG/IMGTHLA/{version}/Allelelist.txt", - version=config["HLAtyping"]["imgt_hla_version"] - ), - ), - output: - "resources/hla_alleles/Allelelist.txt", - log: - "logs/hla_alleles/download_Allelelist.log", - shell: - "( mv {input} {output} ) 2> {log}" - - -rule get_hla_allele_names: - input: - "resources/hla_alleles/Allelelist.txt", - output: - "resources/hla_alleles/hla_allele_names.txt", - log: - "logs/hla_alleles/hla_allele_names.log", - conda: - "../envs/grep_sed.yaml" - shell: - '( grep -v "^\\(#\\|Allele\\)" {input} | ' - ' cut -d "," -f 2,2 | ' - ' cut -d "*" -f 1,1 | ' - " uniq | " - " sed -e 's/^\\([A-Z]\\)$/HLA-\\1/' | " - " sed -e 's/^\\(D[A-Z]\\{{2,2\\}}[1-9]*\\)$/HLA-\\1/' " - " >{output} ) 2> {log}" - - -rule get_hla_regions_from_gtf: - input: - gtf="resources/genome.gtf", - allele_names="resources/hla_alleles/hla_allele_names.txt", - output: - "resources/hla_alleles/hla_allele_regions.bed", - log: - "logs/hla_alleles/hla_allele_regions.log", - conda: - "../envs/rust.yaml" - script: - "../scripts/hla_regions_from_gtf.rs" - - -rule expand_hla_regions: - input: - bed="resources/hla_alleles/hla_allele_regions.bed", - genome="resources/genome.fasta.fai", - output: - "resources/hla_alleles/hla_allele_regions.expanded_1000.bed", - log: - "logs/hla_alleles/hla_allele_regions.expanded_1000.log", - conda: - "../envs/bedtools.yaml" - shell: - "( sort {input.bed} | bedtools slop -b 1000 -g {input.genome} | bedtools merge > {output} ) 2> {log}" - - rule make_sampleheader: output: "resources/sampleheader.txt", From d527371f50ae4b2c7abe69b56d87b8b444f3109a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:28:12 +0000 Subject: [PATCH 072/191] remove environment defs for HLA allele region generation --- workflow/envs/bedtools.yaml | 5 ----- workflow/envs/grep_sed.yaml | 6 ------ workflow/envs/rust.yaml | 8 -------- 3 files changed, 19 deletions(-) delete mode 100644 workflow/envs/bedtools.yaml delete mode 100644 workflow/envs/grep_sed.yaml delete mode 100644 workflow/envs/rust.yaml diff --git a/workflow/envs/bedtools.yaml b/workflow/envs/bedtools.yaml deleted file mode 100644 index 00c3a615..00000000 --- a/workflow/envs/bedtools.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bedtools =2.30 \ No newline at end of file diff --git a/workflow/envs/grep_sed.yaml b/workflow/envs/grep_sed.yaml deleted file mode 100644 index 00e5b6aa..00000000 --- a/workflow/envs/grep_sed.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - grep =3.4 - - sed =4.8 diff --git a/workflow/envs/rust.yaml b/workflow/envs/rust.yaml deleted file mode 100644 index 16124ac7..00000000 --- a/workflow/envs/rust.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - conda-forge -dependencies: - - rust-script >=0.17.0 - - rust >=1.58 - - cryptography >=36.0 - - c-compiler =1.3 - - pkg-config >=0.29 \ No newline at end of file From 0a12e794d46044c4d9fc5dd692e9c3e1decedff8 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:34:31 +0000 Subject: [PATCH 073/191] remove optitype, only use HLA-LA --- .test/config/config.yaml | 12 ------------ config/config.yaml | 15 --------------- workflow/rules/HLAtyping.smk | 37 ------------------------------------ workflow/rules/common.smk | 30 +++++------------------------ 4 files changed, 5 insertions(+), 89 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 524cc23d..b09277f7 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -26,16 +26,6 @@ affinity: location: "../netMHCIIpan-4.0" -HLAtyping: - # activate to use razers3 to pre-filter reads before using optitype - optitype_prefiltering: - activate: true - optitype_data: "config/HLA_Data/hla_reference_dna.fasta" - # activate to predict MHC-I and MHC-II alleles with HLA-LA - HLA_LA: - activate: false - - ref: # Number of chromosomes to consider for calling. # The first n entries of the FASTA will be considered. @@ -57,8 +47,6 @@ annotations: - LoFtool params: - optitype: - "" microphaser: window_len: 33 diff --git a/config/config.yaml b/config/config.yaml index 1172b3a4..257f312e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -29,19 +29,6 @@ affinity: location: "../netMHCIIpan-4.0" -HLAtyping: - # activate to use yara to pre-filter reads before using optitype - optitype_prefiltering: - activate: True - optitype_data: "config/HLA_Data/hla_reference_dna.fasta" - # version of the IMGT-IPD repository to use for determining HLA allele - # regions, the repo is at: https://github.com/ANHIG/IMGTHLA - imgt_hla_version: "v3.48.0-alpha" - # activate to predict MHC-I and MHC-II alleles with HLA-LA - HLA_LA: - activate: true - - ref: # Number of chromosomes to consider for calling. # The first n entries of the FASTA will be considered. @@ -63,8 +50,6 @@ annotations: - LoFtool params: - optitype: - "" microphaser: window_len: 33 diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index a259a57d..0b4169a5 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -72,40 +72,3 @@ rule hla_reads_single_ends: ruleorder: hla_reads_single_ends > bam_index - - -rule OptiType: - input: - reads=[ - "results/fished/{group}.{alias}.R1.bam", - "results/fished/{group}.{alias}.R2.bam", - ], - output: - multiext( - "results/optitype/{group}/{group}.{alias}", - ".coverage_plot.pdf", - ".result.tsv", - ), - log: - "logs/optitype/{group}.{alias}.log", - params: - extra=config["params"]["optitype"], - sequencing_type="dna", - wrapper: - "v1.7.0/bio/optitype" - - -rule parse_Optitype: - input: - "results/optitype/{group}/{group}.{alias}.result.tsv", - output: - report( - "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(Optitype)", - ), - log: - "logs/parse-optitype/{group}.{alias}.log", - shell: - "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' " - '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}' diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9e25bb50..2d9c4b35 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -114,20 +114,15 @@ def get_final_output(): ) ) else: - if config["HLAtyping"]["HLA_LA"]["activate"]: final_output = expand( [ - "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", - "results/HLA-LA/{group}.{alias}.hlaI.tsv", - "results/HLA-LA/{group}.{alias}.hlaII.tsv", + "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv", + "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv", ], - sample=samples["sample_name"], - ) - else: - final_output = expand( - "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", - sample=samples["sample_name"], + group=group, + tumor_alias=tumor_aliases ) + return final_output @@ -159,21 +154,6 @@ def get_sample_from_group_and_alias(group, alias): return sample -def get_optitype_reads_input(wildcards): - sample = get_sample_from_group_and_alias(wildcards.group, wildcards.alias) - if is_activated("HLAtyping/optitype_prefiltering"): - if is_paired_end(sample, "DNA"): - return expand( - "results/razers3/fastq/{sample}_{read}.fished.fastq", - sample=sample, - read=["R1", "R2"], - ) - return f"results/razers3/fastq/{sample}_single.fastq" - else: - wildcards["sample"] = sample - return get_map_reads_input(wildcards) - - def get_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): alias = wildcards.get("alias", From d55d8c6dc2e6834449ac88a2f88cc298f78fbf91 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:35:02 +0000 Subject: [PATCH 074/191] check in rust script used for getting hla regions from GTF file --- workflow/scripts/hla_regions_from_gtf.rs | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 workflow/scripts/hla_regions_from_gtf.rs diff --git a/workflow/scripts/hla_regions_from_gtf.rs b/workflow/scripts/hla_regions_from_gtf.rs new file mode 100644 index 00000000..2357216d --- /dev/null +++ b/workflow/scripts/hla_regions_from_gtf.rs @@ -0,0 +1,49 @@ +//! This is a regular crate doc comment, but it also contains a partial +//! Cargo manifest. Note the use of a *fenced* code block, and the +//! `cargo` "language". +//! +//! ```cargo +//! cargo-features = ["edition2021"] +//! [dependencies] +//! bio = { version = "0.41.0" } +//! ``` +use bio::io::{gff, bed}; + +use std::fs::File; +use std::path::PathBuf; +use std::collections::HashSet; +use std::io::{BufRead, BufReader}; +use std::error::Error; + + +fn main() -> Result<(), Box> { + + snakemake.redirect_stderr(&snakemake.log[0])?; + + let alleles_file = BufReader::new(File::open(PathBuf::from(&snakemake.input.allele_names))?); + let allele_names: HashSet = alleles_file.lines().map(|line| line.unwrap()).collect(); + + let mut gtf_reader = gff::Reader::from_file(PathBuf::from(&snakemake.input.gtf), gff::GffType::GTF2)?; + + let mut bed_writer = bed::Writer::to_file(PathBuf::from(&snakemake.output[0]))?; + + for r in gtf_reader.records() { + let record = r?; + if record.feature_type() == "gene" { + let attr = record.attributes(); + if let Some(name) = attr.get("gene_name") { + if allele_names.contains(name) { + let mut bed_record = bed::Record::new(); + bed_record.set_chrom(record.seqname()); + bed_record.set_start(*record.start()); + bed_record.set_end(*record.end()); + bed_record.set_name(name); + // write out bed record + bed_writer.write(&bed_record)?; + } + } + } + } + + Ok(()) +} \ No newline at end of file From c635b631f755b3b57a2f61883ab1c5d2ae5a39ba Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:35:47 +0000 Subject: [PATCH 075/191] remove checked-in rust script, as it was only needed for optitype read fishing, optitype is now removed --- workflow/scripts/hla_regions_from_gtf.rs | 49 ------------------------ 1 file changed, 49 deletions(-) delete mode 100644 workflow/scripts/hla_regions_from_gtf.rs diff --git a/workflow/scripts/hla_regions_from_gtf.rs b/workflow/scripts/hla_regions_from_gtf.rs deleted file mode 100644 index 2357216d..00000000 --- a/workflow/scripts/hla_regions_from_gtf.rs +++ /dev/null @@ -1,49 +0,0 @@ -//! This is a regular crate doc comment, but it also contains a partial -//! Cargo manifest. Note the use of a *fenced* code block, and the -//! `cargo` "language". -//! -//! ```cargo -//! cargo-features = ["edition2021"] -//! [dependencies] -//! bio = { version = "0.41.0" } -//! ``` -use bio::io::{gff, bed}; - -use std::fs::File; -use std::path::PathBuf; -use std::collections::HashSet; -use std::io::{BufRead, BufReader}; -use std::error::Error; - - -fn main() -> Result<(), Box> { - - snakemake.redirect_stderr(&snakemake.log[0])?; - - let alleles_file = BufReader::new(File::open(PathBuf::from(&snakemake.input.allele_names))?); - let allele_names: HashSet = alleles_file.lines().map(|line| line.unwrap()).collect(); - - let mut gtf_reader = gff::Reader::from_file(PathBuf::from(&snakemake.input.gtf), gff::GffType::GTF2)?; - - let mut bed_writer = bed::Writer::to_file(PathBuf::from(&snakemake.output[0]))?; - - for r in gtf_reader.records() { - let record = r?; - if record.feature_type() == "gene" { - let attr = record.attributes(); - if let Some(name) = attr.get("gene_name") { - if allele_names.contains(name) { - let mut bed_record = bed::Record::new(); - bed_record.set_chrom(record.seqname()); - bed_record.set_start(*record.start()); - bed_record.set_end(*record.end()); - bed_record.set_name(name); - // write out bed record - bed_writer.write(&bed_record)?; - } - } - } - } - - Ok(()) -} \ No newline at end of file From 9642129445961429502693210aff984967ea64b9 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:37:33 +0000 Subject: [PATCH 076/191] clean up config.schema.yaml after all the removals --- workflow/schemas/config.schema.yaml | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 652e2d72..8b960444 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -66,33 +66,9 @@ properties: params: type: string - HLAtyping: - type: object - properties: - HLA_LA: - type: object - properties: - activate: - type: boolean - - fusion: - type: object - properties: - arriba: - type: object - properties: - activate: - type: boolean - blacklist: - type: string - params: - type: string - params: type: object properties: - optitype: - type: string microphaser: type: object properties: @@ -121,7 +97,6 @@ properties: - variant_sets required: - microphaser - - optitype required: - samples From 5935345c202c7161911ba1c576859b6188c37802 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:38:34 +0000 Subject: [PATCH 077/191] remove last rules for optitype read fishing --- workflow/rules/HLAtyping.smk | 37 ------------------------------------ 1 file changed, 37 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 0b4169a5..db8bcc58 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -35,40 +35,3 @@ rule parse_HLA_LA: "logs/parse-HLA-LA/{group}.{alias}.log", script: "../scripts/parse_HLA_types.py" - - -rule get_hla_aligning_reads: - input: - bam=get_bam_from_group_and_alias(), - bai=get_bam_from_group_and_alias(ext=".bai"), - regions="resources/hla_alleles/hla_allele_regions.expanded_1000.bed", - output: - bam="results/fished/{group}.{alias}.bam", - idx="results/fished/{group}.{alias}.bai", - log: - "logs/get_hla_reads/{group}.{alias}.log", - params: - extra=lambda wc, input: f"--regions-file {input.regions}" - wrapper: - "v1.7.0/bio/samtools/view" - - -ruleorder: get_hla_aligning_reads > bam_index - - -rule hla_reads_single_ends: - input: - "results/fished/{group}.{alias}.bam", - "results/fished/{group}.{alias}.bai", - output: - bam="results/fished/{group}.{alias}.{read}.bam", - idx="results/fished/{group}.{alias}.{read}.bai", - log: - "logs/split_hla_reads/{group}.{alias}.{read}.log", - params: - extra=lambda wc: "-f 0x80" if wc.read == "R2" else "-f 0x40" - wrapper: - "v1.7.0/bio/samtools/view" - - -ruleorder: hla_reads_single_ends > bam_index From 6c06a2a3d81d11e62c93c30b0c891787bfca9c60 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:40:04 +0000 Subject: [PATCH 078/191] work around microphaser requirement for `gene_name` in every GTF record by excluding those (only about 150k in a total of something like 3.5m records) --- workflow/rules/ref.smk | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 513ea5dd..86d41dd7 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -44,9 +44,21 @@ rule get_annotation: "0.45.1/bio/reference/ensembl-annotation" -rule split_annotation: +#TODO: remove this rule, once microphaser is fixed to make gene_name optional +rule remove_records_with_gene_name_missing: input: "resources/genome.gtf", + output: + "resources/genome.records_with_gene_name.gtf", + log: + "logs/remove_records_with_gene_name_missing.log", + shell: + '( grep "gene_name" {input} > {output} ) 2> {log}' + + +rule split_annotation: + input: + "resources/genome.records_with_gene_name.gtf", output: "resources/annotation/{contig}.gtf", log: From f12187bad6432c24774d0f6da88c9f2d86d749ca Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:40:27 +0000 Subject: [PATCH 079/191] fix bcf_index output file name --- workflow/rules/utils.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 92f0ab16..c8b11474 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -2,7 +2,7 @@ rule bcf_index: input: "{prefix}.bcf", output: - "{prefix}.csi", + "{prefix}.bcf.csi", log: "logs/bcf-index/{prefix}.log", wrapper: From 42eda3f64e61a2059ffad5aed24c59ed65877dda Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:40:59 +0000 Subject: [PATCH 080/191] remove tmb from config.yaml (previously removed, done in dna-seq-varlociraptor workflow) --- .test/config/config.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index b09277f7..937d9028 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -2,15 +2,6 @@ samples: "config/samples.tsv" units: "config/units.tsv" -tmb: - activate: true - coding_genome_size: 3e7 - # Name of the tumor sample in the scenario.yaml. - tumor_sample: tumor - somatic_events: - - somatic - - epitope_prediction: activate: true From 02d121ff6af926c72d742cb79b4ca93a4076fbcf Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:42:20 +0000 Subject: [PATCH 081/191] fix HLA-LA graph download: generalize to work with module inclusion, rm tar.gz file after extraction --- workflow/rules/ref.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 86d41dd7..d9878f1f 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -102,11 +102,13 @@ rule download_HLALA_graph: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"), "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", + params: + graphs_dir=lambda w, output: output[0].replace("/PRG_MHC_GRCh38_withIMGT/PRG", ""), log: "logs/download-HLA-LA-graph.log", shell: - "cd resources/graphs && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " - "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz" + "( cd {params.graphs_dir} && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " + "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}" rule index_HLALA: From 00e42a1f7878e1035886c26b8f5492d5c7533db1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:43:17 +0000 Subject: [PATCH 082/191] remove multiext() from rule index_HLALA, as snakemake does not accept directories as suffixes any more --- workflow/rules/ref.smk | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index d9878f1f..93daa96a 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -115,12 +115,8 @@ rule index_HLALA: input: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", output: - multiext( - "resources/graphs/PRG_MHC_GRCh38_withIMGT/", - "serializedGRAPH", - "serializedGRAPH_preGapPathindex", - ), - cache: True + "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH_preGapPathIndex", conda: "../envs/hla_la.yaml" params: From b5eda3a50cc3117335d7be3612398401200bcd85 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:45:18 +0000 Subject: [PATCH 083/191] add netMHCpan install instructions in both configs, update to version 4.1 --- .test/config/config.yaml | 16 ++++++++++++++-- config/config.yaml | 4 ++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 937d9028..25ccfa45 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -10,11 +10,23 @@ affinity: netMHCpan: activate: true params: "-BA -l 9 -s -xls" - location: "../netMHCpan-4.0" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCpan-4.1" netMHCIIpan: activate: false params: "-length 15 -s -xls" - location: "../netMHCIIpan-4.0" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCIIpan-4.1" ref: diff --git a/config/config.yaml b/config/config.yaml index 257f312e..31512865 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -16,7 +16,7 @@ affinity: # in addition to the other edits described for a complete install. To use # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" - location: "../netMHCpan-4.0" + location: "../netMHCpan-4.1" netMHCIIpan: activate: false params: "-length 15 -s -xls" @@ -26,7 +26,7 @@ affinity: # in addition to the other edits described for a complete install. To use # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" - location: "../netMHCIIpan-4.0" + location: "../netMHCIIpan-4.1" ref: From aab65ad8c381b22279c5a41ae3270b72a31958b8 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:45:52 +0000 Subject: [PATCH 084/191] update parsing of called HLA types to netMHC(II)pan-4.1 alleles --- workflow/scripts/parse_HLA_types.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py index ff1906f3..66c65b3b 100644 --- a/workflow/scripts/parse_HLA_types.py +++ b/workflow/scripts/parse_HLA_types.py @@ -1,8 +1,8 @@ import pandas as pd -hlaI = ["A","B","C"] +hlaI = ["A","B","C", "E", "G"] -hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"] +hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"] hlas = pd.read_csv(snakemake.input[0], sep='\t') From 8d1acc08ffc9533af1c381b399ebd53ba038022e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:48:37 +0000 Subject: [PATCH 085/191] add comments on how to list alleles that netMHC(II)pan can each handle --- workflow/scripts/parse_HLA_types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py index 66c65b3b..1824326d 100644 --- a/workflow/scripts/parse_HLA_types.py +++ b/workflow/scripts/parse_HLA_types.py @@ -1,7 +1,9 @@ import pandas as pd +# to get alleles that netMHCpan can handle, use its -listMHC option hlaI = ["A","B","C", "E", "G"] +# to get alleles that netMHCIIpan can handle, use its -list option hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"] hlas = pd.read_csv(snakemake.input[0], sep='\t') From 0062cfdcf8dd97fbea8a8ba33f9fd8fdae1874d8 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:50:28 +0000 Subject: [PATCH 086/191] netMHC(II)pan rules: use tcsh environments, clean up shell command via params --- workflow/envs/tcsh.yaml | 4 ++++ workflow/rules/MHC_binding.smk | 28 ++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) create mode 100644 workflow/envs/tcsh.yaml diff --git a/workflow/envs/tcsh.yaml b/workflow/envs/tcsh.yaml new file mode 100644 index 00000000..9bc374fd --- /dev/null +++ b/workflow/envs/tcsh.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - tcsh =6.24 diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 2e80b180..c8d1ed9a 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -24,13 +24,19 @@ rule netMHCpan: "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", log: "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + conda: + "../envs/tcsh.yaml" params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], - run: - alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0]) - cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" - shell(cmd) + alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t").iloc[0]) + shell: + "if [ -s {input.peptides} ]; " + "then " + " {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + "else " + " touch {output}; " + "fi" rule netMHCIIpan: @@ -41,13 +47,19 @@ rule netMHCIIpan: "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", log: "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + conda: + "../envs/tcsh.yaml" params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], - run: - alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist()) - cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi" - shell(cmd) + alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist()) + shell: + "if [ -s {input.peptides} ]; " + "then " + " {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + "else " + " touch {output}; " + "fi" rule parse_mhc_out: From 298afbd7220444e9a3dfe1042ca00ee2018c54aa Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:51:13 +0000 Subject: [PATCH 087/191] generalize rule HLA_LA workdir to make it work via module import --- workflow/rules/HLAtyping.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index db8bcc58..12b78705 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -11,10 +11,11 @@ rule HLA_LA: params: graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), + workdir=lambda w, output: os.path.dirname(os.path.dirname(os.path.dirname(output[0]))), conda: "../envs/hla_la.yaml" shell: - "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1" + "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1" rule parse_HLA_LA: From dffb7573beef509639dc3ea6bafb0592aa17dbe3 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:52:26 +0000 Subject: [PATCH 088/191] get_final_output(): move smps extraction to higher level, also get tumor_aliases at this high level --- workflow/rules/common.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 2d9c4b35..34b5c06c 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -82,16 +82,16 @@ def is_activated(xpath): def get_final_output(): final_output = [] - if config["epitope_prediction"]["activate"]: for group in pd.unique(samples["group"]): smps = samples.loc[samples["group"] == group, "sample_name"] - sequencing_types = pd.unique( - units.loc[units["sample_name"].isin(smps), "sequencing_type"] - ) tumor_aliases = samples.loc[ (samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias", ] + if config["epitope_prediction"]["activate"]: + sequencing_types = pd.unique( + units.loc[units["sample_name"].isin(smps), "sequencing_type"] + ) final_output.extend( expand( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", From e038952047009b7faabb663172390c827cf8ac37 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:52:54 +0000 Subject: [PATCH 089/191] new wildcard_constraints for normal_alias, tumor_set and normal_set --- workflow/rules/common.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 34b5c06c..b831f530 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -62,6 +62,9 @@ wildcard_constraints: tumor_alias="|".join( pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"]) ), + normal_alias="normal", + tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"], + normal_set=config["params"]["microphaser"]["variant_sets"]["normal"], group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), peptide_type="|".join(["normal", "neo"]), From ab79de0bd9c363ad61cbcbd56499d230eab781c9 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:53:45 +0000 Subject: [PATCH 090/191] switch netMHC(II)pan input functions to properly use HLA-LA output --- workflow/rules/common.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b831f530..97e23eb8 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -177,7 +177,7 @@ def get_bam_from_group_and_alias(ext=".bam"): def get_alleles_MHCI(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/optitype/{group}/{group}.{alias}.hla_alleles.tsv", + "results/HLA-LA/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias, ) @@ -187,5 +187,7 @@ def get_alleles_MHCII(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( #TODO: check that hlaII is correct here, and not hlaI which it previously was - "results/HLA-LA/{group}.{alias}.hlaII.tsv", group=wildcards.group, alias=alias + "results/HLA-LA/{group}.{alias}.hlaII.tsv", + group=wildcards.group, + alias=alias ) From 060285acccc9c568220d0c16e44476650d3eacba Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:53:56 +0000 Subject: [PATCH 091/191] type --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 97e23eb8..80f183f7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -166,7 +166,7 @@ def get_bam_from_group_and_alias(ext=".bam"): ) if alias == "unknown": raise CustomException( - "get_bam_from_group_and_alias() requires on of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'." + "get_bam_from_group_and_alias() requires one of the following wildcards: 'alias', 'tumor_alias', 'normal_alias'." ) sample = get_sample_from_group_and_alias(wildcards.group, alias) return f"results/recal/{sample}.sorted{ext}" From 0faa787547049b37e1568efa6094182256dd0728 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:54:33 +0000 Subject: [PATCH 092/191] fix indentation --- workflow/rules/common.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 80f183f7..985eb64a 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -116,7 +116,7 @@ def get_final_output(): seqtype=sequencing_types, ) ) - else: + else: final_output = expand( [ "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv", From 73de9b72b7be413a68c559c63a613b889853ab50 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:54:50 +0000 Subject: [PATCH 093/191] fix further indentation --- workflow/rules/common.smk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 985eb64a..c5e93cac 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -85,12 +85,12 @@ def is_activated(xpath): def get_final_output(): final_output = [] - for group in pd.unique(samples["group"]): - smps = samples.loc[samples["group"] == group, "sample_name"] - tumor_aliases = samples.loc[ - (samples["group"] == group) & (samples["alias"].str.match("tumor")), - "alias", - ] + for group in pd.unique(samples["group"]): + smps = samples.loc[samples["group"] == group, "sample_name"] + tumor_aliases = samples.loc[ + (samples["group"] == group) & (samples["alias"].str.match("tumor")), + "alias", + ] if config["epitope_prediction"]["activate"]: sequencing_types = pd.unique( units.loc[units["sample_name"].isin(smps), "sequencing_type"] From b91744f749d9a8e9f818eef78369c07d6a021eb3 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 09:55:27 +0000 Subject: [PATCH 094/191] adjust bcf index file name to standard bcftools naming scheme --- workflow/rules/microphaser.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 044b825d..50e99926 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -22,7 +22,7 @@ rule merge_tumor_normal: ], ), index=expand( - "results/final-calls/{{group}}.{sets}.norm.csi", + "results/final-calls/{{group}}.{sets}.norm.bcf.csi", sets=[ config["params"]["microphaser"]["variant_sets"]["normal"], config["params"]["microphaser"]["variant_sets"]["tumor"], From 8015a151084ab3fb47c39e226e1baa1cbfc434ab Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 24 Jun 2022 10:03:54 +0000 Subject: [PATCH 095/191] snakefmt --- workflow/rules/HLAtyping.smk | 4 +++- workflow/rules/MHC_binding.smk | 8 ++++++-- workflow/rules/common.smk | 14 +++++++------- workflow/rules/microphaser.smk | 8 ++++++-- workflow/rules/ref.smk | 6 ++++-- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 12b78705..6cd39931 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -11,7 +11,9 @@ rule HLA_LA: params: graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), - workdir=lambda w, output: os.path.dirname(os.path.dirname(os.path.dirname(output[0]))), + workdir=lambda w, output: os.path.dirname( + os.path.dirname(os.path.dirname(output[0])) + ), conda: "../envs/hla_la.yaml" shell: diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index c8d1ed9a..eb3724ea 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -29,7 +29,9 @@ rule netMHCpan: params: extra=config["affinity"]["netMHCpan"]["params"], netMHC=config["affinity"]["netMHCpan"]["location"], - alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t").iloc[0]) + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], sep="\t").iloc[0] + ), shell: "if [ -s {input.peptides} ]; " "then " @@ -52,7 +54,9 @@ rule netMHCIIpan: params: extra=config["affinity"]["netMHCIIpan"]["params"], netMHC=config["affinity"]["netMHCIIpan"]["location"], - alleles=lambda wc, input: ",".join(pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist()) + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist() + ), shell: "if [ -s {input.peptides} ]; " "then " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c5e93cac..54a71e58 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -123,7 +123,7 @@ def get_final_output(): "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv", ], group=group, - tumor_alias=tumor_aliases + tumor_alias=tumor_aliases, ) return final_output @@ -135,6 +135,7 @@ caller = list( ### helper functions ### + def is_paired_end(sample, seqtype): sample_units = units.loc[sample].loc[seqtype] fq2_null = sample_units["fq2"].isnull() @@ -159,10 +160,9 @@ def get_sample_from_group_and_alias(group, alias): def get_bam_from_group_and_alias(ext=".bam"): def inner(wildcards): - alias = wildcards.get("alias", - wildcards.get("tumor_alias", - wildcards.get("normal_alias", "unknown") - ) + alias = wildcards.get( + "alias", + wildcards.get("tumor_alias", wildcards.get("normal_alias", "unknown")), ) if alias == "unknown": raise CustomException( @@ -186,8 +186,8 @@ def get_alleles_MHCI(wildcards): def get_alleles_MHCII(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - #TODO: check that hlaII is correct here, and not hlaI which it previously was + # TODO: check that hlaII is correct here, and not hlaI which it previously was "results/HLA-LA/{group}.{alias}.hlaII.tsv", group=wildcards.group, - alias=alias + alias=alias, ) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 50e99926..a43c7b18 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -68,8 +68,12 @@ rule microphaser_normal: track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - wt_fasta=("results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa"), - wt_tsv=("results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv"), + wt_fasta=( + "results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa" + ), + wt_tsv=( + "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv" + ), log: "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log", conda: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 93daa96a..c6474b7f 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -44,7 +44,7 @@ rule get_annotation: "0.45.1/bio/reference/ensembl-annotation" -#TODO: remove this rule, once microphaser is fixed to make gene_name optional +# TODO: remove this rule, once microphaser is fixed to make gene_name optional rule remove_records_with_gene_name_missing: input: "resources/genome.gtf", @@ -103,7 +103,9 @@ rule download_HLALA_graph: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"), "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", params: - graphs_dir=lambda w, output: output[0].replace("/PRG_MHC_GRCh38_withIMGT/PRG", ""), + graphs_dir=lambda w, output: output[0].replace( + "/PRG_MHC_GRCh38_withIMGT/PRG", "" + ), log: "logs/download-HLA-LA-graph.log", shell: From 84e91122cae5de8e2857b4f01649666dc1ff3c94 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 12 Jul 2022 09:58:25 +0000 Subject: [PATCH 096/191] constrain `set` wildcard to tumor and normal variant set names --- workflow/rules/common.smk | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 54a71e58..097509c6 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -65,6 +65,12 @@ wildcard_constraints: normal_alias="normal", tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"], normal_set=config["params"]["microphaser"]["variant_sets"]["normal"], + set="|".join( + [ + config["params"]["microphaser"]["variant_sets"]["tumor"], + config["params"]["microphaser"]["variant_sets"]["normal"], + ] + ), group="|".join(pd.unique(samples["group"])), caller="|".join(["freebayes", "delly"]), peptide_type="|".join(["normal", "neo"]), From 5f15ecfb50185197f6e80d58ae3ade41938e281f Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 12 Jul 2022 10:00:07 +0000 Subject: [PATCH 097/191] ensure that HLA-LA reference fasta is indexed with hla-la.yaml dependency version of bwa --- workflow/rules/HLAtyping.smk | 1 + workflow/rules/ref.smk | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 6cd39931..b2c389e9 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -3,6 +3,7 @@ rule HLA_LA: bam=get_bam_from_group_and_alias(), bai=get_bam_from_group_and_alias(ext=".bai"), index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", + ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac", output: "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", threads: 7 diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index c6474b7f..cb54eeeb 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -94,13 +94,13 @@ rule genome_dict: rule download_HLALA_graph: output: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"), - directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/mapping_PRGonly"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/referenceGenomeSimulations"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/sampledReferenceGenomes"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/translation"), + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa", "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", params: graphs_dir=lambda w, output: output[0].replace( @@ -130,6 +130,23 @@ rule index_HLALA: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" +rule index_HLALA_extended_ref: + input: + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa", + output: + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.amb", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.ann", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.bwt", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac", + "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.sa", + conda: + "../envs/hla_la.yaml" + log: + "logs/index_HLA-LA_extended_ref.log", + shell: + "bwa index {input} > {log} 2>&1" + + rule make_sampleheader: output: "resources/sampleheader.txt", From 11cdb4af942ec37bc5f383cf3c8b47d28d7085be Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 12 Jul 2022 10:01:48 +0000 Subject: [PATCH 098/191] update to newest microphaser version --- workflow/envs/microphaser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml index 84d2ad83..3237608c 100644 --- a/workflow/envs/microphaser.yaml +++ b/workflow/envs/microphaser.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - microphaser =0.2 + - microphaser =0.4 From 29131ce6822c57e6908e310cf0adea8654dc1079 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 12 Jul 2022 10:02:44 +0000 Subject: [PATCH 099/191] fix logs path in rule merge_tumor_normal --- workflow/rules/microphaser.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index a43c7b18..10daf61d 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -31,7 +31,7 @@ rule merge_tumor_normal: output: "results/final-calls/{group}.merged_tumor_normal.norm.bcf", log: - "bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log", + "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log", params: extra="-O b -a", wrapper: From d9e993c6c8fb10dab40d6ba46a6a91cb9fe3f7f2 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 12 Jul 2022 10:04:06 +0000 Subject: [PATCH 100/191] add INFO flag SOMATIC to somatic tumor calls for microphaser (before merging with normal calls) --- workflow/envs/gawk.yaml | 4 +++ workflow/rules/microphaser.smk | 27 ++++++++++++++++-- workflow/rules/ref.smk | 51 ++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 workflow/envs/gawk.yaml diff --git a/workflow/envs/gawk.yaml b/workflow/envs/gawk.yaml new file mode 100644 index 00000000..fa09e590 --- /dev/null +++ b/workflow/envs/gawk.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - gawk =5.1 diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 10daf61d..2df2e782 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -12,20 +12,43 @@ rule norm_bcf: "0.65.0/bio/bcftools/norm" +rule add_somatic_flag: + input: + bcf="results/final-calls/{group}.{set}.norm.bcf", + header_line="resources/somatic_flag_header_line.txt", + flag_bed="resources/genome.somatic_flag.bed.gz", + flag_bed_idx="resources/genome.somatic_flag.bed.gz.tbi", + output: + "results/final-calls/{group}.{set}.somatic_flag.norm.bcf", + log: + "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log" + conda: + "../envs/bcftools.yaml" + shell: + "( bcftools annotate " + " --annotations {input.flag_bed} " + " --mark-sites +SOMATIC " + " --columns CHROM,FROM,TO " + " --header-lines {input.header_line} " + " -O b -o {output} " + " {input.bcf} " + ") 2> {log}" + + rule merge_tumor_normal: input: calls=expand( "results/final-calls/{{group}}.{sets}.norm.bcf", sets=[ config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"], + config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag", ], ), index=expand( "results/final-calls/{{group}}.{sets}.norm.bcf.csi", sets=[ config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"], + config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag", ], ), output: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index cb54eeeb..0d6324a0 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -79,6 +79,57 @@ rule genome_faidx: "0.45.1/bio/samtools/faidx" +rule create_somatic_flag_header_line: + output: + "resources/somatic_flag_header_line.txt", + log: + "logs/create_somatic_flag_header_line.log" + shell: + """ + ( echo '##INFO=' > {output} ) 2> {log} + """ + + +rule create_genome_somatic_flag_bed: + input: + "resources/genome.fasta.fai", + output: + "resources/genome.somatic_flag.bed", + log: + "logs/create_genome_somatic_flag_bed.log" + conda: + "../envs/gawk.yaml" + cache: True + shell: + """ + ( awk 'BEGIN {{ OFS="\\t" }} {{ print $1,0,$2 }}' {input} > {output} ) 2> {log} + """ + + +rule bgzip_genome_somatic_flag_bed: + input: + "resources/genome.somatic_flag.bed", + output: + "resources/genome.somatic_flag.bed.gz", + log: + "logs/bgzip/genome.somatic_flag.log", + wrapper: + "v1.7.0/bio/bgzip" + + +rule tabix_genome_somatic_flag_bed: + input: + "resources/genome.somatic_flag.bed.gz", + output: + "resources/genome.somatic_flag.bed.gz.tbi", + conda: + "../envs/htslib.yaml" + log: + "logs/tabix/genome.somatic_flag.log", + shell: + "( tabix -p bed {input} ) 2> {log}" + + rule genome_dict: input: "resources/genome.fasta", From 218354c4a0db5b01ab3d10c49d9b9d8264ca00ce Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 14 Jul 2022 08:22:26 +0000 Subject: [PATCH 101/191] add proper logging to python scripts --- workflow/scripts/add_rna_info.py | 4 ++++ workflow/scripts/count_neoantigen_occurrences.py | 4 ++++ workflow/scripts/group_mhc_output.py | 3 +++ workflow/scripts/merge_data.py | 3 +++ workflow/scripts/merge_mhcflurry.py | 3 +++ workflow/scripts/parse_HLA_types.py | 4 ++++ workflow/scripts/sample_comp_plot.py | 4 ++++ workflow/scripts/tsv_to_xlsx.py | 3 ++- 8 files changed, 27 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py index bdcdcac2..b8690f86 100644 --- a/workflow/scripts/add_rna_info.py +++ b/workflow/scripts/add_rna_info.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import pandas as pd ## load data table diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py index 1b6050e4..a70caa36 100644 --- a/workflow/scripts/count_neoantigen_occurrences.py +++ b/workflow/scripts/count_neoantigen_occurrences.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import pandas as pd import glob diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py index 5ce0cf15..fda5b360 100644 --- a/workflow/scripts/group_mhc_output.py +++ b/workflow/scripts/group_mhc_output.py @@ -1,4 +1,7 @@ import sys + +sys.stderr = open(snakemake.log[0], "w") + import os import pandas as pd import numpy as np diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py index b90af1cb..0142ba9f 100644 --- a/workflow/scripts/merge_data.py +++ b/workflow/scripts/merge_data.py @@ -1,4 +1,7 @@ import sys + +sys.stderr = open(snakemake.log[0], "w") + import os import pandas as pd import numpy as np diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py index fcc6aba4..c0098cb9 100644 --- a/workflow/scripts/merge_mhcflurry.py +++ b/workflow/scripts/merge_mhcflurry.py @@ -1,4 +1,7 @@ import sys + +sys.stderr = open(snakemake.log[0], "w") + import os import pandas as pd import numpy as np diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py index 1824326d..ae6cb69f 100644 --- a/workflow/scripts/parse_HLA_types.py +++ b/workflow/scripts/parse_HLA_types.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import pandas as pd # to get alleles that netMHCpan can handle, use its -listMHC option diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py index 3c7c287d..d8dbf6e0 100644 --- a/workflow/scripts/sample_comp_plot.py +++ b/workflow/scripts/sample_comp_plot.py @@ -1,3 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + import os import glob import pandas as pd diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py index 28a24594..5d9bf10f 100644 --- a/workflow/scripts/tsv_to_xlsx.py +++ b/workflow/scripts/tsv_to_xlsx.py @@ -1,7 +1,8 @@ import sys -import pandas as pd sys.stderr = open(snakemake.log[0], "w") +import pandas as pd + data = pd.read_csv(snakemake.input.tsv, sep="\t") data.to_excel(snakemake.output.xlsx, index=False) \ No newline at end of file From 9d6b58f91cda1ee0881d2de22fcbe6367eb513dc Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 14 Jul 2022 08:23:33 +0000 Subject: [PATCH 102/191] add proper logging to R script --- workflow/scripts/phylogeny.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflow/scripts/phylogeny.R b/workflow/scripts/phylogeny.R index 7f26becf..9e32c131 100644 --- a/workflow/scripts/phylogeny.R +++ b/workflow/scripts/phylogeny.R @@ -1,3 +1,7 @@ +log <- file(snakemake@log[[1]], open="wt") +sink(log) +sink(log, type="message") + library(phangorn) ## read the variant matrix From fdbd9dd4431c86e1df1e4c686ddd531ddfcd69be Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 14 Jul 2022 08:24:52 +0000 Subject: [PATCH 103/191] add logging capture to rules netMHCpan and netMHCIIpan --- workflow/rules/MHC_binding.smk | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index eb3724ea..571a7623 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -33,12 +33,14 @@ rule netMHCpan: pd.read_csv(input.alleles[0], sep="\t").iloc[0] ), shell: + "( " "if [ -s {input.peptides} ]; " "then " " {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " - "fi" + "fi " + " ) 2> {log}" rule netMHCIIpan: @@ -58,12 +60,14 @@ rule netMHCIIpan: pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist() ), shell: + "( " "if [ -s {input.peptides} ]; " "then " " {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " - "fi" + "fi " + " ) 2> {log}" rule parse_mhc_out: From da392e29f37503be4c89803a5a3d0c85aa5f6c68 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 19 Jul 2022 12:44:55 +0000 Subject: [PATCH 104/191] harmoize microphaser logging paths --- workflow/rules/microphaser.smk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 2df2e782..9c747582 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -98,7 +98,7 @@ rule microphaser_normal: "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv" ), log: - "logs/microphaser_germline/{group}/{normal_alias}.{normal_set}-{contig}.log", + "logs/microphaser_normal/{group}/{normal_alias}.{normal_set}-{contig}.log", conda: "../envs/microphaser.yaml" params: @@ -117,7 +117,7 @@ rule concat_normal_proteome: output: "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa", log: - "logs/microphaser/concat_normal_proteome/{group}.{normal_set}.log", + "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.log", shell: "cat {input} > {output} 2> {log}" @@ -129,7 +129,7 @@ rule build_normal_proteome_db: bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin", fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta", log: - "logs/microphaser/build_normal_proteome_db/{group}.{normal_set}-{mhc}.log", + "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}-{mhc}.log", conda: "../envs/microphaser.yaml" params: @@ -137,7 +137,7 @@ rule build_normal_proteome_db: wildcards.mhc ], shell: - "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1" + "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}" rule microphaser_filter: @@ -177,7 +177,7 @@ rule concat_tsvs: output: "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", log: - "logs/concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", + "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", conda: "../envs/xsv.yaml" shell: From 873fe38753a2924a9d52d242bf85ae74ae810e3e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 19 Jul 2022 12:46:20 +0000 Subject: [PATCH 105/191] fix default microphaser window_len to 45, 3x the default netMHCIIpan peptide_len of 15 --- config/config.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 31512865..aaa78af8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -51,13 +51,11 @@ annotations: params: microphaser: - window_len: - 33 + # window_len should be at least 3 times the longest peptide_len specified below + window_len: 45 peptide_len: - netMHCpan: - 9 - netMHCIIpan: - 15 + netMHCpan: 9 + netMHCIIpan: 15 variant_sets: normal: "normal_only" tumor: "tumor_only" From ac8b6849fd0b5aeb3b7f8beed3fbaa938691c5d7 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 25 Jul 2022 11:24:35 +0000 Subject: [PATCH 106/191] clean up config params specification for microphaser --- .test/config/config.yaml | 51 +++++++++++++++------------------- config/config.yaml | 48 +++++++++++++++----------------- workflow/rules/MHC_binding.smk | 22 +++++++-------- workflow/rules/common.smk | 4 +-- workflow/rules/microphaser.smk | 12 +++----- 5 files changed, 61 insertions(+), 76 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 25ccfa45..8efb0dde 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -6,28 +6,6 @@ epitope_prediction: activate: true -affinity: - netMHCpan: - activate: true - params: "-BA -l 9 -s -xls" - # Please download netMHCpan manually from: - # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 - # To make the `netMHCpan` script work, you need to fix its first line in - # in addition to the other edits described for a complete install. To use - # the conda-provided tcsh installation, it needs to read (without quotes): - # "#!/usr/bin/env tcsh" - location: "../netMHCpan-4.1" - netMHCIIpan: - activate: false - params: "-length 15 -s -xls" - # Please download netMHCIIpan manually from: - # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 - # To make the `netMHCIIpan` script work, you need to fix its first line in - # in addition to the other edits described for a complete install. To use - # the conda-provided tcsh installation, it needs to read (without quotes): - # "#!/usr/bin/env tcsh" - location: "../netMHCIIpan-4.1" - ref: # Number of chromosomes to consider for calling. @@ -51,13 +29,28 @@ annotations: params: microphaser: - window_len: - 33 - peptide_len: - netMHCpan: - 9 - netMHCIIpan: - 15 events: tumor: "tumor_only" normal: "normal_only" + netMHCpan: + activate: true + peptide_len: 9 + params: "-BA -s" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCpan-4.1" + netMHCIIpan: + activate: false + peptide_len: 15 + params: "-BA -s" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCIIpan-4.1" diff --git a/config/config.yaml b/config/config.yaml index aaa78af8..a9978057 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,27 +7,6 @@ epitope_prediction: affinity: - netMHCpan: - activate: true - params: "-BA -l 9 -s -xls" - # Please download netMHCpan manually from: - # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 - # To make the `netMHCpan` script work, you need to fix its first line in - # in addition to the other edits described for a complete install. To use - # the conda-provided tcsh installation, it needs to read (without quotes): - # "#!/usr/bin/env tcsh" - location: "../netMHCpan-4.1" - netMHCIIpan: - activate: false - params: "-length 15 -s -xls" - # Please download netMHCIIpan manually from: - # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 - # To make the `netMHCIIpan` script work, you need to fix its first line in - # in addition to the other edits described for a complete install. To use - # the conda-provided tcsh installation, it needs to read (without quotes): - # "#!/usr/bin/env tcsh" - location: "../netMHCIIpan-4.1" - ref: # Number of chromosomes to consider for calling. @@ -52,10 +31,29 @@ annotations: params: microphaser: # window_len should be at least 3 times the longest peptide_len specified below - window_len: 45 - peptide_len: - netMHCpan: 9 - netMHCIIpan: 15 variant_sets: normal: "normal_only" tumor: "tumor_only" + netMHCpan: + activate: true + peptide_len: 9 + params: "-BA -s" + # Please download netMHCpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + # To make the `netMHCpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCpan-4.1" + netMHCIIpan: + activate: false + peptide_len: 15 + params: "-BA -s" + # Please download netMHCIIpan manually from: + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + # To make the `netMHCIIpan` script work, you need to fix its first line in + # in addition to the other edits described for a complete install. To use + # the conda-provided tcsh installation, it needs to read (without quotes): + # "#!/usr/bin/env tcsh" + location: "../netMHCIIpan-4.1" + diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 571a7623..fbbf1708 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -27,16 +27,15 @@ rule netMHCpan: conda: "../envs/tcsh.yaml" params: - extra=config["affinity"]["netMHCpan"]["params"], - netMHC=config["affinity"]["netMHCpan"]["location"], - alleles=lambda wc, input: ",".join( - pd.read_csv(input.alleles[0], sep="\t").iloc[0] - ), + extra=config["params"]["netMHCpan"]["params"], + netMHC=config["params"]["netMHCpan"]["location"], + length=config["params"]["netMHCpan"]["peptide_len"], + alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), shell: "( " "if [ -s {input.peptides} ]; " "then " - " {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + " {params.netMHC}/netMHCpan {params.extra} -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " "fi " @@ -54,16 +53,15 @@ rule netMHCIIpan: conda: "../envs/tcsh.yaml" params: - extra=config["affinity"]["netMHCIIpan"]["params"], - netMHC=config["affinity"]["netMHCIIpan"]["location"], - alleles=lambda wc, input: ",".join( - pd.read_csv(input.alleles[0], sep="\t")["Allele"].tolist() - ), + extra=config["params"]["netMHCIIpan"]["params"], + netMHC=config["params"]["netMHCIIpan"]["location"], + length=config["params"]["netMHCIIpan"]["peptide_len"], + alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), shell: "( " "if [ -s {input.peptides} ]; " "then " - " {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + " {params.netMHC}/netMHCIIpan {params.extra} -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " "fi " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 097509c6..1e976731 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -111,10 +111,10 @@ def get_final_output(): None, [ "netMHCpan" - if is_activated("affinity/netMHCpan") + if is_activated("params/netMHCpan") else None, "netMHCIIpan" - if is_activated("affinity/netMHCIIpan") + if is_activated("params/netMHCIIpan") else None, ], ) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 9c747582..86af23d0 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -77,7 +77,7 @@ rule microphaser_tumor: conda: "../envs/microphaser.yaml" params: - window_length=config["params"]["microphaser"]["window_len"], + window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3, shell: "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" @@ -102,7 +102,7 @@ rule microphaser_normal: conda: "../envs/microphaser.yaml" params: - window_length=config["params"]["microphaser"]["window_len"], + window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3, shell: "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" @@ -133,9 +133,7 @@ rule build_normal_proteome_db: conda: "../envs/microphaser.yaml" params: - length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][ - wildcards.mhc - ], + length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"], shell: "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}" @@ -161,9 +159,7 @@ rule microphaser_filter: conda: "../envs/microphaser.yaml" params: - length=lambda wildcards: config["params"]["microphaser"]["peptide_len"][ - wildcards.mhc - ], + length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"], shell: "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}" From f0f3cd2338765fd6bc2a0cde59b1bb7fee1c5c6f Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 25 Jul 2022 11:27:00 +0000 Subject: [PATCH 107/191] debug HLA_LA output parsing, adapting to netMHCpan 4.1, extensive comments with linkouts, named inputs/outputs in rule --- workflow/rules/HLAtyping.smk | 6 +-- workflow/scripts/merge_data.py | 9 ++-- workflow/scripts/parse_HLA_types.py | 73 +++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 24 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index b2c389e9..5482e871 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -23,14 +23,14 @@ rule HLA_LA: rule parse_HLA_LA: input: - "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", + hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", output: - report( + hlaI=report( "results/HLA-LA/{group}.{alias}.hlaI.tsv", caption="../report/HLA_Types.rst", category="HLA-Typing(HLA-LA)", ), - report( + hlaII=report( "results/HLA-LA/{group}.{alias}.hlaII.tsv", caption="../report/HLA_Types.rst", category="HLA-Typing(HLA-LA)", diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py index 0142ba9f..ac5d21c1 100644 --- a/workflow/scripts/merge_data.py +++ b/workflow/scripts/merge_data.py @@ -2,10 +2,7 @@ sys.stderr = open(snakemake.log[0], "w") -import os import pandas as pd -import numpy as np - def select_columns(mhc): rank_cols = [c for c in mhc.columns if "Rank" in c] @@ -82,9 +79,9 @@ def diffEpitope(e1,e2): def main(): - info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str) - tumor = pd.read_csv(snakemake.input[1], sep = '\t') - normal = pd.read_csv(snakemake.input[2], sep = '\t') + info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str) + tumor = pd.read_csv(snakemake.input.neo, sep = '\t') + normal = pd.read_csv(snakemake.input.normal, sep = '\t') outfile = snakemake.output[0] merge(info, tumor, normal, outfile) diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py index ae6cb69f..8523f9ea 100644 --- a/workflow/scripts/parse_HLA_types.py +++ b/workflow/scripts/parse_HLA_types.py @@ -4,23 +4,66 @@ import pandas as pd -# to get alleles that netMHCpan can handle, use its -listMHC option -hlaI = ["A","B","C", "E", "G"] +# To know which alleles netMHCpan can handle, use its -listMHC option. +HLAI = ["A","B","C", "E", "G"] -# to get alleles that netMHCIIpan can handle, use its -list option -hlaII = ["DRB1", "DRB3", "DRB4", "DRB5", "DPA1", "DPB1", "DQA1", "DQB1"] +# To know which alleles netMHCIIpan can handle, use its -list option. +# DRB alleles need to be formatted differently from DP and DQ alleles, +# so we specify them separately. +DRB = ["DRB1", "DRB3", "DRB4", "DRB5"] +ALPHA_BETA = ["DPA1", "DPB1", "DQA1", "DQB1"] -hlas = pd.read_csv(snakemake.input[0], sep='\t') +hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep='\t') -hlasI = hlas[hlas.Locus.isin(hlaI)] -hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') -hlasI = hlasI[["Allele"]].drop_duplicates() -hlasI.to_csv(snakemake.output[0], sep='\t', index=False) +# the Allele column can contain multiple ";"-separated entries for the +# same locus +hlas.loc[:, "Allele"] = hlas.Allele.str.split(pat=";") +hlas["alternative"] = hlas.Allele.apply( lambda x: range( len(x) ) ) +hlas = hlas.explode(["Allele", "alternative"]) -hlasII = hlas[hlas.Locus.isin(hlaII)] -hlasII["HLA"] = hlasII.Locus.str[0:2] -hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','') +# reformat to netMHCpan allele list format: +# * https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/allele.list +# it needs to be in the format of the first column of the above list, as explained in +# the "Instructions" tab under "MHC SELECTION" point "2)" at: +# * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 +hlaI_alleles = hlas.loc[hlas["Locus"].isin(HLAI), "Allele"].str.replace("([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True).drop_duplicates() +hlaI_alleles.to_csv(snakemake.output.hlaI, sep='\t', index=False, header=False) -hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates() -hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_") -hlasII.to_csv(snakemake.output[1], sep='\t', index=False) +# reformat to netMHCIIpan allele list format: +# * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list +# contrary to the format in that list, alleles actually need to be formatted like this, +# with s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only +# the 1st and 2nd sub-number are used) according to the official nomenclature (see: +# https://ars.els-cdn.com/content/image/1-s2.0-S0006497120405555-gr2.jpg ): +# * DRB alleles: "_" +# * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA): +# "HLA--" +# This format was determined by manually selecting combinations above the +# "type a list of molecules names" field of the "Submission" tab at: +# * https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 + +# TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary: +# * example: DRB1_1501-DRB30101-DRB40301N +# * "DRB1_-DRB3-DRB4" +drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)] +hlaII_alleles = drb_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True).drop_duplicates() + +# handle alleles where a combination of alpha and beta always exists +alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)] +alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True) +# we need a variable to group alpha and beta of the same gene combination together +alpha_beta_alleles["gene_group"] = alpha_beta_alleles.Locus.str[0:2] +# we need to handle cases where we had multiple allele entries in an +# alpha or beta locus, adding in a duplicate of the corresponding locus +alleles_to_duplicate = alpha_beta_alleles.loc[ alpha_beta_alleles["alternative"] > 0 & alpha_beta_alleles["Locus"].str.startswith("D[PQ]"), ["Locus", "Chromosome", "alternative"] ].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}}) +alleles_to_insert = alleles_to_duplicate.merge(alpha_beta_alleles.drop('alternative', axis='columns'), on=['Locus', 'Chromosome'], how='left') +alpha_beta_alleles = pd.concat([alpha_beta_alleles, alleles_to_insert]).drop_duplicates() + +hlaII_alleles = hlaII_alleles.append( + alpha_beta_alleles\ + .groupby(["gene_group", "Chromosome", "alternative"])["Allele"]\ + .transform(lambda x: f"HLA-{'-'.join(x)}")\ + .drop_duplicates() + ) + +hlaII_alleles.to_csv(snakemake.output.hlaII, sep='\t', index=False, header=False) From 08bbebd575b11be805ba5ee75d9c422357f3bd6b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 27 Jul 2022 15:12:32 +0000 Subject: [PATCH 108/191] clean up parse_HLA_types.py --- workflow/scripts/parse_HLA_types.py | 61 +++++++++++++++++++---------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_HLA_types.py index 8523f9ea..c93a943e 100644 --- a/workflow/scripts/parse_HLA_types.py +++ b/workflow/scripts/parse_HLA_types.py @@ -5,20 +5,20 @@ import pandas as pd # To know which alleles netMHCpan can handle, use its -listMHC option. -HLAI = ["A","B","C", "E", "G"] +HLAI = {"A", "B", "C", "E", "G"} # To know which alleles netMHCIIpan can handle, use its -list option. # DRB alleles need to be formatted differently from DP and DQ alleles, # so we specify them separately. -DRB = ["DRB1", "DRB3", "DRB4", "DRB5"] -ALPHA_BETA = ["DPA1", "DPB1", "DQA1", "DQB1"] +DRB = {"DRB1", "DRB3", "DRB4", "DRB5"} +ALPHA_BETA = {"DPA1", "DPB1", "DQA1", "DQB1"} -hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep='\t') +hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t") # the Allele column can contain multiple ";"-separated entries for the # same locus -hlas.loc[:, "Allele"] = hlas.Allele.str.split(pat=";") -hlas["alternative"] = hlas.Allele.apply( lambda x: range( len(x) ) ) +hlas.loc[:, "Allele"] = hlas["Allele"].str.split(pat=";") +hlas["alternative"] = hlas["Allele"].apply(lambda x: range(len(x))) hlas = hlas.explode(["Allele", "alternative"]) # reformat to netMHCpan allele list format: @@ -26,8 +26,12 @@ # it needs to be in the format of the first column of the above list, as explained in # the "Instructions" tab under "MHC SELECTION" point "2)" at: # * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 -hlaI_alleles = hlas.loc[hlas["Locus"].isin(HLAI), "Allele"].str.replace("([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True).drop_duplicates() -hlaI_alleles.to_csv(snakemake.output.hlaI, sep='\t', index=False, header=False) +hlaI_alleles = ( + hlas.loc[hlas["Locus"].isin(HLAI), "Allele"] + .str.replace(r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True) + .drop_duplicates() +) +hlaI_alleles.to_csv(snakemake.output.hlaI, sep="\t", index=False, header=False) # reformat to netMHCIIpan allele list format: # * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list @@ -46,24 +50,41 @@ # * example: DRB1_1501-DRB30101-DRB40301N # * "DRB1_-DRB3-DRB4" drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)] -hlaII_alleles = drb_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True).drop_duplicates() +hlaII_alleles = ( + drb_alleles["Allele"] + .str.replace(r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True) + .drop_duplicates() +) # handle alleles where a combination of alpha and beta always exists alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)] -alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace("([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True) +alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace( + r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True +) # we need a variable to group alpha and beta of the same gene combination together -alpha_beta_alleles["gene_group"] = alpha_beta_alleles.Locus.str[0:2] +alpha_beta_alleles["gene_group"] = alpha_beta_alleles["Locus"].str[0:2] # we need to handle cases where we had multiple allele entries in an # alpha or beta locus, adding in a duplicate of the corresponding locus -alleles_to_duplicate = alpha_beta_alleles.loc[ alpha_beta_alleles["alternative"] > 0 & alpha_beta_alleles["Locus"].str.startswith("D[PQ]"), ["Locus", "Chromosome", "alternative"] ].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}}) -alleles_to_insert = alleles_to_duplicate.merge(alpha_beta_alleles.drop('alternative', axis='columns'), on=['Locus', 'Chromosome'], how='left') -alpha_beta_alleles = pd.concat([alpha_beta_alleles, alleles_to_insert]).drop_duplicates() +select_mult_all = alpha_beta_alleles["alternative"] > 0 +select_dpq_loci = alpha_beta_alleles["Locus"].str.startswith("D[PQ]") +mult_all_per_loc_selection = select_mult_all & select_dpq_loci +alleles_to_duplicate = alpha_beta_alleles.loc[ + mult_all_per_loc_selection, + ["Locus", "Chromosome", "alternative"], +].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}}) +alleles_to_insert = alleles_to_duplicate.merge( + alpha_beta_alleles.drop("alternative", axis="columns"), + on=["Locus", "Chromosome"], + how="left", +) +alpha_beta_alleles = pd.concat( + [alpha_beta_alleles, alleles_to_insert] +).drop_duplicates() hlaII_alleles = hlaII_alleles.append( - alpha_beta_alleles\ - .groupby(["gene_group", "Chromosome", "alternative"])["Allele"]\ - .transform(lambda x: f"HLA-{'-'.join(x)}")\ - .drop_duplicates() - ) + alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"] + .transform(lambda x: f"HLA-{'-'.join(x)}") + .drop_duplicates() +) -hlaII_alleles.to_csv(snakemake.output.hlaII, sep='\t', index=False, header=False) +hlaII_alleles.to_csv(snakemake.output.hlaII, sep="\t", index=False, header=False) From 51b17e15d38b59c7b4bfeb38af4823b0a76ea644 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 27 Jul 2022 20:08:20 +0000 Subject: [PATCH 109/191] initial suggestions from Till --- workflow/scripts/merge_data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_data.py index ac5d21c1..23f92e37 100644 --- a/workflow/scripts/merge_data.py +++ b/workflow/scripts/merge_data.py @@ -4,10 +4,10 @@ import pandas as pd -def select_columns(mhc): +def select_columns(mhc: pd.DataFrame) -> pd.DataFrame: rank_cols = [c for c in mhc.columns if "Rank" in c] affinity_cols = [c for c in mhc.columns if "nM" in c] - mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"] + mhc_cols = ["Pos", "ID", "Peptide"] + rank_cols + affinity_cols + ["NB"] mhc = mhc[mhc_cols] mhc["Rank_min"] = mhc[rank_cols].min(axis=1) mhc["Aff_min"] = mhc[affinity_cols].min(axis=1) @@ -17,7 +17,7 @@ def select_columns(mhc): mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") return mhc -def merge(info, tumor, normal, outfile): +def merge(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame: tumor = select_columns(tumor) normal = select_columns(normal) id_length = len(tumor.ID[0]) @@ -60,7 +60,7 @@ def merge(info, tumor, normal, outfile): ### Remove Duplicate kmers data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"]) - data.to_csv(outfile, index=False, sep = '\t') + return data ## highlight the difference between mutated neopeptide and wildtype @@ -83,7 +83,8 @@ def main(): tumor = pd.read_csv(snakemake.input.neo, sep = '\t') normal = pd.read_csv(snakemake.input.normal, sep = '\t') outfile = snakemake.output[0] - merge(info, tumor, normal, outfile) + data = merge(info, tumor, normal) + data.to_csv(outfile, index=False, sep = '\t') if __name__ == '__main__': sys.exit(main()) From c75855d10e5e58822c759298ba3907d0898192d2 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 29 Jul 2022 08:55:32 +0000 Subject: [PATCH 110/191] further cleanup of config --- .test/config/config.yaml | 11 +---------- config/config.yaml | 12 +----------- workflow/rules/common.smk | 2 +- 3 files changed, 3 insertions(+), 22 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 8efb0dde..475a31f0 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -2,11 +2,10 @@ samples: "config/samples.tsv" units: "config/units.tsv" -epitope_prediction: +neoantigen_prediction: activate: true - ref: # Number of chromosomes to consider for calling. # The first n entries of the FASTA will be considered. @@ -19,14 +18,6 @@ ref: build: GRCh38 -annotations: - vep: - params: "--everything" - plugins: - # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt". - - LoFtool - params: microphaser: events: diff --git a/config/config.yaml b/config/config.yaml index a9978057..af13f61b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -2,12 +2,10 @@ samples: "config/samples.tsv" units: "config/units.tsv" -epitope_prediction: +neoantigen_prediction: activate: true -affinity: - ref: # Number of chromosomes to consider for calling. # The first n entries of the FASTA will be considered. @@ -20,14 +18,6 @@ ref: build: GRCh38 -annotations: - vep: - params: "--everything" - plugins: - # Add any plugin from https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be passed as well, e.g. "LoFtool,path/to/custom/scores.txt". - - LoFtool - params: microphaser: # window_len should be at least 3 times the longest peptide_len specified below diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1e976731..e6262532 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -97,7 +97,7 @@ def get_final_output(): (samples["group"] == group) & (samples["alias"].str.match("tumor")), "alias", ] - if config["epitope_prediction"]["activate"]: + if config["neoantigen_prediction"]["activate"]: sequencing_types = pd.unique( units.loc[units["sample_name"].isin(smps), "sequencing_type"] ) From 6c6b2bd74f3bec7f765426d508f578bee8bffbe1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 29 Jul 2022 09:03:13 +0000 Subject: [PATCH 111/191] further config cleanup, including schema --- .test/config/config.yaml | 4 +-- config/config.yaml | 4 +-- workflow/rules/MHC_binding.smk | 8 ++--- workflow/schemas/config.schema.yaml | 55 ++++++++++++++--------------- 4 files changed, 35 insertions(+), 36 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 475a31f0..c5ed610c 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -26,7 +26,7 @@ params: netMHCpan: activate: true peptide_len: 9 - params: "-BA -s" + extra: "" # Please download netMHCpan manually from: # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 # To make the `netMHCpan` script work, you need to fix its first line in @@ -37,7 +37,7 @@ params: netMHCIIpan: activate: false peptide_len: 15 - params: "-BA -s" + extra: "" # Please download netMHCIIpan manually from: # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 # To make the `netMHCIIpan` script work, you need to fix its first line in diff --git a/config/config.yaml b/config/config.yaml index af13f61b..29da7260 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -27,7 +27,7 @@ params: netMHCpan: activate: true peptide_len: 9 - params: "-BA -s" + extra: "" # Please download netMHCpan manually from: # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 # To make the `netMHCpan` script work, you need to fix its first line in @@ -38,7 +38,7 @@ params: netMHCIIpan: activate: false peptide_len: 15 - params: "-BA -s" + extra: "" # Please download netMHCIIpan manually from: # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 # To make the `netMHCIIpan` script work, you need to fix its first line in diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index fbbf1708..5c596d76 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -27,7 +27,7 @@ rule netMHCpan: conda: "../envs/tcsh.yaml" params: - extra=config["params"]["netMHCpan"]["params"], + extra=config["params"]["netMHCpan"]["extra"], netMHC=config["params"]["netMHCpan"]["location"], length=config["params"]["netMHCpan"]["peptide_len"], alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), @@ -35,7 +35,7 @@ rule netMHCpan: "( " "if [ -s {input.peptides} ]; " "then " - " {params.netMHC}/netMHCpan {params.extra} -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + " {params.netMHC}/netMHCpan {params.extra} -BA -s -l {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " "fi " @@ -53,7 +53,7 @@ rule netMHCIIpan: conda: "../envs/tcsh.yaml" params: - extra=config["params"]["netMHCIIpan"]["params"], + extra=config["params"]["netMHCIIpan"]["extra"], netMHC=config["params"]["netMHCIIpan"]["location"], length=config["params"]["netMHCIIpan"]["peptide_len"], alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), @@ -61,7 +61,7 @@ rule netMHCIIpan: "( " "if [ -s {input.peptides} ]; " "then " - " {params.netMHC}/netMHCIIpan {params.extra} -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " + " {params.netMHC}/netMHCIIpan {params.extra} -BA -s -length {params.length} -xls -xlsfile {output} -a {params.alleles} -f {input.peptides} > {log}; " "else " " touch {output}; " "fi " diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 8b960444..dad1a71e 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -42,7 +42,7 @@ properties: - build - n_chromosomes - epitope_prediction: + neoantigen_prediction: type: object properties: activate: @@ -51,36 +51,12 @@ properties: affinity: type: object properties: - netMHCpan: - type: object - properties: - activate: - type: boolean - params: - type: string - netMHCIIpan: - type: object - properties: - activate: - type: boolean - params: - type: string - params: type: object properties: microphaser: type: object properties: - window_len: - type: integer - peptide_len: - type: object - properties: - netMHCpan: - type: integer - netMHCIIpan: - type: integer variant_sets: type: object properties: @@ -91,12 +67,35 @@ properties: required: - normal - tumor + netMHCpan: + type: object + properties: + activate: + type: boolean + peptide_len: + type: integer + extra: + type: string + location: + type: string + required: + - activate + netMHCIIpan: + type: object + properties: + activate: + type: boolean + peptide_len: + type: integer + extra: + type: string required: - - window_len - - peptide_len - - variant_sets + - activate required: - microphaser + - netMHCpan + - netMHCIIpan + required: - samples From 64fc352fa28816d4346c90e06c73138a5fb12957 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 29 Jul 2022 09:03:46 +0000 Subject: [PATCH 112/191] remove traces of attempted mhcflurry implementation --- workflow/rules/MHC_binding.smk | 44 ------------------- workflow/scripts/merge_mhcflurry.py | 66 ----------------------------- 2 files changed, 110 deletions(-) delete mode 100644 workflow/scripts/merge_mhcflurry.py diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 5c596d76..50b98a6b 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -1,21 +1,3 @@ -# rule mhcflurry: -# input: -# peptides="results/microphaser/fasta/{sample}/filtered/{sample}.{contig}.{peptide_type}.fa", -# alleles="results/optitype/{sample}/hla_alleles_{sample}.tsv", -# wt_alleles=get_germline_optitype -# output: -# "results/mhcflurry/{sample}/{contig}/output.{peptide_type}.csv" -# log: -# "logs/mhcflurry/{sample}-{contig}-{peptide_type}.log" -# run: -# if "wt" in input.peptides: -# alleles = ",".join(pd.read_csv(input.wt_alleles, sep="\t").iloc[0]) -# else: -# alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0]) -# cmd = "if [ -s {input.peptides} ]; then mhctools --mhc-predictor mhcflurry --mhc-alleles {alleles} --input-fasta-file {input.peptides} --output-csv {output} > {log}; else touch {output}; fi" -# shell(cmd) - - rule netMHCpan: input: peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa", @@ -82,21 +64,6 @@ rule parse_mhc_out: "../scripts/group_mhc_output.py" -# rule parse_mhcflurry: -# input: -# expand("results/mhcflurry/{{sample}}/{contig}/output.{{peptide_type}}.csv", contig=contigs) -# output: -# "results/mhcflurry/{sample}/{sample}.mhc.{peptide_type}.csv" -# wildcard_constraints: -# group="wt|mt" -# log: -# "logs/parse-mhc/mhcflurry-{sample}-{peptide_type}.log" -# conda: -# "../envs/xsv.yaml" -# shell: -# "xsv cat rows -d ',' {input} | cut --complement -f2,7,8 > {output}" - - rule mhc_csv_table: input: info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", @@ -114,17 +81,6 @@ rule mhc_csv_table: "../scripts/merge_data.py" -# rule mhcflurry_table: -# input: -# info="results/microphaser/info/{sample}/filtered/mhcflurry/{sample}.tsv", -# neo="results/mhcflurry/{sample}/{sample}.mhc.neo.tsv", -# normal="results/mhcflurry/{sample}/{sample}.mhc.normal.tsv" -# output: -# report("results/neoantigens/mhcflurry/{sample}.WES.tsv", caption="../report/WES_results.rst", category="Results WES (MHCFlurry)") -# script: -# "../scripts/merge_mhcflurry.py" - - rule add_RNA_info: input: counts="results/kallisto/{group}.{tumor_alias}", diff --git a/workflow/scripts/merge_mhcflurry.py b/workflow/scripts/merge_mhcflurry.py deleted file mode 100644 index c0098cb9..00000000 --- a/workflow/scripts/merge_mhcflurry.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys - -sys.stderr = open(snakemake.log[0], "w") - -import os -import pandas as pd -import numpy as np - - -## highlight the difference between mutated neopeptide and wildtype -def diffEpitope(e1,e2): - if str(e2) == 'nan': - return(e1) - e1 = str(e1) - e2 = str(e2) - diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]] - e_new = e1 - e2_new = e2 - for p in diff_pos: - e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:] - e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:] - return(e_new) - -info = pd.read_csv(snakemake.input[0]) -tumor = pd.read_csv(snakemake.input[1]) -normal = pd.read_csv(snakemake.input[2]) -outfile = snakemake.output[0] - - -tumor = tumor[["source_sequence_name","peptide","allele","affinity","percentile_rank"]] -tumor = tumor.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index() -tumor.columns = tumor.columns.map("-".join) -tumor = tumor.rename(columns={col: col.replace("-","") for col in tumor.columns if col.endswith("-")}) - -normal = normal[["source_sequence_name","peptide","allele","affinity","percentile_rank"]] -normal = normal.pivot_table(["affinity","percentile_rank"],["source_sequence_name","peptide"],"allele").reset_index() -normal.columns = normal.columns.map("-".join) -normal = normal.rename(columns={col: col.replace("-","") for col in normal.columns if col.endswith("-")}) - -merged = tumor.merge(normal, how="left", on=["source_sequence_name"]) - -merged = merged.rename(columns={col: col.replace("_y","_normal") for col in merged.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged.columns}) -## add info -info = info.rename(columns={"id":"ID","gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"}) -merged_dataframe = merged.merge(info, how="left", left_on="source_sequence_name", right_on="ID") - -merged_dataframe["peptide_tumor"]=merged_dataframe[["peptide_tumor","peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) -## Are all possible variants in the peptide ("Cis") or not ("Trans") -merged_dataframe["Variant_Orientation"] = "Cis" -trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar -merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans" - -## check misssense/silent mutation status -nonsilent = merged_dataframe.peptide_tumor != merged_dataframe.peptide_normal -merged_dataframe = merged_dataframe[nonsilent] -data = merged_dataframe.drop_duplicates(subset=["Gene_ID","offset","peptide_tumor","Somatic_AminoAcid_Change"]) - -### Delete Stop-Codon including peptides -data = data[data.peptide_tumor.str.count("x") == 0] -data = data[data.peptide_tumor.str.count("X") == 0] -data.to_csv(outfile, index=False, sep = '\t') - - - - - From 31b9bddd025b595b92ebae204b8fb7eb3d5f78d3 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Sat, 30 Jul 2022 19:09:30 +0000 Subject: [PATCH 113/191] update microphaser to bug-fixed `v0.5.0` --- workflow/envs/microphaser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml index 3237608c..00657086 100644 --- a/workflow/envs/microphaser.yaml +++ b/workflow/envs/microphaser.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - microphaser =0.4 + - microphaser =0.5 From dfbf9a37def516dc4c8dd7a57d6b6b7e5cf91633 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Sat, 30 Jul 2022 19:13:36 +0000 Subject: [PATCH 114/191] use bioconda `hla-la` recipe --- workflow/envs/hla_la.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/workflow/envs/hla_la.yaml b/workflow/envs/hla_la.yaml index 4b94aaef..6a24bc29 100644 --- a/workflow/envs/hla_la.yaml +++ b/workflow/envs/hla_la.yaml @@ -1,10 +1,5 @@ channels: - conda-forge - bioconda - - jafors dependencies: - - hla-la ==1.0.5 - - samtools ==1.10 - - bamtools ==2.5.1 - - boost-cpp ==1.74.0 - - r-base =4 + - hla-la ==1.0.3 From fc80d5c9c35d07a55c2fe529b5e1ebf5738c2c63 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 2 Aug 2022 19:11:18 +0000 Subject: [PATCH 115/191] rename group_mhc_output.py to clean_mhc_output.py and consistently use pandas for parsing --- workflow/rules/MHC_binding.smk | 8 ++--- workflow/scripts/clean_mhc_output.py | 54 ++++++++++++++++++++++++++++ workflow/scripts/group_mhc_output.py | 31 ---------------- 3 files changed, 58 insertions(+), 35 deletions(-) create mode 100644 workflow/scripts/clean_mhc_output.py delete mode 100644 workflow/scripts/group_mhc_output.py diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 50b98a6b..5a0ff9bc 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -50,18 +50,18 @@ rule netMHCIIpan: " ) 2> {log}" -rule parse_mhc_out: +rule clean_mhc_out: input: expand( - "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.xls", + "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv", contig=contigs, ), output: - "results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv", + joined_mhc_out="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.{peptide_type}.tsv", log: "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log", script: - "../scripts/group_mhc_output.py" + "../scripts/clean_mhc_output.py" rule mhc_csv_table: diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/clean_mhc_output.py new file mode 100644 index 00000000..fcf360a6 --- /dev/null +++ b/workflow/scripts/clean_mhc_output.py @@ -0,0 +1,54 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import os +import pandas as pd +# assumptions of this script about netMHCpan or netMHCIIpan: +# * version 4.1 +# * output generated via `-xls` option +# * generated with the `-BA` option to include binding affinity prediction + +# The mapping of index column names used here to original names in netMHCpan files is: +INDEX_NAMES = {'Pos': 'position_in_protein_sequence', 'Peptide': 'peptide_sequence', 'ID': 'peptide_ID', 'Ave': 'average_el_score', 'NB': 'number_of_binders'} +# The mapping of column names used here to original names in netMHCpan files is: +COLUMN_NAMES = {'BA-score': 'binding_affinity_score', 'BA_Rank': 'binding_affinity_percent_rank', 'EL-score': 'elution_ligang_score', 'EL_Rank': 'elution_ligand_percent_rank' , 'core': 'binding_core', 'icore': 'interaction_core'} + +def parse_file(mhc_in: FileIO): + """ + Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile ` + directive into a tidy pandas data frame. + """ + if os.path.getsize(mhc_in) == 0: + # Short-circuit empty files, but generate correct header. + return pd.DataFrame( columns = list(COLUMN_NAMES.values()) + ["allele"] + list(INDEX_NAMES.values()) ) + + # It's a compound header over two rows and a compound row index in the initial + # three and final two columns of the table. For some reason, the final two + # columns are added to the index but not removed from the table, so we do this + # manually with `.iloc[]``. + data = pd.read_csv(mhc_in, sep="\t", header = [0, 1], index_col=[0, 1, 2, -2, -1]).iloc[:, :-2] + + # With two lines of header parsed into a MultiIndex, pandas only uses the + # first column name in index_col as an entry. Obviously the following code + # assumes that these are the first three and last two columns of the data file. + data.index.names = list(INDEX_NAMES.values()) + + # Entries of the header MultiIndex need to be fixed, there doesn't seem to be + # any way to automatically do this during read_csv. + cols = pd.DataFrame(data.columns.to_list(), columns = ['allele', 'info']) + + # fix up the columns and reassign + cols.loc[cols['allele'].str.endswith("_level_0"), 'allele'] = pd.NA + cols = cols.fillna(method="ffill") + data.columns = pd.MultiIndex.from_frame(cols) + + # Turn into longer table with one HLA Allele per row instead of MultiIndex + # header, rename columns to something readable and turn index into columns. + data = data.stack(level='allele').rename(columns = COLUMN_NAMES).reset_index() + + return data + +all_data = pd.concat((parse_file(f) for f in snakemake.input), axis='index') + +all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False) \ No newline at end of file diff --git a/workflow/scripts/group_mhc_output.py b/workflow/scripts/group_mhc_output.py deleted file mode 100644 index fda5b360..00000000 --- a/workflow/scripts/group_mhc_output.py +++ /dev/null @@ -1,31 +0,0 @@ -import sys - -sys.stderr = open(snakemake.log[0], "w") - -import os -import pandas as pd -import numpy as np - -first = True -out = open(snakemake.output[0], "w") -for e in snakemake.input: - if os.path.getsize(e) > 0: - mhcout = open(e, 'r') - alleles = next(mhcout).split('\t') - header = next(mhcout).rstrip().split('\t') - if first: - allele = '' - for i in range(0, len(header)): - #print(header[i].rstrip()) - if i < len(alleles): - #print(alleles[i]) - if alleles[i] != '': - allele = alleles[i].rstrip() + '_' - header[i] = allele + header[i].rstrip() - header[len(header) -1] = "NB" - first = False - #print(header) - out.write('\t'.join(header) + '\n') - for line in mhcout: - out.write(line) -out.close() From 4be7c35584b83bde0582ac4fd7fa6d58e3dfa5d9 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 2 Aug 2022 19:11:53 +0000 Subject: [PATCH 116/191] black clean_mhc_output.py --- workflow/scripts/clean_mhc_output.py | 44 +++++++++++++++++++++------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/clean_mhc_output.py index fcf360a6..8ab28fbe 100644 --- a/workflow/scripts/clean_mhc_output.py +++ b/workflow/scripts/clean_mhc_output.py @@ -4,15 +4,30 @@ import os import pandas as pd + # assumptions of this script about netMHCpan or netMHCIIpan: # * version 4.1 # * output generated via `-xls` option # * generated with the `-BA` option to include binding affinity prediction # The mapping of index column names used here to original names in netMHCpan files is: -INDEX_NAMES = {'Pos': 'position_in_protein_sequence', 'Peptide': 'peptide_sequence', 'ID': 'peptide_ID', 'Ave': 'average_el_score', 'NB': 'number_of_binders'} +INDEX_NAMES = { + "Pos": "position_in_protein_sequence", + "Peptide": "peptide_sequence", + "ID": "peptide_ID", + "Ave": "average_el_score", + "NB": "number_of_binders", +} # The mapping of column names used here to original names in netMHCpan files is: -COLUMN_NAMES = {'BA-score': 'binding_affinity_score', 'BA_Rank': 'binding_affinity_percent_rank', 'EL-score': 'elution_ligang_score', 'EL_Rank': 'elution_ligand_percent_rank' , 'core': 'binding_core', 'icore': 'interaction_core'} +COLUMN_NAMES = { + "BA-score": "binding_affinity_score", + "BA_Rank": "binding_affinity_percent_rank", + "EL-score": "elution_ligang_score", + "EL_Rank": "elution_ligand_percent_rank", + "core": "binding_core", + "icore": "interaction_core", +} + def parse_file(mhc_in: FileIO): """ @@ -21,13 +36,19 @@ def parse_file(mhc_in: FileIO): """ if os.path.getsize(mhc_in) == 0: # Short-circuit empty files, but generate correct header. - return pd.DataFrame( columns = list(COLUMN_NAMES.values()) + ["allele"] + list(INDEX_NAMES.values()) ) - + return pd.DataFrame( + columns=list(COLUMN_NAMES.values()) + + ["allele"] + + list(INDEX_NAMES.values()) + ) + # It's a compound header over two rows and a compound row index in the initial # three and final two columns of the table. For some reason, the final two # columns are added to the index but not removed from the table, so we do this # manually with `.iloc[]``. - data = pd.read_csv(mhc_in, sep="\t", header = [0, 1], index_col=[0, 1, 2, -2, -1]).iloc[:, :-2] + data = pd.read_csv( + mhc_in, sep="\t", header=[0, 1], index_col=[0, 1, 2, -2, -1] + ).iloc[:, :-2] # With two lines of header parsed into a MultiIndex, pandas only uses the # first column name in index_col as an entry. Obviously the following code @@ -36,19 +57,20 @@ def parse_file(mhc_in: FileIO): # Entries of the header MultiIndex need to be fixed, there doesn't seem to be # any way to automatically do this during read_csv. - cols = pd.DataFrame(data.columns.to_list(), columns = ['allele', 'info']) + cols = pd.DataFrame(data.columns.to_list(), columns=["allele", "info"]) # fix up the columns and reassign - cols.loc[cols['allele'].str.endswith("_level_0"), 'allele'] = pd.NA + cols.loc[cols["allele"].str.endswith("_level_0"), "allele"] = pd.NA cols = cols.fillna(method="ffill") data.columns = pd.MultiIndex.from_frame(cols) - + # Turn into longer table with one HLA Allele per row instead of MultiIndex # header, rename columns to something readable and turn index into columns. - data = data.stack(level='allele').rename(columns = COLUMN_NAMES).reset_index() + data = data.stack(level="allele").rename(columns=COLUMN_NAMES).reset_index() return data -all_data = pd.concat((parse_file(f) for f in snakemake.input), axis='index') -all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False) \ No newline at end of file +all_data = pd.concat((parse_file(f) for f in snakemake.input), axis="index") + +all_data.to_csv(snakemake.output.joined_mhc_out, sep="\t", index=False) From d9e1f0f9dc7f177b7f89a2516a8e5f9b1151c6d5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 08:33:01 +0000 Subject: [PATCH 117/191] rename clean_mhc_output to tidy_mhc_output --- workflow/rules/MHC_binding.smk | 2 +- workflow/scripts/{clean_mhc_output.py => tidy_mhc_output.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename workflow/scripts/{clean_mhc_output.py => tidy_mhc_output.py} (100%) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 5a0ff9bc..4e8de82d 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -50,7 +50,7 @@ rule netMHCIIpan: " ) 2> {log}" -rule clean_mhc_out: +rule tidy_mhc_out: input: expand( "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv", diff --git a/workflow/scripts/clean_mhc_output.py b/workflow/scripts/tidy_mhc_output.py similarity index 100% rename from workflow/scripts/clean_mhc_output.py rename to workflow/scripts/tidy_mhc_output.py From a105c04af24ed36107ec817a513e30c325c432f1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 08:33:53 +0000 Subject: [PATCH 118/191] rename netMHCpan and netMHCIIpan -xls output to `.tsv` to reflect that these are actually tab separated plain text files --- workflow/rules/MHC_binding.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 4e8de82d..b1418d4f 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -3,7 +3,7 @@ rule netMHCpan: peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", + "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: @@ -29,7 +29,7 @@ rule netMHCIIpan: peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.xls", + "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: From ea38fe891ca2ae0d6ad6b6d93507e2df697e5b13 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 09:09:46 +0000 Subject: [PATCH 119/191] rename parse_HLA_types to parse_and_filter_hla_alleles_for_netmhc --- workflow/rules/HLAtyping.smk | 4 ++-- ...LA_types.py => parse_and_filter_hla_alleles_for_netmhc.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename workflow/scripts/{parse_HLA_types.py => parse_and_filter_hla_alleles_for_netmhc.py} (100%) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 5482e871..ff10fe49 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -21,7 +21,7 @@ rule HLA_LA: "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1" -rule parse_HLA_LA: +rule parse_and_filter_hla_alleles_for_netmhc: input: hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", output: @@ -38,4 +38,4 @@ rule parse_HLA_LA: log: "logs/parse-HLA-LA/{group}.{alias}.log", script: - "../scripts/parse_HLA_types.py" + "../scripts/parse_and_filter_hla_alleles_for_netmhc.py" diff --git a/workflow/scripts/parse_HLA_types.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py similarity index 100% rename from workflow/scripts/parse_HLA_types.py rename to workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py From 6312d2e952b939ce89e44322efb9f9f2165ce8f4 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 13:14:25 +0000 Subject: [PATCH 120/191] use net_mhc_pan and net_mhc_two_pan for better readability and consistency wherever possible --- .test/config/config.yaml | 4 ++-- config/config.yaml | 4 ++-- workflow/rules/HLAtyping.smk | 28 ++++++++++++++++++++++++++++ workflow/rules/MHC_binding.smk | 28 ++++++++++++++-------------- workflow/rules/common.smk | 8 ++++---- workflow/rules/microphaser.smk | 4 ++-- workflow/schemas/config.schema.yaml | 8 ++++---- 7 files changed, 56 insertions(+), 28 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index c5ed610c..bcd1a88c 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -23,7 +23,7 @@ params: events: tumor: "tumor_only" normal: "normal_only" - netMHCpan: + net_mhc_pan: activate: true peptide_len: 9 extra: "" @@ -34,7 +34,7 @@ params: # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" location: "../netMHCpan-4.1" - netMHCIIpan: + net_mhc_two_pan: activate: false peptide_len: 15 extra: "" diff --git a/config/config.yaml b/config/config.yaml index 29da7260..a9c006cd 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,7 +24,7 @@ params: variant_sets: normal: "normal_only" tumor: "tumor_only" - netMHCpan: + net_mhc_pan: activate: true peptide_len: 9 extra: "" @@ -35,7 +35,7 @@ params: # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" location: "../netMHCpan-4.1" - netMHCIIpan: + net_mhc_two_pan: activate: false peptide_len: 15 extra: "" diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index ff10fe49..97e3b0ec 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -21,9 +21,37 @@ rule HLA_LA: "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.group}_{wildcards.alias} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir {params.workdir} --maxThreads {threads} > {log} 2>&1" +rule net_mhc_pan_alleles: + output: + mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt", + conda: + "../envs/tcsh.yaml" + log: + "logs/net_mhc_pan/available_alleles.net_mhc_pan.log", + params: + net_mhc=config["params"]["net_mhc_pan"]["location"], + shell: + "{params.net_mhc}/net_mhc_pan -listMHC > {output.mhc_one_alleles} 2> {log}" + + +rule net_mhc_two_pan_alleles: + output: + mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt", + conda: + "../envs/tcsh.yaml" + log: + "logs/net_mhc_pan/available_alleles.net_mhc_two_pan.log", + params: + net_mhc=config["params"]["net_mhc_two_pan"]["location"], + shell: + "{params.net_mhc}/net_mhc_two_pan -list > {output.mhc_two_alleles} 2> {log}" + + rule parse_and_filter_hla_alleles_for_netmhc: input: hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", + mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt", + mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt", output: hlaI=report( "results/HLA-LA/{group}.{alias}.hlaI.tsv", diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index b1418d4f..33154e72 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -1,17 +1,17 @@ -rule netMHCpan: +rule net_mhc_pan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_pan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCI, output: - "results/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + "results/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: - "logs/netMHCpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + "logs/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: "../envs/tcsh.yaml" params: - extra=config["params"]["netMHCpan"]["extra"], - netMHC=config["params"]["netMHCpan"]["location"], - length=config["params"]["netMHCpan"]["peptide_len"], + extra=config["params"]["net_mhc_pan"]["extra"], + netMHC=config["params"]["net_mhc_pan"]["location"], + length=config["params"]["net_mhc_pan"]["peptide_len"], alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), shell: "( " @@ -24,20 +24,20 @@ rule netMHCpan: " ) 2> {log}" -rule netMHCIIpan: +rule net_mhc_two_pan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.netMHCIIpan.{contig}.{peptide_type}.fa", + peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_two_pan.{contig}.{peptide_type}.fa", alleles=get_alleles_MHCII, output: - "results/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + "results/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: - "logs/netMHCIIpan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + "logs/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: "../envs/tcsh.yaml" params: - extra=config["params"]["netMHCIIpan"]["extra"], - netMHC=config["params"]["netMHCIIpan"]["location"], - length=config["params"]["netMHCIIpan"]["peptide_len"], + extra=config["params"]["net_mhc_two_pan"]["extra"], + netMHC=config["params"]["net_mhc_two_pan"]["location"], + length=config["params"]["net_mhc_two_pan"]["peptide_len"], alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), shell: "( " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e6262532..9c928417 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -110,11 +110,11 @@ def get_final_output(): filter( None, [ - "netMHCpan" - if is_activated("params/netMHCpan") + "net_mhc_pan" + if is_activated("params/net_mhc_pan") else None, - "netMHCIIpan" - if is_activated("params/netMHCIIpan") + "net_mhc_two_pan" + if is_activated("params/net_mhc_two_pan") else None, ], ) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 86af23d0..a57340ad 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -77,7 +77,7 @@ rule microphaser_tumor: conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3, + window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3, shell: "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" @@ -102,7 +102,7 @@ rule microphaser_normal: conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max(config["params"]["netMHCpan"]["peptide_len"],config["params"]["netMHCIIpan"]["peptide_len"])*3, + window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3, shell: "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index dad1a71e..1186fc44 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -67,7 +67,7 @@ properties: required: - normal - tumor - netMHCpan: + net_mhc_pan: type: object properties: activate: @@ -80,7 +80,7 @@ properties: type: string required: - activate - netMHCIIpan: + net_mhc_two_pan: type: object properties: activate: @@ -93,8 +93,8 @@ properties: - activate required: - microphaser - - netMHCpan - - netMHCIIpan + - net_mhc_pan + - net_mhc_two_pan required: From fff90dc747aa3d70ff038268b43748e5f32cc79f Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 13:15:05 +0000 Subject: [PATCH 121/191] automatically filter HLA-LA alleles down to those that netMHC can handle --- ...parse_and_filter_hla_alleles_for_netmhc.py | 107 +++++++++++++----- 1 file changed, 81 insertions(+), 26 deletions(-) diff --git a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py index c93a943e..79c6994c 100644 --- a/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py +++ b/workflow/scripts/parse_and_filter_hla_alleles_for_netmhc.py @@ -4,15 +4,57 @@ import pandas as pd -# To know which alleles netMHCpan can handle, use its -listMHC option. -HLAI = {"A", "B", "C", "E", "G"} +# # read in available alleles -# To know which alleles netMHCIIpan can handle, use its -list option. + +def read_allele_list(filename: str): + with open(filename, "r") as alleles_in: + alleles = set() + for line in alleles_in: + if not (line.startswith("#") or line == "\n"): + alleles.add(line.strip()) + return alleles + + +HLA_SUFFIXES_REGEX = r"[NLSCAQ]?" + +# netMHCpan alleles and loci +HLA_ONE_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_one_alleles) +hla_one_net_mhc_alleles = pd.Series(list(HLA_ONE_NET_MHC_ALLELES)) +HLA_ONE_LOCI = set( + hla_one_net_mhc_alleles[hla_one_net_mhc_alleles.str.startswith("HLA-")] + .str.replace(r"HLA-([A-Z])[\d:]+" + HLA_SUFFIXES_REGEX, r"\1", regex=True) + .drop_duplicates() +) + + +# netMHCIIpan alleles and loci +HLA_TWO_NET_MHC_ALLELES = read_allele_list(snakemake.input.mhc_two_alleles) +hla_two_net_mhc_alleles = pd.Series(list(HLA_TWO_NET_MHC_ALLELES)) # DRB alleles need to be formatted differently from DP and DQ alleles, -# so we specify them separately. -DRB = {"DRB1", "DRB3", "DRB4", "DRB5"} -ALPHA_BETA = {"DPA1", "DPB1", "DQA1", "DQB1"} +# so we extract them separately. +DRB_LOCI = set( + hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("DRB")] + .str.replace(r"(DRB\d)_\d+" + HLA_SUFFIXES_REGEX, r"\1", regex=True) + .drop_duplicates() +) + +ALPHA_BETA_LOCI = set( + hla_two_net_mhc_alleles[hla_two_net_mhc_alleles.str.startswith("HLA-")] + .str.replace( + r"HLA-(D[A-Z]A\d)\d+" + + HLA_SUFFIXES_REGEX + + r"-(D[A-Z]B\d)\d+" + + HLA_SUFFIXES_REGEX, + r"\1_\2", + regex=True, + ) + .str.split("_") + .explode() + .drop_duplicates() +) +# read in alleles as determined by HLA-LA hlas = pd.read_csv(snakemake.input.hla_la_bestguess, sep="\t") # the Allele column can contain multiple ";"-separated entries for the @@ -26,52 +68,63 @@ # it needs to be in the format of the first column of the above list, as explained in # the "Instructions" tab under "MHC SELECTION" point "2)" at: # * https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 -hlaI_alleles = ( - hlas.loc[hlas["Locus"].isin(HLAI), "Allele"] - .str.replace(r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(N?)", r"HLA-\1\2:\3\5", regex=True) +hla_one_alleles = ( + hlas.loc[hlas["Locus"].isin(HLA_ONE_LOCI), "Allele"] + .str.replace( + r"([A-Z])\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"HLA-\1\2:\3\5", + regex=True, + ) .drop_duplicates() ) -hlaI_alleles.to_csv(snakemake.output.hlaI, sep="\t", index=False, header=False) +hla_one_alleles[hla_one_alleles.isin(HLA_ONE_NET_MHC_ALLELES)].to_csv( + snakemake.output.hlaI, sep="\t", index=False, header=False +) # reformat to netMHCIIpan allele list format: -# * https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.1/alleles_name.list +# * https://services.healthtech.dtu.dk/services/netMHCIIpan-4.1/alleles_name.list # contrary to the format in that list, alleles actually need to be formatted like this, # with s found in the HLA-LA "Locus" column and syntax for the sub-numbering (only # the 1st and 2nd sub-number are used) according to the official nomenclature (see: -# https://ars.els-cdn.com/content/image/1-s2.0-S0006497120405555-gr2.jpg ): +# http://www.hla.alleles.org/nomenclature/naming.html ): # * DRB alleles: "_" # * DP and DQ alleles (alpha means A and beta means B in the gene name, for example DPA): # "HLA--" # This format was determined by manually selecting combinations above the # "type a list of molecules names" field of the "Submission" tab at: -# * https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.1 +# * https://services.healthtech.dtu.dk/service.php?netMHCIIpan-4.1 # TODO: check whether Jan's previous parsing of DRB alleles into this format is necessary: # * example: DRB1_1501-DRB30101-DRB40301N # * "DRB1_-DRB3-DRB4" -drb_alleles = hlas.loc[hlas["Locus"].isin(DRB)] -hlaII_alleles = ( +drb_alleles = hlas.loc[hlas["Locus"].isin(DRB_LOCI)] +hla_two_alleles = ( drb_alleles["Allele"] - .str.replace(r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1_\2\3\5", regex=True) + .str.replace( + r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"\1_\2\3\5", + regex=True, + ) .drop_duplicates() ) # handle alleles where a combination of alpha and beta always exists -alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA)] +alpha_beta_alleles = hlas.loc[hlas["Locus"].isin(ALPHA_BETA_LOCI)] alpha_beta_alleles.loc[:, "Allele"] = alpha_beta_alleles.Allele.str.replace( - r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(N?)", r"\1\2\3\5", regex=True + r"([A-Z]+\d)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"\1\2\3\5", + regex=True, ) # we need a variable to group alpha and beta of the same gene combination together -alpha_beta_alleles["gene_group"] = alpha_beta_alleles["Locus"].str[0:2] +alpha_beta_alleles.loc[:, "gene_group"] = alpha_beta_alleles["Locus"].str.replace( + r"(D[A-Z])[AB]\d", r"\1", regex=True +) # we need to handle cases where we had multiple allele entries in an # alpha or beta locus, adding in a duplicate of the corresponding locus -select_mult_all = alpha_beta_alleles["alternative"] > 0 -select_dpq_loci = alpha_beta_alleles["Locus"].str.startswith("D[PQ]") -mult_all_per_loc_selection = select_mult_all & select_dpq_loci alleles_to_duplicate = alpha_beta_alleles.loc[ - mult_all_per_loc_selection, + alpha_beta_alleles["alternative"] > 0, ["Locus", "Chromosome", "alternative"], -].replace(regex={"Locus": {"(D[PQ])A(\d+)": r"\1B\2", "(D[PQ])B(\d+)": r"\1A\2"}}) +].replace(regex={"Locus": {"(D[A-Z])A(\d)": r"\1B\2", "(D[A-Z])B(\d)": r"\1A\2"}}) alleles_to_insert = alleles_to_duplicate.merge( alpha_beta_alleles.drop("alternative", axis="columns"), on=["Locus", "Chromosome"], @@ -81,10 +134,12 @@ [alpha_beta_alleles, alleles_to_insert] ).drop_duplicates() -hlaII_alleles = hlaII_alleles.append( +hla_two_alleles = hla_two_alleles.append( alpha_beta_alleles.groupby(["gene_group", "Chromosome", "alternative"])["Allele"] .transform(lambda x: f"HLA-{'-'.join(x)}") .drop_duplicates() ) -hlaII_alleles.to_csv(snakemake.output.hlaII, sep="\t", index=False, header=False) +hla_two_alleles[hla_two_alleles.isin(HLA_TWO_NET_MHC_ALLELES)].to_csv( + snakemake.output.hlaII, sep="\t", index=False, header=False +) From fd999dea825794885e0a6a452bdee5492c111801 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 13:22:23 +0000 Subject: [PATCH 122/191] rename merge_data.py to merge_neoantigen_info.py --- workflow/rules/MHC_binding.smk | 4 ++-- workflow/scripts/{merge_data.py => merge_neoantigen_info.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename workflow/scripts/{merge_data.py => merge_neoantigen_info.py} (100%) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 33154e72..7934237b 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -64,7 +64,7 @@ rule tidy_mhc_out: "../scripts/clean_mhc_output.py" -rule mhc_csv_table: +rule merge_neoantigen_info: input: info="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", neo="results/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.mhc.neo.tsv", @@ -78,7 +78,7 @@ rule mhc_csv_table: log: "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", script: - "../scripts/merge_data.py" + "../scripts/merge_neoantigen_info.py" rule add_RNA_info: diff --git a/workflow/scripts/merge_data.py b/workflow/scripts/merge_neoantigen_info.py similarity index 100% rename from workflow/scripts/merge_data.py rename to workflow/scripts/merge_neoantigen_info.py From 457870428ed278fb55e42c3ee192825f998cad94 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 3 Aug 2022 13:25:13 +0000 Subject: [PATCH 123/191] fix erroneous substitutions --- workflow/rules/HLAtyping.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 97e3b0ec..179a2c67 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -31,7 +31,7 @@ rule net_mhc_pan_alleles: params: net_mhc=config["params"]["net_mhc_pan"]["location"], shell: - "{params.net_mhc}/net_mhc_pan -listMHC > {output.mhc_one_alleles} 2> {log}" + "{params.net_mhc}/netMHCpan -listMHC > {output.mhc_one_alleles} 2> {log}" rule net_mhc_two_pan_alleles: @@ -44,7 +44,7 @@ rule net_mhc_two_pan_alleles: params: net_mhc=config["params"]["net_mhc_two_pan"]["location"], shell: - "{params.net_mhc}/net_mhc_two_pan -list > {output.mhc_two_alleles} 2> {log}" + "{params.net_mhc}/netMHCIIpan -list > {output.mhc_two_alleles} 2> {log}" rule parse_and_filter_hla_alleles_for_netmhc: From 8f40088013e597463f998136986d511cd45027b7 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 4 Aug 2022 08:24:11 +0000 Subject: [PATCH 124/191] fix changed script name --- workflow/rules/MHC_binding.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index 7934237b..f6163b26 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -61,7 +61,7 @@ rule tidy_mhc_out: log: "logs/parse_mhc_out/{mhc}/{group}.{tumor_alias}.merged_tumor_normal.{peptide_type}.log", script: - "../scripts/clean_mhc_output.py" + "../scripts/tidy_mhc_output.py" rule merge_neoantigen_info: From 8008228963ec7866d2711585cf58e85bf72313be Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 4 Aug 2022 10:24:31 +0000 Subject: [PATCH 125/191] fix mhc_in type in tidy_mhc_output.py --- workflow/scripts/tidy_mhc_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py index 8ab28fbe..535609ee 100644 --- a/workflow/scripts/tidy_mhc_output.py +++ b/workflow/scripts/tidy_mhc_output.py @@ -29,7 +29,7 @@ } -def parse_file(mhc_in: FileIO): +def parse_file(mhc_in: str): """ Parse an netMHCpan or netMHCIIpan output file from the `-xls -xlsfile ` directive into a tidy pandas data frame. From 6376dd5c97aa9cea65db64c083adc951cb718b0a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 5 Aug 2022 15:48:06 +0000 Subject: [PATCH 126/191] properly clean up netMHC(II)pan headers --- workflow/scripts/tidy_mhc_output.py | 110 ++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 31 deletions(-) diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py index 535609ee..2fee212a 100644 --- a/workflow/scripts/tidy_mhc_output.py +++ b/workflow/scripts/tidy_mhc_output.py @@ -5,27 +5,57 @@ import os import pandas as pd +from itertools import cycle + # assumptions of this script about netMHCpan or netMHCIIpan: # * version 4.1 # * output generated via `-xls` option # * generated with the `-BA` option to include binding affinity prediction -# The mapping of index column names used here to original names in netMHCpan files is: +# The mapping of index column names used here to original names in netMHCpan files +# is (please excuse the pd.NA tuples, they make it easier further down the line): INDEX_NAMES = { - "Pos": "position_in_protein_sequence", - "Peptide": "peptide_sequence", - "ID": "peptide_ID", - "Ave": "average_el_score", - "NB": "number_of_binders", + (pd.NA, "Pos"): "position_in_protein_sequence", + (pd.NA, "Peptide"): "peptide_sequence", + (pd.NA, "ID"): "peptide_ID", + (pd.NA, "Ave"): "average_el_score", + (pd.NA, "NB"): "number_of_binders", } -# The mapping of column names used here to original names in netMHCpan files is: -COLUMN_NAMES = { - "BA-score": "binding_affinity_score", - "BA_Rank": "binding_affinity_percent_rank", - "EL-score": "elution_ligang_score", - "EL_Rank": "elution_ligand_percent_rank", - "core": "binding_core", - "icore": "interaction_core", + +if snakemake.wildcards.mhc == "net_mhc_pan": + # The mapping of column names used here to original names in netMHCpan files is: + COLUMN_NAMES = { + "BA-score": "binding_affinity_score", + "BA_Rank": "binding_affinity_percent_rank", + "EL-score": "elution_ligang_score", + "EL_Rank": "elution_ligand_percent_rank", + "core": "binding_core", + } +elif snakemake.wildcards.mhc == "net_mhc_two_pan": + # The mapping of column names used here to original names in netMHCIIpan files is: + COLUMN_NAMES = { + "Score_BA": "binding_affinity_score", + "Rank_BA": "binding_affinity_percent_rank", + "Score": "elution_ligang_score", + "Rank": "elution_ligand_percent_rank", + "Core": "binding_core", + } +else: + sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}") + +COLUMNS_TO_DROP = { + # I have not found any docs or indication in the manuscript, what the column + # `Target` is. It might be the column `Exp_Bind` from Figure 1B here (it's + # the only column that only appears in netMHCIIpan output and not netMHCpan + # output): + # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7319546/figure/F1/ + # If so, it is only for benchmarking purposes according to the docs. And it + # is always NA, even in their manuscript. So we simply remove it here. + "Target", + # There doesn't seem to be an `icore` equivalent in netMHCIIpan output. + "icore", + # There doesn't seem to be an `nM` equivalent in netMHCpan output. + "nM", } @@ -42,27 +72,45 @@ def parse_file(mhc_in: str): + list(INDEX_NAMES.values()) ) + # We need to fix the utterly broken headers first + + # parse first header line into pandas.Series and name it + first_header_line = pd.read_csv(mhc_in, nrows=1, header=None, sep="\t").iloc[0, :] + first_header_line.name = "allele" + + # parse second header line into pandas.Series and name it + second_header_line = pd.read_csv( + mhc_in, skiprows=1, nrows=1, header=None, sep="\t" + ).iloc[0, :] + second_header_line.name = "column_name" + + header = pd.concat([first_header_line, second_header_line], axis="columns") + header = header.fillna(method="ffill") + header.loc[ + header.column_name.isin({"Pos", "Peptide", "ID", "Target", "Ave", "NB"}), + "allele", + ] = pd.NA + # It's a compound header over two rows and a compound row index in the initial # three and final two columns of the table. For some reason, the final two # columns are added to the index but not removed from the table, so we do this # manually with `.iloc[]``. - data = pd.read_csv( - mhc_in, sep="\t", header=[0, 1], index_col=[0, 1, 2, -2, -1] - ).iloc[:, :-2] - - # With two lines of header parsed into a MultiIndex, pandas only uses the - # first column name in index_col as an entry. Obviously the following code - # assumes that these are the first three and last two columns of the data file. - data.index.names = list(INDEX_NAMES.values()) - - # Entries of the header MultiIndex need to be fixed, there doesn't seem to be - # any way to automatically do this during read_csv. - cols = pd.DataFrame(data.columns.to_list(), columns=["allele", "info"]) - - # fix up the columns and reassign - cols.loc[cols["allele"].str.endswith("_level_0"), "allele"] = pd.NA - cols = cols.fillna(method="ffill") - data.columns = pd.MultiIndex.from_frame(cols) + data = pd.read_csv(mhc_in, sep="\t", skiprows=2, header=None) + + data.columns = pd.MultiIndex.from_frame(header) + + # remove columns only present in one of the two tools' output + columns_to_keep = [ + col + for col in list(data.columns.get_level_values("column_name")) + if col not in COLUMNS_TO_DROP + ] + idx = pd.IndexSlice + data = data.loc[:, idx[:, columns_to_keep]] + + # properly set row index columns (values to repeat in every row while doing the stack below) + data = data.set_index(list(INDEX_NAMES.keys())) + data.index.set_names(INDEX_NAMES, inplace=True) # Turn into longer table with one HLA Allele per row instead of MultiIndex # header, rename columns to something readable and turn index into columns. From c4c9cd9959e69cba4aa3fe7c184bc1909a79f644 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Mon, 8 Aug 2022 09:35:14 +0000 Subject: [PATCH 127/191] minor comment text update --- workflow/scripts/tidy_mhc_output.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py index 2fee212a..5188d6de 100644 --- a/workflow/scripts/tidy_mhc_output.py +++ b/workflow/scripts/tidy_mhc_output.py @@ -13,7 +13,8 @@ # * generated with the `-BA` option to include binding affinity prediction # The mapping of index column names used here to original names in netMHCpan files -# is (please excuse the pd.NA tuples, they make it easier further down the line): +# is (please excuse the pd.NA tuples, they make header and index handling +# easier further down the line): INDEX_NAMES = { (pd.NA, "Pos"): "position_in_protein_sequence", (pd.NA, "Peptide"): "peptide_sequence", From a98b222500dddbe6f038648707ca179c6bdcaff1 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 09:23:40 +0000 Subject: [PATCH 128/191] rename to shorter col names, use existing definitions where possible --- workflow/scripts/tidy_mhc_output.py | 34 +++++++++++++++-------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py index 5188d6de..3d29dfb8 100644 --- a/workflow/scripts/tidy_mhc_output.py +++ b/workflow/scripts/tidy_mhc_output.py @@ -16,30 +16,30 @@ # is (please excuse the pd.NA tuples, they make header and index handling # easier further down the line): INDEX_NAMES = { - (pd.NA, "Pos"): "position_in_protein_sequence", - (pd.NA, "Peptide"): "peptide_sequence", - (pd.NA, "ID"): "peptide_ID", - (pd.NA, "Ave"): "average_el_score", - (pd.NA, "NB"): "number_of_binders", + (pd.NA, "Pos"): "pos_in_id_seq", + (pd.NA, "Peptide"): "pep_seq", + (pd.NA, "ID"): "id", + (pd.NA, "Ave"): "ave_el_score", + (pd.NA, "NB"): "num_binders", } if snakemake.wildcards.mhc == "net_mhc_pan": # The mapping of column names used here to original names in netMHCpan files is: COLUMN_NAMES = { - "BA-score": "binding_affinity_score", - "BA_Rank": "binding_affinity_percent_rank", - "EL-score": "elution_ligang_score", - "EL_Rank": "elution_ligand_percent_rank", - "core": "binding_core", + "BA-score": "ba_score", + "BA_Rank": "ba_rank", + "EL-score": "el_score", + "EL_Rank": "el_rank", + "core": "bind_core", } elif snakemake.wildcards.mhc == "net_mhc_two_pan": # The mapping of column names used here to original names in netMHCIIpan files is: COLUMN_NAMES = { - "Score_BA": "binding_affinity_score", - "Rank_BA": "binding_affinity_percent_rank", - "Score": "elution_ligang_score", - "Rank": "elution_ligand_percent_rank", - "Core": "binding_core", + "Score_BA": "ba_score", + "Rank_BA": "ba_rank", + "Score": "el_score", + "Rank": "el_rank", + "Core": "bind_core", } else: sys.exit(f"Wildcard `mhc` has unknown value: {snakemake.wildcards.mhc}") @@ -88,7 +88,9 @@ def parse_file(mhc_in: str): header = pd.concat([first_header_line, second_header_line], axis="columns") header = header.fillna(method="ffill") header.loc[ - header.column_name.isin({"Pos", "Peptide", "ID", "Target", "Ave", "NB"}), + header.column_name.isin( + [ index_col for (_, index_col) in INDEX_NAMES.keys() ] + ), "allele", ] = pd.NA From 6bdadca7636968ff1d1fd232dcda21b7736efb32 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 09:24:20 +0000 Subject: [PATCH 129/191] intermediate rework step of merge_neoantigen_info.py with with .apply() approach for getting set columns to work --- workflow/scripts/merge_neoantigen_info.py | 82 ++++++++++++++++++----- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 23f92e37..47526fdd 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -4,6 +4,14 @@ import pandas as pd +def get_minimum_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: + df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele']) + rank_col = f"{rank_type}_rank" + score_col = f"{rank_type}_score" + prefix = f"top_{rank_col}_" + columns_to_keep = ['bind_core', rank_col, score_col] + return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix) + def select_columns(mhc: pd.DataFrame) -> pd.DataFrame: rank_cols = [c for c in mhc.columns if "Rank" in c] affinity_cols = [c for c in mhc.columns if "nM" in c] @@ -17,16 +25,61 @@ def select_columns(mhc: pd.DataFrame) -> pd.DataFrame: mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") return mhc -def merge(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame: - tumor = select_columns(tumor) - normal = select_columns(normal) - id_length = len(tumor.ID[0]) - print(info.columns) - info["ID"] = info["id"].astype(str).str[:id_length] - merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID']) - merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns}) - info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"}) - merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') +def get_filtered_per_sample(sample: pd.DataFrame, alias: str) -> pd.DataFrame: + common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True) + sample_el = get_minimum_rank_per_peptide(sample, "el") + sample_ba = get_minimum_rank_per_peptide(sample, "ba") + sample_filtered = sample_el.join(sample_ba) + return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True) + +def tidy_info(info: pd.DataFrame, alias: str) -> pd.DataFrame: + info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand']) + int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites'] + info[int_cols] = info[int_cols].astype('int32') + num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': alias}).set_index('alias', append=True) + num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': alias}).set_index('alias', append=True) + sites_tidy = info.copy() + sites_tidy['sites'] = sites_tidy['variant_sites'].apply(lambda x: set(str(x).split('|'))) + sites_tidy['somatic_sites'] = sites_tidy['somatic_positions'].apply(lambda x: set(str(x).split('|'))) + sites_tidy['germline_sites'] = sites_tidy[['sites', 'somatic_sites']].apply(lambda row: [s for s in row['sites'] if s not in row['somatic_sites']], axis=1) + sites_tidy = sites_tidy.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_sites', 'germline_sites'], ignore_index=False).replace({'somatic_sites': 'normal', 'germline_sites': 'tumor_alias'}).set_index('alias', append=True)['genomic_pos'].apply(lambda r: '|'.join(r)) + sites_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': 'tumor_resection', 'germline_positions': 'normal'}).set_index('alias', append=True) + + seq_tidy = info.melt(var_name='alias', value_name='sequence', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': alias}).set_index('alias', append=True) + +def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame: + # get and merge tumor and normal + tumor_filtered = get_filtered_per_sample(tumor, snakemake.wildcards.tumor_alias) + normal_filtered = get_filtered_per_sample(normal, "normal") + all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').sort_index() + info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias) + + # tidy up info + all_annotated = all_filtered.merge(info, on='id', how='left') +# tumor = select_columns(tumor) +# normal = select_columns(normal) +# id_length = len(tumor.ID[0]) +# print(info.columns) +# info["ID"] = info["id"].astype(str).str[:id_length] +# merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID']) +# merged_mhc = merged_mhc +# .rename( +# columns={col: col.replace("_y","_normal") for col in merged_mhc.columns} +# ) +# .rename( +# columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns} +# ) +# info = info.rename( +# columns={ +# "gene_id":"Gene_ID", +# "gene_name":"Gene_Symbol", +# "strand":"Strand", +# "positions":"Variant_Position", +# "chrom":"Chromosome", +# "somatic_aa_change": +# "Somatic_AminoAcid_Change" +# }) +# merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) ## Are all possible variants in the peptide ("Cis") or not ("Trans") @@ -80,11 +133,10 @@ def diffEpitope(e1,e2): def main(): info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str) - tumor = pd.read_csv(snakemake.input.neo, sep = '\t') - normal = pd.read_csv(snakemake.input.normal, sep = '\t') - outfile = snakemake.output[0] - data = merge(info, tumor, normal) - data.to_csv(outfile, index=False, sep = '\t') + tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str}) + normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str}) + data = merge_data_frames(info, tumor, normal) + data.to_csv(snakemake.output[0], index=False, sep = '\t') if __name__ == '__main__': sys.exit(main()) From 525d406c7032666b2306cf08bf663216e7eb750b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 17:47:04 +0000 Subject: [PATCH 130/191] intermediate version of merge_neoantigen_info.py cleanup --- workflow/scripts/merge_neoantigen_info.py | 84 ++++++++++++++--------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 47526fdd..3fe10399 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -4,7 +4,7 @@ import pandas as pd -def get_minimum_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: +def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele']) rank_col = f"{rank_type}_rank" score_col = f"{rank_type}_score" @@ -25,37 +25,41 @@ def select_columns(mhc: pd.DataFrame) -> pd.DataFrame: mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") return mhc -def get_filtered_per_sample(sample: pd.DataFrame, alias: str) -> pd.DataFrame: +def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True) - sample_el = get_minimum_rank_per_peptide(sample, "el") - sample_ba = get_minimum_rank_per_peptide(sample, "ba") + sample_el = get_best_rank_per_peptide(sample, "el") + sample_ba = get_best_rank_per_peptide(sample, "ba") sample_filtered = sample_el.join(sample_ba) return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True) -def tidy_info(info: pd.DataFrame, alias: str) -> pd.DataFrame: +def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: + """ + Get the -o info output of the microphaser filter command into tidy data format. + """ info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand']) int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites'] info[int_cols] = info[int_cols].astype('int32') - num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': alias}).set_index('alias', append=True) - num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': alias}).set_index('alias', append=True) - sites_tidy = info.copy() - sites_tidy['sites'] = sites_tidy['variant_sites'].apply(lambda x: set(str(x).split('|'))) - sites_tidy['somatic_sites'] = sites_tidy['somatic_positions'].apply(lambda x: set(str(x).split('|'))) - sites_tidy['germline_sites'] = sites_tidy[['sites', 'somatic_sites']].apply(lambda row: [s for s in row['sites'] if s not in row['somatic_sites']], axis=1) - sites_tidy = sites_tidy.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_sites', 'germline_sites'], ignore_index=False).replace({'somatic_sites': 'normal', 'germline_sites': 'tumor_alias'}).set_index('alias', append=True)['genomic_pos'].apply(lambda r: '|'.join(r)) - sites_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': 'tumor_resection', 'germline_positions': 'normal'}).set_index('alias', append=True) - - seq_tidy = info.melt(var_name='alias', value_name='sequence', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': alias}).set_index('alias', append=True) + # TODO: Ensure that microphaser output contains only one entry per id. + # If there is more than one entry per index, ensure that they are identical + if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0: + sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.") + # Always take the first entry for each index. + info = info.groupby(info.index).head(1) + # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal. + num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True) + num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True) + genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True) + aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True) + nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True) + return num_var_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]) def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame: # get and merge tumor and normal - tumor_filtered = get_filtered_per_sample(tumor, snakemake.wildcards.tumor_alias) - normal_filtered = get_filtered_per_sample(normal, "normal") - all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').sort_index() + tumor_filtered = get_filtered_per_alias(tumor, snakemake.wildcards.tumor_alias) + normal_filtered = get_filtered_per_alias(normal, "normal") + all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias='tumor_resection').sort_index() info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias) - - # tidy up info - all_annotated = all_filtered.merge(info, on='id', how='left') + all_annotated = all_filtered.merge(info_tidy, on=['id', 'alias'], how='left') # tumor = select_columns(tumor) # normal = select_columns(normal) # id_length = len(tumor.ID[0]) @@ -81,7 +85,7 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr # }) # merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') - merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1) + merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diff_peptides(*x), axis=1) ## Are all possible variants in the peptide ("Cis") or not ("Trans") merged_dataframe["Variant_Orientation"] = "Cis" trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar @@ -116,19 +120,31 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr return data -## highlight the difference between mutated neopeptide and wildtype -def diffEpitope(e1,e2): - if str(e2) == 'nan' or str(e2) == '': - return(e1) - e1 = str(e1) - e2 = str(e2) - diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]] - e_new = e1 - e2_new = e2 +def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame: + group = group.reset_index(level='alias') + normal_pep = group.loc[group['alias'] == 'normal', column].fillna('') + if normal_pep.empty: + normal_pep = '' + else: + normal_pep = normal_pep.squeeze() + tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze() + ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= diff_peptides(tumor_pep, normal_pep) + return group.set_index('alias', append=True) + + +def diff_peptides(tumor_p: str, normal_p: str) -> (str, str): + """ + Highlight the difference between mutated neopeptide and normal peptide + """ + if normal_p == 'nan' or normal_p == '': + return (tumor_p, normal_p) + diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] + tp_changed = tumor_p + np_changed = normal_p for p in diff_pos: - e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:] - e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:] - return(e_new) + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:] + return (tp_changed, np_changed) def main(): From f660e21dbda495017a0f6915a6fd872a5e24cd10 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 19:26:40 +0000 Subject: [PATCH 131/191] reworked version of merge_neoantigen_info.py to test --- workflow/scripts/merge_neoantigen_info.py | 201 ++++++++++------------ 1 file changed, 94 insertions(+), 107 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 3fe10399..d2a9334d 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -1,8 +1,11 @@ import sys +from xml.sax.handler import all_properties sys.stderr = open(snakemake.log[0], "w") import pandas as pd +from typing import Tuple + def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele']) @@ -12,18 +15,6 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: columns_to_keep = ['bind_core', rank_col, score_col] return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix) -def select_columns(mhc: pd.DataFrame) -> pd.DataFrame: - rank_cols = [c for c in mhc.columns if "Rank" in c] - affinity_cols = [c for c in mhc.columns if "nM" in c] - mhc_cols = ["Pos", "ID", "Peptide"] + rank_cols + affinity_cols + ["NB"] - mhc = mhc[mhc_cols] - mhc["Rank_min"] = mhc[rank_cols].min(axis=1) - mhc["Aff_min"] = mhc[affinity_cols].min(axis=1) - mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1) - mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1) - mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","") - mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","") - return mhc def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True) @@ -32,126 +23,122 @@ def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: sample_filtered = sample_el.join(sample_ba) return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True) + +def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: + """ + Highlight the difference between mutated neopeptide and normal peptide + """ + if normal_p == 'nan' or normal_p == '': + return (tumor_p, normal_p) + diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] + tp_changed = tumor_p + np_changed = normal_p + for p in diff_pos: + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:] + return (tp_changed, np_changed) + + +def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame: + group = group.reset_index(level='alias') + normal_pep = group.loc[group['alias'] == 'normal', column].fillna('') + if normal_pep.empty: + normal_pep = '' + else: + normal_pep = normal_pep.squeeze() + tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze() + # Silent mutations should not be included in microphaser output. + if normal_pep == tumor_pep: + sys.exit(f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" + "Please fix this upstream or comment out this check to ignore this problem.\n" + ) + # Remove groups where the tumor peptide contains a stop codon. + # TODO: Maybe this should be a hard fail complaining to fix this upstream? + if 'X' in tumor_pep: + return group.loc[[], :] + ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= highlight_peptides_diff(tumor_pep, normal_pep) + return group.set_index('alias', append=True) + + def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: """ Get the -o info output of the microphaser filter command into tidy data format. """ - info = info.set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'credible_interval', 'depth', 'strand']) + info = info.rename(columns={'credible_interval': 'freq_credible_interval'}).set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'freq_credible_interval', 'depth', 'strand']) int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites'] info[int_cols] = info[int_cols].astype('int32') # TODO: Ensure that microphaser output contains only one entry per id. # If there is more than one entry per index, ensure that they are identical if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0: - sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.") + sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n") # Always take the first entry for each index. info = info.groupby(info.index).head(1) # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal. - num_var_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True) + num_var_in_pep_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var_in_pep', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True) num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True) genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True) aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True) nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True) - return num_var_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]) + all_tidy = num_var_in_pep_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]) + return all_tidy.reset_index(level=[i for i in all_tidy.index.names if i not in ['id', 'alias']]) -def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame) -> pd.DataFrame: + +def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: # get and merge tumor and normal - tumor_filtered = get_filtered_per_alias(tumor, snakemake.wildcards.tumor_alias) + tumor_filtered = get_filtered_per_alias(tumor, tumor_alias) normal_filtered = get_filtered_per_alias(normal, "normal") - all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level='pep_seq').groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias='tumor_resection').sort_index() - info_tidy = tidy_info(info, snakemake.wildcards.tumor_alias) - all_annotated = all_filtered.merge(info_tidy, on=['id', 'alias'], how='left') -# tumor = select_columns(tumor) -# normal = select_columns(normal) -# id_length = len(tumor.ID[0]) -# print(info.columns) -# info["ID"] = info["id"].astype(str).str[:id_length] -# merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID']) -# merged_mhc = merged_mhc -# .rename( -# columns={col: col.replace("_y","_normal") for col in merged_mhc.columns} -# ) -# .rename( -# columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns} -# ) -# info = info.rename( -# columns={ -# "gene_id":"Gene_ID", -# "gene_name":"Gene_Symbol", -# "strand":"Strand", -# "positions":"Variant_Position", -# "chrom":"Chromosome", -# "somatic_aa_change": -# "Somatic_AminoAcid_Change" -# }) -# merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID') - - merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diff_peptides(*x), axis=1) - ## Are all possible variants in the peptide ("Cis") or not ("Trans") - merged_dataframe["Variant_Orientation"] = "Cis" - trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar - merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans" - - ## check misssense/silent mutation status - nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal - merged_dataframe = merged_dataframe[nonsilent] - merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"]) - - data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth", - "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor", - "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal", - "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]] - - data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth", - "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor", - "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal", - "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"] - - # data = data[data.BindingHLAs_tumor > 0] - # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)] - #data = data[(data.BindingHLAs_normal == 0)] - - ### Delete Stop-Codon including peptides - data = data[data.Peptide_tumor.str.count("x") == 0] - data = data[data.Peptide_tumor.str.count("X") == 0] - data.sort_values(["Chromosome", "somatic_positions"], inplace=True) - ### Remove Duplicate kmers - data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"]) - - return data - - -def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame: - group = group.reset_index(level='alias') - normal_pep = group.loc[group['alias'] == 'normal', column].fillna('') - if normal_pep.empty: - normal_pep = '' - else: - normal_pep = normal_pep.squeeze() - tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze() - ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= diff_peptides(tumor_pep, normal_pep) - return group.set_index('alias', append=True) - - -def diff_peptides(tumor_p: str, normal_p: str) -> (str, str): - """ - Highlight the difference between mutated neopeptide and normal peptide - """ - if normal_p == 'nan' or normal_p == '': - return (tumor_p, normal_p) - diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] - tp_changed = tumor_p - np_changed = normal_p - for p in diff_pos: - tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:] - np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:] - return (tp_changed, np_changed) + all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level=['pep_seq', 'pos_in_id_seq']).groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias=tumor_alias).sort_index() + info_tidy = tidy_info(info, tumor_alias) + all_annotated = all_filtered.join(info_tidy, how='left').reset_index(level=['id', 'alias']) + + # Double-check for weird duplicates, as previously done in Jan's code. + if sum(all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])) > 0: + duplicates = all_annotated[all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])] + sys.exit("Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n" + "This indicates an upstream issue, please fix this. The offending entries are:\n" + f"{duplicates}\n" + ) + + column_order = [ + 'id', + 'pep_seq', + 'pos_in_id_seq', + 'alias', + 'num_binders', + 'freq', + 'depth', + 'num_var_sites', + 'num_var_in_pep', + 'top_el_rank_allele', + 'top_el_rank_bind_core', + 'top_el_rank_el_rank', + 'top_el_rank_el_score', + 'ave_el_score', + 'top_ba_rank_allele', + 'top_ba_rank_bind_core', + 'top_ba_rank_ba_rank', + 'top_ba_rank_ba_score', + 'aa_changes', + 'genomic_pos', + 'nt_seq', + 'gene_name', + 'gene_id', + 'transcript', + 'chrom', + 'offset', + 'frame', + 'strand', + 'freq_credible_interval', + ] + + return all_annotated.reindex(columns=column_order).sort_values(['chrom', 'genomic_pos', 'id']) def main(): info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str) tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str}) normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str}) - data = merge_data_frames(info, tumor, normal) + data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias) data.to_csv(snakemake.output[0], index=False, sep = '\t') if __name__ == '__main__': From ace6130a066205ed63579b5638d55c1a9d5c6f83 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 19:42:06 +0000 Subject: [PATCH 132/191] try fix for sorting --- workflow/scripts/merge_neoantigen_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index d2a9334d..ea861ea8 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -131,7 +131,7 @@ def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFr 'freq_credible_interval', ] - return all_annotated.reindex(columns=column_order).sort_values(['chrom', 'genomic_pos', 'id']) + return all_annotated.reindex(columns=column_order).sort_values(by = ['chrom', 'genomic_pos', 'id']) def main(): From 92dcdf1e6b37cd1f754d0603e8a55f04406b214e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 11 Aug 2022 19:43:12 +0000 Subject: [PATCH 133/191] black formatting --- workflow/scripts/merge_neoantigen_info.py | 264 ++++++++++++++++------ 1 file changed, 190 insertions(+), 74 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index ea861ea8..32a50556 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -8,138 +8,254 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: - df = df.set_index(['id', 'pos_in_id_seq', 'pep_seq', 'allele']) + df = df.set_index(["id", "pos_in_id_seq", "pep_seq", "allele"]) rank_col = f"{rank_type}_rank" score_col = f"{rank_type}_score" prefix = f"top_{rank_col}_" - columns_to_keep = ['bind_core', rank_col, score_col] - return df.loc[ df.groupby(['pep_seq', 'id'])[rank_col].idxmin(), columns_to_keep ].reset_index(level='allele').sort_index().drop_duplicates().add_prefix(prefix) + columns_to_keep = ["bind_core", rank_col, score_col] + return ( + df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep] + .reset_index(level="allele") + .sort_index() + .drop_duplicates() + .add_prefix(prefix) + ) def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: - common_info = sample.set_index(['id', 'pos_in_id_seq', 'pep_seq']).loc[:, ['ave_el_score', 'num_binders']].reset_index(level=['id', 'pos_in_id_seq']).drop_duplicates().set_index(['id', 'pos_in_id_seq'], append=True) + common_info = ( + sample.set_index(["id", "pos_in_id_seq", "pep_seq"]) + .loc[:, ["ave_el_score", "num_binders"]] + .reset_index(level=["id", "pos_in_id_seq"]) + .drop_duplicates() + .set_index(["id", "pos_in_id_seq"], append=True) + ) sample_el = get_best_rank_per_peptide(sample, "el") sample_ba = get_best_rank_per_peptide(sample, "ba") sample_filtered = sample_el.join(sample_ba) - return sample_filtered.join(common_info, how='left').assign(alias=alias).set_index('alias', append=True) + return ( + sample_filtered.join(common_info, how="left") + .assign(alias=alias) + .set_index("alias", append=True) + ) def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: """ Highlight the difference between mutated neopeptide and normal peptide """ - if normal_p == 'nan' or normal_p == '': + if normal_p == "nan" or normal_p == "": return (tumor_p, normal_p) diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] tp_changed = tumor_p np_changed = normal_p for p in diff_pos: - tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p+1:] - np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p+1:] + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :] return (tp_changed, np_changed) -def diff_tumor_normal_peptides(group: pd.DataFrame, column: str, tumor_alias: str) -> pd.DataFrame: - group = group.reset_index(level='alias') - normal_pep = group.loc[group['alias'] == 'normal', column].fillna('') +def diff_tumor_normal_peptides( + group: pd.DataFrame, column: str, tumor_alias: str +) -> pd.DataFrame: + group = group.reset_index(level="alias") + normal_pep = group.loc[group["alias"] == "normal", column].fillna("") if normal_pep.empty: - normal_pep = '' + normal_pep = "" else: normal_pep = normal_pep.squeeze() - tumor_pep = group.loc[group['alias'] == tumor_alias, column].fillna('').squeeze() + tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze() # Silent mutations should not be included in microphaser output. if normal_pep == tumor_pep: - sys.exit(f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" - "Please fix this upstream or comment out this check to ignore this problem.\n" + sys.exit( + f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" + "Please fix this upstream or comment out this check to ignore this problem.\n" ) # Remove groups where the tumor peptide contains a stop codon. # TODO: Maybe this should be a hard fail complaining to fix this upstream? - if 'X' in tumor_pep: + if "X" in tumor_pep: return group.loc[[], :] - ( group.loc[group['alias'] == tumor_alias, column], group.loc[group['alias'] == 'normal', column] )= highlight_peptides_diff(tumor_pep, normal_pep) - return group.set_index('alias', append=True) + ( + group.loc[group["alias"] == tumor_alias, column], + group.loc[group["alias"] == "normal", column], + ) = highlight_peptides_diff(tumor_pep, normal_pep) + return group.set_index("alias", append=True) def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: """ Get the -o info output of the microphaser filter command into tidy data format. """ - info = info.rename(columns={'credible_interval': 'freq_credible_interval'}).set_index(['id', 'transcript', 'gene_id', 'gene_name', 'chrom', 'offset', 'frame', 'freq', 'freq_credible_interval', 'depth', 'strand']) - int_cols = ['nvar', 'nsomatic', 'nvariant_sites', 'nsomvariant_sites'] - info[int_cols] = info[int_cols].astype('int32') + info = info.rename( + columns={"credible_interval": "freq_credible_interval"} + ).set_index( + [ + "id", + "transcript", + "gene_id", + "gene_name", + "chrom", + "offset", + "frame", + "freq", + "freq_credible_interval", + "depth", + "strand", + ] + ) + int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"] + info[int_cols] = info[int_cols].astype("int32") # TODO: Ensure that microphaser output contains only one entry per id. # If there is more than one entry per index, ensure that they are identical if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0: - sys.exit(f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n") + sys.exit( + f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n" + ) # Always take the first entry for each index. - info = info.groupby(info.index).head(1) + info = info.groupby(info.index).head(1) # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal. - num_var_in_pep_tidy = info.assign(ngermline = lambda x: x.nvar - x.nsomatic).melt(var_name='alias', value_name='num_var_in_pep', value_vars=['ngermline', 'nsomatic'], ignore_index=False).replace({'ngermline': 'normal', 'nsomatic': tumor_alias}).set_index('alias', append=True) - num_var_sites_tidy = info.assign(ngermvariant_sites = lambda x: x.nvariant_sites - x.nsomvariant_sites).melt(var_name='alias', value_name='num_var_sites', value_vars=['ngermvariant_sites', 'nsomvariant_sites'], ignore_index=False).replace({'ngermvariant_sites': 'normal', 'nsomvariant_sites': tumor_alias}).set_index('alias', append=True) - genomic_pos_tidy = info.melt(var_name='alias', value_name='genomic_pos', value_vars=['somatic_positions', 'germline_positions'], ignore_index=False).replace({'somatic_positions': tumor_alias, 'germline_positions': 'normal'}).set_index('alias', append=True) - aa_changes_tidy = info.melt(var_name='alias', value_name='aa_changes', value_vars=['somatic_aa_change', 'germline_aa_change'], ignore_index=False).replace({'somatic_aa_change': tumor_alias, 'germline_aa_change': 'normal'}).set_index('alias', append=True) - nt_seq_tidy = info.melt(var_name='alias', value_name='nt_seq', value_vars=['normal_sequence', 'mutant_sequence'], ignore_index=False).replace({'normal_sequence': 'normal', 'mutant_sequence': tumor_alias}).set_index('alias', append=True) - all_tidy = num_var_in_pep_tidy.join([num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy]) - return all_tidy.reset_index(level=[i for i in all_tidy.index.names if i not in ['id', 'alias']]) + num_var_in_pep_tidy = ( + info.assign(ngermline=lambda x: x.nvar - x.nsomatic) + .melt( + var_name="alias", + value_name="num_var_in_pep", + value_vars=["ngermline", "nsomatic"], + ignore_index=False, + ) + .replace({"ngermline": "normal", "nsomatic": tumor_alias}) + .set_index("alias", append=True) + ) + num_var_sites_tidy = ( + info.assign(ngermvariant_sites=lambda x: x.nvariant_sites - x.nsomvariant_sites) + .melt( + var_name="alias", + value_name="num_var_sites", + value_vars=["ngermvariant_sites", "nsomvariant_sites"], + ignore_index=False, + ) + .replace({"ngermvariant_sites": "normal", "nsomvariant_sites": tumor_alias}) + .set_index("alias", append=True) + ) + genomic_pos_tidy = ( + info.melt( + var_name="alias", + value_name="genomic_pos", + value_vars=["somatic_positions", "germline_positions"], + ignore_index=False, + ) + .replace({"somatic_positions": tumor_alias, "germline_positions": "normal"}) + .set_index("alias", append=True) + ) + aa_changes_tidy = ( + info.melt( + var_name="alias", + value_name="aa_changes", + value_vars=["somatic_aa_change", "germline_aa_change"], + ignore_index=False, + ) + .replace({"somatic_aa_change": tumor_alias, "germline_aa_change": "normal"}) + .set_index("alias", append=True) + ) + nt_seq_tidy = ( + info.melt( + var_name="alias", + value_name="nt_seq", + value_vars=["normal_sequence", "mutant_sequence"], + ignore_index=False, + ) + .replace({"normal_sequence": "normal", "mutant_sequence": tumor_alias}) + .set_index("alias", append=True) + ) + all_tidy = num_var_in_pep_tidy.join( + [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy] + ) + return all_tidy.reset_index( + level=[i for i in all_tidy.index.names if i not in ["id", "alias"]] + ) -def merge_data_frames(info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: +def merge_data_frames( + info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str +) -> pd.DataFrame: # get and merge tumor and normal tumor_filtered = get_filtered_per_alias(tumor, tumor_alias) normal_filtered = get_filtered_per_alias(normal, "normal") - all_filtered = pd.concat([tumor_filtered, normal_filtered]).reset_index(level=['pep_seq', 'pos_in_id_seq']).groupby('id', group_keys=False).apply(diff_tumor_normal_peptides, column='pep_seq', tumor_alias=tumor_alias).sort_index() + all_filtered = ( + pd.concat([tumor_filtered, normal_filtered]) + .reset_index(level=["pep_seq", "pos_in_id_seq"]) + .groupby("id", group_keys=False) + .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias) + .sort_index() + ) info_tidy = tidy_info(info, tumor_alias) - all_annotated = all_filtered.join(info_tidy, how='left').reset_index(level=['id', 'alias']) - + all_annotated = all_filtered.join(info_tidy, how="left").reset_index( + level=["id", "alias"] + ) + # Double-check for weird duplicates, as previously done in Jan's code. - if sum(all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])) > 0: - duplicates = all_annotated[all_annotated.duplicated(subset=['transcript','offset','pep_seq','aa_changes'])] - sys.exit("Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n" - "This indicates an upstream issue, please fix this. The offending entries are:\n" - f"{duplicates}\n" + if ( + sum( + all_annotated.duplicated( + subset=["transcript", "offset", "pep_seq", "aa_changes"] + ) + ) + > 0 + ): + duplicates = all_annotated[ + all_annotated.duplicated( + subset=["transcript", "offset", "pep_seq", "aa_changes"] ) + ] + sys.exit( + "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n" + "This indicates an upstream issue, please fix this. The offending entries are:\n" + f"{duplicates}\n" + ) column_order = [ - 'id', - 'pep_seq', - 'pos_in_id_seq', - 'alias', - 'num_binders', - 'freq', - 'depth', - 'num_var_sites', - 'num_var_in_pep', - 'top_el_rank_allele', - 'top_el_rank_bind_core', - 'top_el_rank_el_rank', - 'top_el_rank_el_score', - 'ave_el_score', - 'top_ba_rank_allele', - 'top_ba_rank_bind_core', - 'top_ba_rank_ba_rank', - 'top_ba_rank_ba_score', - 'aa_changes', - 'genomic_pos', - 'nt_seq', - 'gene_name', - 'gene_id', - 'transcript', - 'chrom', - 'offset', - 'frame', - 'strand', - 'freq_credible_interval', + "id", + "pep_seq", + "pos_in_id_seq", + "alias", + "num_binders", + "freq", + "depth", + "num_var_sites", + "num_var_in_pep", + "top_el_rank_allele", + "top_el_rank_bind_core", + "top_el_rank_el_rank", + "top_el_rank_el_score", + "ave_el_score", + "top_ba_rank_allele", + "top_ba_rank_bind_core", + "top_ba_rank_ba_rank", + "top_ba_rank_ba_score", + "aa_changes", + "genomic_pos", + "nt_seq", + "gene_name", + "gene_id", + "transcript", + "chrom", + "offset", + "frame", + "strand", + "freq_credible_interval", ] - return all_annotated.reindex(columns=column_order).sort_values(by = ['chrom', 'genomic_pos', 'id']) + return all_annotated.reindex(columns=column_order).sort_values( + by=["chrom", "genomic_pos", "id"] + ) def main(): - info = pd.read_csv(snakemake.input.info, sep = '\t', dtype=str) - tumor = pd.read_csv(snakemake.input.neo, sep = '\t', dtype={'pos_in_id_seq': str}) - normal = pd.read_csv(snakemake.input.normal, sep = '\t', dtype={'pos_in_id_seq': str}) + info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str) + tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str}) + normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str}) data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias) - data.to_csv(snakemake.output[0], index=False, sep = '\t') + data.to_csv(snakemake.output[0], index=False, sep="\t") + -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) From 52b46cb27edece3651b2d88e33ced7ceacdc57f4 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 16 Aug 2022 09:00:21 +0000 Subject: [PATCH 134/191] code review by @tedil --- workflow/scripts/merge_neoantigen_info.py | 35 +++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 32a50556..4599f932 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -1,5 +1,4 @@ import sys -from xml.sax.handler import all_properties sys.stderr = open(snakemake.log[0], "w") @@ -46,6 +45,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: """ if normal_p == "nan" or normal_p == "": return (tumor_p, normal_p) + assert len(tumor_p) == len(normal_p), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] tp_changed = tumor_p np_changed = normal_p @@ -67,18 +67,19 @@ def diff_tumor_normal_peptides( tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze() # Silent mutations should not be included in microphaser output. if normal_pep == tumor_pep: - sys.exit( + raise ValueError( f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" "Please fix this upstream or comment out this check to ignore this problem.\n" ) # Remove groups where the tumor peptide contains a stop codon. # TODO: Maybe this should be a hard fail complaining to fix this upstream? + # TODO: Write out warning. if "X" in tumor_pep: + print(f"Warning: ", file=sys.stderr) return group.loc[[], :] - ( - group.loc[group["alias"] == tumor_alias, column], - group.loc[group["alias"] == "normal", column], - ) = highlight_peptides_diff(tumor_pep, normal_pep) + t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep) + group.loc[group["alias"] == tumor_alias, column] = t_diff + group.loc[group["alias"] == "normal", column] = n_diff return group.set_index("alias", append=True) @@ -108,12 +109,13 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: # TODO: Ensure that microphaser output contains only one entry per id. # If there is more than one entry per index, ensure that they are identical if len(info.groupby(info.index).filter(lambda g: (g.nunique() > 1).any())) > 0: - sys.exit( + raise ValueError( f"Found multiple differing entries for an 'id' in file '{snakemake.input.info}'. Please ensure that entries are unique per 'id'.\n" ) # Always take the first entry for each index. info = info.groupby(info.index).head(1) # TODO: Ensure that microphaser output is tidy data, with one row each for tumor and normal. + # TODO: factor out tidying of column pairs into a function. num_var_in_pep_tidy = ( info.assign(ngermline=lambda x: x.nvar - x.nsomatic) .melt( @@ -206,7 +208,7 @@ def merge_data_frames( subset=["transcript", "offset", "pep_seq", "aa_changes"] ) ] - sys.exit( + raise ValueError( "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n" "This indicates an upstream issue, please fix this. The offending entries are:\n" f"{duplicates}\n" @@ -245,17 +247,12 @@ def merge_data_frames( ] return all_annotated.reindex(columns=column_order).sort_values( - by=["chrom", "genomic_pos", "id"] + by=["chrom", "offset", "id", "alias"] ) -def main(): - info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str) - tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str}) - normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str}) - data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias) - data.to_csv(snakemake.output[0], index=False, sep="\t") - - -if __name__ == "__main__": - sys.exit(main()) +info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str) +tumor = pd.read_csv(snakemake.input.neo, sep="\t", dtype={"pos_in_id_seq": str}) +normal = pd.read_csv(snakemake.input.normal, sep="\t", dtype={"pos_in_id_seq": str}) +data = merge_data_frames(info, tumor, normal, snakemake.wildcards.tumor_alias) +data.to_csv(snakemake.output[0], index=False, sep="\t") From f4960bb5655a7de25b23bf14d68e4bad204a8963 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 16 Aug 2022 11:26:39 +0000 Subject: [PATCH 135/191] truncate ids for netMHCpan join() on id to work --- workflow/scripts/merge_neoantigen_info.py | 54 +++++++++++++++-------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 4599f932..0a2906f2 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -3,7 +3,7 @@ sys.stderr = open(snakemake.log[0], "w") import pandas as pd -from typing import Tuple +from typing import List, Tuple def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: @@ -176,6 +176,29 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: ) +def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str): + if ( + sum( + df.duplicated( + subset=cols + ) + ) + > 0 + ): + duplicates = all_annotated[ + df.duplicated( + subset=cols + ) + ] + raise ValueError( + f"Found multiple rows with identical [ \"{'\", \"'.join(cols)}\" ] entries.\n" + "This indicates an upstream issue, please fix this.\n" + f"{specific_error}" + "The offending entries are:\n" + f"{duplicates}\n" + ) + + def merge_data_frames( info: pd.DataFrame, tumor: pd.DataFrame, normal: pd.DataFrame, tumor_alias: str ) -> pd.DataFrame: @@ -190,29 +213,22 @@ def merge_data_frames( .sort_index() ) info_tidy = tidy_info(info, tumor_alias) + + # netMHCpan 4.1 truncates the fasta entry IDs, so we have to cut down the IDs + # that microphaser originally provided to make the following .join() work + len_tumor_id = len(tumor_filtered["id"][0]) + len_normal_id = len(normal_filtered["id"][0]) + assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" + info_tidy['id'] = info_tidy['id'].str[:len_tumor_id] + # Double-check for duplicates resulting from the id truncation + check_duplicates(info_tidy, ["id", "alias"]) + all_annotated = all_filtered.join(info_tidy, how="left").reset_index( level=["id", "alias"] ) # Double-check for weird duplicates, as previously done in Jan's code. - if ( - sum( - all_annotated.duplicated( - subset=["transcript", "offset", "pep_seq", "aa_changes"] - ) - ) - > 0 - ): - duplicates = all_annotated[ - all_annotated.duplicated( - subset=["transcript", "offset", "pep_seq", "aa_changes"] - ) - ] - raise ValueError( - "Found multiple rows with identical 'transcript', 'offset', 'pep_seq' and 'aa_changes' entries.\n" - "This indicates an upstream issue, please fix this. The offending entries are:\n" - f"{duplicates}\n" - ) + check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"]) column_order = [ "id", From da2974d082490ab4d6244360cb04cca2ea78015b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 16 Aug 2022 11:48:57 +0000 Subject: [PATCH 136/191] further fixes, working without index wherever possible --- workflow/scripts/merge_neoantigen_info.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 0a2906f2..f87f23d0 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -35,7 +35,7 @@ def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: return ( sample_filtered.join(common_info, how="left") .assign(alias=alias) - .set_index("alias", append=True) + .reset_index() ) @@ -171,9 +171,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: all_tidy = num_var_in_pep_tidy.join( [num_var_sites_tidy, genomic_pos_tidy, aa_changes_tidy, nt_seq_tidy] ) - return all_tidy.reset_index( - level=[i for i in all_tidy.index.names if i not in ["id", "alias"]] - ) + return all_tidy.reset_index() def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str): @@ -190,8 +188,9 @@ def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str): subset=cols ) ] + cols_str = '", "'.join(cols) raise ValueError( - f"Found multiple rows with identical [ \"{'\", \"'.join(cols)}\" ] entries.\n" + f'Found multiple rows with identical [ "{cols_str}" ] entries.\n' "This indicates an upstream issue, please fix this.\n" f"{specific_error}" "The offending entries are:\n" @@ -207,10 +206,8 @@ def merge_data_frames( normal_filtered = get_filtered_per_alias(normal, "normal") all_filtered = ( pd.concat([tumor_filtered, normal_filtered]) - .reset_index(level=["pep_seq", "pos_in_id_seq"]) .groupby("id", group_keys=False) .apply(diff_tumor_normal_peptides, column="pep_seq", tumor_alias=tumor_alias) - .sort_index() ) info_tidy = tidy_info(info, tumor_alias) @@ -219,13 +216,11 @@ def merge_data_frames( len_tumor_id = len(tumor_filtered["id"][0]) len_normal_id = len(normal_filtered["id"][0]) assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" - info_tidy['id'] = info_tidy['id'].str[:len_tumor_id] + info_tidy["id"] = info_tidy["id"].str[:len_tumor_id] # Double-check for duplicates resulting from the id truncation check_duplicates(info_tidy, ["id", "alias"]) - all_annotated = all_filtered.join(info_tidy, how="left").reset_index( - level=["id", "alias"] - ) + all_annotated = all_filtered.join(info_tidy, how="left", on=["id", "alias"]) # Double-check for weird duplicates, as previously done in Jan's code. check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"]) From 377bba433a0a1ddd6d981e5e1342ad19f6470d91 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 16 Aug 2022 12:16:40 +0000 Subject: [PATCH 137/191] further fixes to remove indices and for more specific error messages --- workflow/scripts/merge_neoantigen_info.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index f87f23d0..c98a1ed1 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -58,7 +58,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: def diff_tumor_normal_peptides( group: pd.DataFrame, column: str, tumor_alias: str ) -> pd.DataFrame: - group = group.reset_index(level="alias") + group = group normal_pep = group.loc[group["alias"] == "normal", column].fillna("") if normal_pep.empty: normal_pep = "" @@ -80,7 +80,7 @@ def diff_tumor_normal_peptides( t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep) group.loc[group["alias"] == tumor_alias, column] = t_diff group.loc[group["alias"] == "normal", column] = n_diff - return group.set_index("alias", append=True) + return group def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: @@ -174,7 +174,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: return all_tidy.reset_index() -def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str): +def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""): if ( sum( df.duplicated( @@ -218,9 +218,9 @@ def merge_data_frames( assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" info_tidy["id"] = info_tidy["id"].str[:len_tumor_id] # Double-check for duplicates resulting from the id truncation - check_duplicates(info_tidy, ["id", "alias"]) + check_duplicates(info_tidy, ["id", "alias"], specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n") - all_annotated = all_filtered.join(info_tidy, how="left", on=["id", "alias"]) + all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"]) # Double-check for weird duplicates, as previously done in Jan's code. check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"]) From b425ff77bbda11266645ca31e43863215b32bb7b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:38:13 +0000 Subject: [PATCH 138/191] sort merged neoantigen infos by el_rank of tumor sample --- workflow/scripts/merge_neoantigen_info.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index c98a1ed1..d50f2a05 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -257,9 +257,13 @@ def merge_data_frames( "freq_credible_interval", ] - return all_annotated.reindex(columns=column_order).sort_values( - by=["chrom", "offset", "id", "alias"] - ) + def get_id_rank(group: pd.DataFrame, tumor_alias: str): + return group.loc[group["alias"] == tumor_alias, 'top_el_rank_el_rank'].squeeze() + + sort_rank = all_annotated.groupby('id').apply(get_id_rank, tumor_alias).rename('sort_rank') + all_sorted = all_annotated.merge(sort_rank, on=['id'], how='left').sort_values(['sort_rank', 'id', 'alias'], ascending=[True, True, False]).drop(columns='sort_rank') + + return all_sorted.reindex(columns=column_order) info = pd.read_csv(snakemake.input.info, sep="\t", dtype=str) From fb98a80ffa88e233a614efdc1fd38034d48dea2d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:39:49 +0000 Subject: [PATCH 139/191] get rid of further index usages --- workflow/scripts/merge_neoantigen_info.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index d50f2a05..c2082da7 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -15,27 +15,24 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: return ( df.loc[df.groupby(["pep_seq", "id"])[rank_col].idxmin(), columns_to_keep] .reset_index(level="allele") - .sort_index() - .drop_duplicates() .add_prefix(prefix) + .reset_index() + .drop_duplicates() ) def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: + merge_cols = ["id", "pos_in_id_seq", "pep_seq"] common_info = ( - sample.set_index(["id", "pos_in_id_seq", "pep_seq"]) - .loc[:, ["ave_el_score", "num_binders"]] - .reset_index(level=["id", "pos_in_id_seq"]) + sample.loc[:, merge_cols + ["ave_el_score", "num_binders"]] .drop_duplicates() - .set_index(["id", "pos_in_id_seq"], append=True) ) sample_el = get_best_rank_per_peptide(sample, "el") sample_ba = get_best_rank_per_peptide(sample, "ba") - sample_filtered = sample_el.join(sample_ba) + sample_filtered = sample_el.merge(sample_ba, on=merge_cols) return ( - sample_filtered.join(common_info, how="left") + sample_filtered.merge(common_info, how="left", on=merge_cols) .assign(alias=alias) - .reset_index() ) From 167ea484898baf0c3eb12df5227bfb8b3b3b6f37 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:40:33 +0000 Subject: [PATCH 140/191] remove peptides with stop codon earlier --- workflow/scripts/merge_neoantigen_info.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index c2082da7..c6136f6a 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -62,18 +62,17 @@ def diff_tumor_normal_peptides( else: normal_pep = normal_pep.squeeze() tumor_pep = group.loc[group["alias"] == tumor_alias, column].fillna("").squeeze() + # Remove groups where the tumor peptide contains a stop codon. + # TODO: Maybe this should be a hard fail complaining to fix this upstream? + if "X" in tumor_pep: + print(f"Warning: ", file=sys.stderr) + return group.loc[[], :] # Silent mutations should not be included in microphaser output. if normal_pep == tumor_pep: raise ValueError( f"For peptide '{group['id'][0]}' the normal and the tumor peptide have an identical sequence ({normal_pep}).\n" "Please fix this upstream or comment out this check to ignore this problem.\n" ) - # Remove groups where the tumor peptide contains a stop codon. - # TODO: Maybe this should be a hard fail complaining to fix this upstream? - # TODO: Write out warning. - if "X" in tumor_pep: - print(f"Warning: ", file=sys.stderr) - return group.loc[[], :] t_diff, n_diff = highlight_peptides_diff(tumor_pep, normal_pep) group.loc[group["alias"] == tumor_alias, column] = t_diff group.loc[group["alias"] == "normal", column] = n_diff From 650a3500900972b1b2b44c93ec54c631bc0e1224 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:41:15 +0000 Subject: [PATCH 141/191] aggregate all transcripts for a peptide into a list column --- workflow/scripts/merge_neoantigen_info.py | 34 +++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index c6136f6a..cea3c94a 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -85,21 +85,27 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: """ info = info.rename( columns={"credible_interval": "freq_credible_interval"} - ).set_index( - [ - "id", - "transcript", - "gene_id", - "gene_name", - "chrom", - "offset", - "frame", - "freq", - "freq_credible_interval", - "depth", - "strand", - ] ) + # Aggregate multiple identical entries that differ only in 'id' and 'transcript' + # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated + # list. + cols = [ c for c in info.columns if c not in ['id', 'transcript'] ] + aggregation_functions = {'id': lambda i: list(i), 'transcript': lambda t: '|'.join(set(t)) } + info = info.groupby(cols, dropna=False).agg(aggregation_functions).reset_index().explode('id').set_index( + [ + "id", + "transcript", + "gene_id", + "gene_name", + "chrom", + "offset", + "frame", + "freq", + "freq_credible_interval", + "depth", + "strand", + ] + ) int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"] info[int_cols] = info[int_cols].astype("int32") # TODO: Ensure that microphaser output contains only one entry per id. From 4467eb7d61707c01b36069a67a747615e6fb556b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:42:30 +0000 Subject: [PATCH 142/191] remove duplicate entries that differ only in microphaser id --- workflow/scripts/merge_neoantigen_info.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index cea3c94a..07dfce50 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -185,9 +185,10 @@ def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = "" ) > 0 ): - duplicates = all_annotated[ + duplicates = df[ df.duplicated( - subset=cols + subset=cols, + keep=False, ) ] cols_str = '", "'.join(cols) @@ -225,7 +226,11 @@ def merge_data_frames( all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"]) # Double-check for weird duplicates, as previously done in Jan's code. - check_duplicates(all_annotated, ["transcript", "offset", "pep_seq", "aa_changes"]) + # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"], + # we check for everything except id. + cols_without_id = [ c for c in all_annotated.columns if c not in ['id'] ] + all_annotated = all_annotated.drop_duplicates(subset=cols_without_id) + check_duplicates(all_annotated, cols_without_id) column_order = [ "id", From 732717fe68722305c30c8e64087ed08346b6a9a3 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 12:46:56 +0000 Subject: [PATCH 143/191] update microphaser to 0.6 with filter TSV header fix --- workflow/envs/microphaser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml index 00657086..d781c04c 100644 --- a/workflow/envs/microphaser.yaml +++ b/workflow/envs/microphaser.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - microphaser =0.5 + - microphaser =0.6 From d078e9cfd20a3812e00c7107b2a3bf1af700948e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 18:13:04 +0000 Subject: [PATCH 144/191] detailed description of columns in neoantigen results table --- workflow/report/WES_results.rst | 1 - workflow/report/neoantigens.DNA.rst | 60 +++++++++++++++++++++++++++++ workflow/rules/MHC_binding.smk | 4 +- 3 files changed, 62 insertions(+), 3 deletions(-) delete mode 100644 workflow/report/WES_results.rst create mode 100644 workflow/report/neoantigens.DNA.rst diff --git a/workflow/report/WES_results.rst b/workflow/report/WES_results.rst deleted file mode 100644 index 4a699bdf..00000000 --- a/workflow/report/WES_results.rst +++ /dev/null @@ -1 +0,0 @@ -"Results - neoantigen candidate table" diff --git a/workflow/report/neoantigens.DNA.rst b/workflow/report/neoantigens.DNA.rst new file mode 100644 index 00000000..17ba537e --- /dev/null +++ b/workflow/report/neoantigens.DNA.rst @@ -0,0 +1,60 @@ +Neoantigens and corresponding normal peptides as phased and determined by +microphaser, with elution ligand / binding affinity predictions by netMHC +(netMHCpan and netMHCIIpan) to the HLA alleles determined by HLA-LA. + +=================== +Column descriptions +=================== + +* **id**: Peptide ID assigned by microphaser. +* **pep_seq**: Sequence of the full peptide that was given to netMHC(II)pan. Amino acids that + are different between the normal and the tumor sample are highlighted in lower case. +* **pos_in_id_seq**: Position in pep_seq of the peptide that was used for prediction. + It seems like netMHCpan positions start at 0 and netMHCIIpan positions at 1. +* **alias**: Indicator of the type of sample (normal vs. some tumor sample). +* **num_binders**: Total number of peptide-HLA allele pairs from pep_seq that are considered binders, + either weak or strong. Cutoffs are applied to el_rank and are: + * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder) + * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder) +* **freq**: Allelic frequency of the peptide as predicted by microphaser. For a + credible allele frequency interval, see column freq_credible_interval. +* **depth**: Read depth at the peptide position. +* **num_var_sites**: Number of variant sites on the peptides haplotype in that sample (alias). +* **num_var_in_pep**: Number of variant sites within the peptide sequence. +* **top_el_rank_allele**: HLA allele with the best eluted ligand prediction score percentile rank. +* **top_el_rank_bind_core**: Binding core of the peptide for the HLA allele with the best + elution ligand score percentile rank. +* **top_el_rank_el_rank**: Percentile rank of the elution ligand score. The rank of the predicted binding + score when compared to a set of random natural peptides. This measure is not + affected by inherent bias of certain molecules towards higher or lower mean + predicted affinities. It is the recommended value for determining likely + binders / neoantigens of interest. Cutoffs recommended by netMHC(II)pan authors + are: + * netMHCpan 4.1: <0.5% (strong binder), <2.0% (weak binder) + * netMHCIIpan 4.1: <1.0% (strong binder), <5.0% (weak binder) +* **top_el_rank_el_score**: The raw eluted ligand prediction score +* **ave_el_score**: Average across the eluted ligand prediction scores of all alleles for this + peptide in the particular sample (alias). +* **top_ba_rank_allele**: HLA allele with the best binding affinity prediction score percentile rank. +* **top_ba_rank_bind_core**: Binding core of the peptide for the HLA allele with the best + binding affinity score percentile rank. +* **top_ba_rank_ba_rank**: Percentile rank of the predicted binding affinity compared to a set of 100.000 + random natural peptides. This measure is not affected by inherent bias of certain + molecules towards higher or lower mean predicted affinities. +* **top_ba_rank_ba_score**: Predicted binding affinity in log-scale. +* **aa_changes**: List of aa changes. For normal samples, only germline changes are listed. For + tumor samples, only somatic tumor changes are listed, even though the germline + changes also affect the tumor peptide. +* **genomic_pos**: Genomic position of the nucleotide change. +* **nt_seq**: Nucleotide sequence underlying the peptide. Nucleotide changes underlying the + amino acid changes are highlighted as lower case letters. +* **gene_name**: Common gene name / gene symbol of the peptide's gene of origin. +* **gene_id**: Ensembl gene id of the peptide's gene of origin. +* **transcript**: List of Ensembl transcript ids in which the peptide occurs. +* **chrom**: Chromosome on which the peptide's gene of origin is located. +* **offset**: Chromosomal position of the peptide's gene of origin. +* **frame**: Open reading frame that the peptide originates from. 0 indicates the regular + reading frame, non-zero values indicate frame shifts. +* **strand**: Strand of the gene / transcript. +* **freq_credible_interval**: Credible interval for freq, the allelic frequency of the peptide as predicted + by microphaser. diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/MHC_binding.smk index f6163b26..4ee0c185 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/MHC_binding.smk @@ -72,8 +72,8 @@ rule merge_neoantigen_info: output: report( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", - caption="../report/WES_results.rst", - category="Results WES (netMHC)", + caption="../report/neoantigens.DNA.rst", + category="Neoantigens", ), log: "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", From 4463176ead51ab0386bdf387f8c8051736750ab5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 17 Aug 2022 18:14:12 +0000 Subject: [PATCH 145/191] update HLA allele report description --- workflow/report/HLA_Types.rst | 1 - workflow/report/hla_alleles.rst | 1 + workflow/rules/HLAtyping.smk | 8 ++++---- 3 files changed, 5 insertions(+), 5 deletions(-) delete mode 100644 workflow/report/HLA_Types.rst create mode 100644 workflow/report/hla_alleles.rst diff --git a/workflow/report/HLA_Types.rst b/workflow/report/HLA_Types.rst deleted file mode 100644 index 999fe494..00000000 --- a/workflow/report/HLA_Types.rst +++ /dev/null @@ -1 +0,0 @@ -Typing of HLA profile. diff --git a/workflow/report/hla_alleles.rst b/workflow/report/hla_alleles.rst new file mode 100644 index 00000000..a793a914 --- /dev/null +++ b/workflow/report/hla_alleles.rst @@ -0,0 +1 @@ +HLA allele profile as determined by HLA-LA. diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/HLAtyping.smk index 179a2c67..2bbb1435 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/HLAtyping.smk @@ -55,13 +55,13 @@ rule parse_and_filter_hla_alleles_for_netmhc: output: hlaI=report( "results/HLA-LA/{group}.{alias}.hlaI.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(HLA-LA)", + caption="../report/hla_alleles.rst", + category="HLA alleles", ), hlaII=report( "results/HLA-LA/{group}.{alias}.hlaII.tsv", - caption="../report/HLA_Types.rst", - category="HLA-Typing(HLA-LA)", + caption="../report/hla_alleles.rst", + category="HLA alleles", ), log: "logs/parse-HLA-LA/{group}.{alias}.log", From 2e9a0b6664c415417036b63d92b54f2874f618bd Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:09:07 +0000 Subject: [PATCH 146/191] snakecase and lower case wherever possible --- workflow/Snakefile | 4 ++-- .../{neoantigens.DNA.rst => neoantigens.dna.rst} | 0 .../{RNA_results.rst => neoantigens.rna.rst} | 0 workflow/rules/common.smk | 8 ++++---- workflow/rules/{HLAtyping.smk => hla_typing.smk} | 14 +++++++------- .../rules/{MHC_binding.smk => mhc_binding.smk} | 10 +++++----- workflow/rules/ref.smk | 12 ++++++------ 7 files changed, 24 insertions(+), 24 deletions(-) rename workflow/report/{neoantigens.DNA.rst => neoantigens.dna.rst} (100%) rename workflow/report/{RNA_results.rst => neoantigens.rna.rst} (100%) rename workflow/rules/{HLAtyping.smk => hla_typing.smk} (86%) rename workflow/rules/{MHC_binding.smk => mhc_binding.smk} (94%) diff --git a/workflow/Snakefile b/workflow/Snakefile index f75eba84..f00c401a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -29,8 +29,8 @@ include: "rules/common.smk" include: "rules/utils.smk" include: "rules/ref.smk" include: "rules/microphaser.smk" -include: "rules/HLAtyping.smk" -include: "rules/MHC_binding.smk" +include: "rules/hla_typing.smk" +include: "rules/mhc_binding.smk" rule all: diff --git a/workflow/report/neoantigens.DNA.rst b/workflow/report/neoantigens.dna.rst similarity index 100% rename from workflow/report/neoantigens.DNA.rst rename to workflow/report/neoantigens.dna.rst diff --git a/workflow/report/RNA_results.rst b/workflow/report/neoantigens.rna.rst similarity index 100% rename from workflow/report/RNA_results.rst rename to workflow/report/neoantigens.rna.rst diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9c928417..f25cdb85 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -125,8 +125,8 @@ def get_final_output(): else: final_output = expand( [ - "results/HLA-LA/{group}.{tumor_alias}.hlaI.tsv", - "results/HLA-LA/{group}.{tumor_alias}.hlaII.tsv", + "results/hla_la/{group}.{tumor_alias}.hlaI.tsv", + "results/hla_la/{group}.{tumor_alias}.hlaII.tsv", ], group=group, tumor_alias=tumor_aliases, @@ -183,7 +183,7 @@ def get_bam_from_group_and_alias(ext=".bam"): def get_alleles_MHCI(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( - "results/HLA-LA/{group}.{alias}.hlaI.tsv", + "results/hla_la/{group}.{alias}.hlaI.tsv", group=wildcards.group, alias=alias, ) @@ -193,7 +193,7 @@ def get_alleles_MHCII(wildcards): alias = "normal" if wildcards.peptide_type == "normal" else wildcards.tumor_alias return expand( # TODO: check that hlaII is correct here, and not hlaI which it previously was - "results/HLA-LA/{group}.{alias}.hlaII.tsv", + "results/hla_la/{group}.{alias}.hlaII.tsv", group=wildcards.group, alias=alias, ) diff --git a/workflow/rules/HLAtyping.smk b/workflow/rules/hla_typing.smk similarity index 86% rename from workflow/rules/HLAtyping.smk rename to workflow/rules/hla_typing.smk index 2bbb1435..98034abf 100644 --- a/workflow/rules/HLAtyping.smk +++ b/workflow/rules/hla_typing.smk @@ -1,14 +1,14 @@ -rule HLA_LA: +rule hla_la: input: bam=get_bam_from_group_and_alias(), bai=get_bam_from_group_and_alias(ext=".bai"), index="resources/graphs/PRG_MHC_GRCh38_withIMGT/serializedGRAPH", ext_idx="resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.pac", output: - "results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", + "results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt", threads: 7 log: - "logs/HLA-LA/{group}_{alias}.log", + "logs/hla_la/{group}_{alias}.log", params: graph=lambda w, input: os.path.basename(os.path.dirname(input.index)), graphdir=lambda w, input: os.path.dirname(os.path.dirname(input.index)), @@ -49,21 +49,21 @@ rule net_mhc_two_pan_alleles: rule parse_and_filter_hla_alleles_for_netmhc: input: - hla_la_bestguess="results/HLA-LA/output/{group}_{alias}/hla/R1_bestguess_G.txt", + hla_la_bestguess="results/hla_la/output/{group}_{alias}/hla/R1_bestguess_G.txt", mhc_one_alleles="resources/hla_alleles/available_alleles.net_mhc_pan.txt", mhc_two_alleles="resources/hla_alleles/available_alleles.net_mhc_two_pan.txt", output: hlaI=report( - "results/HLA-LA/{group}.{alias}.hlaI.tsv", + "results/hla_la/{group}.{alias}.hlaI.tsv", caption="../report/hla_alleles.rst", category="HLA alleles", ), hlaII=report( - "results/HLA-LA/{group}.{alias}.hlaII.tsv", + "results/hla_la/{group}.{alias}.hlaII.tsv", caption="../report/hla_alleles.rst", category="HLA alleles", ), log: - "logs/parse-HLA-LA/{group}.{alias}.log", + "logs/parse_hla_la/{group}.{alias}.log", script: "../scripts/parse_and_filter_hla_alleles_for_netmhc.py" diff --git a/workflow/rules/MHC_binding.smk b/workflow/rules/mhc_binding.smk similarity index 94% rename from workflow/rules/MHC_binding.smk rename to workflow/rules/mhc_binding.smk index 4ee0c185..0a326942 100644 --- a/workflow/rules/MHC_binding.smk +++ b/workflow/rules/mhc_binding.smk @@ -72,7 +72,7 @@ rule merge_neoantigen_info: output: report( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", - caption="../report/neoantigens.DNA.rst", + caption="../report/neoantigens.dna.rst", category="Neoantigens", ), log: @@ -81,19 +81,19 @@ rule merge_neoantigen_info: "../scripts/merge_neoantigen_info.py" -rule add_RNA_info: +rule add_rna_info: input: counts="results/kallisto/{group}.{tumor_alias}", table="results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", output: report( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv", - caption="../report/RNA_results.rst", - category="Results RNA", + caption="../report/neoantigens.rna.rst", + category="Neoantigens", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), log: - "logs/add-RNA/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", + "logs/add_rna/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", script: "../scripts/add_rna_info.py" diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 0d6324a0..d041c2e6 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -142,7 +142,7 @@ rule genome_dict: "0.45.1/bio/picard/createsequencedictionary" -rule download_HLALA_graph: +rule download_hla_la_graph: output: directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/PRG"), directory("resources/graphs/PRG_MHC_GRCh38_withIMGT/knownReferences"), @@ -158,13 +158,13 @@ rule download_HLALA_graph: "/PRG_MHC_GRCh38_withIMGT/PRG", "" ), log: - "logs/download-HLA-LA-graph.log", + "logs/download_hla_la_graph.log", shell: "( cd {params.graphs_dir} && wget http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz " "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz && rm PRG_MHC_GRCh38_withIMGT.tar.gz ) 2> {log}" -rule index_HLALA: +rule index_hla_la: input: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", output: @@ -176,12 +176,12 @@ rule index_HLALA: path=lambda wc, input: os.path.dirname(os.path.dirname(input[0])), graph=lambda wc, input: os.path.basename(os.path.dirname(input[0])), log: - "logs/index-HLA-LA-graph.log", + "logs/index_hla_la_graph.log", shell: "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1" -rule index_HLALA_extended_ref: +rule index_hla_la_extended_ref: input: "resources/graphs/PRG_MHC_GRCh38_withIMGT/extendedReferenceGenome/extendedReferenceGenome.fa", output: @@ -193,7 +193,7 @@ rule index_HLALA_extended_ref: conda: "../envs/hla_la.yaml" log: - "logs/index_HLA-LA_extended_ref.log", + "logs/index_hla_la_extended_ref.log", shell: "bwa index {input} > {log} 2>&1" From 946bbdf8b41faffb706447e8c937065b2466bc9d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:10:45 +0000 Subject: [PATCH 147/191] snakefmt --- workflow/rules/mhc_binding.smk | 8 ++++++-- workflow/rules/microphaser.smk | 24 +++++++++++++++++------- workflow/rules/ref.smk | 4 ++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk index 0a326942..db077370 100644 --- a/workflow/rules/mhc_binding.smk +++ b/workflow/rules/mhc_binding.smk @@ -12,7 +12,9 @@ rule net_mhc_pan: extra=config["params"]["net_mhc_pan"]["extra"], netMHC=config["params"]["net_mhc_pan"]["location"], length=config["params"]["net_mhc_pan"]["peptide_len"], - alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], header=None)[0] + ), shell: "( " "if [ -s {input.peptides} ]; " @@ -38,7 +40,9 @@ rule net_mhc_two_pan: extra=config["params"]["net_mhc_two_pan"]["extra"], netMHC=config["params"]["net_mhc_two_pan"]["location"], length=config["params"]["net_mhc_two_pan"]["peptide_len"], - alleles=lambda wc, input: ",".join( pd.read_csv(input.alleles[0], header=None)[0] ), + alleles=lambda wc, input: ",".join( + pd.read_csv(input.alleles[0], header=None)[0] + ), shell: "( " "if [ -s {input.peptides} ]; " diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index a57340ad..727d5b99 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -21,7 +21,7 @@ rule add_somatic_flag: output: "results/final-calls/{group}.{set}.somatic_flag.norm.bcf", log: - "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log" + "logs/bcftools_annotate/{group}.{set}.somatic_flag.norm.log", conda: "../envs/bcftools.yaml" shell: @@ -41,14 +41,16 @@ rule merge_tumor_normal: "results/final-calls/{{group}}.{sets}.norm.bcf", sets=[ config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag", + config["params"]["microphaser"]["variant_sets"]["tumor"] + + ".somatic_flag", ], ), index=expand( "results/final-calls/{{group}}.{sets}.norm.bcf.csi", sets=[ config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"] + ".somatic_flag", + config["params"]["microphaser"]["variant_sets"]["tumor"] + + ".somatic_flag", ], ), output: @@ -77,7 +79,11 @@ rule microphaser_tumor: conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3, + window_length=lambda w: max( + config["params"]["net_mhc_pan"]["peptide_len"], + config["params"]["net_mhc_two_pan"]["peptide_len"], + ) + * 3, shell: "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" @@ -102,7 +108,11 @@ rule microphaser_normal: conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max(config["params"]["net_mhc_pan"]["peptide_len"],config["params"]["net_mhc_two_pan"]["peptide_len"])*3, + window_length=lambda w: max( + config["params"]["net_mhc_pan"]["peptide_len"], + config["params"]["net_mhc_two_pan"]["peptide_len"], + ) + * 3, shell: "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" @@ -133,7 +143,7 @@ rule build_normal_proteome_db: conda: "../envs/microphaser.yaml" params: - length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"], + length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"], shell: "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}" @@ -159,7 +169,7 @@ rule microphaser_filter: conda: "../envs/microphaser.yaml" params: - length=lambda wildcards: config["params"][ wildcards.mhc]["peptide_len"], + length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"], shell: "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}" diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index d041c2e6..6e7db8a5 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -83,7 +83,7 @@ rule create_somatic_flag_header_line: output: "resources/somatic_flag_header_line.txt", log: - "logs/create_somatic_flag_header_line.log" + "logs/create_somatic_flag_header_line.log", shell: """ ( echo '##INFO=' > {output} ) 2> {log} @@ -96,7 +96,7 @@ rule create_genome_somatic_flag_bed: output: "resources/genome.somatic_flag.bed", log: - "logs/create_genome_somatic_flag_bed.log" + "logs/create_genome_somatic_flag_bed.log", conda: "../envs/gawk.yaml" cache: True From 5adc1dfd968efd7d7f4eab3e7068a1518c460d19 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:12:42 +0000 Subject: [PATCH 148/191] black on worfklow/scripts/ --- workflow/scripts/add_rna_info.py | 10 +-- .../scripts/count_neoantigen_occurrences.py | 23 +++--- workflow/scripts/merge_neoantigen_info.py | 70 +++++++++++-------- workflow/scripts/sample_comp_plot.py | 53 +++++++------- workflow/scripts/tidy_mhc_output.py | 6 +- workflow/scripts/tsv_to_xlsx.py | 2 +- 6 files changed, 92 insertions(+), 72 deletions(-) diff --git a/workflow/scripts/add_rna_info.py b/workflow/scripts/add_rna_info.py index b8690f86..0eb2298e 100644 --- a/workflow/scripts/add_rna_info.py +++ b/workflow/scripts/add_rna_info.py @@ -5,13 +5,15 @@ import pandas as pd ## load data table -data = pd.read_csv(snakemake.input["table"], sep='\t') +data = pd.read_csv(snakemake.input["table"], sep="\t") ## Merge transcript count -transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t') +transcript_count = pd.read_csv(snakemake.params["abundance"], sep="\t") transcript_count = transcript_count[["target_id", "tpm"]] transcript_count.columns = ["Transcript_ID", "TPM"] -transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0] +transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split( + ".", expand=True +)[0] data = data.merge(transcript_count, on="Transcript_ID", how="left") -data.to_csv(snakemake.output[0], sep='\t', index=False) +data.to_csv(snakemake.output[0], sep="\t", index=False) diff --git a/workflow/scripts/count_neoantigen_occurrences.py b/workflow/scripts/count_neoantigen_occurrences.py index a70caa36..eb9c917a 100644 --- a/workflow/scripts/count_neoantigen_occurrences.py +++ b/workflow/scripts/count_neoantigen_occurrences.py @@ -9,16 +9,21 @@ dfs, dfs_id = dict(), [] -for f in files: - df = pd.read_csv(f, sep = '\t') - dfs[f] = df - df = df[["ID_tumor"]] - dfs_id.append(df) +for f in files: + df = pd.read_csv(f, sep="\t") + dfs[f] = df + df = df[["ID_tumor"]] + dfs_id.append(df) ids = pd.concat(dfs_id) -ids=ids.groupby(ids.columns.tolist(), as_index = False).size().reset_index().rename(columns = {0:"occurrences"}) -ids.to_csv("out/tables/candidate_occurrences.tsv", sep = '\t', index = False) +ids = ( + ids.groupby(ids.columns.tolist(), as_index=False) + .size() + .reset_index() + .rename(columns={0: "occurrences"}) +) +ids.to_csv("out/tables/candidate_occurrences.tsv", sep="\t", index=False) for k, v in dfs.items(): - df = v.merge(ids, on = "ID_tumor") - df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep = ',', index = False) + df = v.merge(ids, on="ID_tumor") + df.to_csv(k.replace("filtered.tsv", "filtered.counts.csv"), sep=",", index=False) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 07dfce50..850792d0 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -23,16 +23,14 @@ def get_best_rank_per_peptide(df: pd.DataFrame, rank_type: str) -> pd.DataFrame: def get_filtered_per_alias(sample: pd.DataFrame, alias: str) -> pd.DataFrame: merge_cols = ["id", "pos_in_id_seq", "pep_seq"] - common_info = ( - sample.loc[:, merge_cols + ["ave_el_score", "num_binders"]] - .drop_duplicates() - ) + common_info = sample.loc[ + :, merge_cols + ["ave_el_score", "num_binders"] + ].drop_duplicates() sample_el = get_best_rank_per_peptide(sample, "el") sample_ba = get_best_rank_per_peptide(sample, "ba") sample_filtered = sample_el.merge(sample_ba, on=merge_cols) - return ( - sample_filtered.merge(common_info, how="left", on=merge_cols) - .assign(alias=alias) + return sample_filtered.merge(common_info, how="left", on=merge_cols).assign( + alias=alias ) @@ -42,7 +40,9 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: """ if normal_p == "nan" or normal_p == "": return (tumor_p, normal_p) - assert len(tumor_p) == len(normal_p), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." + assert len(tumor_p) == len( + normal_p + ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] tp_changed = tumor_p np_changed = normal_p @@ -83,15 +83,21 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: """ Get the -o info output of the microphaser filter command into tidy data format. """ - info = info.rename( - columns={"credible_interval": "freq_credible_interval"} - ) + info = info.rename(columns={"credible_interval": "freq_credible_interval"}) # Aggregate multiple identical entries that differ only in 'id' and 'transcript' # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated # list. - cols = [ c for c in info.columns if c not in ['id', 'transcript'] ] - aggregation_functions = {'id': lambda i: list(i), 'transcript': lambda t: '|'.join(set(t)) } - info = info.groupby(cols, dropna=False).agg(aggregation_functions).reset_index().explode('id').set_index( + cols = [c for c in info.columns if c not in ["id", "transcript"]] + aggregation_functions = { + "id": lambda i: list(i), + "transcript": lambda t: "|".join(set(t)), + } + info = ( + info.groupby(cols, dropna=False) + .agg(aggregation_functions) + .reset_index() + .explode("id") + .set_index( [ "id", "transcript", @@ -106,6 +112,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: "strand", ] ) + ) int_cols = ["nvar", "nsomatic", "nvariant_sites", "nsomvariant_sites"] info[int_cols] = info[int_cols].astype("int32") # TODO: Ensure that microphaser output contains only one entry per id. @@ -177,14 +184,7 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: def check_duplicates(df: pd.DataFrame, cols: List[str], specific_error: str = ""): - if ( - sum( - df.duplicated( - subset=cols - ) - ) - > 0 - ): + if sum(df.duplicated(subset=cols)) > 0: duplicates = df[ df.duplicated( subset=cols, @@ -218,17 +218,23 @@ def merge_data_frames( # that microphaser originally provided to make the following .join() work len_tumor_id = len(tumor_filtered["id"][0]) len_normal_id = len(normal_filtered["id"][0]) - assert len_tumor_id == len_normal_id, f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" + assert ( + len_tumor_id == len_normal_id + ), f"'id's' are of different length, tumor: {len_tumor_id}, normal: {len_normal_id}, please check your input data.\n" info_tidy["id"] = info_tidy["id"].str[:len_tumor_id] # Double-check for duplicates resulting from the id truncation - check_duplicates(info_tidy, ["id", "alias"], specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n") + check_duplicates( + info_tidy, + ["id", "alias"], + specific_error="Here, the problem is most likely the truncation of 'id's by netMHCpan.\n", + ) all_annotated = all_filtered.merge(info_tidy, how="left", on=["id", "alias"]) # Double-check for weird duplicates, as previously done in Jan's code. # Jan's code was only checking for ["transcript", "offset", "pep_seq", "aa_changes"], # we check for everything except id. - cols_without_id = [ c for c in all_annotated.columns if c not in ['id'] ] + cols_without_id = [c for c in all_annotated.columns if c not in ["id"]] all_annotated = all_annotated.drop_duplicates(subset=cols_without_id) check_duplicates(all_annotated, cols_without_id) @@ -265,10 +271,16 @@ def merge_data_frames( ] def get_id_rank(group: pd.DataFrame, tumor_alias: str): - return group.loc[group["alias"] == tumor_alias, 'top_el_rank_el_rank'].squeeze() - - sort_rank = all_annotated.groupby('id').apply(get_id_rank, tumor_alias).rename('sort_rank') - all_sorted = all_annotated.merge(sort_rank, on=['id'], how='left').sort_values(['sort_rank', 'id', 'alias'], ascending=[True, True, False]).drop(columns='sort_rank') + return group.loc[group["alias"] == tumor_alias, "top_el_rank_el_rank"].squeeze() + + sort_rank = ( + all_annotated.groupby("id").apply(get_id_rank, tumor_alias).rename("sort_rank") + ) + all_sorted = ( + all_annotated.merge(sort_rank, on=["id"], how="left") + .sort_values(["sort_rank", "id", "alias"], ascending=[True, True, False]) + .drop(columns="sort_rank") + ) return all_sorted.reindex(columns=column_order) diff --git a/workflow/scripts/sample_comp_plot.py b/workflow/scripts/sample_comp_plot.py index d8dbf6e0..ffbd7aaa 100644 --- a/workflow/scripts/sample_comp_plot.py +++ b/workflow/scripts/sample_comp_plot.py @@ -10,8 +10,10 @@ import matplotlib.pyplot as plt from pysam import VariantFile -variant_df = pd.read_csv(snakemake.input[0], sep='\t').fillna(0.0) -variant_df = variant_df[["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")]] +variant_df = pd.read_csv(snakemake.input[0], sep="\t").fillna(0.0) +variant_df = variant_df[ + ["CHROM", "POS"] + [c for c in variant_df.columns if c.endswith("Freq")] +] ## tidy data - for facet plot tidy_df = variant_df.melt(id_vars=["CHROM", "POS"], var_name="Sample", value_name="VAF") g = sns.FacetGrid(tidy_df, col="Sample") @@ -19,25 +21,27 @@ g.savefig(snakemake.output["facet"]) plt.close() ## pairplot -sns.pairplot(variant_df.drop(["CHROM", "POS"],axis=1), diag_kind="kde") +sns.pairplot(variant_df.drop(["CHROM", "POS"], axis=1), diag_kind="kde") plt.savefig(snakemake.output["pairplot"]) plt.close() + def overlap_pct(x, y, **kws): n = 0 - for i in range(0,len(x)): + for i in range(0, len(x)): if (x[i] > 0) & (y[i] > 0): n += 1 - overlap=n/len([e for e in x if e > 0]) + overlap = n / len([e for e in x if e > 0]) ax = plt.gca() - ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4)) - ax.annotate("Shared Variants: {}".format(n), xy=(.2, .6)) + ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(0.2, 0.4)) + ax.annotate("Shared Variants: {}".format(n), xy=(0.2, 0.6)) + def variants(x, **kws): positive = len([e for e in x if e > 0]) - ax = plt.gca() - ax.annotate("#Variants: {}".format(positive), xy=(.2, .5), -xycoords=ax.transAxes) + ax = plt.gca() + ax.annotate("#Variants: {}".format(positive), xy=(0.2, 0.5), xycoords=ax.transAxes) + g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) @@ -46,7 +50,7 @@ def variants(x, **kws): g.savefig(snakemake.output["grid"]) plt.close() -#def neg_overlap_pct(x, y, **kws): +# def neg_overlap_pct(x, y, **kws): # n = 0 # for i in range(0,len(x)): # if (x[i] == 0) & (y[i] == 0): @@ -56,25 +60,24 @@ def variants(x, **kws): # ax.annotate("Shared Fraction: {:.2f}".format(overlap), xy=(.2, .4)) # ax.annotate("Shared missing variants: {}".format(n), xy=(.2, .6)) -#def neg_variants(x, **kws): +# def neg_variants(x, **kws): # zero = len([e for e in x if e == 0]) -# ax = plt.gca() +# ax = plt.gca() # ax.annotate("#Missing Variants: {}".format(zero), xy=(.2, .5), -#xycoords=ax.transAxes) +# xycoords=ax.transAxes) -#g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) +# g = sns.PairGrid(variant_df.drop(["CHROM", "POS"], axis=1)) -#g.map_offdiag(neg_overlap_pct) -#g.map_diag(neg_variants) -#g.savefig("plots/Mssing_Variant_table.pdf") -#plt.close() +# g.map_offdiag(neg_overlap_pct) +# g.map_diag(neg_variants) +# g.savefig("plots/Mssing_Variant_table.pdf") +# plt.close() -for c in variant_df.drop(["CHROM", "POS"], axis=1).columns: - sns.distplot(variant_df[[c]][variant_df[c] > 0]) - plt.title(c) - plt.savefig("plots/positive_" + c +".distplot.pdf") +for c in variant_df.drop(["CHROM", "POS"], axis=1).columns: + sns.distplot(variant_df[[c]][variant_df[c] > 0]) + plt.title(c) + plt.savefig("plots/positive_" + c + ".distplot.pdf") plt.close() sns.distplot(variant_df[[c]]) - plt.savefig("plots/all_" + c +".distplot.pdf") + plt.savefig("plots/all_" + c + ".distplot.pdf") plt.close() - diff --git a/workflow/scripts/tidy_mhc_output.py b/workflow/scripts/tidy_mhc_output.py index 3d29dfb8..fa129c0a 100644 --- a/workflow/scripts/tidy_mhc_output.py +++ b/workflow/scripts/tidy_mhc_output.py @@ -13,7 +13,7 @@ # * generated with the `-BA` option to include binding affinity prediction # The mapping of index column names used here to original names in netMHCpan files -# is (please excuse the pd.NA tuples, they make header and index handling +# is (please excuse the pd.NA tuples, they make header and index handling # easier further down the line): INDEX_NAMES = { (pd.NA, "Pos"): "pos_in_id_seq", @@ -88,9 +88,7 @@ def parse_file(mhc_in: str): header = pd.concat([first_header_line, second_header_line], axis="columns") header = header.fillna(method="ffill") header.loc[ - header.column_name.isin( - [ index_col for (_, index_col) in INDEX_NAMES.keys() ] - ), + header.column_name.isin([index_col for (_, index_col) in INDEX_NAMES.keys()]), "allele", ] = pd.NA diff --git a/workflow/scripts/tsv_to_xlsx.py b/workflow/scripts/tsv_to_xlsx.py index 5d9bf10f..f99e77ec 100644 --- a/workflow/scripts/tsv_to_xlsx.py +++ b/workflow/scripts/tsv_to_xlsx.py @@ -5,4 +5,4 @@ import pandas as pd data = pd.read_csv(snakemake.input.tsv, sep="\t") -data.to_excel(snakemake.output.xlsx, index=False) \ No newline at end of file +data.to_excel(snakemake.output.xlsx, index=False) From b166c9d52591add05860e791535e7b3306759505 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:21:15 +0000 Subject: [PATCH 149/191] fix config.schema.yaml --- workflow/schemas/config.schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 1186fc44..60708436 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -102,5 +102,5 @@ required: - units - ref - params - - epitope_prediction + - neoantigen_prediction - affinity From 3b3ed3ec6fb3c4672e0f2635c0a160b9bc1c08f7 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:32:34 +0000 Subject: [PATCH 150/191] further config.schema.yaml fixes --- workflow/schemas/config.schema.yaml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 60708436..eedcb838 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -4,21 +4,6 @@ description: snakemake configuration file type: object -definitions: - filterentry: - type: object - additionalProperties: - type: string - evententry: - type: object - properties: - varlociraptor: - type: array - items: - type: string - filter: - type: string - properties: samples: type: string @@ -48,9 +33,6 @@ properties: activate: type: boolean - affinity: - type: object - properties: params: type: object properties: @@ -103,4 +85,3 @@ required: - ref - params - neoantigen_prediction - - affinity From 20eb61afe4c768d2f4ffda7fdc3527577a4aa195 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 13:44:19 +0000 Subject: [PATCH 151/191] silence misguided lint --- workflow/rules/ref.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 6e7db8a5..f6be3bb0 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -155,7 +155,7 @@ rule download_hla_la_graph: "resources/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt", params: graphs_dir=lambda w, output: output[0].replace( - "/PRG_MHC_GRCh38_withIMGT/PRG", "" + "graphs/PRG_MHC_GRCh38_withIMGT/PRG", "graphs" ), log: "logs/download_hla_la_graph.log", From 3ee90aba269460c0b6dd6e4ea8c11307eb22f6dd Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 14:30:08 +0000 Subject: [PATCH 152/191] fix different config entries --- config/config.yaml | 2 +- workflow/rules/common.smk | 8 ++++---- workflow/rules/microphaser.smk | 10 +++++----- workflow/schemas/config.schema.yaml | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index a9c006cd..10c46191 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -21,7 +21,7 @@ ref: params: microphaser: # window_len should be at least 3 times the longest peptide_len specified below - variant_sets: + events: normal: "normal_only" tumor: "tumor_only" net_mhc_pan: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f25cdb85..0fd753f0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -63,12 +63,12 @@ wildcard_constraints: pd.unique(samples.loc[samples["alias"].str.match("tumor"), "alias"]) ), normal_alias="normal", - tumor_set=config["params"]["microphaser"]["variant_sets"]["tumor"], - normal_set=config["params"]["microphaser"]["variant_sets"]["normal"], + tumor_set=config["params"]["microphaser"]["events"]["tumor"], + normal_set=config["params"]["microphaser"]["events"]["normal"], set="|".join( [ - config["params"]["microphaser"]["variant_sets"]["tumor"], - config["params"]["microphaser"]["variant_sets"]["normal"], + config["params"]["microphaser"]["events"]["tumor"], + config["params"]["microphaser"]["events"]["normal"], ] ), group="|".join(pd.unique(samples["group"])), diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 727d5b99..6879e32f 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -40,16 +40,16 @@ rule merge_tumor_normal: calls=expand( "results/final-calls/{{group}}.{sets}.norm.bcf", sets=[ - config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"] + config["params"]["microphaser"]["events"]["normal"], + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", ], ), index=expand( "results/final-calls/{{group}}.{sets}.norm.bcf.csi", sets=[ - config["params"]["microphaser"]["variant_sets"]["normal"], - config["params"]["microphaser"]["variant_sets"]["tumor"] + config["params"]["microphaser"]["events"]["normal"], + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", ], ), @@ -153,7 +153,7 @@ rule microphaser_filter: tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv", proteome=expand( "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin", - normal_set=config["params"]["microphaser"]["variant_sets"]["normal"], + normal_set=config["params"]["microphaser"]["events"]["normal"], ), output: mt_fasta=( diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index eedcb838..05e55b83 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -39,7 +39,7 @@ properties: microphaser: type: object properties: - variant_sets: + events: type: object properties: normal: From 4323a572cb54ff605c4e46fb502c9004738fac1d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 23 Aug 2022 14:45:24 +0000 Subject: [PATCH 153/191] snakefmt --- workflow/rules/microphaser.smk | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 6879e32f..4e6a3407 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -41,16 +41,14 @@ rule merge_tumor_normal: "results/final-calls/{{group}}.{sets}.norm.bcf", sets=[ config["params"]["microphaser"]["events"]["normal"], - config["params"]["microphaser"]["events"]["tumor"] - + ".somatic_flag", + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", ], ), index=expand( "results/final-calls/{{group}}.{sets}.norm.bcf.csi", sets=[ config["params"]["microphaser"]["events"]["normal"], - config["params"]["microphaser"]["events"]["tumor"] - + ".somatic_flag", + config["params"]["microphaser"]["events"]["tumor"] + ".somatic_flag", ], ), output: From 4dc72784d52a07636a033f8512332071d3011f94 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 26 Aug 2022 18:37:13 +0000 Subject: [PATCH 154/191] fix deduplication of microphaser filter info tsv --- workflow/scripts/merge_neoantigen_info.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/merge_neoantigen_info.py b/workflow/scripts/merge_neoantigen_info.py index 850792d0..669a7716 100644 --- a/workflow/scripts/merge_neoantigen_info.py +++ b/workflow/scripts/merge_neoantigen_info.py @@ -87,10 +87,13 @@ def tidy_info(info: pd.DataFrame, tumor_alias: str) -> pd.DataFrame: # Aggregate multiple identical entries that differ only in 'id' and 'transcript' # into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated # list. - cols = [c for c in info.columns if c not in ["id", "transcript"]] + cols = [c for c in info.columns if c not in ["id", "transcript", "depth", "freq", "freq_credible_interval"]] aggregation_functions = { "id": lambda i: list(i), - "transcript": lambda t: "|".join(set(t)), + "transcript": lambda t: "|".join(list(t)), + "depth": lambda d: "|".join(list(d)), + "freq": lambda f: "|".join(list(f)), + "freq_credible_interval": lambda c: "|".join(list(c)), } info = ( info.groupby(cols, dropna=False) From 6251ac4deff41dd5c879398186e7a460e3a2e41e Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 26 Aug 2022 18:38:14 +0000 Subject: [PATCH 155/191] NeoFox conda environment creation setup with post-deploy --- workflow/Snakefile | 14 +++ workflow/envs/neo_fox_deps.post-deploy.sh | 144 ++++++++++++++++++++++ workflow/envs/neo_fox_deps.yaml | 30 +++++ 3 files changed, 188 insertions(+) create mode 100755 workflow/envs/neo_fox_deps.post-deploy.sh create mode 100644 workflow/envs/neo_fox_deps.yaml diff --git a/workflow/Snakefile b/workflow/Snakefile index f00c401a..dee9f9ff 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -11,6 +11,20 @@ configfile: "config/config.yaml" scattergather: calling=24, +##### required envvars ##### + +envvars: + # For NeoFox installation: + # The tarballs for both netMHCpan and netMHCIIpan + # need to be donwloaded manually after registering + # online and deposited at a filesystem location + # that needs to be available as a shell environment + # variable when running snakemake. + # For downloading, see the "Downloads" tab at: + # https://services.healthtech.dtu.dk/service.php?NetMHCpan-4.1 + "NET_MHC_PAN_4_1_TARBALL", + # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0 + "NET_MHC_TWO_PAN_4_0_TARBALL", ##### setup report ##### diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh new file mode 100755 index 00000000..939b3c3a --- /dev/null +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -0,0 +1,144 @@ +set -euo pipefail + +# set all the necessary conda paths and +# ensure they exist +CONDA_BIN="${CONDA_PREFIX}/bin/" +CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/" +mkdir -p $CONDA_MAN1 +CONDA_INFO="${CONDA_PREFIX}/share/info/" +mkdir -p $CONDA_INFO +CONDA_LIB="${CONDA_PREFIX}/lib/" +mkdir -p ${CONDA_LIB} +CONDA_ETC="${CONDA_PREFIX}/etc/" +mkdir -p $CONDA_ETC + +# install MixMHCpred, following: +# https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README +MIX_MHC_PRED_VERSION="2.1" +MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/" +wget https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz +tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz +cd MixMHCpred-${MIX_MHC_PRED_VERSION} +g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x +# TODO: when updating to v2.2, change this line to: +# MMP_PLACEHOLDER="/PATH_TO_MIXMHCPRED/lib" +MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER" +grep "${MMP_PLACEHOLDER}" MixMHCpred +sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred +mv lib $MIX_MHC_PRED_LIB_PATH +mv MixMHCpred ${CONDA_BIN} +# TODO: when updating to v2.2, change this line to: +# mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf +mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf +MixMHCpred -i test/test.fa -o test/out.txt -a A0101,A2501,B0801,B1801 +diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt) +cd .. +rm v${MIX_MHC_PRED_VERSION}.tar.gz +rm -r MixMHCpred-${MIX_MHC_PRED_VERSION} + +# install MixMHC2pred, mostly following (we use the default GitHub-created +# .tar.gz file instead of the hand-crafted .zip file for future-proofing): +# https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md +MIX_MHC_TWO_PRED_VERSION="1.2" +MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/" +wget https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} +mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix +mv rpep ${CONDA_ETC} +ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep +mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE +MixMHC2pred_unix -i test/testData.txt -o test/out.txt -a DRB1_11_01 DRB3_02_02 DPA1_01_03__DPB1_04_01 DQA1_05_05__DQB1_03_01 +diff test/out.txt test/out_compare.txt +cd .. +rm v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} + +# install PRIME, mostly following (minor corrections): +# https://github.com/GfellerLab/PRIME/blob/v1.0/README +PRIME_VERSION="1.0" +PRIME_LIB_PATH="${CONDA_LIB}/prime/" +wget https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz +tar xzf v${PRIME_VERSION}.tar.gz +cd PRIME-${PRIME_VERSION} +PRIME_PLACEHOLDER="/app/PRIME/lib" +grep "${PRIME_PLACEHOLDER}" PRIME +sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME +mv lib $PRIME_LIB_PATH +mv PRIME ${CONDA_BIN} +mv PRIME_license.pdf ${CONDA_INFO} +PRIME -i test/test.txt -o test/out.txt -a A0201,A0101 +diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt) +cd .. +rm v${PRIME_VERSION}.tar.gz +rm -r PRIME-${PRIME_VERSION} + +# This is the non-portable version of the 1st line +# in both netMHCpan and netMHCIIpan, assuming a +# root install of tcsh. +TCSH_ROOT="#! /bin/tcsh -f" +# For the scripts to work with any tcsh in the +# $PATH, we need to change this to: +TCSH_PATH="#!/usr/bin/env tcsh" + +# install netMHCpan version 4.1 +# requires tcsh to have been installed via conda dependencies +NET_MHC_PAN_4_1_LIB="${CONDA_LIB}/netMHCpan_4_1/" +mkdir -p ${NET_MHC_PAN_4_1_LIB} +NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/" +mkdir -p ${NET_MHC_PAN_4_1_ETC} +tar xzf ${NET_MHC_PAN_4_1_TARBALL} +cd netMHCpan-4.1 +wget https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz +tar xzf data.tar.gz +rm data.tar.gz +grep "${TCSH_ROOT}" netMHCpan +sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCpan +grep -P "setenv\s+NMHOME" netMHCpan +sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_PAN_4_1_LIB}%" netMHCpan +mv -t ${CONDA_BIN} netMHCpan +mv -t ${CONDA_MAN1} netMHCpan.1 +mv -t ${CONDA_INFO} netMHCpan-4.1.readme +mv -t ${NET_MHC_PAN_4_1_LIB} Linux_x86_64 +mv -t ${NET_MHC_PAN_4_1_ETC} data +ln -s ${NET_MHC_PAN_4_1_ETC}/data ${NET_MHC_PAN_4_1_LIB}/data +cd test +netMHCpan -p test.pep -BA -xls -a HLA-A01:01,HLA-A02:01 -xlsfile my_NetMHCpan_out.xls +diff NetMHCpan_out.xls my_NetMHCpan_out.xls +cd ../.. +rm -r netMHCpan-4.1 + +# install netMHCIIpan version 4.0 +# requires tcsh to have been installed via conda dependencies +NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/" +mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB} +NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/" +mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC} +tar xzf ${NET_MHC_PAN_TWO_4_0_TARBALL} +cd netMHCIIpan-4.0 +wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz +tar xzf data.tar.gz +rm data.tar.gz +grep "${TCSH_ROOT}" netMHCIIpan +sed -i "s%${TCSH_ROOT}%${TCSH_PATH}%" netMHCIIpan +grep -P "setenv\s+NMHOME" netMHCIIpan +sed -r -i "s%^setenv\s+NMHOME+.*$%setenv NMHOME ${NET_MHC_TWO_PAN_4_0_LIB}%" netMHCIIpan +mv -t ${CONDA_BIN} netMHCIIpan +mv -t ${CONDA_MAN1} netMHCIIpan.1 +mv -t ${CONDA_INFO} netMHCIIpan-4.0.readme +mv -t ${NET_MHC_TWO_PAN_4_0_LIB} Linux_x86_64 NetMHCIIpan-4.0.pl +mv -t ${NET_MHC_TWO_PAN_4_0_ETC} data +ln -s ${NET_MHC_TWO_PAN_4_0_ETC}/data ${NET_MHC_TWO_PAN_4_0_LIB}/data +cd test +netMHCIIpan -f example.fsa -a DRB1_0101 > example.fsa.myout +diff example.fsa.out example.fsa.myout +netMHCIIpan -f example.pep -inptype 1 -a DRB1_0101 > example.pep.myout +diff example.pep.out example.pep.myout +netMHCIIpan -f example.fsa -a H-2-IAb -s -u > example.fsa.sorted.myout +diff example.fsa.sorted.out example.fsa.sorted.myout +netMHCIIpan -f example.fsa -hlaseq DRB10101.fsa > example.fsa_hlaseq.myout +diff example.fsa_hlaseq.out example.fsa_hlaseq.myout +netMHCIIpan -f example.fsa -hlaseqA alpha.dat -hlaseq beta.dat > example.fsa_hlaseq_A+B.myout +diff example.fsa_hlaseq_A+B.out example.fsa_hlaseq_A+B.myout +cd ../.. +rm -r netMHCIIpan-4.0 diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml new file mode 100644 index 00000000..c4cb4a9b --- /dev/null +++ b/workflow/envs/neo_fox_deps.yaml @@ -0,0 +1,30 @@ +channels: + - conda-forge + - bioconda +dependencies: + # https://neofox.readthedocs.io/en/latest/02_installation.html#step-by-step-guide-without-docker + - r-base =3.6 + - python >=3.7, <=3.8 + # implicit unmentioned dependency of MixMHCpred and PRIME + - perl + # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp + - blast =2.10 + # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64 + - cxx-compiler + # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts + - tcsh =6.24 + # just make sure this is available for the post-deploy.sh script + - sed =4.8 + # R packages mentioned at the end of this section: + # https://neofox.readthedocs.io/en/latest/02_installation.html#configuration-of-the-reference-folder + - r-lattice + - r-ggplot2 + - r-caret + - r-peptides + - r-doparallel + - r-gbm + - bioconductor-biostrings + # https://neofox.readthedocs.io/en/latest/02_installation.html#install-neofox + - pip + - pip: + - neofox==0.6.4 From 0d1058cc4dd9b180de3ca6a492ceb8e15dc25ddf Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 26 Aug 2022 18:41:55 +0000 Subject: [PATCH 156/191] initial NeoFox config and run rules, rules for exact input missing --- .test/config/config.yaml | 7 +++ config/config.yaml | 7 +++ workflow/Snakefile | 1 + workflow/rules/annotate_neoantigens.smk | 74 +++++++++++++++++++++++++ workflow/rules/common.smk | 1 + 5 files changed, 90 insertions(+) create mode 100644 workflow/rules/annotate_neoantigens.smk diff --git a/.test/config/config.yaml b/.test/config/config.yaml index bcd1a88c..80c0e438 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -45,3 +45,10 @@ params: # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" location: "../netMHCIIpan-4.1" + neo_fox: + activate: false + # This should be at least as long as the desired net_mhc_two_pan peptide length + peptide_len: 15 + # update the version number to get a newer release of the reference set of HLA Alleles + hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt" + extra: "" diff --git a/config/config.yaml b/config/config.yaml index 10c46191..083fc651 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -46,4 +46,11 @@ params: # the conda-provided tcsh installation, it needs to read (without quotes): # "#!/usr/bin/env tcsh" location: "../netMHCIIpan-4.1" + neo_fox: + activate: false + # This should be at least as long as the desired net_mhc_two_pan peptide length + peptide_len: 15 + # update the version number to get a newer release of the reference set of HLA Alleles + hla_alleles: "https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/allelelist/Allelelist.3480.txt" + extra: "" diff --git a/workflow/Snakefile b/workflow/Snakefile index dee9f9ff..ee333ba2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -45,6 +45,7 @@ include: "rules/ref.smk" include: "rules/microphaser.smk" include: "rules/hla_typing.smk" include: "rules/mhc_binding.smk" +include: "rules/annotate_neoantigens.smk" rule all: diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk new file mode 100644 index 00000000..b610c948 --- /dev/null +++ b/workflow/rules/annotate_neoantigens.smk @@ -0,0 +1,74 @@ +rule prepare_neo_fox_config_and_resources: + output: + config="resources/neo_fox/neo_fox_config.txt", + references=directory("resources/neo_fox/references/"), + conda: + "../envs/neo_fox_deps.yaml" + params: + hla_alleles=config["params"]["neofox"]["hla_alleles"], + shell: + """ + # environment variables necessary for neofox-configure + + ## pre-installed via conda + export NEOFOX_MAKEBLASTDB=makeblastdb + echo 'NEOFOX_MAKEBLASTDB=makeblastdb' > {output.config} + export NEOFOX_RSCRIPT=Rscript + echo 'NEOFOX_RSCRIPT=Rscript' >> {output.config} + + ## pre-installed into conda environment via post-deploy script + export NEOFOX_NETMHCPAN=netMHCpan + echo 'NEOFOX_NETMHCPAN=netMHCpan' >> {output.config} + export NEOFOX_NETMHC2PAN=netMHCIIpan + echo 'NEOFOX_NETMHC2PAN=netMHCIIpan' >> {output.config} + + ## specification of hla_allele link via config.yaml + export NEOFOX_HLA_DATABASE={params.hla_alleles} + + neofox-configure --reference-folder {output.references} + echo 'NEOFOX_REFERENCE_FOLDER={output.references}' >> {output.config} + + # further environment variables needed for the config file + + ## pre-installed via conda + echo 'NEOFOX_BLASTP=blastp' >> {output.config} + + ## pre-installed into conda environment via post-deploy script + echo 'NEOFOX_MIXMHCPRED=MixMHCpred' >> {output.config} + echo 'NEOFOX_MIXMHC2PRED=MixMHC2pred_unix' >> {output.config} + echo 'NEOFOX_PRIME=PRIME' >> {output.config} + """ + + +rule neo_fox: + input: + config="resources/neo_fox/neo_fox_config.txt", + references=directory("resources/neo_fox/references/"), + candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv", + patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", + output: + tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv" + json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json" + meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json" + threads: 8 + conda: + "../envs/neo_fox_deps.yaml" + params: + folder=lambda wc, output: path.dirname(output.annotated), + prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0], + organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported" + shell: + "(neofox " + " --num_cpus {threads} " + " --config {input.config} " + " --candidate-file {input.candidates} " + " --patient-data {input.patient_annotation} " + " --with-table " + " --with-json " + " --organism {params.organism} " + " --output-folder {params.folder} " + " --output-prefix {params.prefix} ; " + " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " + " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " + " mv {params_folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; " + ") 2> {log} " diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0fd753f0..68725904 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -1,6 +1,7 @@ import glob import pandas as pd +from os import path from snakemake.remote import FTP from snakemake.utils import validate From 3d8bbe126cfc6306dcf104cc12ccebffdac02537 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 26 Aug 2022 18:48:11 +0000 Subject: [PATCH 157/191] fix typos --- workflow/rules/annotate_neoantigens.smk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index b610c948..5c4a4029 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -5,7 +5,7 @@ rule prepare_neo_fox_config_and_resources: conda: "../envs/neo_fox_deps.yaml" params: - hla_alleles=config["params"]["neofox"]["hla_alleles"], + hla_alleles=config["params"]["neo_fox"]["hla_alleles"], shell: """ # environment variables necessary for neofox-configure @@ -47,16 +47,16 @@ rule neo_fox: candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv", patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", output: - tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv" - json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json" - meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json" + tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", + json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json", + meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json", threads: 8 conda: "../envs/neo_fox_deps.yaml" params: folder=lambda wc, output: path.dirname(output.annotated), prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0], - organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported" + organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported", shell: "(neofox " " --num_cpus {threads} " From 2f8abd4f663711dc2ce881602249e78d927361cc Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 30 Aug 2022 12:09:55 +0000 Subject: [PATCH 158/191] update microphaser to 0.7.0 for filter output adapted for NeoFox --- workflow/envs/microphaser.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/microphaser.yaml b/workflow/envs/microphaser.yaml index d781c04c..90698479 100644 --- a/workflow/envs/microphaser.yaml +++ b/workflow/envs/microphaser.yaml @@ -2,4 +2,4 @@ channels: - bioconda - conda-forge dependencies: - - microphaser =0.6 + - microphaser =0.7 From 0affc660dd9f9c07b2c83cfd94ca8965ee7fb4a0 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 30 Aug 2022 12:10:51 +0000 Subject: [PATCH 159/191] consistently use peptide_length wildcard and contigs/ folder throughout --- workflow/rules/mhc_binding.smk | 20 ++++++---- workflow/rules/microphaser.smk | 68 ++++++++++++++-------------------- 2 files changed, 41 insertions(+), 47 deletions(-) diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk index db077370..0a6079ca 100644 --- a/workflow/rules/mhc_binding.smk +++ b/workflow/rules/mhc_binding.smk @@ -1,11 +1,14 @@ rule net_mhc_pan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_pan.{contig}.{peptide_type}.fa", + peptides=expand( + "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa", + peptide_length=config["params"]["net_mhc_pan"]["peptide_len"], + ), alleles=get_alleles_MHCI, output: - "results/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + "results/net_mhc_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: - "logs/net_mhc_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + "logs/net_mhc_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: "../envs/tcsh.yaml" params: @@ -28,12 +31,15 @@ rule net_mhc_pan: rule net_mhc_two_pan: input: - peptides="results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.net_mhc_two_pan.{contig}.{peptide_type}.fa", + peptides=expand( + "results/microphaser/fasta/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.{{contig}}.{{peptide_type}}.fa", + peptide_length=config["params"]["net_mhc_two_pan"]["peptide_len"], + ), alleles=get_alleles_MHCII, output: - "results/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", + "results/net_mhc_two_pan/contigs/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.tsv", log: - "logs/net_mhc_two_pan/{group}/{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", + "logs/net_mhc_two_pan/{group}.{tumor_alias}.merged_tumor_normal.{contig}.{peptide_type}.log", conda: "../envs/tcsh.yaml" params: @@ -57,7 +63,7 @@ rule net_mhc_two_pan: rule tidy_mhc_out: input: expand( - "results/{{mhc}}/{{group}}/{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv", + "results/{{mhc}}/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.{contig}.{{peptide_type}}.tsv", contig=contigs, ), output: diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 4e6a3407..a0bfc6c6 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -69,19 +69,15 @@ rule microphaser_tumor: track="resources/annotation/{contig}.gtf", ref="resources/genome.fasta", output: - mt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.neo.fa", - wt_fasta="results/microphaser/fasta/{group}/{tumor_alias}.merged_tumor_normal.{contig}.normal.fa", - tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv", + mt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa", + wt_fasta="results/microphaser/fasta/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa", + tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", log: - "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.{contig}.log", + "logs/microphaser_tumor/{group}/{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max( - config["params"]["net_mhc_pan"]["peptide_len"], - config["params"]["net_mhc_two_pan"]["peptide_len"], - ) - * 3, + window_length=lambda wc: int(wc.peptide_length) * 3 shell: "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" @@ -96,21 +92,17 @@ rule microphaser_normal: ref="resources/genome.fasta", output: wt_fasta=( - "results/microphaser/fasta/{group}/{normal_alias}.{normal_set}.{contig}.fa" + "results/microphaser/fasta/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.fa" ), wt_tsv=( - "results/microphaser/info/{group}/{normal_alias}.{normal_set}.{contig}.tsv" + "results/microphaser/info/contigs/{group}.{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.tsv" ), log: - "logs/microphaser_normal/{group}/{normal_alias}.{normal_set}-{contig}.log", + "logs/microphaser_normal/contigs/{group}/{normal_alias}.{normal_set}.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" params: - window_length=lambda w: max( - config["params"]["net_mhc_pan"]["peptide_len"], - config["params"]["net_mhc_two_pan"]["peptide_len"], - ) - * 3, + window_length=lambda wc: int(wc.peptide_length) * 3 shell: "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" @@ -119,69 +111,65 @@ rule microphaser_normal: rule concat_normal_proteome: input: expand( - "results/microphaser/fasta/{{group}}/normal.{{normal_set}}.{contig}.fa", + "results/microphaser/fasta/contigs/{{group}}.normal.{{normal_set}}.pep_len_{{peptide_length}}.{contig}.fa", contig=contigs, ), output: - "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa", log: - "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.log", + "logs/microphaser_concat_normal_proteome/{group}.{normal_set}.pep_len_{peptide_length}.log", shell: "cat {input} > {output} 2> {log}" rule build_normal_proteome_db: input: - "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.fa", + "results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.fa", output: - bin="results/microphaser/bin/{group}.{normal_set}.{mhc}.normal_proteome.bin", - fasta="results/microphaser/fasta/{group}.{normal_set}.{mhc}.normal_proteome.peptides.fasta", + bin="results/microphaser/bin/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.bin", + fasta="results/microphaser/fasta/{group}.{normal_set}.normal_proteome.pep_len_{peptide_length}.peptides.fasta", log: - "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}-{mhc}.log", + "logs/microphaser_build_normal_proteome_db/{group}.{normal_set}.pep_len_{peptide_length}.log", conda: "../envs/microphaser.yaml" - params: - length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"], shell: - "( microphaser build_reference -r {input} -o {output.bin} -l {params.length} > {output.fasta} ) 2> {log}" + "( microphaser build_reference -r {input} -o {output.bin} -l {wildcards.peptide_length} > {output.fasta} ) 2> {log}" rule microphaser_filter: input: - tsv="results/microphaser/info/{group}/{tumor_alias}.merged_tumor_normal.{contig}.tsv", + tsv="results/microphaser/info/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", proteome=expand( - "results/microphaser/bin/{{group}}.{normal_set}.{{mhc}}.normal_proteome.bin", + "results/microphaser/bin/{{group}}.{normal_set}.normal_proteome.pep_len_{{peptide_length}}.bin", normal_set=config["params"]["microphaser"]["events"]["normal"], ), output: mt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.neo.fa" + "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.neo.fa" ), wt_fasta=( - "results/microphaser/fasta/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.normal.fa" + "results/microphaser/fasta/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.normal.fa" ), - tsv="results/microphaser/info/filtered/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.tsv", - removed="results/microphaser/info/removed/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.removed.tsv", + tsv="results/microphaser/info/filtered/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.tsv", + removed="results/microphaser/info/removed/contigs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.removed.tsv", log: - "logs/microphaser_filter/{group}/{tumor_alias}.merged_tumor_normal.{mhc}.{contig}.log", + "logs/microphaser_filter/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.{contig}.log", conda: "../envs/microphaser.yaml" - params: - length=lambda wildcards: config["params"][wildcards.mhc]["peptide_len"], shell: - "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}" + "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {wildcards.peptide_length} > {output.mt_fasta} 2>{log}" rule concat_tsvs: input: expand( - "results/microphaser/info/filtered/{{group}}/{{tumor_alias}}.merged_tumor_normal.{{mhc}}.{contig}.tsv", + "results/microphaser/info/filtered/contigs/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{{peptide_length}}.{contig}.tsv", contig=contigs, ), output: - "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.tsv", + "results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", log: - "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", + "logs/microphaser_concat_tsvs/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log", conda: "../envs/xsv.yaml" shell: From 22ce178a7e0de1091ded61851e325b4723d0fdd5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 30 Aug 2022 12:11:19 +0000 Subject: [PATCH 160/191] initial infrastructure for parsing microphaes output into NeoFox input --- workflow/envs/polars.yaml | 5 +++++ workflow/rules/annotate_neoantigens.smk | 17 ++++++++++++++++- .../adjust_microphaser_output_for_neo_fox.py | 7 +++++++ 3 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 workflow/envs/polars.yaml create mode 100644 workflow/scripts/adjust_microphaser_output_for_neo_fox.py diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml new file mode 100644 index 00000000..c5a678f4 --- /dev/null +++ b/workflow/envs/polars.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - polars =0.14 \ No newline at end of file diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 5c4a4029..5e071228 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -40,11 +40,26 @@ rule prepare_neo_fox_config_and_resources: """ +rule adjust_microphaser_output_for_neo_fox: + input: + candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + output: + candidates="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + threads: 1 + conda: + "../envs/polars.yaml" + script: + "../scripts/adjust_microphaser_output_for_neo_fox.py" + + rule neo_fox: input: config="resources/neo_fox/neo_fox_config.txt", references=directory("resources/neo_fox/references/"), - candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.neo_fox.tsv", + candidates=expand( + "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + peptide_length=config["params"]["neo_fox"]["peptide_len"], + ), patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", output: tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py new file mode 100644 index 00000000..cdccddcb --- /dev/null +++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py @@ -0,0 +1,7 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import polars as pl + +candidates = pl.read_tsv(snakemake.input.candidates, sep="\t") \ No newline at end of file From a9077b7226af8ea4452fbbdea750426fb6893bee Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:36:14 +0000 Subject: [PATCH 161/191] fix NET_MHC_TWO_PAN_4_0_TARBALL environment variable spelling --- workflow/envs/neo_fox_deps.post-deploy.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh index 939b3c3a..595fa6a0 100755 --- a/workflow/envs/neo_fox_deps.post-deploy.sh +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -114,7 +114,7 @@ NET_MHC_TWO_PAN_4_0_LIB="${CONDA_LIB}/netMHCIIpan_4_0/" mkdir -p ${NET_MHC_TWO_PAN_4_0_LIB} NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/" mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC} -tar xzf ${NET_MHC_PAN_TWO_4_0_TARBALL} +tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL} cd netMHCIIpan-4.0 wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz tar xzf data.tar.gz From 5e2c180181eafb4ace73c597f1e07716338dd96d Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:37:33 +0000 Subject: [PATCH 162/191] further unpin NeoFox blast dep from `2.10` to `2`, to avoid unsolvable situations in some settings --- workflow/envs/neo_fox_deps.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml index c4cb4a9b..780af1d6 100644 --- a/workflow/envs/neo_fox_deps.yaml +++ b/workflow/envs/neo_fox_deps.yaml @@ -8,7 +8,7 @@ dependencies: # implicit unmentioned dependency of MixMHCpred and PRIME - perl # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp - - blast =2.10 + - blast =2 # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64 - cxx-compiler # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts From 85b4ded9271f9bf690cff95f6b0a042fc0758522 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:39:16 +0000 Subject: [PATCH 163/191] use shebang in post-deploy script, to ensure using bash (see https://github.com/snakemake/snakemake/pull/1841 for details of respective snakemake change) --- workflow/envs/neo_fox_deps.post-deploy.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh index 595fa6a0..77cd586c 100755 --- a/workflow/envs/neo_fox_deps.post-deploy.sh +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash set -euo pipefail # set all the necessary conda paths and From 3483eb9ef14ccebae6a6212ddb9df427f51e9cdd Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:40:01 +0000 Subject: [PATCH 164/191] silence wget calls (`-q`) in neo_fox_deps.post-deploy.sh --- workflow/envs/neo_fox_deps.post-deploy.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh index 77cd586c..c17a3400 100755 --- a/workflow/envs/neo_fox_deps.post-deploy.sh +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -17,7 +17,7 @@ mkdir -p $CONDA_ETC # https://github.com/GfellerLab/MixMHCpred/blob/v2.1/README MIX_MHC_PRED_VERSION="2.1" MIX_MHC_PRED_LIB_PATH="$CONDA_PREFIX/lib/mix_mhc_pred/" -wget https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz +wget -q https://github.com/GfellerLab/MixMHCpred/archive/refs/tags/v${MIX_MHC_PRED_VERSION}.tar.gz tar xzf v${MIX_MHC_PRED_VERSION}.tar.gz cd MixMHCpred-${MIX_MHC_PRED_VERSION} g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x @@ -42,7 +42,7 @@ rm -r MixMHCpred-${MIX_MHC_PRED_VERSION} # https://github.com/GfellerLab/MixMHC2pred/blob/v1.2/README.md MIX_MHC_TWO_PRED_VERSION="1.2" MIX_MHC_TWO_PRED_LIB_PATH="${CONDA_LIB}/mix_mhc_two_pred/" -wget https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz +wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_TWO_PRED_VERSION}.tar.gz tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix @@ -59,7 +59,7 @@ rm -r MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} # https://github.com/GfellerLab/PRIME/blob/v1.0/README PRIME_VERSION="1.0" PRIME_LIB_PATH="${CONDA_LIB}/prime/" -wget https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz +wget -q https://github.com/GfellerLab/PRIME/archive/refs/tags/v${PRIME_VERSION}.tar.gz tar xzf v${PRIME_VERSION}.tar.gz cd PRIME-${PRIME_VERSION} PRIME_PLACEHOLDER="/app/PRIME/lib" @@ -90,7 +90,7 @@ NET_MHC_PAN_4_1_ETC="${CONDA_ETC}/netMHCpan_4_1/" mkdir -p ${NET_MHC_PAN_4_1_ETC} tar xzf ${NET_MHC_PAN_4_1_TARBALL} cd netMHCpan-4.1 -wget https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz +wget -q https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/data.tar.gz tar xzf data.tar.gz rm data.tar.gz grep "${TCSH_ROOT}" netMHCpan @@ -117,7 +117,7 @@ NET_MHC_TWO_PAN_4_0_ETC="${CONDA_ETC}/netMHCIIpan_4_0/" mkdir -p ${NET_MHC_TWO_PAN_4_0_ETC} tar xzf ${NET_MHC_TWO_PAN_4_0_TARBALL} cd netMHCIIpan-4.0 -wget https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz +wget -q https://services.healthtech.dtu.dk/services/NetMHCIIpan-4.0/data.tar.gz tar xzf data.tar.gz rm data.tar.gz grep "${TCSH_ROOT}" netMHCIIpan From 7f2300de6caa30b53cc3a5d4c9d5b6c39bb4748c Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:41:19 +0000 Subject: [PATCH 165/191] set hardcoded full paths in NeoFox config file, as NeoFox tests for file existence of the binary, not binary presence in path --- workflow/rules/annotate_neoantigens.smk | 29 +++++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 5e071228..5206b93d 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -9,18 +9,23 @@ rule prepare_neo_fox_config_and_resources: shell: """ # environment variables necessary for neofox-configure + # NOTE: we have to provide all binaries with hard-coded + # paths, because NeoFox checks that the file at the path + # exists (and not simply that a binary exists): + # https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L90 + CONDA_BIN=$CONDA_PREFIX/bin ## pre-installed via conda - export NEOFOX_MAKEBLASTDB=makeblastdb - echo 'NEOFOX_MAKEBLASTDB=makeblastdb' > {output.config} - export NEOFOX_RSCRIPT=Rscript - echo 'NEOFOX_RSCRIPT=Rscript' >> {output.config} + export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb + echo 'NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb' > {output.config} + export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript + echo 'NEOFOX_RSCRIPT=$CONDA_BIN/Rscript' >> {output.config} ## pre-installed into conda environment via post-deploy script - export NEOFOX_NETMHCPAN=netMHCpan - echo 'NEOFOX_NETMHCPAN=netMHCpan' >> {output.config} - export NEOFOX_NETMHC2PAN=netMHCIIpan - echo 'NEOFOX_NETMHC2PAN=netMHCIIpan' >> {output.config} + export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan + echo 'NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan' >> {output.config} + export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan + echo 'NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan' >> {output.config} ## specification of hla_allele link via config.yaml export NEOFOX_HLA_DATABASE={params.hla_alleles} @@ -31,12 +36,12 @@ rule prepare_neo_fox_config_and_resources: # further environment variables needed for the config file ## pre-installed via conda - echo 'NEOFOX_BLASTP=blastp' >> {output.config} + echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config} ## pre-installed into conda environment via post-deploy script - echo 'NEOFOX_MIXMHCPRED=MixMHCpred' >> {output.config} - echo 'NEOFOX_MIXMHC2PRED=MixMHC2pred_unix' >> {output.config} - echo 'NEOFOX_PRIME=PRIME' >> {output.config} + echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config} + echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config} + echo 'NEOFOX_PRIME=$CONDA_BIN/PRIME' >> {output.config} """ From 304610f5333ad03128c6635fc6b7e5de278eacf9 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:42:12 +0000 Subject: [PATCH 166/191] adapt naming of stuff in `rule adjust_microphaser_output_for_neo_fox` --- workflow/rules/annotate_neoantigens.smk | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 5206b93d..17a07d65 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -47,9 +47,11 @@ rule prepare_neo_fox_config_and_resources: rule adjust_microphaser_output_for_neo_fox: input: - candidates="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + microphaser="results/microphaser/info/filtered/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", output: - candidates="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + neo_fox="results/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.tsv", + log: + "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log", threads: 1 conda: "../envs/polars.yaml" From 6aeb681a2260518c82ed640a87c0e9b5efc8278a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:43:24 +0000 Subject: [PATCH 167/191] rely only on neo_fox_config.txt to ensure `rule prepare_neo_fox_config_and_resources` is run, add logs to neo_fox rules --- workflow/rules/annotate_neoantigens.smk | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 17a07d65..bb3e2c02 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -1,7 +1,14 @@ rule prepare_neo_fox_config_and_resources: output: config="resources/neo_fox/neo_fox_config.txt", + # we cannot put the exact files generated into the + # output, as snakemake will generate the respective + # subdirectories and NeoFox has default exist_ok=False + # set for os.makedirs: + # https://github.com/TRON-Bioinformatics/neofox/blob/fb6cdf9f10e77c409d0fa44657ef520eedca6994/neofox/references/installer.py#L221 references=directory("resources/neo_fox/references/"), + log: + "logs/neo_fox/neo_fox_config.log", conda: "../envs/neo_fox_deps.yaml" params: @@ -62,7 +69,6 @@ rule adjust_microphaser_output_for_neo_fox: rule neo_fox: input: config="resources/neo_fox/neo_fox_config.txt", - references=directory("resources/neo_fox/references/"), candidates=expand( "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv", peptide_length=config["params"]["neo_fox"]["peptide_len"], @@ -72,6 +78,8 @@ rule neo_fox: tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json", meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json", + log: + "logs/neo_fox/annotated/{group}.{tumor_alias}.log", threads: 8 conda: "../envs/neo_fox_deps.yaml" From dfd5760a87bc660a02ffcdc76dacdb913179d12b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:56:45 +0000 Subject: [PATCH 168/191] initial version of adjust_microphaser_output_for_neo_fox.py --- .../adjust_microphaser_output_for_neo_fox.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py index cdccddcb..879994da 100644 --- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py +++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py @@ -4,4 +4,17 @@ import polars as pl -candidates = pl.read_tsv(snakemake.input.candidates, sep="\t") \ No newline at end of file +columns_mapping = { + "gene_name": "gene", + "normal_peptide": "mutation.wildTypeXmer", + "tumor_peptide": "mutation.mutatedXmer", + "freq": "dnaVariantAlleleFrequency", +} + +candidates = ( + pl.read_tsv(snakemake.input.microphaser, sep="\t", quote="") + .rename(columns_mapping) + .with_column(pl.lit(snakemake.wildcards.group).alias("patientIdentifier")) +) + +candidates.write_csv(snakemake.output.neo_fox, sep="\t", quote="") From c78f5dd3d45e60921b0059955e3b011f821c6120 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 1 Sep 2022 15:59:01 +0000 Subject: [PATCH 169/191] Revert "further unpin NeoFox blast dep from `2.10` to `2`, to avoid unsolvable situations in some settings" This reverts commit 5e2c180181eafb4ace73c597f1e07716338dd96d. --- workflow/envs/neo_fox_deps.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml index 780af1d6..c4cb4a9b 100644 --- a/workflow/envs/neo_fox_deps.yaml +++ b/workflow/envs/neo_fox_deps.yaml @@ -8,7 +8,7 @@ dependencies: # implicit unmentioned dependency of MixMHCpred and PRIME - perl # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp - - blast =2 + - blast =2.10 # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64 - cxx-compiler # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts From e07d5dc9122bed38768724b7aefc67e00318b8b5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 2 Sep 2022 08:11:44 +0000 Subject: [PATCH 170/191] update wrapper and tool versions --- workflow/envs/bcftools.yaml | 2 +- workflow/envs/rbt.yaml | 4 ++-- workflow/envs/varlociraptor.yaml | 2 +- workflow/rules/microphaser.smk | 4 ++-- workflow/rules/phylogeny.smk | 2 +- workflow/rules/ref.smk | 12 ++++++------ 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml index 70e39da1..35bf7643 100644 --- a/workflow/envs/bcftools.yaml +++ b/workflow/envs/bcftools.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/envs/rbt.yaml b/workflow/envs/rbt.yaml index 6c8be1fb..94ff5bae 100644 --- a/workflow/envs/rbt.yaml +++ b/workflow/envs/rbt.yaml @@ -1,6 +1,6 @@ channels: - - bioconda - conda-forge + - bioconda dependencies: - rust-bio-tools =0.19 - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml index a76d765a..270dc831 100644 --- a/workflow/envs/varlociraptor.yaml +++ b/workflow/envs/varlociraptor.yaml @@ -3,4 +3,4 @@ channels: - bioconda dependencies: - varlociraptor =2.3.0 - - bcftools =1.10 + - bcftools =1.14 diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index a0bfc6c6..b0570b0b 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -9,7 +9,7 @@ rule norm_bcf: params: lambda w, input: "-f {} -O b -m-".format(input.genome), # optional parameters for bcftools norm (except -o) wrapper: - "0.65.0/bio/bcftools/norm" + "v1.12.0/bio/bcftools/norm" rule add_somatic_flag: @@ -58,7 +58,7 @@ rule merge_tumor_normal: params: extra="-O b -a", wrapper: - "0.64.0/bio/bcftools/concat" + "v1.12.0/bio/bcftools/concat" rule microphaser_tumor: diff --git a/workflow/rules/phylogeny.smk b/workflow/rules/phylogeny.smk index 63d2ad4a..d7279042 100644 --- a/workflow/rules/phylogeny.smk +++ b/workflow/rules/phylogeny.smk @@ -15,7 +15,7 @@ rule merge_snvs: params: "--use-header final-calls/sampleheader.txt --force-samples", wrapper: - "0.36.0/bio/bcftools/merge" + "v1.12.0/bio/bcftools/merge" rule query: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index f6be3bb0..80e62380 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -10,7 +10,7 @@ rule get_genome: release=config["ref"]["release"], cache: True wrapper: - "0.45.1/bio/reference/ensembl-sequence" + "v1.12.0/bio/reference/ensembl-sequence" rule get_cdna: @@ -25,7 +25,7 @@ rule get_cdna: release=config["ref"]["release"], cache: True wrapper: - "0.45.1/bio/reference/ensembl-sequence" + "v1.12.0/bio/reference/ensembl-sequence" rule get_annotation: @@ -41,7 +41,7 @@ rule get_annotation: log: "logs/get-annotation.log", wrapper: - "0.45.1/bio/reference/ensembl-annotation" + "v1.12.0/bio/reference/ensembl-annotation" # TODO: remove this rule, once microphaser is fixed to make gene_name optional @@ -76,7 +76,7 @@ rule genome_faidx: "logs/genome-faidx.log", cache: True wrapper: - "0.45.1/bio/samtools/faidx" + "v1.12.0/bio/samtools/faidx" rule create_somatic_flag_header_line: @@ -114,7 +114,7 @@ rule bgzip_genome_somatic_flag_bed: log: "logs/bgzip/genome.somatic_flag.log", wrapper: - "v1.7.0/bio/bgzip" + "v1.12.0/bio/bgzip" rule tabix_genome_somatic_flag_bed: @@ -139,7 +139,7 @@ rule genome_dict: "logs/picard/create-dict.log", cache: True wrapper: - "0.45.1/bio/picard/createsequencedictionary" + "v1.12.0/bio/picard/createsequencedictionary" rule download_hla_la_graph: From 6beada8b8301cd4cb4445d8d11e73ca74461975b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 2 Sep 2022 19:14:27 +0000 Subject: [PATCH 171/191] get polars to work on older cpu architectures: https://github.com/pola-rs/polars/blob/622c92470f81cff2626b2c130c35d38bf4c66be9/README.md?plain=1#L213-L214 --- workflow/envs/polars.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml index c5a678f4..c26d1af2 100644 --- a/workflow/envs/polars.yaml +++ b/workflow/envs/polars.yaml @@ -2,4 +2,6 @@ channels: - conda-forge - bioconda dependencies: - - polars =0.14 \ No newline at end of file + - pip + - pip: + - polars-lts-cpu==0.14.8 \ No newline at end of file From 2453302ee44409e19166f3fd206a7f3dbac7b5da Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 08:50:35 +0000 Subject: [PATCH 172/191] add in groups.tsv for group_annotation --- .test/config/groups.tsv | 2 ++ config/config.yaml | 1 + config/groups.tsv | 3 +++ workflow/rules/common.smk | 13 +++++++++++++ 4 files changed, 19 insertions(+) create mode 100644 .test/config/groups.tsv create mode 100644 config/groups.tsv diff --git a/.test/config/groups.tsv b/.test/config/groups.tsv new file mode 100644 index 00000000..c0a77bbd --- /dev/null +++ b/.test/config/groups.tsv @@ -0,0 +1,2 @@ +group tumorType +A LUSC diff --git a/config/config.yaml b/config/config.yaml index 083fc651..4d447645 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,5 +1,6 @@ samples: "config/samples.tsv" units: "config/units.tsv" +groups: "config/groups.tsv" neoantigen_prediction: diff --git a/config/groups.tsv b/config/groups.tsv new file mode 100644 index 00000000..11353453 --- /dev/null +++ b/config/groups.tsv @@ -0,0 +1,3 @@ +group tumorType +A LUSC +B LUAD diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 68725904..d4d8dafb 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -48,6 +48,19 @@ units = ( ) validate(units, schema="../schemas/units.schema.yaml") +groups = samples["group"].unique() + +if "groups" in config: + group_annotation = ( + pd.read_csv(config["groups"], sep="\t", dtype={"group": str}) + .set_index("group") + .sort_index() + ) + group_annotation = group_annotation.loc[groups] +else: + group_annotation = pd.DataFrame({"group": groups}).set_index("group") + + contigs = [c for c in range(1, 23)] contigs.extend(["X", "Y"]) From cd1688c4b400e10a1b755fbc8bd6472a652215fb Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 08:51:27 +0000 Subject: [PATCH 173/191] replace polars with pandas, because polars doesn't play well with old CPUs (and thus hampers workflow portability) --- workflow/envs/pandas.yaml | 4 ++++ workflow/envs/polars.yaml | 7 ------- workflow/rules/annotate_neoantigens.smk | 2 +- .../scripts/adjust_microphaser_output_for_neo_fox.py | 10 +++++----- 4 files changed, 10 insertions(+), 13 deletions(-) create mode 100644 workflow/envs/pandas.yaml delete mode 100644 workflow/envs/polars.yaml diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml new file mode 100644 index 00000000..5ced20fa --- /dev/null +++ b/workflow/envs/pandas.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - pandas=1.4 \ No newline at end of file diff --git a/workflow/envs/polars.yaml b/workflow/envs/polars.yaml deleted file mode 100644 index c26d1af2..00000000 --- a/workflow/envs/polars.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - pip - - pip: - - polars-lts-cpu==0.14.8 \ No newline at end of file diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index bb3e2c02..d1c09c39 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -61,7 +61,7 @@ rule adjust_microphaser_output_for_neo_fox: "logs/neo_fox/candidates/{group}.{tumor_alias}.merged_tumor_normal.pep_len_{peptide_length}.log", threads: 1 conda: - "../envs/polars.yaml" + "../envs/pandas.yaml" script: "../scripts/adjust_microphaser_output_for_neo_fox.py" diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py index 879994da..0d5581bb 100644 --- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py +++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py @@ -2,7 +2,7 @@ sys.stderr = open(snakemake.log[0], "w") -import polars as pl +import pandas as pd columns_mapping = { "gene_name": "gene", @@ -12,9 +12,9 @@ } candidates = ( - pl.read_tsv(snakemake.input.microphaser, sep="\t", quote="") - .rename(columns_mapping) - .with_column(pl.lit(snakemake.wildcards.group).alias("patientIdentifier")) + pd.read_csv(snakemake.input.microphaser, sep="\t", quoting=3) + .rename(columns=columns_mapping) + .assign(patientIdentifier=snakemake.wildcards.group) ) -candidates.write_csv(snakemake.output.neo_fox, sep="\t", quote="") +candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3) From f6e21a09dc95cdda89094269ba92dd6495dfb7cc Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 08:52:40 +0000 Subject: [PATCH 174/191] create NeoFox group / patient sheet --- workflow/rules/annotate_neoantigens.smk | 19 +++++- .../scripts/create_neo_fox_group_sheet.py | 60 +++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 workflow/scripts/create_neo_fox_group_sheet.py diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index d1c09c39..d844b4b2 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -66,6 +66,21 @@ rule adjust_microphaser_output_for_neo_fox: "../scripts/adjust_microphaser_output_for_neo_fox.py" +rule create_neo_fox_group_sheet: + input: + hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt", + output: + group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv" + log: + "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log" + conda: + "../envs/pandas.yaml" + params: + group=lambda wc: group_annotation.loc[wc.group] + script: + "../scripts/create_neo_fox_group_sheet.py" + + rule neo_fox: input: config="resources/neo_fox/neo_fox_config.txt", @@ -73,7 +88,7 @@ rule neo_fox: "results/neo_fox/candidates/{{group}}.{{tumor_alias}}.merged_tumor_normal.pep_len_{peptide_length}.tsv", peptide_length=config["params"]["neo_fox"]["peptide_len"], ), - patient_annotation="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", + group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", output: tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json", @@ -92,7 +107,7 @@ rule neo_fox: " --num_cpus {threads} " " --config {input.config} " " --candidate-file {input.candidates} " - " --patient-data {input.patient_annotation} " + " --patient-data {input.group_sheet} " " --with-table " " --with-json " " --organism {params.organism} " diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py new file mode 100644 index 00000000..252b6fc4 --- /dev/null +++ b/workflow/scripts/create_neo_fox_group_sheet.py @@ -0,0 +1,60 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd + +HLA_SUFFIXES_REGEX = r"[NLSCAQ]?" + +# allowed loci according to NeoFox input data documentation: +# https://neofox.readthedocs.io/en/latest/03_01_input_data.html#file-with-patient-data +# * mhcIAlleles: comma separated MHC I alleles of the patient for HLA-A, HLA-B and +# HLA-C. If homozygous, the allele should be added twice. +# * mhcIIAlleles: comma separated MHC II alleles of the patient for HLA-DRB1, HLA-DQA1, +# HLA-DQB1, HLA-DPA1 and HLA-DPB1. If homozygous, the allele should be added twice. +ALLOWED_LOCI = { + "A", + "B", + "C", + "DRB1", + "DQA1", + "DQB1", + "DPA1", + "DPB1", +} + +mhc_alleles = pd.read_csv( + snakemake.input.hla_la_bestguess, + sep="\t", + ) +# the Allele column can contain multiple ";"-separated entries for the +# same locus +mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";") +mhc_alleles = mhc_alleles.explode(["Allele"]) +mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)] +mhc_alleles.loc[:, "Allele"] = ( + mhc_alleles["Allele"] + .str + .replace( + r"([A-Z]+\d?)\*(\d+):(\d+)(:\d+)*G?(" + HLA_SUFFIXES_REGEX + r")", + r"HLA-\1*\2:\3\5", + regex=True, + ) +) +# the multiple ";"-separated entries from above can be identical after reducing +# to the allele group (1st number) and specific HLA protein (2nd number) +mhc_alleles = mhc_alleles.drop_duplicates(subset=["Chromosome", "Allele"]) + +mhc_one_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() == 1, "Allele"] ) +mhc_two_alleles = ",".join( mhc_alleles.loc[ mhc_alleles["Locus"].str.len() > 1, "Allele"] ) + +patient_info = pd.DataFrame( + data={ + "identifier": [ snakemake.params.group.name ], + "tumorType": [ snakemake.params.group["tumorType"] ], + "mhcIAlleles": [ mhc_one_alleles ], + "mhcIIAlleles": [ mhc_two_alleles ], + } +) + +patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3) From 1895d0f74af9853b4a9fa55fcc81fde3d5ce6765 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 08:54:38 +0000 Subject: [PATCH 175/191] snakefmt --- workflow/Snakefile | 3 +++ workflow/rules/annotate_neoantigens.smk | 16 ++++++++++------ workflow/rules/microphaser.smk | 4 ++-- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index ee333ba2..6f3c2caf 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -11,8 +11,10 @@ configfile: "config/config.yaml" scattergather: calling=24, + ##### required envvars ##### + envvars: # For NeoFox installation: # The tarballs for both netMHCpan and netMHCIIpan @@ -26,6 +28,7 @@ envvars: # https://services.healthtech.dtu.dk/service.php?NetMHCIIpan-4.0 "NET_MHC_TWO_PAN_4_0_TARBALL", + ##### setup report ##### diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index d844b4b2..406ca0d7 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -44,7 +44,7 @@ rule prepare_neo_fox_config_and_resources: ## pre-installed via conda echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config} - + ## pre-installed into conda environment via post-deploy script echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config} echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config} @@ -70,16 +70,16 @@ rule create_neo_fox_group_sheet: input: hla_la_bestguess="results/hla_la/output/{group}_{tumor_alias}/hla/R1_bestguess_G.txt", output: - group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv" + group_sheet="results/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.tsv", log: - "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log" + "logs/neo_fox/patient_data/{group}.{tumor_alias}.hla_alleles.tumor_type.log", conda: "../envs/pandas.yaml" params: - group=lambda wc: group_annotation.loc[wc.group] + group=lambda wc: group_annotation.loc[wc.group], script: "../scripts/create_neo_fox_group_sheet.py" - + rule neo_fox: input: @@ -101,7 +101,11 @@ rule neo_fox: params: folder=lambda wc, output: path.dirname(output.annotated), prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0], - organism="human" if config["ref"]["species"]=="homo_sapiens" else "mouse" if config["ref"]["species"]=="mus_musculus" else "unsupported", + organism="human" + if config["ref"]["species"] == "homo_sapiens" + else "mouse" + if config["ref"]["species"] == "mus_musculus" + else "unsupported", shell: "(neofox " " --num_cpus {threads} " diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index b0570b0b..83061530 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -77,7 +77,7 @@ rule microphaser_tumor: conda: "../envs/microphaser.yaml" params: - window_length=lambda wc: int(wc.peptide_length) * 3 + window_length=lambda wc: int(wc.peptide_length) * 3, shell: "microphaser somatic {input.bam} --variants {input.bcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} " "< {input.track} > {output.mt_fasta} 2> {log}" @@ -102,7 +102,7 @@ rule microphaser_normal: conda: "../envs/microphaser.yaml" params: - window_length=lambda wc: int(wc.peptide_length) * 3 + window_length=lambda wc: int(wc.peptide_length) * 3, shell: "microphaser normal {input.bam} --variants {input.bcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} " "< {input.track} > {output.wt_fasta} 2> {log}" From 657e982a6a6685bbc42e0ec4c6eba66112c233cc Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 09:14:16 +0000 Subject: [PATCH 176/191] request NeoFox output instead of original netMHC outputs --- workflow/rules/common.smk | 43 +++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d4d8dafb..a604bc55 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -112,30 +112,37 @@ def get_final_output(): "alias", ] if config["neoantigen_prediction"]["activate"]: - sequencing_types = pd.unique( - units.loc[units["sample_name"].isin(smps), "sequencing_type"] - ) final_output.extend( expand( - "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", + "results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", group=group, tumor_alias=tumor_aliases, - mhc=list( - filter( - None, - [ - "net_mhc_pan" - if is_activated("params/net_mhc_pan") - else None, - "net_mhc_two_pan" - if is_activated("params/net_mhc_two_pan") - else None, - ], - ) - ), - seqtype=sequencing_types, ) ) + #sequencing_types = pd.unique( + # units.loc[units["sample_name"].isin(smps), "sequencing_type"] + #) + #final_output.extend( + # expand( + # "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", + # group=group, + # tumor_alias=tumor_aliases, + # mhc=list( + # filter( + # None, + # [ + # "net_mhc_pan" + # if is_activated("params/net_mhc_pan") + # else None, + # "net_mhc_two_pan" + # if is_activated("params/net_mhc_two_pan") + # else None, + # ], + # ) + # ), + # seqtype=sequencing_types, + # ) + #) else: final_output = expand( [ From 539da379e3e5a4c5e32e1bb6c1a1fc7c725456a9 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 09:14:27 +0000 Subject: [PATCH 177/191] small fixes --- workflow/rules/annotate_neoantigens.smk | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 406ca0d7..1d805267 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -99,8 +99,8 @@ rule neo_fox: conda: "../envs/neo_fox_deps.yaml" params: - folder=lambda wc, output: path.dirname(output.annotated), - prefix=lambda wc, output: path.plitext(path.basename(output.annotated))[0], + folder=lambda wc, output: path.dirname(output.tsv), + prefix=lambda wc, output: path.splitext(path.basename(output.tsv))[0], organism="human" if config["ref"]["species"] == "homo_sapiens" else "mouse" @@ -117,7 +117,7 @@ rule neo_fox: " --organism {params.organism} " " --output-folder {params.folder} " " --output-prefix {params.prefix} ; " - " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " - " mv {params_folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " - " mv {params_folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; " + " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " + " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " + " mv {params.folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; " ") 2> {log} " From 21a62fca00212e70ae8c3d2462e6391f164aec92 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 09:35:19 +0000 Subject: [PATCH 178/191] snakefmt --- workflow/rules/common.smk | 48 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a604bc55..37a72984 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -119,30 +119,30 @@ def get_final_output(): tumor_alias=tumor_aliases, ) ) - #sequencing_types = pd.unique( - # units.loc[units["sample_name"].isin(smps), "sequencing_type"] - #) - #final_output.extend( - # expand( - # "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", - # group=group, - # tumor_alias=tumor_aliases, - # mhc=list( - # filter( - # None, - # [ - # "net_mhc_pan" - # if is_activated("params/net_mhc_pan") - # else None, - # "net_mhc_two_pan" - # if is_activated("params/net_mhc_two_pan") - # else None, - # ], - # ) - # ), - # seqtype=sequencing_types, - # ) - #) + # sequencing_types = pd.unique( + # units.loc[units["sample_name"].isin(smps), "sequencing_type"] + # ) + # final_output.extend( + # expand( + # "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.{seqtype}.tsv", + # group=group, + # tumor_alias=tumor_aliases, + # mhc=list( + # filter( + # None, + # [ + # "net_mhc_pan" + # if is_activated("params/net_mhc_pan") + # else None, + # "net_mhc_two_pan" + # if is_activated("params/net_mhc_two_pan") + # else None, + # ], + # ) + # ), + # seqtype=sequencing_types, + # ) + # ) else: final_output = expand( [ From 63f618d1bf82931f97982ad0a4a7516e49c202e6 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Tue, 6 Sep 2022 12:25:16 +0000 Subject: [PATCH 179/191] unpin blast from `2.10` to `2`, so it installs on our server --- workflow/envs/neo_fox_deps.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/neo_fox_deps.yaml b/workflow/envs/neo_fox_deps.yaml index c4cb4a9b..780af1d6 100644 --- a/workflow/envs/neo_fox_deps.yaml +++ b/workflow/envs/neo_fox_deps.yaml @@ -8,7 +8,7 @@ dependencies: # implicit unmentioned dependency of MixMHCpred and PRIME - perl # https://neofox.readthedocs.io/en/latest/02_installation.html#install-blastp - - blast =2.10 + - blast =2 # https://github.com/GfellerLab/MixMHCpred/blob/75374a7a0de214278c1cda00bb9dee4b2f475ec3/README#L64 - cxx-compiler # needed for netMHCpan and netMHCIIpan, as their executables are tcsh-scripts From 50591114fe92c0ee9abf7cb088c48a26a86fa125 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 7 Sep 2022 09:13:50 +0000 Subject: [PATCH 180/191] fix quoting in neo_fox prep rule, so that bash variables get expanded upon writing into config file --- workflow/rules/annotate_neoantigens.smk | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 1d805267..64557ffc 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -24,31 +24,31 @@ rule prepare_neo_fox_config_and_resources: ## pre-installed via conda export NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb - echo 'NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb' > {output.config} + echo "NEOFOX_MAKEBLASTDB=$CONDA_BIN/makeblastdb" > {output.config} export NEOFOX_RSCRIPT=$CONDA_BIN/Rscript - echo 'NEOFOX_RSCRIPT=$CONDA_BIN/Rscript' >> {output.config} + echo "NEOFOX_RSCRIPT=$CONDA_BIN/Rscript" >> {output.config} ## pre-installed into conda environment via post-deploy script export NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan - echo 'NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan' >> {output.config} + echo "NEOFOX_NETMHCPAN=$CONDA_BIN/netMHCpan" >> {output.config} export NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan - echo 'NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan' >> {output.config} + echo "NEOFOX_NETMHC2PAN=$CONDA_BIN/netMHCIIpan" >> {output.config} ## specification of hla_allele link via config.yaml export NEOFOX_HLA_DATABASE={params.hla_alleles} neofox-configure --reference-folder {output.references} - echo 'NEOFOX_REFERENCE_FOLDER={output.references}' >> {output.config} + echo "NEOFOX_REFERENCE_FOLDER={output.references}" >> {output.config} # further environment variables needed for the config file ## pre-installed via conda - echo 'NEOFOX_BLASTP=$CONDA_BIN/blastp' >> {output.config} + echo "NEOFOX_BLASTP=$CONDA_BIN/blastp" >> {output.config} ## pre-installed into conda environment via post-deploy script - echo 'NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred' >> {output.config} - echo 'NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix' >> {output.config} - echo 'NEOFOX_PRIME=$CONDA_BIN/PRIME' >> {output.config} + echo "NEOFOX_MIXMHCPRED=$CONDA_BIN/MixMHCpred" >> {output.config} + echo "NEOFOX_MIXMHC2PRED=$CONDA_BIN/MixMHC2pred_unix" >> {output.config} + echo "NEOFOX_PRIME=$CONDA_BIN/PRIME" >> {output.config} """ @@ -108,7 +108,7 @@ rule neo_fox: else "unsupported", shell: "(neofox " - " --num_cpus {threads} " + " --num-cpus {threads} " " --config {input.config} " " --candidate-file {input.candidates} " " --patient-data {input.group_sheet} " From 70ce592f8c9343c949621f29ec91f05a493db8ed Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 7 Sep 2022 12:06:32 +0000 Subject: [PATCH 181/191] move allele lists of MixMHC tools and PRIME where NeoFox expects them (hard-coded in the tool) --- workflow/envs/neo_fox_deps.post-deploy.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/workflow/envs/neo_fox_deps.post-deploy.sh b/workflow/envs/neo_fox_deps.post-deploy.sh index c17a3400..c344af3e 100755 --- a/workflow/envs/neo_fox_deps.post-deploy.sh +++ b/workflow/envs/neo_fox_deps.post-deploy.sh @@ -4,6 +4,9 @@ set -euo pipefail # set all the necessary conda paths and # ensure they exist CONDA_BIN="${CONDA_PREFIX}/bin/" +# this is needed for the allele list files of MixMHCpred, MixMHC2pred +# and PRIME, where NeoFox expects some of the files in that directory +mkdir -p ${CONDA_BIN}/lib/ CONDA_MAN1="${CONDA_PREFIX}/share/man/man1/" mkdir -p $CONDA_MAN1 CONDA_INFO="${CONDA_PREFIX}/share/info/" @@ -26,8 +29,11 @@ g++ -O3 lib/MixMHCpred.cc -o lib/MixMHCpred.x MMP_PLACEHOLDER="YOUR PATH TO MixMHCpred/lib FOLDER" grep "${MMP_PLACEHOLDER}" MixMHCpred sed -i "s%${MMP_PLACEHOLDER}%${MIX_MHC_PRED_LIB_PATH}%" MixMHCpred -mv lib $MIX_MHC_PRED_LIB_PATH mv MixMHCpred ${CONDA_BIN} +# The allele_list.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L123 +cp lib/allele_list.txt ${CONDA_BIN}/lib/ +mv lib $MIX_MHC_PRED_LIB_PATH # TODO: when updating to v2.2, change this line to: # mv MixMHCpred_license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf mv license.pdf ${CONDA_INFO}/MixMHCpred_license.pdf @@ -46,6 +52,9 @@ wget -q https://github.com/GfellerLab/MixMHC2pred/archive/refs/tags/v${MIX_MHC_T tar xzf v${MIX_MHC_TWO_PRED_VERSION}.tar.gz cd MixMHC2pred-${MIX_MHC_TWO_PRED_VERSION} mv -t ${CONDA_BIN} MixMHC2pred MixMHC2pred_unix +# The Alleles_list.txt file needs to be in the same directory as the binaries, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L114 +mv -t ${CONDA_BIN} Alleles_list.txt mv rpep ${CONDA_ETC} ln -s ${CONDA_ETC}/rpep ${CONDA_BIN}/rpep mv LICENSE ${CONDA_INFO}/MixMHC2pred_unix_LICENSE @@ -65,8 +74,11 @@ cd PRIME-${PRIME_VERSION} PRIME_PLACEHOLDER="/app/PRIME/lib" grep "${PRIME_PLACEHOLDER}" PRIME sed -i "s%${PRIME_PLACEHOLDER}%${PRIME_LIB_PATH}%" PRIME -mv lib $PRIME_LIB_PATH mv PRIME ${CONDA_BIN} +# The alleles.txt file needs to be in the a `lib/` subdirectory of the `bin/` dir, this is hard-coded here: +# https://github.com/TRON-Bioinformatics/neofox/blob/629443b637fc41b1ab81f4f770e7a8a1c976d3f2/neofox/references/references.py#L132 +cp lib/alleles.txt ${CONDA_BIN}/lib/ +mv lib $PRIME_LIB_PATH mv PRIME_license.pdf ${CONDA_INFO} PRIME -i test/test.txt -o test/out.txt -a A0201,A0101 diff <(sed '4d' test/out.txt) <(sed '4d' test/out_compare.txt) From a70f2aea95109cdd85302b3aa389bb896f47e725 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Wed, 7 Sep 2022 12:07:02 +0000 Subject: [PATCH 182/191] _neoantigen_features.json does not seem to exist --- workflow/rules/annotate_neoantigens.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index 64557ffc..d4ce35fa 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -92,7 +92,6 @@ rule neo_fox: output: tsv="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", json="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.json", - meta_json="results/neo_fox/annotated/{group}.{tumor_alias}.meta_annotations.json", log: "logs/neo_fox/annotated/{group}.{tumor_alias}.log", threads: 8 @@ -119,5 +118,4 @@ rule neo_fox: " --output-prefix {params.prefix} ; " " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " - " mv {params.folder}/{params.prefix}_neoantigen_features.json {output.meta_json}; " ") 2> {log} " From 49edf28e2949fa651f42110138cc8e23cbb3978a Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 8 Sep 2022 08:57:50 +0000 Subject: [PATCH 183/191] remove implicit NeoFox logs, as they are incomplete and accumulate across runs --- workflow/rules/annotate_neoantigens.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/rules/annotate_neoantigens.smk b/workflow/rules/annotate_neoantigens.smk index d4ce35fa..70e1a492 100644 --- a/workflow/rules/annotate_neoantigens.smk +++ b/workflow/rules/annotate_neoantigens.smk @@ -118,4 +118,7 @@ rule neo_fox: " --output-prefix {params.prefix} ; " " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.tsv {output.tsv}; " " mv {params.folder}/{params.prefix}_neoantigen_candidates_annotated.json {output.json}; " + # this implicitly created log does not seem to contain all of stderr, + # so we rather do our own capturing below + " rm {params.folder}/{params.prefix}.log; " ") 2> {log} " From de676ae5d6ac8bfe2027fa327882b73de56008b5 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 8 Sep 2022 09:45:16 +0000 Subject: [PATCH 184/191] remove indexes from pandas.to_csv output --- workflow/scripts/adjust_microphaser_output_for_neo_fox.py | 2 +- workflow/scripts/create_neo_fox_group_sheet.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py index 0d5581bb..4806d56e 100644 --- a/workflow/scripts/adjust_microphaser_output_for_neo_fox.py +++ b/workflow/scripts/adjust_microphaser_output_for_neo_fox.py @@ -17,4 +17,4 @@ .assign(patientIdentifier=snakemake.wildcards.group) ) -candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3) +candidates.to_csv(snakemake.output.neo_fox, sep="\t", quoting=3, index=False) diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py index 252b6fc4..b989ebf6 100644 --- a/workflow/scripts/create_neo_fox_group_sheet.py +++ b/workflow/scripts/create_neo_fox_group_sheet.py @@ -57,4 +57,4 @@ } ) -patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3) +patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False) From 594fe3a0382a5249be0eb1f6b08e74738b491a59 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 8 Sep 2022 09:46:47 +0000 Subject: [PATCH 185/191] remove extra possible MHC alleles, because NeoFox will only parse one per chromosome --- workflow/scripts/create_neo_fox_group_sheet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py index b989ebf6..1ee7193a 100644 --- a/workflow/scripts/create_neo_fox_group_sheet.py +++ b/workflow/scripts/create_neo_fox_group_sheet.py @@ -28,9 +28,11 @@ sep="\t", ) # the Allele column can contain multiple ";"-separated entries for the -# same locus +# same locus -- NeoFox does a hard assertion that only two alleles per +# gene exist, so we chose to only ever keep the first of such multiple +# possibilities mhc_alleles.loc[:, "Allele"] = mhc_alleles["Allele"].str.split(pat=";") -mhc_alleles = mhc_alleles.explode(["Allele"]) +mhc_alleles = mhc_alleles.explode(["Allele"]).drop_duplicates(subset=["Locus", "Chromosome"]) mhc_alleles = mhc_alleles[mhc_alleles["Locus"].isin(ALLOWED_LOCI)] mhc_alleles.loc[:, "Allele"] = ( mhc_alleles["Allele"] From e867bf8a785d17decd6da485f5b53a86c2c8d9d6 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Thu, 8 Sep 2022 09:47:22 +0000 Subject: [PATCH 186/191] remove tumorType column from group_sheet if empty, as NeoFox cannot handle empty entries, here --- workflow/scripts/create_neo_fox_group_sheet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/create_neo_fox_group_sheet.py b/workflow/scripts/create_neo_fox_group_sheet.py index 1ee7193a..4242d684 100644 --- a/workflow/scripts/create_neo_fox_group_sheet.py +++ b/workflow/scripts/create_neo_fox_group_sheet.py @@ -57,6 +57,8 @@ "mhcIAlleles": [ mhc_one_alleles ], "mhcIIAlleles": [ mhc_two_alleles ], } -) +# This is required for cases where no tumorType is available, as NeoFox does not +# seem to be able to handle empty entries, here -- so we remove the whole column +).dropna(axis="columns") patient_info.to_csv(snakemake.output.group_sheet, sep="\t", quoting=3, index=False) From 68e0b963e17ac35cfb8a7271d63f9444b11fdb21 Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 23 Sep 2022 10:05:04 +0000 Subject: [PATCH 187/191] add datavzrd report for NeoFox output --- workflow/Snakefile | 1 + workflow/envs/datavzrd.yaml | 4 + workflow/report/neopeptides.rst | 9 ++ ...neo_fox-neoantigens-template.datavzrd.yaml | 16 +++ workflow/rules/common.smk | 3 +- workflow/rules/datavzrd.smk | 48 +++++++ workflow/rules/mhc_binding.smk | 4 +- workflow/scripts/prepare_neoprint.py | 118 ++++++++++++++++++ 8 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 workflow/envs/datavzrd.yaml create mode 100644 workflow/report/neopeptides.rst create mode 100644 workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml create mode 100644 workflow/rules/datavzrd.smk create mode 100644 workflow/scripts/prepare_neoprint.py diff --git a/workflow/Snakefile b/workflow/Snakefile index 6f3c2caf..7282e12f 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -49,6 +49,7 @@ include: "rules/microphaser.smk" include: "rules/hla_typing.smk" include: "rules/mhc_binding.smk" include: "rules/annotate_neoantigens.smk" +include: "rules/datavzrd.smk" rule all: diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml new file mode 100644 index 00000000..e3f74967 --- /dev/null +++ b/workflow/envs/datavzrd.yaml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - datavzrd =2.1 diff --git a/workflow/report/neopeptides.rst b/workflow/report/neopeptides.rst new file mode 100644 index 00000000..1e6d6625 --- /dev/null +++ b/workflow/report/neopeptides.rst @@ -0,0 +1,9 @@ +Neopeptides and corresponding normal peptides as phased and determined by +microphaser, with various annotation scores gathered and provided by NeoFox, +using the HLA alleles determined by HLA-LA. + +=================== +Column descriptions +=================== + +TODO: transform spreadsheet into RST table diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml new file mode 100644 index 00000000..2f3e1773 --- /dev/null +++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml @@ -0,0 +1,16 @@ +name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}" + +default-view: "overview" + +datasets: + neoprint: + path: ?input.neopeptides + separator: "\t" + +views: + overview: + desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox." + dataset: neoprint + render-table: + gene: + link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value} diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 37a72984..9a71c4d0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -114,9 +114,10 @@ def get_final_output(): if config["neoantigen_prediction"]["activate"]: final_output.extend( expand( - "results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", + "results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}", group=group, tumor_alias=tumor_aliases, + mhc=["I", "II"], ) ) # sequencing_types = pd.unique( diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk new file mode 100644 index 00000000..799616cc --- /dev/null +++ b/workflow/rules/datavzrd.smk @@ -0,0 +1,48 @@ +rule prepare_neoprint: + input: + neopeptides="results/neo_fox/annotated/{group}.{tumor_alias}.annotated_neoantigens.tsv", + output: + mhc_one="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.I.sorted.tsv", + mhc_two="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.II.sorted.tsv", + log: + "logs/prepare_neoprint/{group}.{tumor_alias}.log", + params: + purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze() + conda: + "../envs/pandas.yaml" + script: + "../scripts/prepare_neoprint.py" + + +rule render_datavzrd_neoprint_config: + input: + template=workflow.source_path( + "../resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml" + ), + neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv", + output: + "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml", + log: + "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log", + template_engine: + "yte" + + +rule datavzrd_neoprint: + input: + neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv", + config="resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml", + output: + report( + directory("results/datavzrd/neoprint/{group}.{tumor_alias}.{mhc}"), + htmlindex="index.html", + caption="../report/neopeptides.rst", + category="Neopeptides", + labels=lambda wc: {"group": wc.group, "sample_type": wc.tumor_alias, "MHC": wc.mhc}, + ), + conda: + "../envs/datavzrd.yaml" + log: + "logs/datavzrd_neoprint/{group}.{tumor_alias}.{mhc}.log", + shell: + "datavzrd {input.config} --output {output} &> {log}" diff --git a/workflow/rules/mhc_binding.smk b/workflow/rules/mhc_binding.smk index 0a6079ca..535015fc 100644 --- a/workflow/rules/mhc_binding.smk +++ b/workflow/rules/mhc_binding.smk @@ -83,7 +83,7 @@ rule merge_neoantigen_info: report( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.DNA.tsv", caption="../report/neoantigens.dna.rst", - category="Neoantigens", + category="Neopeptides", ), log: "logs/mhc_csv_table/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.log", @@ -99,7 +99,7 @@ rule add_rna_info: report( "results/neoantigens/{group}.{tumor_alias}.merged_tumor_normal.{mhc}.RNA.tsv", caption="../report/neoantigens.rna.rst", - category="Neoantigens", + category="Neopeptides", ), params: abundance=lambda wc, input: "{}/abundance.tsv".format(input.counts), diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py new file mode 100644 index 00000000..d9a01928 --- /dev/null +++ b/workflow/scripts/prepare_neoprint.py @@ -0,0 +1,118 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pandas as pd +from typing import Tuple + +def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: + """ + Highlight the difference between mutated neopeptide and normal peptide + """ + if normal_p == "nan" or normal_p == "NA" or normal_p == "": + return (tumor_p, normal_p) + assert len(tumor_p) == len( + normal_p + ), f"Tumor peptide '{tumor_p}' and normal peptide '{normal_p}' have different lengths." + diff_pos = [i for i in range(len(tumor_p)) if tumor_p[i] != normal_p[i]] + tp_changed = tumor_p + np_changed = normal_p + for p in diff_pos: + tp_changed = tp_changed[:p] + tp_changed[p].lower() + tp_changed[p + 1 :] + np_changed = np_changed[:p] + np_changed[p].lower() + np_changed[p + 1 :] + return (tp_changed, np_changed) + +all_neopeptides = pd.read_csv(snakemake.input.neopeptides, sep="\t") + +# If we leave in any dots, datavzrd will interpret this as attributes +all_neopeptides.columns = all_neopeptides.columns.str.replace(".", "_", regex=False) + +# Aggregate multiple identical entries that differ only in 'id' and 'transcript' +# into one, taking the first 'id' and collecting all 'transcript's into a '|'-separated +# list. +# TODO: Remove this redundancy from microphaser output before passing it along to other +# tools. +cols = [c for c in all_neopeptides.columns if c not in ["id", "transcript"]] +aggregation_functions = { + "id": lambda i: list(i)[0], + "transcript": lambda t: "|".join(list(t)), +} +all_neopeptides = ( + all_neopeptides.groupby(cols, dropna=False) + .agg(aggregation_functions) + .reset_index() + .explode("id") +) + +# highlight mutations in the original Xmers +all_neopeptides[["mutation_mutatedXmer", "mutation_wildTypeXmer"]] = ( + pd.DataFrame( + all_neopeptides + .fillna({"mutation_wildTypeXmer": ""}) + .apply(lambda row: highlight_peptides_diff(row["mutation_mutatedXmer"], row["mutation_wildTypeXmer"]), axis="columns") + .tolist() + ) +) + +# create new purity-adjusted DNA variant allele frequency +all_neopeptides["purity_adjusted_DNA_VAF"] = all_neopeptides["dnaVariantAlleleFrequency"] / snakemake.params.purity + +# round all floats to the specified decimals +all_neopeptides = all_neopeptides.round(decimals=5) + +# define important columns to move to the left of the table + +important_cols_general = [ + "gene", + "mutation_mutatedXmer", + "mutation_wildTypeXmer", + "purity_adjusted_DNA_VAF", + "imputedGeneExpression", + ] + +important_cols_one = [ + "PRIME_best_rank", + "PRIME_best_score", + "PRIME_best_peptide", + "PRIME_best_allele", + "Best_rank_MHCI_9mer_score", + "Best_rank_MHCI_9mer_score_WT", + "Best_rank_MHCI_9mer_epitope", + "Best_rank_MHCI_9mer_epitope_WT", + "Best_rank_MHCI_9mer_allele", + "Best_rank_MHCI_9mer_allele_WT", + ] + +important_cols_two = [ + "MixMHC2pred_best_rank", + "MixMHC2pred_best_peptide", + "MixMHC2pred_best_allele", + "Best_rank_MHCII_score", + "Best_rank_MHCII_score_WT", + "Best_rank_MHCII_score_epitope", + "Best_rank_MHCII_score_epitope_WT", + "Best_rank_MHCII_score_allele", + "Best_rank_MHCII_score_allele_WT", + ] + +important_cols = important_cols_general + important_cols_one + important_cols_two + + +mhc_one = ( + all_neopeptides[ important_cols + [ col for col in all_neopeptides.columns if col not in important_cols ] ] + .sort_values(by = ["PRIME_best_rank", "MixMHC2pred_best_rank"]) + .groupby("PRIME_best_rank") + .head(n=1) +) + +mhc_one.to_csv(snakemake.output.mhc_one, sep="\t", index=False) + +# move important columns to the left of the table +mhc_two = ( + all_neopeptides[ important_cols_general + important_cols_two + important_cols_one + [ col for col in all_neopeptides.columns if col not in important_cols ] ] + .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"]) + .groupby("MixMHC2pred_best_rank") + .head(n=1) +) + +mhc_two.to_csv(snakemake.output.mhc_two, sep="\t", index=False) From 67ed16ae07b79c11573bcb22befd85dcfa324985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 27 Sep 2022 16:47:09 +0000 Subject: [PATCH 188/191] fix: adapt to wrapper changes --- workflow/rules/microphaser.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 83061530..4c0da1c9 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -1,13 +1,13 @@ rule norm_bcf: input: "results/final-calls/{group}.{set}.bcf", - genome="resources/genome.fasta", + ref="resources/genome.fasta", output: "results/final-calls/{group}.{set}.norm.bcf", log: "logs/bcftools/norm/{group}.{set}.log", params: - lambda w, input: "-f {} -O b -m-".format(input.genome), # optional parameters for bcftools norm (except -o) + extra="-m-" wrapper: "v1.12.0/bio/bcftools/norm" @@ -58,7 +58,7 @@ rule merge_tumor_normal: params: extra="-O b -a", wrapper: - "v1.12.0/bio/bcftools/concat" + "v1.14.1/bio/bcftools/concat" rule microphaser_tumor: From 5ffe288d60ed6ce049888e5dbbfa0ec35193b535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Wed, 28 Sep 2022 13:23:15 +0000 Subject: [PATCH 189/191] polish datavzrd report --- workflow/envs/datavzrd.yaml | 2 +- ...neo_fox-neoantigens-template.datavzrd.yaml | 325 +++++++++++++++++- workflow/rules/common.smk | 39 +++ workflow/rules/datavzrd.smk | 5 +- workflow/rules/microphaser.smk | 2 +- workflow/scripts/prepare_neoprint.py | 40 +-- 6 files changed, 373 insertions(+), 40 deletions(-) diff --git a/workflow/envs/datavzrd.yaml b/workflow/envs/datavzrd.yaml index e3f74967..859add75 100644 --- a/workflow/envs/datavzrd.yaml +++ b/workflow/envs/datavzrd.yaml @@ -1,4 +1,4 @@ channels: - conda-forge dependencies: - - datavzrd =2.1 + - datavzrd =2.2 diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml index 2f3e1773..40398001 100644 --- a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml +++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml @@ -1,3 +1,313 @@ +__definitions__: + - import pandas as pd + - from itertools import chain + - from copy import deepcopy +__variables__: + important_cols: ?set(params.neofox_important_cols["general"]) | set(params.neofox_important_cols[wildcards.mhc]) + cols: ?set(pd.read_csv(input.neopeptides, sep="\t").columns.values) + coldefs: + gene: + link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value} + mutation_mutatedXmer: + custom: | + function(value, row) { + return value.split("").map(function(a) { + if (a === a.toLowerCase()) { + return `${a}` + } else { + return a + } + }).join(""); + } + purity_adjusted_DNA_VAF: + plot: + ticks: + scale: linear + imputedGeneExpression: + plot: + ticks: + scale: linear + PRIME_best_rank: + plot: + heatmap: + scale: linear + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - white + PRIME_best_score: + plot: + ticks: + scale: linear + Best_rank_MHCI_9mer_score: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCI_9mer_score_WT + Best_rank_MHCI_9mer_score_WT: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCI_9mer_score + MixMHC2pred_best_rank: + plot: + heatmap: + scale: linear + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - "#ffffff" + Best_rank_MHCII_score: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCII_score_WT + Best_rank_MHCII_score_WT: + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_rank_MHCII_score + Recognition_Potential_MHCI_9mer: + plot: + ticks: + scale: linear + Improved_Binder_MHCI: + plot: + heatmap: + scale: ordinal + domain: + - 0 + - 1 + range: + - "#ffffff" + - "#2CA02C" + Selfsimilarity_MHCI_conserved_binder: + plot: + ticks: + scale: linear + mutation_position: + display-mode: detail + dnaVariantAlleleFrequency: + display-mode: detail + rnaVariantAlleleFrequency: + display-mode: hidden + rnaExpression: + display-mode: hidden + ADN_MHCI: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + ADN_MHCII: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + Amplitude_MHCII_rank: + display-mode: detail + plot: + heatmap: + scale: linear + domain: + - 0.0 + - 100.0 + range: + - "#EC0000" + - "#ffffff" + Amplitude_MHCI_affinity: + display-mode: detail + plot: + ticks: + scale: linear + Amplitude_MHCI_affinity_9mer: + display-mode: detail + plot: + ticks: + scale: linear + MixMHC2pred_best_allele: + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_rank_MHCII_score_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_rank_MHCII_score_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - MixMHC2pred_best_allele + - Best_rank_MHCII_score_allele + - Best_rank_MHCII_score_allele_WT + - Best_rank_MHCII_allele_WT + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + Best_affinity_MHCII_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCII_score_WT + Best_affinity_MHCII_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCII_score + PRIME_best_allele: + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_rank_MHCI_9mer_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_rank_MHCI_9mer_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_rank_MHCI_9mer_allele + - Best_rank_MHCI_9mer_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_9mer_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_9mer_score_WT + Best_affinity_MHCI_9mer_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_9mer_score + Best_affinity_MHCI_allele: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_allele_WT: + display-mode: detail + plot: + heatmap: + scale: ordinal + color-scheme: category20 + aux-domain-columns: + - PRIME_best_allele + - Best_affinity_MHCII_allele + - Best_affinity_MHCII_allele_WT + - Best_affinity_MHCI_9mer_allele + - Best_affinity_MHCI_9mer_allele_WT + - Best_affinity_MHCI_allele + - Best_affinity_MHCI_allele_WT + Best_affinity_MHCI_score: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_score_WT + Best_affinity_MHCI_score_WT: + display-mode: detail + plot: + ticks: + scale: linear + aux-domain-columns: + - Best_affinity_MHCI_score + name: ?f"Neopeptide candidates for {wildcards.group}, tumor sample {wildcards.tumor_alias}" default-view: "overview" @@ -11,6 +321,17 @@ views: overview: desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox." dataset: neoprint + render-table: - gene: - link-to-url: https://www.ensembl.org/Homo_sapiens/Gene/Summary?g={value} + columns: + ?for col, coldef in coldefs.items(): + __definitions__: + - | + coldef_ = deepcopy(coldef) + if col not in important_cols: + coldef_["display-mode"] = "detail" + ?col: ?coldef_ + ?for col in cols: + ?if col not in coldefs and col not in important_cols: + ?col: + display-mode: detail \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9a71c4d0..23a4b97e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -219,3 +219,42 @@ def get_alleles_MHCII(wildcards): group=wildcards.group, alias=alias, ) + + +##### Other stuff #### + +neofox_important_cols = { + "general": [ + "gene", + "mutation_mutatedXmer", + "mutation_wildTypeXmer", + "purity_adjusted_DNA_VAF", + "imputedGeneExpression", + ], + "I": [ + "PRIME_best_rank", + "PRIME_best_score", + "PRIME_best_peptide", + "PRIME_best_allele", + "Recognition_Potential_MHCI_9mer", + "Improved_Binder_MHCI", + "Selfsimilarity_MHCI_conserved_binder", + "Best_rank_MHCI_9mer_score", + "Best_rank_MHCI_9mer_score_WT", + "Best_rank_MHCI_9mer_epitope", + "Best_rank_MHCI_9mer_epitope_WT", + "Best_rank_MHCI_9mer_allele", + "Best_rank_MHCI_9mer_allele_WT", + ], + "II": [ + "MixMHC2pred_best_rank", + "MixMHC2pred_best_peptide", + "MixMHC2pred_best_allele", + "Best_rank_MHCII_score", + "Best_rank_MHCII_score_WT", + "Best_rank_MHCII_score_epitope", + "Best_rank_MHCII_score_epitope_WT", + "Best_rank_MHCII_score_allele", + "Best_rank_MHCII_score_allele_WT", + ] +} \ No newline at end of file diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 799616cc..d09c9cd7 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -7,7 +7,8 @@ rule prepare_neoprint: log: "logs/prepare_neoprint/{group}.{tumor_alias}.log", params: - purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze() + purity = lambda wc: samples.loc[(samples["group"] == wc.group) & (samples["alias"] == wc.tumor_alias), "purity"].squeeze(), + neofox_important_cols=neofox_important_cols, conda: "../envs/pandas.yaml" script: @@ -22,6 +23,8 @@ rule render_datavzrd_neoprint_config: neopeptides="results/tables/neoprint/{group}.{tumor_alias}.annotated_neopeptides.{mhc}.sorted.tsv", output: "resources/datavzrd/{group}.{tumor_alias}.datavzrd_neoprint.{mhc}.yaml", + params: + neofox_important_cols=neofox_important_cols, log: "logs/datavzrd_render_neoprint/{group}.{tumor_alias}.{mhc}.log", template_engine: diff --git a/workflow/rules/microphaser.smk b/workflow/rules/microphaser.smk index 4c0da1c9..a52d72ef 100644 --- a/workflow/rules/microphaser.smk +++ b/workflow/rules/microphaser.smk @@ -56,7 +56,7 @@ rule merge_tumor_normal: log: "logs/bcftools/concat-tumor-normal/{group}.merged_tumor_normal.log", params: - extra="-O b -a", + extra="-a", wrapper: "v1.14.1/bio/bcftools/concat" diff --git a/workflow/scripts/prepare_neoprint.py b/workflow/scripts/prepare_neoprint.py index d9a01928..0a535d14 100644 --- a/workflow/scripts/prepare_neoprint.py +++ b/workflow/scripts/prepare_neoprint.py @@ -1,4 +1,5 @@ import sys +import json sys.stderr = open(snakemake.log[0], "w") @@ -62,40 +63,9 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: # define important columns to move to the left of the table -important_cols_general = [ - "gene", - "mutation_mutatedXmer", - "mutation_wildTypeXmer", - "purity_adjusted_DNA_VAF", - "imputedGeneExpression", - ] - -important_cols_one = [ - "PRIME_best_rank", - "PRIME_best_score", - "PRIME_best_peptide", - "PRIME_best_allele", - "Best_rank_MHCI_9mer_score", - "Best_rank_MHCI_9mer_score_WT", - "Best_rank_MHCI_9mer_epitope", - "Best_rank_MHCI_9mer_epitope_WT", - "Best_rank_MHCI_9mer_allele", - "Best_rank_MHCI_9mer_allele_WT", - ] - -important_cols_two = [ - "MixMHC2pred_best_rank", - "MixMHC2pred_best_peptide", - "MixMHC2pred_best_allele", - "Best_rank_MHCII_score", - "Best_rank_MHCII_score_WT", - "Best_rank_MHCII_score_epitope", - "Best_rank_MHCII_score_epitope_WT", - "Best_rank_MHCII_score_allele", - "Best_rank_MHCII_score_allele_WT", - ] - -important_cols = important_cols_general + important_cols_one + important_cols_two +neofox_important_cols = snakemake.params.neofox_important_cols + +important_cols = neofox_important_cols["general"] + neofox_important_cols["I"] + neofox_important_cols["II"] mhc_one = ( @@ -109,7 +79,7 @@ def highlight_peptides_diff(tumor_p: str, normal_p: str) -> Tuple[str, str]: # move important columns to the left of the table mhc_two = ( - all_neopeptides[ important_cols_general + important_cols_two + important_cols_one + [ col for col in all_neopeptides.columns if col not in important_cols ] ] + all_neopeptides[ neofox_important_cols["general"] + neofox_important_cols["II"] + neofox_important_cols["I"] + [ col for col in all_neopeptides.columns if col not in important_cols ] ] .sort_values(by = ["MixMHC2pred_best_rank", "PRIME_best_rank"]) .groupby("MixMHC2pred_best_rank") .head(n=1) From 9257537e11c9d66893d643f66c85ad7c1926ffc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Wed, 28 Sep 2022 13:58:57 +0000 Subject: [PATCH 190/191] polish template --- .../neo_fox-neoantigens-template.datavzrd.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml index 40398001..270c77a7 100644 --- a/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml +++ b/workflow/resources/datavzrd/neo_fox-neoantigens-template.datavzrd.yaml @@ -30,7 +30,7 @@ __variables__: PRIME_best_rank: plot: heatmap: - scale: linear + scale: symlog domain: - 0.0 - 100.0 @@ -319,7 +319,14 @@ datasets: views: overview: - desc: ?f"Neopeptide candidates for {wildcards.group} tumor sample {wildcards.tumor_alias}, with annotations gathered and provided by NeoFox." + desc: | + Neopeptide candidates with annotations gathered and provided by NeoFox. + + ### Column descriptions + * **Recognition_Potential_MHCI_9mer:** The recognition potential of a neoantigen is the likelihood that it is effectively recognized by the TCR repertoire (definition: Amplitude_MHCI_affinity_9mer x Pathogensimiliarity_MHCI_affinity_9mer) \[[runup to equation (5) in Luksza et al. 2017](https://doi.org/10.1038/nature24473)\]. + * **Selfsimilarity_MHCI_conserved_binder:** Score for k-mer based similarity between Best_rank_MHCI_score_epitope and Best_affinity_MHCI_epitope_WT. For conserved binders only (i.e. NOT improved binders), where this score is considered indicative of immunogenicity \[[page 3 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\]. + * **Improved_Binder_MHCI:** Cutoff of 1.2 on the ratio between normal and mutated peptide rank scores to designate a peptide as an improved binder (as opposed to a conserved binder) (definition: (Best_rank_MHCI_score_WT / Best_rank_MHCI_score ) > 1.2) \[[page 4 of Bjerregaard et al., 2017, Front Immunol.](https://doi.org/10.3389/fimmu.2017.01566)\] + dataset: neoprint render-table: From d40cffe13676441fc29d8e93db2e52a5ce85138b Mon Sep 17 00:00:00 2001 From: dlaehnemann Date: Fri, 18 Nov 2022 15:58:26 +0000 Subject: [PATCH 191/191] update to Ensembl release 108 (GTFs in 105 and 106 had genes partly unsorted) --- .test/config/config.yaml | 2 +- config/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 80c0e438..7eb854e1 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -13,7 +13,7 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 100 + release: 108 # Genome build build: GRCh38 diff --git a/config/config.yaml b/config/config.yaml index 4d447645..e00ae4d9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -14,7 +14,7 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 100 + release: 108 # Genome build build: GRCh38