Skip to content

Commit

Permalink
Merge pull request #14 from TRON-Bioinformatics/vafator2decifer
Browse files Browse the repository at this point in the history
Vafator2decifer
  • Loading branch information
priesgo authored Nov 10, 2021
2 parents 3b46385 + a505fb2 commit feca54d
Show file tree
Hide file tree
Showing 20 changed files with 521 additions and 299 deletions.
39 changes: 27 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,35 @@ Run as follows:
```
vafator --input-vcf /path/yo/your.vcf \
--output-vcf /path/yo/your_vafator.vcf \
--normal-bams /path/to/your_normal.bam \
--tumor-bams /path/to/your_primary_tumor.bam,/path/to/your_metastasis_tumor.bam
--bam normal /path/to/your_normal.bam \
--bam primary /path/to/your_primary_tumor.bam \
--bam metastasis /path/to/your_metastasis_tumor.bam
```

Both tumor and normal BAMs are optional, it can annotate only with the tumor BAMs or with the normal BAMs.
This will add annotations for each of the three samples `normal`, `primary` and `metastasis`: `normal_ac`,
`normal_dp`, `normal_af`, `primary_ac`, `primary_dp`, `primary_af`,
`metastasis_ac`, `metastasis_dp` and `metastasis_af`.

If more than one BAM is provided for either the tumor or the normal then the annotations are calculated across all BAMs
and for also each of them separately (eg: `tumor_af` provides the allele frequency across all tumor BAMs, `tumor_af_1`
and `tumor_af_2` provide the allele frequency on the first and second BAM respectively).
If more than one BAM is provided for any sample then the annotations are calculated across all BAMs
and for also each of them separately (eg: `primary_af` provides the allele frequency across all primary tumor BAMs,
`primary_af_1` and `primary_af_2` provide the allele frequency on the first and second BAM respectively).

Also, a `--prefix` parameter can be used to run VAFator multiple times over the same VCF without overwriting its
annotations. For instance, first `vafator [...] --prefix primary --tumor-bams /path/to/your_primary_tumor.bam` and
then `vafator [...] --prefix metastasis --tumor-bams /path/to/your_metastasis_tumor.bam`.
```
vafator --input-vcf /path/yo/your.vcf \
--output-vcf /path/yo/your_vafator.vcf \
--bam primary /path/to/your_primary_tumor_1.bam \
--bam primary /path/to/your_primary_tumor_2.bam
```

Alternatively, you can use `--normal-bams` and/or `--tumor-bams` and the sample names will be predefined to `normal`
and `tumor`respectively.

```
vafator --input-vcf /path/yo/your.vcf \
--output-vcf /path/yo/your_vafator.vcf \
--normal-bams /path/to/your_normal.bam \
--tumor-bams /path/to/your_tumor_1.bam,/path/to/your_tumor_2.bam
```

Use the parameters `--mapping-quality` and `--base-call-quality` to define the minimum quality values for each read.
All reads with quality values velow these thresholds will be filtered out.
Expand Down Expand Up @@ -77,7 +93,7 @@ VAFator is available as a Nextflow pipeline for convenience.

Run as follows:
```
nextflow run tron-bioinformatics/vafator -r 1.0.0 -profile conda --input_files /path/to/your.tsv
nextflow run tron-bioinformatics/vafator -r 1.1.0 -profile conda --input_files /path/to/your.tsv
```

where `--input_files` expects four tab-separated columns **without a header**:
Expand All @@ -94,8 +110,7 @@ Optional parameters:
variant at the same position is kept)
* `--base_call_quality`: minimum base call quality, reads with values below will be filtered out (default: 30)
* `--mapping_quality`: minimum mapping quality, reads with values below will be filtered out (default: 1)
* `--prefix`: when provided the annotations are preceded by this prefix (e.g.: <PREFIX>_tumor_ac, <PREFIX>_tumor_af, etc),
otherwise the annotations are named as tumor_af, normal_af, tumor_ac, normal_ac, tumor_dp and normal_d


## Support for indels

Expand Down
1 change: 0 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ params.output = "output"
params.mapping_quality = false
params.base_call_quality = false
params.skip_multiallelic_filter = false
params.prefix = false

if (params.help) {
log.info params.help_message
Expand Down
4 changes: 1 addition & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
cleanup = true
conda.createTimeout = '1 h'

VERSION = '1.0.0'
VERSION = '1.1.0'

manifest {
name = 'TRON-Bioinformatics/vafator'
Expand Down Expand Up @@ -62,8 +62,6 @@ Optional input:
* skip_multiallelic_filter: skip the filtering of multiallelics by frequency in the tumor (only highest frequency variant at the same position is kept)
* base_call_quality: minimum base call quality, reads with values below will be filtered out (default: 30)
* mapping_quality: minimum mapping quality, reads with values below will be filtered out (default: 1)
* prefix: when provided the annotations are preceded by this prefix (e.g.: <PREFIX>_tumor_ac, <PREFIX>_tumor_af, etc),
otherwise the annotations are named as tumor_af, normal_af, tumor_ac, normal_ac, tumor_dp and normal_d
Output:
* Annotated VCF file
Expand Down
8 changes: 3 additions & 5 deletions nf_modules/vafator.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ params.output = ""
params.mapping_quality = false
params.base_call_quality = false
params.skip_multiallelic_filter = false
params.prefix = false
params.enable_conda = false


Expand All @@ -14,7 +13,7 @@ process VAFATOR {
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy"

conda (params.enable_conda ? "bioconda::vafator=1.0.0" : null)
conda (params.enable_conda ? "bioconda::vafator=1.1.0" : null)

input:
tuple val(name), file(vcf), val(normal_bams), val(tumor_bams)
Expand All @@ -27,11 +26,10 @@ process VAFATOR {
tumor_bams_param = tumor_bams?.trim() ? "--tumor-bams " + tumor_bams.split(",").join(" ") : ""
mq_param = params.mapping_quality ? "--mapping-quality " + params.mapping_quality : ""
bq_param = params.base_call_quality ? "--base-call-quality " + params.base_call_quality : ""
prefix_param = params.prefix ? "--prefix " + params.prefix : ""
"""
vafator \
--input-vcf ${vcf} \
--output-vcf ${vcf.baseName}.vaf.vcf ${normal_bams_param} ${tumor_bams_param} ${mq_param} ${bq_param} ${prefix_param}
--output-vcf ${vcf.baseName}.vaf.vcf ${normal_bams_param} ${tumor_bams_param} ${mq_param} ${bq_param}
"""
}

Expand All @@ -42,7 +40,7 @@ process MULTIALLELIC_FILTER {
tag "${name}"
publishDir "${params.output}/${name}", mode: "copy"

conda (params.enable_conda ? "bioconda::vafator=1.0.0" : null)
conda (params.enable_conda ? "bioconda::vafator=1.1.0" : null)

input:
tuple val(name), file(vcf)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pandas==1.3.3
pysam==0.17.0
cyvcf2==0.30.11
logzero==1.7.0
logzero==1.7.0
pybedtools==0.8.2
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
entry_points={
'console_scripts': [
'vafator=vafator.command_line:annotator',
'multiallelics-filter=vafator.command_line:multiallelics_filter'
'multiallelics-filter=vafator.command_line:multiallelics_filter',
'vafator2decifer=vafator.command_line:vafator2decifer'
],
},
author="TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz"
Expand Down
2 changes: 1 addition & 1 deletion vafator/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION='1.0.0'
VERSION='1.1.0'
188 changes: 67 additions & 121 deletions vafator/annotator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from typing import List
import pysam
from cyvcf2 import VCF, Writer, Variant
import os
from pysam import AlignmentFile
import vafator
import datetime
import json
from vafator.pileups import get_insertion_metrics, get_deletion_metrics, get_snv_metrics, get_variant_pileup, \
build_variant
from vafator.pileups import get_variant_pileup, build_variant, get_metrics


class Annotator(object):
Expand All @@ -19,133 +16,93 @@ class Annotator(object):
"timestamp": datetime.datetime.now().timestamp(),
}

def __init__(self, input_vcf: str, output_vcf: str, normal_bams=[], tumor_bams=[],
mapping_qual_thr=0, base_call_qual_thr=29, prefix=None):
def __init__(self, input_vcf: str, output_vcf: str, input_bams: dict, mapping_qual_thr=0, base_call_qual_thr=29):

self.mapping_quality_threshold = mapping_qual_thr
self.base_call_quality_threshold = base_call_qual_thr
self.vcf = VCF(input_vcf)
# sets a line in the header with the command used to annotate the file
self.vafator_header["input_vcf"] = input_vcf
self.vafator_header["output_vcf"] = output_vcf
self.vafator_header["normal_bams"] = normal_bams
self.vafator_header["tumor_bams"] = tumor_bams
self.vafator_header["bams"] = ";".join(["{}:{}".format(s, ",".join(b)) for s, b in input_bams.items()])
self.vafator_header["mapping_quality_threshold"] = mapping_qual_thr
self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr
self.vafator_header["prefix"] = prefix
self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header)))
# adds to the header all the names of the annotations
self.tumor_prefix = "{prefix}_tumor".format(prefix=prefix) if prefix is not None else "tumor"
self.normal_prefix = "{prefix}_normal".format(prefix=prefix) if prefix is not None else "normal"
for a in Annotator._get_headers(self.tumor_prefix, tumor_bams) + \
Annotator._get_headers(self.normal_prefix, normal_bams):
for a in Annotator._get_headers(input_bams):
self.vcf.add_info_to_header(a)
self.vcf_writer = Writer(output_vcf, self.vcf)
self.tumor_bams = [pysam.AlignmentFile(b, "rb") for b in tumor_bams]
self.normal_bams = [pysam.AlignmentFile(b, "rb") for b in normal_bams]
self.bam_readers = {s : [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}

@staticmethod
def _get_headers(prefix, bams):
def _get_headers(input_bams: dict):
headers = []
if len(bams) > 0:
headers = [
{'ID': "{}_af".format(prefix),
'Description': "Allele frequency for the alternate alleles in the {} samples".format(prefix),
'Type': 'Float', 'Number': 'A'},
{'ID': "{}_dp".format(prefix),
'Description': "Total depth of coverage in the {} samples (independent of alleles)".format(prefix),
'Type': 'Float', 'Number': 'A'},
{'ID': "{}_ac".format(prefix),
'Description': "Allele count for the alternate alleles in the {} samples".format(prefix),
'Type': 'Integer', 'Number': 'A'}
]
if len(bams) > 1:
for i, b in enumerate(bams, start=1):
n = os.path.basename(b).split(".")[0]
headers = headers + [
{'ID': "{}_af_{}".format(prefix, i),
'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(prefix, n),
'Type': 'Float', 'Number': 'A'},
{'ID': "{}_dp_{}".format(prefix, i),
'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(prefix, n),
'Type': 'Float', 'Number': 'A'},
{'ID': "{}_ac_{}".format(prefix, i),
'Description': "Allele count for the alternate alleles in the {} sample {}".format(prefix, n),
'Type': 'Integer', 'Number': 'A'}
]

for s, bams in input_bams.items():
headers.append({
'ID': "{}_af".format(s),
'Description': "Allele frequency for the alternate alleles in the {} sample/s".format(s),
'Type': 'Float',
'Number': 'A'
})
headers.append({
'ID': "{}_dp".format(s),
'Description': "Total depth of coverage in the {} sample/s (independent of alleles)".format(s),
'Type': 'Float',
'Number': '1'
})
headers.append({
'ID': "{}_ac".format(s),
'Description': "Allele count for the alternate alleles in the {} sample/s".format(s),
'Type': 'Integer',
'Number': 'A'
})

if len(bams) > 1:
for i, bam in enumerate(bams, start=1):
n = os.path.basename(bam).split(".")[0]
headers = headers + [
{'ID': "{}_af_{}".format(s, i),
'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(s, n),
'Type': 'Float', 'Number': 'A'},
{'ID': "{}_dp_{}".format(s, i),
'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(s, n),
'Type': 'Float', 'Number': '1'},
{'ID': "{}_ac_{}".format(s, i),
'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n),
'Type': 'Integer', 'Number': 'A'}
]
return headers

def _write_batch(self, batch):
for v in batch:
self.vcf_writer.write_record(v)

def _add_snv_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):
def _add_stats(self, variant: Variant):

global_dp = 0
global_ac = {}
vafator_variant = build_variant(variant)
for i, b in enumerate(bams):
pileups = get_variant_pileup(variant=vafator_variant, bam=b,
min_base_quality=self.base_call_quality_threshold,
min_mapping_quality=self.mapping_quality_threshold)
ac, dp = get_snv_metrics(variant=vafator_variant, pileups=pileups)
if len(bams) > 1:
variant.INFO["{}_af_{}".format(prefix, i + 1)] = ",".join(
[str(self._calculate_af(ac[alt], dp)) for alt in variant.ALT])
variant.INFO["{}_ac_{}".format(prefix, i + 1)] = ",".join([str(ac[alt]) for alt in variant.ALT])
variant.INFO["{}_dp_{}".format(prefix, i + 1)] = dp
for alt in variant.ALT:
global_ac[alt] = global_ac.get(alt, 0) + ac[alt]
global_dp += dp

variant.INFO["{}_af".format(prefix)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT])
variant.INFO["{}_ac".format(prefix)] = ",".join([str(global_ac[alt]) for alt in variant.ALT])
variant.INFO["{}_dp".format(prefix)] = global_dp

def _add_insertion_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):

global_dp = 0
global_ac = 0
vafator_variant = build_variant(variant)
for i, b in enumerate(bams):
pileups = get_variant_pileup(variant=vafator_variant, bam=b,
min_base_quality=self.base_call_quality_threshold,
min_mapping_quality=self.mapping_quality_threshold)
ac, dp = get_insertion_metrics(variant=vafator_variant, pileups=pileups)
af = self._calculate_af(ac, dp)
if len(bams) > 1:
variant.INFO["{}_af_{}".format(prefix, i+1)] = af
variant.INFO["{}_ac_{}".format(prefix, i+1)] = ac
variant.INFO["{}_dp_{}".format(prefix, i+1)] = dp
global_ac += ac
global_dp += dp

global_af = self._calculate_af(global_ac, global_dp)
variant.INFO["{}_af".format(prefix)] = global_af
variant.INFO["{}_ac".format(prefix)] = global_ac
variant.INFO["{}_dp".format(prefix)] = global_dp

def _add_deletion_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):

global_dp = 0
global_ac = 0
vafator_variant = build_variant(variant)
for i, b in enumerate(bams):
pileups = get_variant_pileup(variant=vafator_variant, bam=b,
min_base_quality=self.base_call_quality_threshold,
min_mapping_quality=self.mapping_quality_threshold)
ac, dp = get_deletion_metrics(variant=vafator_variant, pileups=pileups)
af = self._calculate_af(ac, dp)
if len(bams) > 1:
variant.INFO["{}_af_{}".format(prefix, i+1)] = af
variant.INFO["{}_ac_{}".format(prefix, i+1)] = ac
variant.INFO["{}_dp_{}".format(prefix, i+1)] = dp
global_ac += ac
global_dp += dp
global_af = self._calculate_af(global_ac, global_dp)
variant.INFO["{}_af".format(prefix)] = global_af
variant.INFO["{}_ac".format(prefix)] = global_ac
variant.INFO["{}_dp".format(prefix)] = global_dp
for sample, bams in self.bam_readers.items():
for i, bam in enumerate(bams):
pileups = get_variant_pileup(
variant=vafator_variant, bam=bam,
min_base_quality=self.base_call_quality_threshold,
min_mapping_quality=self.mapping_quality_threshold)
coverage_metrics = get_metrics(variant=vafator_variant, pileups=pileups)
if coverage_metrics is not None:
if len(bams) > 1:
variant.INFO["{}_af_{}".format(sample, i + 1)] = ",".join(
[str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) for alt in variant.ALT])
variant.INFO["{}_ac_{}".format(sample, i + 1)] = ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT])
variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp
for alt in variant.ALT:
global_ac[alt] = global_ac.get(alt, 0) + coverage_metrics.ac[alt]
global_dp += coverage_metrics.dp

variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT])
variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT])
variant.INFO["{}_dp".format(sample)] = global_dp

def _calculate_af(self, ac, dp):
return float(ac) / dp if dp > 0 else 0.0
Expand All @@ -155,20 +112,8 @@ def run(self):
variant: Variant
for variant in self.vcf:
# gets the counts of all bases across all BAMs
if self.tumor_bams:
if variant.is_snp:
self._add_snv_stats(self.tumor_bams, variant, self.tumor_prefix)
elif variant.is_indel and not variant.is_deletion:
self._add_insertion_stats(self.tumor_bams, variant, self.tumor_prefix)
elif variant.is_indel and variant.is_deletion:
self._add_deletion_stats(self.tumor_bams, variant, self.tumor_prefix)
if self.normal_bams:
if variant.is_snp:
self._add_snv_stats(self.normal_bams, variant, self.normal_prefix)
elif variant.is_indel and not variant.is_deletion:
self._add_insertion_stats(self.normal_bams, variant, self.normal_prefix)
elif variant.is_indel and variant.is_deletion:
self._add_deletion_stats(self.normal_bams, variant, self.normal_prefix)
self._add_stats(variant)

batch.append(variant)
if len(batch) >= 1000:
self._write_batch(batch)
Expand All @@ -178,5 +123,6 @@ def run(self):

self.vcf_writer.close()
self.vcf.close()
for b in self.tumor_bams + self.normal_bams:
b.close()
for _, bams in self.bam_readers.items():
for bam in bams:
bam.close()
Loading

0 comments on commit feca54d

Please sign in to comment.