Merge pull request #14 from TRON-Bioinformatics/vafator2decifer

Vafator2decifer
TRON-Bioinformatics · Nov 10, 2021 · feca54d · feca54d
2 parents 3b46385 + a505fb2
commit feca54d
Show file tree

Hide file tree

Showing 20 changed files with 521 additions and 299 deletions.
diff --git a/README.md b/README.md
@@ -36,19 +36,35 @@ Run as follows:
 ```
 vafator --input-vcf /path/yo/your.vcf \
 --output-vcf /path/yo/your_vafator.vcf \ 
---normal-bams /path/to/your_normal.bam \
---tumor-bams /path/to/your_primary_tumor.bam,/path/to/your_metastasis_tumor.bam
+--bam normal /path/to/your_normal.bam \
+--bam primary /path/to/your_primary_tumor.bam \
+--bam metastasis /path/to/your_metastasis_tumor.bam
 ```
 
-Both tumor and normal BAMs are optional, it can annotate only with the tumor BAMs or with the normal BAMs.
+This will add annotations for each of the three samples `normal`, `primary` and `metastasis`: `normal_ac`, 
+`normal_dp`, `normal_af`, `primary_ac`, `primary_dp`, `primary_af`, 
+`metastasis_ac`, `metastasis_dp` and `metastasis_af`. 
 
-If more than one BAM is provided for either the tumor or the normal then the annotations are calculated across all BAMs 
-and for also each of them separately (eg: `tumor_af` provides the allele frequency across all tumor BAMs, `tumor_af_1` 
-and `tumor_af_2` provide the allele frequency on the first and second BAM respectively).
+If more than one BAM is provided for any sample then the annotations are calculated across all BAMs 
+and for also each of them separately (eg: `primary_af` provides the allele frequency across all primary tumor BAMs, 
+`primary_af_1` and `primary_af_2` provide the allele frequency on the first and second BAM respectively).
 
-Also, a `--prefix` parameter can be used to run VAFator multiple times over the same VCF without overwriting its
-annotations. For instance, first `vafator [...] --prefix primary --tumor-bams /path/to/your_primary_tumor.bam` and
-then `vafator [...] --prefix metastasis --tumor-bams /path/to/your_metastasis_tumor.bam`.
+```
+vafator --input-vcf /path/yo/your.vcf \
+--output-vcf /path/yo/your_vafator.vcf \ 
+--bam primary /path/to/your_primary_tumor_1.bam \
+--bam primary /path/to/your_primary_tumor_2.bam
+```
+
+Alternatively, you can use `--normal-bams` and/or `--tumor-bams` and the sample names will be predefined to `normal` 
+and `tumor`respectively.
+
+```
+vafator --input-vcf /path/yo/your.vcf \
+--output-vcf /path/yo/your_vafator.vcf \ 
+--normal-bams /path/to/your_normal.bam \
+--tumor-bams /path/to/your_tumor_1.bam,/path/to/your_tumor_2.bam
+```
 
 Use the parameters `--mapping-quality` and `--base-call-quality` to define the minimum quality values for each read.
 All reads with quality values velow these thresholds will be filtered out.
@@ -77,7 +93,7 @@ VAFator is available as a Nextflow pipeline for convenience.
 
 Run as follows:
 ```
-nextflow run tron-bioinformatics/vafator -r 1.0.0 -profile conda --input_files /path/to/your.tsv
+nextflow run tron-bioinformatics/vafator -r 1.1.0 -profile conda --input_files /path/to/your.tsv
 ```
 
 where `--input_files` expects four tab-separated columns **without a header**:
@@ -94,8 +110,7 @@ Optional parameters:
   variant at the same position is kept)
 * `--base_call_quality`: minimum base call quality, reads with values below will be filtered out (default: 30)
 * `--mapping_quality`: minimum mapping quality, reads with values below will be filtered out (default: 1)
-* `--prefix`: when provided the annotations are preceded by this prefix (e.g.: <PREFIX>_tumor_ac, <PREFIX>_tumor_af, etc), 
-  otherwise the annotations are named as tumor_af, normal_af, tumor_ac, normal_ac, tumor_dp and normal_d
+
 
 ## Support for indels
 

diff --git a/main.nf b/main.nf
@@ -12,7 +12,6 @@ params.output = "output"
 params.mapping_quality = false
 params.base_call_quality = false
 params.skip_multiallelic_filter = false
-params.prefix = false
 
 if (params.help) {
     log.info params.help_message

diff --git a/nextflow.config b/nextflow.config
@@ -27,7 +27,7 @@ process.shell = ['/bin/bash', '-euo', 'pipefail']
 cleanup = true
 conda.createTimeout = '1 h'
 
-VERSION = '1.0.0'
+VERSION = '1.1.0'
 
 manifest {
   name = 'TRON-Bioinformatics/vafator'
@@ -62,8 +62,6 @@ Optional input:
     * skip_multiallelic_filter: skip the filtering of multiallelics by frequency in the tumor (only highest frequency variant at the same position is kept)
     * base_call_quality: minimum base call quality, reads with values below will be filtered out (default: 30)
     * mapping_quality: minimum mapping quality, reads with values below will be filtered out (default: 1)
-    * prefix: when provided the annotations are preceded by this prefix (e.g.: <PREFIX>_tumor_ac, <PREFIX>_tumor_af, etc),
-    otherwise the annotations are named as tumor_af, normal_af, tumor_ac, normal_ac, tumor_dp and normal_d
 
 Output:
     * Annotated VCF file

diff --git a/nf_modules/vafator.nf b/nf_modules/vafator.nf
@@ -4,7 +4,6 @@ params.output = ""
 params.mapping_quality = false
 params.base_call_quality = false
 params.skip_multiallelic_filter = false
-params.prefix = false
 params.enable_conda = false
 
 
@@ -14,7 +13,7 @@ process VAFATOR {
     tag "${name}"
     publishDir "${params.output}/${name}", mode: "copy"
 
-    conda (params.enable_conda ? "bioconda::vafator=1.0.0" : null)
+    conda (params.enable_conda ? "bioconda::vafator=1.1.0" : null)
 
     input:
     tuple val(name), file(vcf), val(normal_bams), val(tumor_bams)
@@ -27,11 +26,10 @@ process VAFATOR {
     tumor_bams_param = tumor_bams?.trim() ? "--tumor-bams " + tumor_bams.split(",").join(" ") : ""
     mq_param = params.mapping_quality ? "--mapping-quality " + params.mapping_quality : ""
     bq_param = params.base_call_quality ? "--base-call-quality " + params.base_call_quality : ""
-    prefix_param = params.prefix ? "--prefix " + params.prefix : ""
     """
     vafator \
     --input-vcf ${vcf} \
-    --output-vcf ${vcf.baseName}.vaf.vcf ${normal_bams_param} ${tumor_bams_param} ${mq_param} ${bq_param} ${prefix_param}
+    --output-vcf ${vcf.baseName}.vaf.vcf ${normal_bams_param} ${tumor_bams_param} ${mq_param} ${bq_param}
     """
 }
 
@@ -42,7 +40,7 @@ process MULTIALLELIC_FILTER {
     tag "${name}"
     publishDir "${params.output}/${name}", mode: "copy"
 
-    conda (params.enable_conda ? "bioconda::vafator=1.0.0" : null)
+    conda (params.enable_conda ? "bioconda::vafator=1.1.0" : null)
 
     input:
     tuple val(name), file(vcf)

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 pandas==1.3.3
 pysam==0.17.0
 cyvcf2==0.30.11
-logzero==1.7.0
+logzero==1.7.0
+pybedtools==0.8.2
diff --git a/setup.py b/setup.py
@@ -20,7 +20,8 @@
     entry_points={
         'console_scripts': [
             'vafator=vafator.command_line:annotator',
-            'multiallelics-filter=vafator.command_line:multiallelics_filter'
+            'multiallelics-filter=vafator.command_line:multiallelics_filter',
+            'vafator2decifer=vafator.command_line:vafator2decifer'
         ],
     },
     author="TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz"

diff --git a/vafator/__init__.py b/vafator/__init__.py
@@ -1 +1 @@
-VERSION='1.0.0'
+VERSION='1.1.0'
diff --git a/vafator/annotator.py b/vafator/annotator.py
@@ -1,13 +1,10 @@
-from typing import List
 import pysam
 from cyvcf2 import VCF, Writer, Variant
 import os
-from pysam import AlignmentFile
 import vafator
 import datetime
 import json
-from vafator.pileups import get_insertion_metrics, get_deletion_metrics, get_snv_metrics, get_variant_pileup, \
-    build_variant
+from vafator.pileups import get_variant_pileup, build_variant, get_metrics
 
 
 class Annotator(object):
@@ -19,133 +16,93 @@ class Annotator(object):
         "timestamp": datetime.datetime.now().timestamp(),
     }
 
-    def __init__(self, input_vcf: str, output_vcf: str, normal_bams=[], tumor_bams=[],
-                 mapping_qual_thr=0, base_call_qual_thr=29, prefix=None):
+    def __init__(self, input_vcf: str, output_vcf: str, input_bams: dict, mapping_qual_thr=0, base_call_qual_thr=29):
 
         self.mapping_quality_threshold = mapping_qual_thr
         self.base_call_quality_threshold = base_call_qual_thr
         self.vcf = VCF(input_vcf)
         # sets a line in the header with the command used to annotate the file
         self.vafator_header["input_vcf"] = input_vcf
         self.vafator_header["output_vcf"] = output_vcf
-        self.vafator_header["normal_bams"] = normal_bams
-        self.vafator_header["tumor_bams"] = tumor_bams
+        self.vafator_header["bams"] = ";".join(["{}:{}".format(s, ",".join(b)) for s, b in input_bams.items()])
         self.vafator_header["mapping_quality_threshold"] = mapping_qual_thr
         self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr
-        self.vafator_header["prefix"] = prefix
         self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header)))
         # adds to the header all the names of the annotations
-        self.tumor_prefix = "{prefix}_tumor".format(prefix=prefix) if prefix is not None else "tumor"
-        self.normal_prefix = "{prefix}_normal".format(prefix=prefix) if prefix is not None else "normal"
-        for a in Annotator._get_headers(self.tumor_prefix, tumor_bams) + \
-                 Annotator._get_headers(self.normal_prefix, normal_bams):
+        for a in Annotator._get_headers(input_bams):
             self.vcf.add_info_to_header(a)
         self.vcf_writer = Writer(output_vcf, self.vcf)
-        self.tumor_bams = [pysam.AlignmentFile(b, "rb") for b in tumor_bams]
-        self.normal_bams = [pysam.AlignmentFile(b, "rb") for b in normal_bams]
+        self.bam_readers = {s : [pysam.AlignmentFile(b, "rb") for b in bams] for s, bams in input_bams.items()}
 
     @staticmethod
-    def _get_headers(prefix, bams):
+    def _get_headers(input_bams: dict):
         headers = []
-        if len(bams) > 0:
-            headers = [
-                {'ID': "{}_af".format(prefix),
-                 'Description': "Allele frequency for the alternate alleles in the {} samples".format(prefix),
-                 'Type': 'Float', 'Number': 'A'},
-                {'ID': "{}_dp".format(prefix),
-                 'Description': "Total depth of coverage in the {} samples (independent of alleles)".format(prefix),
-                 'Type': 'Float', 'Number': 'A'},
-                {'ID': "{}_ac".format(prefix),
-                 'Description': "Allele count for the alternate alleles in the {} samples".format(prefix),
-                 'Type': 'Integer', 'Number': 'A'}
-            ]
-        if len(bams) > 1:
-            for i, b in enumerate(bams, start=1):
-                n = os.path.basename(b).split(".")[0]
-                headers = headers + [
-                    {'ID': "{}_af_{}".format(prefix, i),
-                     'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(prefix, n),
-                     'Type': 'Float', 'Number': 'A'},
-                    {'ID': "{}_dp_{}".format(prefix, i),
-                     'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(prefix, n),
-                     'Type': 'Float', 'Number': 'A'},
-                    {'ID': "{}_ac_{}".format(prefix, i),
-                     'Description': "Allele count for the alternate alleles in the {} sample {}".format(prefix, n),
-                     'Type': 'Integer', 'Number': 'A'}
-                ]
+
+        for s, bams in input_bams.items():
+            headers.append({
+                'ID': "{}_af".format(s),
+                'Description': "Allele frequency for the alternate alleles in the {} sample/s".format(s),
+                'Type': 'Float',
+                'Number': 'A'
+            })
+            headers.append({
+                'ID': "{}_dp".format(s),
+                'Description': "Total depth of coverage in the {} sample/s (independent of alleles)".format(s),
+                'Type': 'Float',
+                'Number': '1'
+            })
+            headers.append({
+                'ID': "{}_ac".format(s),
+                'Description': "Allele count for the alternate alleles in the {} sample/s".format(s),
+                'Type': 'Integer',
+                'Number': 'A'
+            })
+
+            if len(bams) > 1:
+                for i, bam in enumerate(bams, start=1):
+                    n = os.path.basename(bam).split(".")[0]
+                    headers = headers + [
+                        {'ID': "{}_af_{}".format(s, i),
+                         'Description': "Allele frequency for the alternate alleles in the {} sample {}".format(s, n),
+                         'Type': 'Float', 'Number': 'A'},
+                        {'ID': "{}_dp_{}".format(s, i),
+                         'Description': "Depth of coverage in the {} sample {} (independent of alleles)".format(s, n),
+                         'Type': 'Float', 'Number': '1'},
+                        {'ID': "{}_ac_{}".format(s, i),
+                         'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n),
+                         'Type': 'Integer', 'Number': 'A'}
+                    ]
         return headers
 
     def _write_batch(self, batch):
         for v in batch:
             self.vcf_writer.write_record(v)
 
-    def _add_snv_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):
+    def _add_stats(self, variant: Variant):
 
         global_dp = 0
         global_ac = {}
         vafator_variant = build_variant(variant)
-        for i, b in enumerate(bams):
-            pileups = get_variant_pileup(variant=vafator_variant, bam=b,
-                                         min_base_quality=self.base_call_quality_threshold,
-                                         min_mapping_quality=self.mapping_quality_threshold)
-            ac, dp = get_snv_metrics(variant=vafator_variant, pileups=pileups)
-            if len(bams) > 1:
-                variant.INFO["{}_af_{}".format(prefix, i + 1)] = ",".join(
-                    [str(self._calculate_af(ac[alt], dp)) for alt in variant.ALT])
-                variant.INFO["{}_ac_{}".format(prefix, i + 1)] = ",".join([str(ac[alt]) for alt in variant.ALT])
-                variant.INFO["{}_dp_{}".format(prefix, i + 1)] = dp
-            for alt in variant.ALT:
-                global_ac[alt] = global_ac.get(alt, 0) + ac[alt]
-            global_dp += dp
-
-        variant.INFO["{}_af".format(prefix)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT])
-        variant.INFO["{}_ac".format(prefix)] = ",".join([str(global_ac[alt]) for alt in variant.ALT])
-        variant.INFO["{}_dp".format(prefix)] = global_dp
-
-    def _add_insertion_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):
-
-        global_dp = 0
-        global_ac = 0
-        vafator_variant = build_variant(variant)
-        for i, b in enumerate(bams):
-            pileups = get_variant_pileup(variant=vafator_variant, bam=b,
-                                         min_base_quality=self.base_call_quality_threshold,
-                                         min_mapping_quality=self.mapping_quality_threshold)
-            ac, dp = get_insertion_metrics(variant=vafator_variant, pileups=pileups)
-            af = self._calculate_af(ac, dp)
-            if len(bams) > 1:
-                variant.INFO["{}_af_{}".format(prefix, i+1)] = af
-                variant.INFO["{}_ac_{}".format(prefix, i+1)] = ac
-                variant.INFO["{}_dp_{}".format(prefix, i+1)] = dp
-            global_ac += ac
-            global_dp += dp
-
-        global_af = self._calculate_af(global_ac, global_dp)
-        variant.INFO["{}_af".format(prefix)] = global_af
-        variant.INFO["{}_ac".format(prefix)] = global_ac
-        variant.INFO["{}_dp".format(prefix)] = global_dp
-
-    def _add_deletion_stats(self, bams: List[AlignmentFile], variant: Variant, prefix: str):
-
-        global_dp = 0
-        global_ac = 0
-        vafator_variant = build_variant(variant)
-        for i, b in enumerate(bams):
-            pileups = get_variant_pileup(variant=vafator_variant, bam=b,
-                                         min_base_quality=self.base_call_quality_threshold,
-                                         min_mapping_quality=self.mapping_quality_threshold)
-            ac, dp = get_deletion_metrics(variant=vafator_variant, pileups=pileups)
-            af = self._calculate_af(ac, dp)
-            if len(bams) > 1:
-                variant.INFO["{}_af_{}".format(prefix, i+1)] = af
-                variant.INFO["{}_ac_{}".format(prefix, i+1)] = ac
-                variant.INFO["{}_dp_{}".format(prefix, i+1)] = dp
-            global_ac += ac
-            global_dp += dp
-        global_af = self._calculate_af(global_ac, global_dp)
-        variant.INFO["{}_af".format(prefix)] = global_af
-        variant.INFO["{}_ac".format(prefix)] = global_ac
-        variant.INFO["{}_dp".format(prefix)] = global_dp
+        for sample, bams in self.bam_readers.items():
+            for i, bam in enumerate(bams):
+                pileups = get_variant_pileup(
+                    variant=vafator_variant, bam=bam,
+                    min_base_quality=self.base_call_quality_threshold,
+                    min_mapping_quality=self.mapping_quality_threshold)
+                coverage_metrics = get_metrics(variant=vafator_variant, pileups=pileups)
+                if coverage_metrics is not None:
+                    if len(bams) > 1:
+                        variant.INFO["{}_af_{}".format(sample, i + 1)] = ",".join(
+                            [str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) for alt in variant.ALT])
+                        variant.INFO["{}_ac_{}".format(sample, i + 1)] = ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT])
+                        variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp
+                    for alt in variant.ALT:
+                        global_ac[alt] = global_ac.get(alt, 0) + coverage_metrics.ac[alt]
+                    global_dp += coverage_metrics.dp
+
+            variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT])
+            variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT])
+            variant.INFO["{}_dp".format(sample)] = global_dp
 
     def _calculate_af(self, ac, dp):
         return float(ac) / dp if dp > 0 else 0.0
@@ -155,20 +112,8 @@ def run(self):
         variant: Variant
         for variant in self.vcf:
             # gets the counts of all bases across all BAMs
-            if self.tumor_bams:
-                if variant.is_snp:
-                    self._add_snv_stats(self.tumor_bams, variant, self.tumor_prefix)
-                elif variant.is_indel and not variant.is_deletion:
-                    self._add_insertion_stats(self.tumor_bams, variant, self.tumor_prefix)
-                elif variant.is_indel and variant.is_deletion:
-                    self._add_deletion_stats(self.tumor_bams, variant, self.tumor_prefix)
-            if self.normal_bams:
-                if variant.is_snp:
-                    self._add_snv_stats(self.normal_bams, variant, self.normal_prefix)
-                elif variant.is_indel and not variant.is_deletion:
-                    self._add_insertion_stats(self.normal_bams, variant, self.normal_prefix)
-                elif variant.is_indel and variant.is_deletion:
-                    self._add_deletion_stats(self.normal_bams, variant, self.normal_prefix)
+            self._add_stats(variant)
+
             batch.append(variant)
             if len(batch) >= 1000:
                 self._write_batch(batch)
@@ -178,5 +123,6 @@ def run(self):
 
         self.vcf_writer.close()
         self.vcf.close()
-        for b in self.tumor_bams + self.normal_bams:
-            b.close()
+        for _, bams in self.bam_readers.items():
+            for bam in bams:
+                bam.close()