diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml old mode 100644 new mode 100755 index 35f503f..bd8f232 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -7,27 +7,28 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ '3.7', '3.8', '3.9', '3.10' ] steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: conda-incubator/setup-miniconda@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 with: - auto-update-conda: true - channels: defaults,conda-forge,bioconda + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | sudo apt-get update - sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev bedtools + sudo apt-get --assume-yes install build-essential libcurl4-openssl-dev libz-dev liblzma-dev python -m pip install --upgrade pip pip install setuptools wheel - # this is needed by cyvcf2 and pysam - conda install --yes htslib=1.14 + # this is needed by pybedtools + sudo apt-get --assume-yes install bedtools + # install python requirements + pip install -r requirements.txt - name: Install vafator run: | python setup.py bdist_wheel - pip install dist/* + pip install dist/vafator-*.whl - name: Run integration tests run: | make clean integration_tests \ No newline at end of file diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 855d23c..473fa02 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -7,16 +7,22 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, 3.10 ] + python-version: [ '3.7', '3.8', '3.9', '3.10' ] steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel + pip install setuptools wheel coverage pip install virtualenv tox==3.26.0 tox-wheel==1.0.0 tox-gh-actions - name: Build, install and run unit tests run: | - tox \ No newline at end of file + pip install -r requirements.txt + coverage run -m unittest discover vafator.tests + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v3 \ No newline at end of file diff --git a/README.md b/README.md index 9cde21b..63efa1d 100755 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![PyPI version](https://badge.fury.io/py/vafator.svg)](https://badge.fury.io/py/vafator) [![Anaconda-Server Badge](https://anaconda.org/bioconda/vafator/badges/version.svg)](https://anaconda.org/bioconda/vafator) [![Run unit tests](https://github.com/TRON-Bioinformatics/vafator/actions/workflows/unit_tests.yml/badge.svg?branch=master)](https://github.com/TRON-Bioinformatics/vafator/actions/workflows/unit_tests.yml) +[![codecov](https://codecov.io/gh/TRON-Bioinformatics/vafator/branch/master/graph/badge.svg?token=QLK84NI44G)](https://codecov.io/gh/TRON-Bioinformatics/vafator) [![License](https://img.shields.io/badge/license-MIT-green)](https://opensource.org/licenses/MIT) @@ -20,25 +21,28 @@ somatic variant calling, tumor evolution with multiple samples at different time |-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------|-----------------| | Allele frequency (AF) | Ratio of reads supporting the alternate allele | {sample}_af | float | A | | Allele count (AC) | Count of reads supporting the alternate allele | {sample}_ac | int | A | +| Count ambiguous bases (N) | Count of ambiguous bases (N + all IUPAC ambiguity codes) overlapping the variant | {sample}_n | int | 1 | | Depth of coverage (DP) | Count of reads covering the position of the variant | {sample}_dp | int | 1 | | Expected allele frequency | Expected allele frequency assuming a multiplicity of the mutation m=1 (the number of DNA copies bearing a mutation) considering the given purity and ploidy/copy numbers | {sample}_eaf | float | 1 | | Probability undetected | Probability that a given somatic mutation is undetected given the total coverage, supporting reads and expected allele frequency | {sample}_pu | float | A | | Power | Power to detect a somatic mutation given the total coverage and expected allele frequency (Carter, 2012) | {sample}_pw | float | 1 | | k | Minimum number of supporting reads such that the probability of observing k or more non-reference reads due to sequencing error is less than the defined false positive rate (FPR) (Carter, 2012) | {sample}_k | float | 1 | | Mapping quality median | Median mapping quality of reads supporting each of the reference and the alternate alleles | {sample}_mq | float | R | -| Mapping quality rank sum test | Rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsmq | float | A | -| Mapping quality rank sum test p-value | Significance of the rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsmq_pv | float | A | +| Mapping quality rank sum test (§§) | Rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions. | {sample}_rsmq | float | A | +| Mapping quality rank sum test p-value (§§) | Significance of the rank sum test comparing the MQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsmq_pv | float | A | | Base call quality median | Median base call quality of reads supporting each of the reference and the alternate alleles (not available for deletions) | {sample}_bq | float | R | -| Base call quality rank sum test | Rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsbq | float | A | -| Base call quality rank sum test p-value | Significance of the rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsbq_pv | float | A | +| Base call quality rank sum test (§§) | Rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rsbq | float | A | +| Base call quality rank sum test p-value (§§) | Significance of the rank sum test comparing the BQ distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rsbq_pv | float | A | | Position median | Median position within reads supporting each of the reference and the alternate alleles (in indels this is the start position) | {sample}_pos | float | R | -| Position rank sum test | Rank sum test comparing the position distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rspos | float | A | -| Position rank sum test p-value | Significance of the rank sum test comparing the position distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rspos_pv | float | A | - +| Position rank sum test (§§) | Rank sum test comparing the position distribution between reads supporting the reference and the alternate. Values closer to zero indicate similar distributions | {sample}_rspos | float | A | +| Position rank sum test p-value (§§) | Significance of the rank sum test comparing the position distribution between reads supporting the reference and the alternate. The null hypothesis is that there is no difference in the distributions | {sample}_rspos_pv | float | A | § cardinality is defined as in the VCF format specification: `A` refers to one value per alternate allele, `R` refers to one value per possible allele (including the reference), `1` refers to one value. +§§ rank sum tests require at least one read supporting the reference and one read supporting the alternate + + VAFator uses cyvcf2 (Pederson, 2017) to read/write VCF files and pysam (https://github.com/pysam-developers/pysam) to read BAM files. Both libraries are cython wrappers around HTSlib (Bonfield, 2021). @@ -64,9 +68,26 @@ vafator --input-vcf /path/yo/your.vcf \ ``` This will add annotations for each of the three samples `normal`, `primary` and `metastasis`: `normal_ac`, -`normal_dp`, `normal_af`, `normal_pw`, `primary_ac`, `primary_dp`, etc. +`normal_dp`, `normal_af`, `normal_pw`, `primary_ac`, `primary_dp`, etc. + +### Support for indels + +VAFator provides equivalent annotations for indels. Depth of coverage and allele frequency are calculated on the +position immediately before the indel. Only insertions and deletions as recorded in the CIGAR matching the respective +coordinates and sequence from the VCF file are taken into account. Any read supporting a similar but not identical indel +is not counted. + +**NOTE**: multiallelic mutations are not supported for indels, the indel in the multiallelic position will be +annotated with null values. This problem can be circumvented by using the Nextflow normalization pipeline described above. + +### Support for MNVs + +Not supported at the moment when not decomposed. + +If running the nextflow pipeline indicated above, MNVs and complex variants are by default decomposed and hence +correctly annotated by VAFator. -## Support for technical replicates +### Support for technical replicates If more than one BAM for the same sample is provided then the annotations are calculated across all BAMs and for also each of them separately (eg: `primary_af` provides the allele frequency across all primary tumor BAMs, @@ -169,6 +190,17 @@ The statistic value will be close to zero for similar distributions and further The significance value corresponds to the null hypothesis of similar distributions. No multiple test correction is applied over this p-value. +### Ambiguous bases + +Some reads may contain ambiguous bases with high base call quality scores. +The count of all reads passing the quality thresholds that contain an +ambiguous base overlapping the mutation are annotated. +All IUPAC ambiguity codes are taken into account. + +Also, these reads supporting ambiguous bases are not taken into account in the depth of coverage (DP) +as they may dilute the VAF values. In order to include those into the depth of coverage use the flag +`--include-ambiguous-bases`. Only SNVs are supported. + ## Understanding the output The output is a VCF with the some new annotations in the INFO field for the provided sample names. @@ -208,23 +240,6 @@ nextflow run tron-bioinformatics/tronflow-vcf-postprocessing -r 2.2.0 -profile c See https://github.com/TRON-Bioinformatics/tronflow-vcf-postprocessing for more details -## Support for indels - -VAFator provides equivalent annotations for indels. Depth of coverage and allele frequency are calculated on the -position immediately before the indel. Only insertions and deletions as recorded in the CIGAR matching the respective -coordinates and sequence from the VCF file are taken into account. Any read supporting a similar but not identical indel -is not counted. - -**NOTE**: multiallelic mutations are not supported for indels, the indel in the multiallelic position will be -annotated with null values. This problem can be circumvented by using the Nextflow normalization pipeline described above. - -## Support for MNVs - -Not supported at the moment when not decomposed. - -If running the nextflow pipeline indicated above, MNVs and complex variants are by default decomposed and hence -correctly annotated by VAFator. - ## Bibliography - Pedersen, B. S., & Quinlan, A. R. (2017). cyvcf2: fast, flexible variant analysis with Python. Bioinformatics, 33(12), 1867–1869. https://doi.org/10.1093/BIOINFORMATICS/BTX057 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index 89d2b56..aa18723 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ pysam~=0.19.1 cyvcf2~=0.30.14 logzero~=1.7.0 pybedtools~=0.9.0 -scipy~=1.8.1 \ No newline at end of file +scipy>=1.0.0,<2.0.0 \ No newline at end of file diff --git a/tox.ini b/tox.ini deleted file mode 100644 index b14bfb1..0000000 --- a/tox.ini +++ /dev/null @@ -1,16 +0,0 @@ -[tox] -envlist = {py37, py38, py39, py310} - -[gh-actions] -python = - 3.7: py37 - 3.8: py38 - 3.9: py39 - 3.10: py310 - -[testenv] -wheel = true -passenv = * -commands= - pip install -r requirements.txt - python -m unittest discover vafator.tests \ No newline at end of file diff --git a/vafator/__init__.py b/vafator/__init__.py index 51abfa5..d7b3f80 100755 --- a/vafator/__init__.py +++ b/vafator/__init__.py @@ -1 +1,4 @@ -VERSION='2.1.0' \ No newline at end of file +VERSION='2.2.0' + + +AMBIGUOUS_BASES = ['N', 'M', 'R', 'W', 'S', 'Y', 'K', 'V', 'H', 'D', 'B'] diff --git a/vafator/annotator.py b/vafator/annotator.py index 533caaf..00f8293 100755 --- a/vafator/annotator.py +++ b/vafator/annotator.py @@ -10,6 +10,7 @@ import asyncio import time +from vafator.ploidies import DEFAULT_PLOIDY from vafator.rank_sum_test import calculate_rank_sum_test, get_rank_sum_tests from vafator.power import PowerCalculator, DEFAULT_ERROR_RATE, DEFAULT_FPR from vafator.pileups import get_variant_pileup, get_metrics @@ -42,13 +43,15 @@ def __init__(self, input_vcf: str, output_vcf: str, tumor_ploidies: dict = {}, normal_ploidy=2, fpr=DEFAULT_FPR, - error_rate=DEFAULT_ERROR_RATE): + error_rate=DEFAULT_ERROR_RATE, + include_ambiguous_bases=False): self.mapping_quality_threshold = mapping_qual_thr self.base_call_quality_threshold = base_call_qual_thr self.purities = purities self.tumor_ploidies = tumor_ploidies self.normal_ploidy = normal_ploidy + self.include_ambiguous_bases = include_ambiguous_bases self.power = PowerCalculator( normal_ploidy=normal_ploidy, tumor_ploidies=tumor_ploidies, purities=purities, error_rate=error_rate, fpr=fpr) @@ -63,7 +66,10 @@ def __init__(self, input_vcf: str, output_vcf: str, self.vafator_header["base_call_quality_threshold"] = base_call_qual_thr self.vafator_header["purities"] = ";".join(["{}:{}".format(s, p) for s, p in purities.items()]) self.vafator_header["normal_ploidy"] = normal_ploidy - self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p) for s, p in tumor_ploidies.items()]) + self.vafator_header["tumor_ploidy"] = ";".join(["{}:{}".format(s, p.report_value) + for s, p in tumor_ploidies.items()]) \ + if tumor_ploidies else DEFAULT_PLOIDY + self.vafator_header["include_ambiguous_bases"] = self.include_ambiguous_bases self.vcf.add_to_header("##vafator_command_line={}".format(json.dumps(self.vafator_header))) # adds to the header all the names of the annotations for a in Annotator._get_headers(input_bams): @@ -94,6 +100,13 @@ def _get_headers(input_bams: dict): 'Type': 'Integer', 'Number': 'A' }) + headers.append({ + 'ID': "{}_n".format(s), + 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " + "in the {} sample/s".format(s), + 'Type': 'Integer', + 'Number': '1' + }) headers.append({ 'ID': "{}_pu".format(s), 'Description': "Probability of an undetected mutation given the observed supporting reads (AC), " @@ -208,6 +221,10 @@ def _get_headers(input_bams: dict): {'ID': "{}_ac_{}".format(s, i), 'Description': "Allele count for the alternate alleles in the {} sample {}".format(s, n), 'Type': 'Integer', 'Number': 'A'}, + {'ID': "{}_n_{}".format(s, i), + 'Description': "Allele count for ambiguous bases (any IUPAC ambiguity code is counted) " + "in the {} sample {}".format(s, n), + 'Type': 'Integer', 'Number': '1'}, {'ID': "{}_pu_{}".format(s, i), 'Description': "Probability of an undetected mutation given the observed supporting " "reads (AC), the observed total coverage (DP) and the provided tumor " @@ -286,12 +303,17 @@ def _add_stats(self, variant: Variant): variant=variant, bam=bam, min_base_quality=self.base_call_quality_threshold, min_mapping_quality=self.mapping_quality_threshold) - coverage_metrics = get_metrics(variant=variant, pileups=pileups) + coverage_metrics = get_metrics(variant=variant, pileups=pileups, + include_ambiguous_bases=self.include_ambiguous_bases) if coverage_metrics is not None: if len(bams) > 1: - variant.INFO["{}_af_{}".format(sample, i + 1)] = ",".join( - [str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) for alt in variant.ALT]) - variant.INFO["{}_ac_{}".format(sample, i + 1)] = ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_af_{}".format(sample, i + 1)] = \ + ",".join([str(self._calculate_af(coverage_metrics.ac[alt], coverage_metrics.dp)) + for alt in variant.ALT]) + variant.INFO["{}_ac_{}".format(sample, i + 1)] = \ + ",".join([str(coverage_metrics.ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_n_{}".format(sample, i + 1)] = \ + str(sum([coverage_metrics.ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) variant.INFO["{}_dp_{}".format(sample, i + 1)] = coverage_metrics.dp variant.INFO["{}_pu_{}".format(sample, i + 1)] = ",".join( [str(self.power.calculate_power( @@ -319,12 +341,12 @@ def _add_stats(self, variant: Variant): pvalues, stats = get_rank_sum_tests(coverage_metrics.all_bqs, variant) if stats: variant.INFO["{}_rsbq_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(stats) + variant.INFO["{}_rsbq_pv_{}".format(sample, i + 1)] = ",".join(pvalues) pvalues, stats = get_rank_sum_tests(coverage_metrics.all_positions, variant) if stats: variant.INFO["{}_rspos_{}".format(sample, i + 1)] = ",".join(stats) - variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(stats) + variant.INFO["{}_rspos_pv_{}".format(sample, i + 1)] = ",".join(pvalues) global_ac.update(coverage_metrics.ac) global_bq.update(coverage_metrics.bqs) @@ -337,6 +359,7 @@ def _add_stats(self, variant: Variant): variant.INFO["{}_af".format(sample)] = ",".join([str(self._calculate_af(global_ac[alt], global_dp)) for alt in variant.ALT]) variant.INFO["{}_ac".format(sample)] = ",".join([str(global_ac[alt]) for alt in variant.ALT]) + variant.INFO["{}_n".format(sample)] = str(sum([global_ac.get(n, 0) for n in vafator.AMBIGUOUS_BASES])) variant.INFO["{}_dp".format(sample)] = global_dp variant.INFO["{}_eaf".format(sample)] = str(self.power.calculate_expected_vaf( sample=sample, variant=variant)) @@ -354,6 +377,8 @@ def _add_stats(self, variant: Variant): variant.INFO["{}_pos".format(sample)] = ",".join( [str(global_pos[variant.REF])] + [str(global_pos[alt]) for alt in variant.ALT]) + # for these rank sum tests it is required at least one value for the alternate and one value for the + # reference otherwise it cannot be calculated pvalues, stats = get_rank_sum_tests(global_all_mqs, variant) if stats: variant.INFO["{}_rsmq".format(sample)] = ",".join(stats) diff --git a/vafator/command_line.py b/vafator/command_line.py index edcdec3..06cdd85 100755 --- a/vafator/command_line.py +++ b/vafator/command_line.py @@ -48,6 +48,8 @@ def annotator(): help="False Positive Rate (FPR) to use in the power calculation") parser.add_argument("--error-rate", dest="error_rate", required=False, default=DEFAULT_ERROR_RATE, type=float, help="Error rate to use in the power calculation") + parser.add_argument("--include-ambiguous-bases", dest="include_ambiguous_bases", action='store_true', + help="Flag indicating to include ambiguous bases from the DP calculation") args = parser.parse_args() @@ -96,7 +98,8 @@ def annotator(): tumor_ploidies=tumor_ploidies, normal_ploidy=int(args.normal_ploidy), fpr=args.fpr, - error_rate=args.error_rate + error_rate=args.error_rate, + include_ambiguous_bases=args.include_ambiguous_bases ) annotator.run() except Exception as e: diff --git a/vafator/pileups.py b/vafator/pileups.py old mode 100644 new mode 100755 index 5eac256..d285bf4 --- a/vafator/pileups.py +++ b/vafator/pileups.py @@ -3,6 +3,8 @@ from typing import Union from cyvcf2 import Variant from pysam.libcalignmentfile import IteratorColumnRegion, AlignmentFile + +from vafator import AMBIGUOUS_BASES from vafator.tests.utils import VafatorVariant import numpy as np @@ -33,19 +35,27 @@ def get_variant_pileup( @dataclass class CoverageMetrics: + # number supporting reads of each base, including the reference ac: dict + # total depth of coverage dp: int + # median base call quality of each base, including the reference bqs: dict = None + # median mapping quality of each alternate base, including the reference mqs: dict = None + # median position within the read of each alternate base, including the reference positions: dict = None + # base call quality distribution of each base, including the reference all_bqs: dict = None + # mapping quality distribution of each base, including the reference all_mqs: dict = None + # position within the read distribution of each base, including the reference all_positions: dict = None -def get_metrics(variant: Variant, pileups: IteratorColumnRegion) -> CoverageMetrics: +def get_metrics(variant: Variant, pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: if is_snp(variant): - return get_snv_metrics(pileups) + return get_snv_metrics(pileups, include_ambiguous_bases) elif is_insertion(variant): return get_insertion_metrics(variant, pileups) elif is_deletion(variant): @@ -158,7 +168,7 @@ def get_deletion_metrics(variant: Variant, pileups: IteratorColumnRegion) -> Cov ) -def get_snv_metrics(pileups: IteratorColumnRegion) -> CoverageMetrics: +def get_snv_metrics(pileups: IteratorColumnRegion, include_ambiguous_bases=False) -> CoverageMetrics: try: pileup = next(pileups) bases = [s.upper() for s in pileup.get_query_sequences()] @@ -170,7 +180,10 @@ def get_snv_metrics(pileups: IteratorColumnRegion) -> CoverageMetrics: all_mqs = aggregate_list_per_base(bases, pileup.get_mapping_qualities()) all_positions = aggregate_list_per_base(bases, pileup.get_query_positions()) - dp = len(bases) + if include_ambiguous_bases: + dp = len(bases) + else: + dp = len([b for b in bases if b not in AMBIGUOUS_BASES]) ac = Counter(bases) except StopIteration: # no reads diff --git a/vafator/ploidies.py b/vafator/ploidies.py old mode 100644 new mode 100755 index b114d85..b9f9099 --- a/vafator/ploidies.py +++ b/vafator/ploidies.py @@ -15,6 +15,7 @@ def __init__(self, local_copy_numbers: str = None, genome_wide_ploidy: float = D if local_copy_numbers is not None and not os.path.exists(local_copy_numbers): raise ValueError('The provided tumor ploidy is neither a copy number value or a BED file with copy ' 'numbers') + self.report_value = local_copy_numbers if local_copy_numbers else genome_wide_ploidy self.bed = pd.read_csv(local_copy_numbers, sep='\t', names=['chromosome', 'start', 'end', 'copy_number']) \ if local_copy_numbers is not None else None self.ploidy = genome_wide_ploidy diff --git a/vafator/tests/test_annotator.py b/vafator/tests/test_annotator.py index 5a3daf6..7bcb716 100755 --- a/vafator/tests/test_annotator.py +++ b/vafator/tests/test_annotator.py @@ -1,4 +1,6 @@ import os + +import numpy as np import pkg_resources from unittest import TestCase from cyvcf2 import VCF @@ -10,6 +12,18 @@ from logzero import logger +EXPECTED_ANNOTATIONS = [ + 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'eaf', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', + 'rspos_pv' +] + +# replicates do not have EAF annotation +EXPECTED_ANNOTATIONS_REPLICATES = [ + 'af', 'dp', 'ac', 'n', 'pu', 'pw', 'k', 'bq', 'mq', 'pos', 'rsmq', 'rsmq_pv', 'rsbq', 'rsbq_pv', 'rspos', + 'rspos_pv' +] + + class TestAnnotator(TestCase): def test_annotator(self): @@ -27,12 +41,9 @@ def test_annotator(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af" in info_annotations) - self.assertTrue("normal_af" in info_annotations) - self.assertTrue("tumor_ac" in info_annotations) - self.assertTrue("normal_ac" in info_annotations) - self.assertTrue("tumor_dp" in info_annotations) - self.assertTrue("normal_dp" in info_annotations) + for a in EXPECTED_ANNOTATIONS: + self.assertTrue("tumor_{}".format(a) in info_annotations) + self.assertTrue("normal_{}".format(a) in info_annotations) def test_annotator_with_multiple_bams(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -49,18 +60,11 @@ def test_annotator_with_multiple_bams(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af_1" in info_annotations) - self.assertTrue("normal_af_1" in info_annotations) - self.assertTrue("tumor_ac_1" in info_annotations) - self.assertTrue("normal_ac_1" in info_annotations) - self.assertTrue("tumor_dp_1" in info_annotations) - self.assertTrue("normal_dp_1" in info_annotations) - self.assertTrue("tumor_af_2" in info_annotations) - self.assertTrue("normal_af_2" in info_annotations) - self.assertTrue("tumor_ac_2" in info_annotations) - self.assertTrue("normal_ac_2" in info_annotations) - self.assertTrue("tumor_dp_2" in info_annotations) - self.assertTrue("normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("tumor_{}_1".format(a) in info_annotations, + "Missing annotation tumor_{}_1".format(a)) + self.assertTrue("normal_{}_1".format(a) in info_annotations, + "Missing annotation normal_{}_1".format(a)) def test_annotator_with_prefix(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -78,18 +82,11 @@ def test_annotator_with_prefix(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("RNA_tumor_af_1" in info_annotations) - self.assertTrue("RNA_normal_af_1" in info_annotations) - self.assertTrue("RNA_tumor_ac_1" in info_annotations) - self.assertTrue("RNA_normal_ac_1" in info_annotations) - self.assertTrue("RNA_tumor_dp_1" in info_annotations) - self.assertTrue("RNA_normal_dp_1" in info_annotations) - self.assertTrue("RNA_tumor_af_2" in info_annotations) - self.assertTrue("RNA_normal_af_2" in info_annotations) - self.assertTrue("RNA_tumor_ac_2" in info_annotations) - self.assertTrue("RNA_normal_ac_2" in info_annotations) - self.assertTrue("RNA_tumor_dp_2" in info_annotations) - self.assertTrue("RNA_normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a)) + self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a)) def test_annotator_with_mnvs(self): input_file = pkg_resources.resource_filename(__name__, "resources/test_tumor_normal.vcf") @@ -107,18 +104,11 @@ def test_annotator_with_mnvs(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("RNA_tumor_af_1" in info_annotations) - self.assertTrue("RNA_normal_af_1" in info_annotations) - self.assertTrue("RNA_tumor_ac_1" in info_annotations) - self.assertTrue("RNA_normal_ac_1" in info_annotations) - self.assertTrue("RNA_tumor_dp_1" in info_annotations) - self.assertTrue("RNA_normal_dp_1" in info_annotations) - self.assertTrue("RNA_tumor_af_2" in info_annotations) - self.assertTrue("RNA_normal_af_2" in info_annotations) - self.assertTrue("RNA_tumor_ac_2" in info_annotations) - self.assertTrue("RNA_normal_ac_2" in info_annotations) - self.assertTrue("RNA_tumor_dp_2" in info_annotations) - self.assertTrue("RNA_normal_dp_2" in info_annotations) + for a in EXPECTED_ANNOTATIONS_REPLICATES: + self.assertTrue("RNA_tumor_{}_1".format(a) in info_annotations, + "Missing annotation RNA_tumor_{}_1".format(a)) + self.assertTrue("RNA_normal_{}_1".format(a) in info_annotations, + "Missing annotation RNA_normal_{}_1".format(a)) def _get_info_at(self, input_file, chromosome, position, annotation): vcf = VCF(input_file) @@ -144,7 +134,8 @@ def test_nist(self): duration = time.time() - start logger.info("Duration {} seconds".format(round(duration, 3))) - self.assertTrue(os.path.exists(output_vcf)) + self._assert_vafator_vcf(output_vcf, sample_name='normal') + n_variants_input = test_utils._get_count_variants(input_file) n_variants_output = test_utils._get_count_variants(output_vcf) self.assertTrue(n_variants_input == n_variants_output) @@ -218,6 +209,23 @@ def test_nist(self): self.assertEqual(variant.INFO['normal_pos'][0], 95.0) self.assertEqual(variant.INFO['normal_pos'][1], 56.0) + def test_nist_with_replicates(self): + input_file = pkg_resources.resource_filename( + __name__, "resources/project.NIST.hc.snps.indels.chr1_1000000_2000000.vcf") + output_vcf = pkg_resources.resource_filename( + __name__, "resources/results/project.NIST.hc.snps.indels.chr1_1000000_2000000.vaf_replicates.vcf") + bam_file = pkg_resources.resource_filename( + __name__, + "resources/project.NIST_NIST7035_H7AP8ADXX_TAAGGCGA_1_NA12878.bwa.markDuplicates.chr1_1000000_2000000.bam") + start = time.time() + annotator = Annotator(input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam_file]}) + annotator.run() + duration = time.time() - start + logger.info("Duration {} seconds".format(round(duration, 3))) + + self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=1) + self._assert_vafator_vcf(output_vcf, sample_name='normal', replicate=2) def test_annotator_bams_order(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -232,12 +240,24 @@ def test_annotator_bams_order(self): self.assertTrue(os.path.exists(output_vcf)) self.assertTrue(os.path.exists(output_vcf_2)) + self._assert_vafator_vcf(output_vcf, sample_name='normal') + self._assert_vafator_vcf(output_vcf, sample_name='tumor') + self._assert_vafator_vcf(output_vcf_2, sample_name='normal') + self._assert_vafator_vcf(output_vcf_2, sample_name='tumor') + vcf = VCF(output_vcf) vcf_2 = VCF(output_vcf_2) for v, v2 in zip(vcf, vcf_2): - self.assertEqual(v.INFO["normal_dp"], v2.INFO["normal_dp"]) - self.assertEqual(v.INFO["tumor_dp"], v2.INFO["tumor_dp"]) + for a in EXPECTED_ANNOTATIONS: + self.assertEqual( + v.INFO.get("normal_{}".format(a), ""), + v2.INFO.get("normal_{}".format(a), ""), + "Variant {}:{}:{}>{} is missing annotation normal_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) + self.assertEqual( + v.INFO.get("tumor_{}".format(a), ""), + v2.INFO.get("tumor_{}".format(a), ""), + "Variant {}:{}:{}>{} is missing annotation tumor_{}".format(v.CHROM, v.POS, v.REF, v.ALT[0], a)) def test_annotator_with_purities(self): input_file = pkg_resources.resource_filename(__name__, "resources/test1.vcf") @@ -256,25 +276,52 @@ def test_annotator_with_purities(self): self.assertTrue(n_variants_input == n_variants_output) info_annotations = test_utils._get_info_fields(output_vcf) - self.assertTrue("tumor_af" in info_annotations) - self.assertTrue("normal_af" in info_annotations) - self.assertTrue("tumor_ac" in info_annotations) - self.assertTrue("normal_ac" in info_annotations) - self.assertTrue("tumor_dp" in info_annotations) - self.assertTrue("normal_dp" in info_annotations) - self.assertTrue("tumor_pu" in info_annotations) - self.assertTrue("normal_pu" in info_annotations) - self.assertTrue("normal_eaf" in info_annotations) - self.assertTrue("tumor_eaf" in info_annotations) - self.assertTrue("tumor_pw" in info_annotations) - self.assertTrue("normal_pw" in info_annotations) - self.assertTrue("tumor_bq" in info_annotations) - self.assertTrue("normal_bq" in info_annotations) - self.assertTrue("tumor_mq" in info_annotations) - self.assertTrue("normal_mq" in info_annotations) + for a in EXPECTED_ANNOTATIONS: + self.assertTrue("tumor_{}".format(a) in info_annotations) + self.assertTrue("normal_{}".format(a) in info_annotations) annotator = Annotator( input_vcf=input_file, output_vcf=output_vcf, input_bams={"normal": [bam1], "tumor": [bam2]}, purities={"tumor": 0.2}, tumor_ploidies={"tumor": PloidyManager(genome_wide_ploidy=1.5)} ) annotator.run() + + def _assert_vafator_vcf(self, vcf_filename, sample_name, replicate=None): + self.assertTrue(os.path.exists(vcf_filename)) + vcf = VCF(vcf_filename) + for v in vcf: + # p-values or VAFs + self._assert_probability(v.INFO.get(self._get_annotation_name('rsbq_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('rsmq_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('rspos_pv', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('eaf', sample_name, replicate=replicate), 0)) + self._assert_probability(v.INFO.get(self._get_annotation_name('af', sample_name, replicate=replicate), 0)) + + # positive integer annotations + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('ac', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('dp', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('mq', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('bq', sample_name, replicate=replicate), 0)) + self._assert_positive_integer(v.INFO.get(self._get_annotation_name('pos', sample_name, replicate=replicate), 0)) + vcf.close() + + @staticmethod + def _get_annotation_name(annotation_name, sample_name, replicate=None): + annotation_name = "{}_{}".format(sample_name, annotation_name) + if replicate: + annotation_name = "{}_{}".format(annotation_name, replicate) + return annotation_name + + def _assert_probability(self, annotation): + if isinstance(annotation, list) or isinstance(annotation, tuple): + for a in annotation: + self.assertTrue(0.0 <= float(a) <= 1.0, "Expected probability has a value of {}".format(a)) + else: + self.assertTrue(0.0 <= float(annotation) <= 1.0, "Expected probability has a value of {}".format(annotation)) + + def _assert_positive_integer(self, annotation): + if isinstance(annotation, list) or isinstance(annotation, tuple): + for a in annotation: + self.assertTrue(np.isnan(a) or 0.0 <= a, "Expected positive integer has a value of {}".format(a)) + else: + self.assertTrue(np.isnan(annotation) or 0.0 <= annotation, "Expected positive integer has a value of {}".format(annotation))