Skip to content

Commit

Permalink
Merge pull request #57 from cmclean/r0.5
Browse files Browse the repository at this point in the history
DV 0.5.2 bugfix release: fix gVCF reference base issue
  • Loading branch information
pichuan authored Mar 8, 2018
2 parents 38c17ed + 17fa9c4 commit b3a5bce
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 60 deletions.
1 change: 1 addition & 0 deletions deepvariant/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ py_library(
"//deepvariant/core:genomics_math",
"//deepvariant/core:io_utils",
"//deepvariant/core:proto_utils",
"//deepvariant/core:ranges",
"//deepvariant/core:variantutils",
"//deepvariant/core/genomics:struct_py_pb2",
"//deepvariant/core/genomics:variants_py_pb2",
Expand Down
12 changes: 8 additions & 4 deletions deepvariant/core/genomics_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@
SAM_EXTENSIONS = frozenset(['.sam', '.bam'])


def make_ref_reader(reference_filename):
def make_ref_reader(reference_filename, cache_size=None):
"""Creates an indexed GenomeReference for reference_filename."""
return reference_fai.GenomeReferenceFai.from_file(
reference_filename.encode('utf8'),
reference_filename.encode('utf8') + '.fai')
fasta_path = reference_filename.encode('utf8')
fai_path = fasta_path + '.fai'
if cache_size is None:
return reference_fai.GenomeReferenceFai.from_file(fasta_path, fai_path)
else:
return reference_fai.GenomeReferenceFai.from_file(fasta_path, fai_path,
cache_size)


def make_sam_reader(reads_source,
Expand Down
8 changes: 7 additions & 1 deletion deepvariant/core/genomics_io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,17 @@
class RefReaderTests(parameterized.TestCase):

@parameterized.parameters('test.fasta', 'test.fasta.gz')
def test_make_ref_reader(self, fasta_filename):
def test_make_ref_reader_default(self, fasta_filename):
fasta_path = test_utils.genomics_core_testdata(fasta_filename)
with genomics_io.make_ref_reader(fasta_path) as reader:
self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 6)), 'ATCAC')

@parameterized.parameters('test.fasta', 'test.fasta.gz')
def test_make_ref_reader_cache_specified(self, fasta_filename):
fasta_path = test_utils.genomics_core_testdata(fasta_filename)
with genomics_io.make_ref_reader(fasta_path, cache_size=10) as reader:
self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 5)), 'ATCA')


class SamReaderTests(parameterized.TestCase):
"""Test the iteration functionality provided by io.SamReader."""
Expand Down
5 changes: 4 additions & 1 deletion deepvariant/core/python/reference_fai.clif
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ from "deepvariant/core/reference_fai.h":
contigs: list<ContigInfo> = property(`Contigs`)

@classmethod
def `FromFile` as from_file(cls, fasta_path: str, fai_path: str)
def `FromFile` as from_file(cls,
fasta_path: str,
fai_path: str,
cache_size_bases: int = default)
-> StatusOr<GenomeReferenceFai>

@__enter__
Expand Down
47 changes: 36 additions & 11 deletions deepvariant/postprocess_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from __future__ import print_function

import collections
import copy
import itertools
import tempfile

Expand All @@ -52,6 +53,7 @@
from deepvariant.core import genomics_math
from deepvariant.core import io_utils
from deepvariant.core import proto_utils
from deepvariant.core import ranges
from deepvariant.core import variantutils
from deepvariant.core.protos import core_pb2
from deepvariant.protos import deepvariant_pb2
Expand Down Expand Up @@ -116,6 +118,9 @@
# The genotype likelihood of the gVCF alternate allele for variant calls.
_GVCF_ALT_ALLELE_GL = -99

# FASTA cache size. Span 300 Mb so that we query each chromosome at most once.
_FASTA_CACHE_SIZE = 300000000


def _extract_single_sample_name(record):
"""Returns the name of the single sample within the CallVariantsOutput file.
Expand Down Expand Up @@ -713,12 +718,29 @@ def lessthanfn(variant1, variant2):
return lessthanfn


def _create_record_from_template(template, start, end):
"""Returns a copy of the template variant with the new start and end."""
retval = variants_pb2.Variant()
retval.CopyFrom(template)
def _create_record_from_template(template, start, end, fasta_reader):
"""Returns a copy of the template variant with the new start and end.
Updates to the start position cause a different reference base to be set.
Args:
template: third_party.nucleus.protos.Variant. The template variant whose
non-location and reference base information to use.
start: int. The desired new start location.
end: int. The desired new end location.
fasta_reader: GenomeReferenceFai object. The reader used to determine the
correct start base to use for the updated variant.
Returns:
An updated third_party.nucleus.protos.Variant with the proper start, end,
and reference base set and all other fields inherited from the template.
"""
retval = copy.deepcopy(template)
retval.start = start
retval.end = end
if start != template.start:
retval.reference_bases = fasta_reader.bases(
ranges.make_range(retval.reference_name, start, start + 1))
return retval


Expand Down Expand Up @@ -751,7 +773,7 @@ def _transform_to_gvcf_record(variant):


def merge_variants_and_nonvariants(variant_iterable, nonvariant_iterable,
lessthan):
lessthan, fasta_reader):
"""Yields records consisting of the merging of variant and non-variant sites.
The merging strategy used for single-sample records is to emit variants
Expand All @@ -768,6 +790,8 @@ def merge_variants_and_nonvariants(variant_iterable, nonvariant_iterable,
lessthan: Callable. A function that takes two Variant protos as input and
returns True iff the first argument is located "before" the second and
the variants do not overlap.
fasta_reader: GenomeReferenceFai object. The reference genome reader used to
ensure gVCF records have the correct reference base.
Yields:
Variant protos representing both variant and non-variant sites in the sorted
Expand Down Expand Up @@ -799,12 +823,12 @@ def next_or_none(iterable):
if nonvariant.start < variant.start:
# Emit a non-variant region up to the start of the variant.
yield _create_record_from_template(nonvariant, nonvariant.start,
variant.start)
variant.start, fasta_reader)
if nonvariant.end > variant.end:
# There is an overhang of the non-variant site after the variant is
# finished, so update the non-variant to point to that.
nonvariant = _create_record_from_template(nonvariant, variant.end,
nonvariant.end)
nonvariant.end, fasta_reader)
else:
# This non-variant site is subsumed by a Variant. Ignore it.
nonvariant = next_or_none(nonvariant_iterable)
Expand All @@ -828,8 +852,9 @@ def main(argv=()):

logging_level.set_from_flag()

with genomics_io.make_ref_reader(FLAGS.ref) as reader:
contigs = reader.contigs
fasta_reader = genomics_io.make_ref_reader(FLAGS.ref,
cache_size=_FASTA_CACHE_SIZE)
contigs = fasta_reader.contigs
paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile)
with tempfile.NamedTemporaryFile() as temp:
postprocess_variants_lib.process_single_sites_tfrecords(
Expand Down Expand Up @@ -865,12 +890,12 @@ def main(argv=()):
with genomics_io.make_vcf_reader(
FLAGS.outfile, use_index=False,
include_likelihoods=True) as variant_reader:
lessthanfn = _get_contig_based_lessthan(variant_reader.contigs)
lessthanfn = _get_contig_based_lessthan(contigs)
gvcf_variants = (
_transform_to_gvcf_record(variant)
for variant in variant_reader.iterate())
merged_variants = merge_variants_and_nonvariants(
gvcf_variants, nonvariant_generator, lessthanfn)
gvcf_variants, nonvariant_generator, lessthanfn, fasta_reader)
write_variants_to_vcf(
contigs=contigs,
variant_generator=merged_variants,
Expand Down
96 changes: 58 additions & 38 deletions deepvariant/postprocess_variants_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@
]


class DummyReferenceReader(object):

def __init__(self):
self._bases = {
'1': 'AACCGGTTACGTTCGATTTTAAAACCCCGGGG',
'2': 'GCAGTGACGTAGCGATGACGTAGACGCTTACG'
}

def bases(self, region):
chrom = region.reference_name
if chrom not in self._bases or region.end > len(self._bases[chrom]):
raise ValueError('Invalid region for dummy reader: {}'.format(region))
return self._bases[chrom][region.start:region.end]


def setUpModule():
test_utils.init()

Expand Down Expand Up @@ -145,13 +160,14 @@ def _simple_variant(ref_name, start, ref_base):
alleles=[ref_base, 'A' if ref_base != 'A' else 'C'])


def _create_nonvariant(ref_name, start, end):
def _create_nonvariant(ref_name, start, end, ref_base):
"""Creates a non-variant Variant record for testing.
Args:
ref_name: str. Reference name for this variant.
start: int. start position on the contig [0-based, half open).
end: int. end position on the contig [0-based, half open).
ref_base: str. reference base at the start position.
Returns:
A non-variant Variant record created with the specified arguments.
Expand All @@ -160,7 +176,7 @@ def _create_nonvariant(ref_name, start, end):
chrom=ref_name,
start=start,
end=end,
alleles=['A', variantutils.GVCF_ALT_ALLELE])
alleles=[ref_base, variantutils.GVCF_ALT_ALLELE])


def make_golden_dataset(compressed_inputs=False):
Expand Down Expand Up @@ -934,21 +950,23 @@ class MergeVcfAndGvcfTest(parameterized.TestCase):
(('1', 6, 9), ('1', 3, 4), False),
)
def test_lessthan_comparison(self, v1, v2, expected):
variant1 = _create_nonvariant(*v1)
variant2 = _create_nonvariant(*v2)
variant1 = _create_nonvariant(*v1, ref_base='A')
variant2 = _create_nonvariant(*v2, ref_base='C')
lessthan = postprocess_variants._get_contig_based_lessthan(_CONTIGS)
self.assertEqual(lessthan(variant1, variant2), expected)
self.assertEqual(lessthan(variant1, None), True)
self.assertEqual(lessthan(None, variant1), False)

@parameterized.parameters(
(_create_nonvariant('1', 10, 15), 10, 12, _create_nonvariant('1', 10,
12)),
(_create_nonvariant('2', 1, 9), 3, 4, _create_nonvariant('2', 3, 4)),
(_create_nonvariant('1', 10, 15, 'G'), 10, 12,
_create_nonvariant('1', 10, 12, 'G')),
(_create_nonvariant('2', 1, 9, 'C'), 3, 4,
_create_nonvariant('2', 3, 4, 'G')),
)
def test_create_record_from_template(self, template, start, end, expected):
reader = DummyReferenceReader()
actual = postprocess_variants._create_record_from_template(
template, start, end)
template, start, end, reader)
self.assertEqual(actual, expected)

@parameterized.parameters(
Expand Down Expand Up @@ -1035,52 +1053,54 @@ def test_transform_to_gvcf_no_allele_addition(self, alts, gls, vaf):
([], [], []),
# Non-overlapping records.
([('1', 1, 'A')], [], [_simple_variant('1', 1, 'A')]),
([('1', 3, 'A'), ('1', 7, 'C'),
('2', 6, 'G')], [('2', 3, 6), ('2', 7, 9)], [
_simple_variant('1', 3, 'A'),
_simple_variant('1', 7, 'C'),
_create_nonvariant('2', 3, 6),
_simple_variant('2', 6, 'G'),
_create_nonvariant('2', 7, 9)
([('1', 3, 'C'), ('1', 7, 'T'),
('2', 6, 'A')], [('2', 3, 6, 'G'), ('2', 7, 9, 'C')], [
_simple_variant('1', 3, 'C'),
_simple_variant('1', 7, 'T'),
_create_nonvariant('2', 3, 6, 'G'),
_simple_variant('2', 6, 'A'),
_create_nonvariant('2', 7, 9, 'C')
]),
# Non-variant record overlaps a variant from the left.
([('1', 5, 'CACGTG')], [('1', 2, 8)],
[_create_nonvariant('1', 2, 5),
_simple_variant('1', 5, 'CACGTG')]),
([('1', 5, 'GTTACG')], [('1', 2, 8, 'C')],
[_create_nonvariant('1', 2, 5, 'C'),
_simple_variant('1', 5, 'GTTACG')]),
# Non-variant record overlaps a variant from the right.
([('1', 5, 'CACGTG')], [('1', 8, 15)],
[_simple_variant('1', 5, 'CACGTG'),
_create_nonvariant('1', 11, 15)]),
([('1', 5, 'GTTACG')], [('1', 8, 15, 'A')], [
_simple_variant('1', 5, 'GTTACG'),
_create_nonvariant('1', 11, 15, 'T')
]),
# Non-variant record is subsumed by a variant.
([('1', 5, 'CACGTG')], [('1', 5, 11)],
[_simple_variant('1', 5, 'CACGTG')]),
([('1', 5, 'GTTACG')], [('1', 5, 11, 'G')],
[_simple_variant('1', 5, 'GTTACG')]),
# Non-variant record subsumes a variant.
([('1', 5, 'CACGTG')], [('1', 4, 12)], [
_create_nonvariant('1', 4, 5),
_simple_variant('1', 5, 'CACGTG'),
_create_nonvariant('1', 11, 12)
([('1', 5, 'GTTACG')], [('1', 4, 12, 'G')], [
_create_nonvariant('1', 4, 5, 'G'),
_simple_variant('1', 5, 'GTTACG'),
_create_nonvariant('1', 11, 12, 'T')
]),
# Non-variant record subsumes multiple overlapping variants.
([('1', 3, 'AAAAAAA'), ('1', 5, 'A')], [('1', 1, 15)], [
_create_nonvariant('1', 1, 3),
_simple_variant('1', 3, 'AAAAAAA'),
_simple_variant('1', 5, 'A'),
_create_nonvariant('1', 10, 15)
([('1', 3, 'CGGTTAC'), ('1', 5, 'G')], [('1', 1, 15, 'A')], [
_create_nonvariant('1', 1, 3, 'A'),
_simple_variant('1', 3, 'CGGTTAC'),
_simple_variant('1', 5, 'G'),
_create_nonvariant('1', 10, 15, 'G')
]),
([('1', 3, 'AAAAAAA'), ('1', 5, 'AAAAAAA')], [('1', 1, 15)], [
_create_nonvariant('1', 1, 3),
_simple_variant('1', 3, 'AAAAAAA'),
_simple_variant('1', 5, 'AAAAAAA'),
_create_nonvariant('1', 12, 15)
([('1', 3, 'CGGTTAC'), ('1', 5, 'GTTACGT')], [('1', 1, 15, 'A')], [
_create_nonvariant('1', 1, 3, 'A'),
_simple_variant('1', 3, 'CGGTTAC'),
_simple_variant('1', 5, 'GTTACGT'),
_create_nonvariant('1', 12, 15, 'T')
]),
)
def test_merge_variants_and_nonvariants(self, variants, nonvariants,
expected):
viter = (_simple_variant(*v) for v in variants)
nonviter = (_create_nonvariant(*nv) for nv in nonvariants)
lessthan = postprocess_variants._get_contig_based_lessthan(_CONTIGS)
reader = DummyReferenceReader()
actual = postprocess_variants.merge_variants_and_nonvariants(
viter, nonviter, lessthan)
viter, nonviter, lessthan, reader)
self.assertEqual(list(actual), expected)


Expand Down
2 changes: 1 addition & 1 deletion deepvariant/testdata/golden.postprocess_gvcf_output.g.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ chr20 10008936 . C <*> 0 . END=10008947 GT:GQ:MIN_DP:PL 0/0:50:30:0,90,899
chr20 10008948 . TA T,<*> 36.8 PASS . GT:GQ:DP:AD:VAF:PL 1/1:24:39:0,39,0:1,0:36,24,0,990,990,990
chr20 10008950 . A <*> 0 . END=10008951 GT:GQ:MIN_DP:PL 0/0:50:35:0,105,1049
chr20 10008952 . CACACACACACA C,CCACACACACA,<*> 31 PASS . GT:GQ:DP:AD:VAF:PL 1/2:11:41:1,14,25,0:0.341463,0.609756,0:30,20,44,20,0,11,990,990,990,990
chr20 10008964 . A <*> 0 . END=10008999 GT:GQ:MIN_DP:PL 0/0:50:23:0,69,689
chr20 10008964 . C <*> 0 . END=10008999 GT:GQ:MIN_DP:PL 0/0:50:23:0,69,689
chr20 10009000 . T <*> 0 . END=10009226 GT:GQ:MIN_DP:PL 0/0:50:45:0,144,1439
chr20 10009227 . A G,<*> 43.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:41:66:34,32,0:0.484848,0:43,0,45,990,990,990
chr20 10009228 . G <*> 0 . END=10009245 GT:GQ:MIN_DP:PL 0/0:50:67:0,201,2009
Expand Down
4 changes: 2 additions & 2 deletions docs/deepvariant-case-study.md
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ Step | wall time
`make_examples` | 5h 37m 42s
`call_variants` | 11h 0m 29s
`postprocess_variants` (no gVCF) | 21m 54s
`postprocess_variants` (with gVCF) | 58m 24s
total time (single machine) | 17h - 17h 36m
`postprocess_variants` (with gVCF) | 59m 13s
total time (single machine) | 17h - 17h 37m

## Variant call quality

Expand Down
4 changes: 2 additions & 2 deletions docs/deepvariant-exome-case-study.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ Step | wall time
`make_examples` | 69m 22s
`call_variants` | 6m 32s
`postprocess_variants` (no gVCF) | 0m 13s
`postprocess_variants` (with gVCF) | 0m 41s
total time (single machine) | ~ 1h 16m
`postprocess_variants` (with gVCF) | 1m 29s
total time (single machine) | ~ 1h 17m

## Variant call quality

Expand Down

0 comments on commit b3a5bce

Please sign in to comment.