Skip to content

Commit

Permalink
Handle cDNA alignment gaps
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Oct 1, 2021
1 parent 957d25c commit b319e67
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 7 deletions.
43 changes: 41 additions & 2 deletions pyhgvs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,40 @@ def get_genomic_sequence(genome, chrom, start, end):
return str(genome[str(chrom)][start - 1:end]).upper()


def _get_exon_misalignment_offset(cdna_match_gap: str, exon_pos: int):
""" cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length
@see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute
codes operation
M match
I insert a gap into the reference sequence
D insert a gap into the target (delete from reference)
If you want the whole exon, then pass the end
"""

exon_i = 0
misalignment_offset = 0
for gap_op in cdna_match_gap.split():
code = gap_op[0]
length = int(gap_op[1:])
if code == "M":
exon_i += length
elif code == "I":
if exon_pos <= exon_i + length:
raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op)
misalignment_offset += length
elif code == "D":
if exon_pos <= exon_i + length:
raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op)
misalignment_offset -= length
else:
raise ValueError(f"Unknown code in cDNA GAP: {gap_op}")

if exon_i >= exon_pos:
break
return misalignment_offset


def cdna_to_genomic_coord(transcript, coord):
"""Convert a HGVS cDNA coordinate to a genomic coordinate."""
transcript_strand = transcript.tx_position.is_forward_strand
Expand Down Expand Up @@ -549,10 +583,15 @@ def cdna_to_genomic_coord(transcript, coord):
# Walk along transcript until we find an exon that contains pos.
cdna_start = 1
cdna_end = 1
misalignment_offset = 0
for exon in exons:
exon_start = exon.tx_position.chrom_start + 1
exon_end = exon.tx_position.chrom_stop
cdna_end = cdna_start + (exon_end - exon_start)
if exon.cdna_match:
# Doesn't matter if it goes off the end, will just count entire exon
exon_pos = pos - cdna_start + 1
misalignment_offset -= _get_exon_misalignment_offset(exon.cdna_match, exon_pos)
if cdna_start <= pos <= cdna_end:
break
cdna_start = cdna_end + 1
Expand All @@ -566,10 +605,10 @@ def cdna_to_genomic_coord(transcript, coord):
# Compute genomic coordinate using offset.
if transcript_strand:
# Plus strand.
return exon_start + (pos - cdna_start) + coord.offset
return exon_start + (pos - cdna_start) + coord.offset + misalignment_offset
else:
# Minus strand.
return exon_end - (pos - cdna_start) - coord.offset
return exon_end - (pos - cdna_start) - coord.offset + misalignment_offset


def genomic_to_cdna_coord(transcript, genomic_coord):
Expand Down
8 changes: 5 additions & 3 deletions pyhgvs/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ class Transcript(object):
"""

def __init__(self, name, version, gene, tx_position, cds_position,
is_default=False, exons=None):
is_default=False, exons=None, cdna_match=None):
self.name = name
self.version = version
self.gene = Gene(gene)
self.tx_position = tx_position
self.cds_position = cds_position
self.is_default = is_default
self.exons = exons if exons else []
self.exons = exons or []
self.cdna_match = cdna_match or []

@property
def full_name(self):
Expand Down Expand Up @@ -102,10 +103,11 @@ def distance(self, offset):


class Exon(object):
def __init__(self, transcript, tx_position, exon_number):
def __init__(self, transcript, tx_position, exon_number, cdna_match=None):
self.transcript = transcript
self.tx_position = tx_position
self.exon_number = exon_number
self.cdna_match = cdna_match

@property
def get_exon_name(self):
Expand Down
7 changes: 5 additions & 2 deletions pyhgvs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,21 @@ def make_transcript(transcript_json):
transcript_json['strand'] == '+'))

exons = transcript_json['exons']
cdna_matches = transcript_json.get('cdna_match', [None] * len(exons))
if not transcript.tx_position.is_forward_strand:
exons = reversed(exons)
cdna_matches = reversed(cdna_matches)

for exon_number, (exon_start, exon_end) in enumerate(exons, 1):
for exon_number, ((exon_start, exon_end), cdna_match) in enumerate(zip(exons, cdna_matches), 1):
transcript.exons.append(
Exon(transcript=transcript,
tx_position=Position(
transcript_json['chrom'],
exon_start,
exon_end,
transcript_json['strand'] == '+'),
exon_number=exon_number))
exon_number=exon_number,
cdna_match=cdna_match))

return transcript

Expand Down

0 comments on commit b319e67

Please sign in to comment.