From b319e671bb82710f33b262d0d80e2f1e3ff0cdf2 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Sat, 2 Oct 2021 00:04:02 +0930
Subject: [PATCH] Handle cDNA alignment gaps

---
 pyhgvs/__init__.py | 43 +++++++++++++++++++++++++++++++++++++++++--
 pyhgvs/models.py   |  8 +++++---
 pyhgvs/utils.py    |  7 +++++--
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 977ae39..09b7968 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -518,6 +518,40 @@ def get_genomic_sequence(genome, chrom, start, end):
         return str(genome[str(chrom)][start - 1:end]).upper()
 
 
+def _get_exon_misalignment_offset(cdna_match_gap: str, exon_pos: int):
+    """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length
+        @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute
+        codes operation
+        M 	match
+        I 	insert a gap into the reference sequence
+        D 	insert a gap into the target (delete from reference)
+
+        If you want the whole exon, then pass the end
+    """
+
+    exon_i = 0
+    misalignment_offset = 0
+    for gap_op in cdna_match_gap.split():
+        code = gap_op[0]
+        length = int(gap_op[1:])
+        if code == "M":
+            exon_i += length
+        elif code == "I":
+            if exon_pos <= exon_i + length:
+                raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op)
+            misalignment_offset += length
+        elif code == "D":
+            if exon_pos <= exon_i + length:
+                raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op)
+            misalignment_offset -= length
+        else:
+            raise ValueError(f"Unknown code in cDNA GAP: {gap_op}")
+
+        if exon_i >= exon_pos:
+            break
+    return misalignment_offset
+
+
 def cdna_to_genomic_coord(transcript, coord):
     """Convert a HGVS cDNA coordinate to a genomic coordinate."""
     transcript_strand = transcript.tx_position.is_forward_strand
@@ -549,10 +583,15 @@ def cdna_to_genomic_coord(transcript, coord):
     # Walk along transcript until we find an exon that contains pos.
     cdna_start = 1
     cdna_end = 1
+    misalignment_offset = 0
     for exon in exons:
         exon_start = exon.tx_position.chrom_start + 1
         exon_end = exon.tx_position.chrom_stop
         cdna_end = cdna_start + (exon_end - exon_start)
+        if exon.cdna_match:
+            # Doesn't matter if it goes off the end, will just count entire exon
+            exon_pos = pos - cdna_start + 1
+            misalignment_offset -= _get_exon_misalignment_offset(exon.cdna_match, exon_pos)
         if cdna_start <= pos <= cdna_end:
             break
         cdna_start = cdna_end + 1
@@ -566,10 +605,10 @@ def cdna_to_genomic_coord(transcript, coord):
     # Compute genomic coordinate using offset.
     if transcript_strand:
         # Plus strand.
-        return exon_start + (pos - cdna_start) + coord.offset
+        return exon_start + (pos - cdna_start) + coord.offset + misalignment_offset
     else:
         # Minus strand.
-        return exon_end - (pos - cdna_start) - coord.offset
+        return exon_end - (pos - cdna_start) - coord.offset + misalignment_offset
 
 
 def genomic_to_cdna_coord(transcript, genomic_coord):
diff --git a/pyhgvs/models.py b/pyhgvs/models.py
index 656e67b..4dbb182 100644
--- a/pyhgvs/models.py
+++ b/pyhgvs/models.py
@@ -33,14 +33,15 @@ class Transcript(object):
     """
 
     def __init__(self, name, version, gene, tx_position, cds_position,
-                 is_default=False, exons=None):
+                 is_default=False, exons=None, cdna_match=None):
         self.name = name
         self.version = version
         self.gene = Gene(gene)
         self.tx_position = tx_position
         self.cds_position = cds_position
         self.is_default = is_default
-        self.exons = exons if exons else []
+        self.exons = exons or []
+        self.cdna_match = cdna_match or []
 
     @property
     def full_name(self):
@@ -102,10 +103,11 @@ def distance(self, offset):
 
 
 class Exon(object):
-    def __init__(self, transcript, tx_position, exon_number):
+    def __init__(self, transcript, tx_position, exon_number, cdna_match=None):
         self.transcript = transcript
         self.tx_position = tx_position
         self.exon_number = exon_number
+        self.cdna_match = cdna_match
 
     @property
     def get_exon_name(self):
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 55b20f4..ca7418f 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -87,10 +87,12 @@ def make_transcript(transcript_json):
             transcript_json['strand'] == '+'))
 
     exons = transcript_json['exons']
+    cdna_matches = transcript_json.get('cdna_match', [None] * len(exons))
     if not transcript.tx_position.is_forward_strand:
         exons = reversed(exons)
+        cdna_matches = reversed(cdna_matches)
 
-    for exon_number, (exon_start, exon_end) in enumerate(exons, 1):
+    for exon_number, ((exon_start, exon_end), cdna_match) in enumerate(zip(exons, cdna_matches), 1):
         transcript.exons.append(
             Exon(transcript=transcript,
                  tx_position=Position(
@@ -98,7 +100,8 @@ def make_transcript(transcript_json):
                      exon_start,
                      exon_end,
                      transcript_json['strand'] == '+'),
-                 exon_number=exon_number))
+                 exon_number=exon_number,
+                 cdna_match=cdna_match))
 
     return transcript