From e65db8330b783bc28e108c8df7c5f6cf66be008b Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Mon, 16 Mar 2020 16:45:01 +1030
Subject: [PATCH 01/47] Patch on top of latest counsyl/hgvs - indel resolution.
 Load genePred files

---
 pyhgvs/__init__.py                            | 100 +++++++++++-------
 pyhgvs/models.py                              |   2 +-
 pyhgvs/tests/data/test_name_to_variant.genome |  11 ++
 pyhgvs/tests/data/test_variant_to_name.genome |   6 ++
 pyhgvs/tests/genome.py                        |   6 +-
 pyhgvs/utils.py                               |  63 +++++------
 pyhgvs/variants.py                            |  20 ++--
 7 files changed, 125 insertions(+), 83 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index b4df00c..9b28e8f 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -655,7 +655,7 @@ def genomic_to_cdna_coord(transcript, genomic_coord):
 
 def get_allele(hgvs, genome, transcript=None):
     """Get an allele from a HGVSName, a genome, and a transcript."""
-    chrom, start, end = hgvs.get_coords(transcript)
+    chrom, start, end = hgvs.get_ref_coords(transcript)
     _, alt = hgvs.get_ref_alt(
         transcript.tx_position.is_forward_strand if transcript else True)
     ref = get_genomic_sequence(genome, chrom, start, end)
@@ -673,16 +673,31 @@ def get_vcf_allele(hgvs, genome, transcript=None):
     ref = get_genomic_sequence(genome, chrom, start, end)
 
     if hgvs.mutation_type in _indel_mutation_types:
+        if hgvs.mutation_type == 'dup':
+            # Sometimes we need to retrieve alt from reference, eg:
+            # No alt supplied: NM_000492.3:c.1155_1156dup
+            # Number used:     NM_004119.2(FLT3):c.1794_1811dup18
+            # We *know* what the sequence is for "dup18", but not for "ins18"
+            if not hgvs.alt_allele or re.match("^N+$", hgvs.alt_allele):
+                alt = get_alt_from_sequence(hgvs, genome, transcript)
+
         # Left-pad alternate allele.
         alt = ref[0] + alt
     return chrom, start, end, ref, alt
 
 
+def get_alt_from_sequence(hgvs, genome, transcript):
+    """ returns ready for VCF """
+
+    chrom, start, end = hgvs.get_raw_coords(transcript)
+    return get_genomic_sequence(genome, chrom, start, end)
+
+
 def matches_ref_allele(hgvs, genome, transcript=None):
     """Return True if reference allele matches genomic sequence."""
-    ref, alt = hgvs.get_ref_alt(
+    ref, _ = hgvs.get_ref_alt(
         transcript.tx_position.is_forward_strand if transcript else True)
-    chrom, start, end = hgvs.get_coords(transcript)
+    chrom, start, end = hgvs.get_ref_coords(transcript)
     genome_ref = get_genomic_sequence(genome, chrom, start, end)
     return genome_ref == ref
 
@@ -1146,8 +1161,8 @@ def format_genome(self):
         """
         return self.format_coords() + self.format_dna_allele()
 
-    def get_coords(self, transcript=None):
-        """Return genomic coordinates of reference allele."""
+    def get_raw_coords(self, transcript=None):
+        """ return genomic coordinates """
         if self.kind == 'c':
             chrom = transcript.tx_position.chrom
             start = cdna_to_genomic_coord(transcript, self.cdna_start)
@@ -1158,27 +1173,14 @@ def get_coords(self, transcript=None):
                     raise AssertionError(
                         "cdna_start cannot be greater than cdna_end")
                 start, end = end, start
-            else:
-                if start > end:
-                    raise AssertionError(
-                        "cdna_start cannot be greater than cdna_end")
-
-            if self.mutation_type == "ins":
-                # Inserts have empty interval.
-                if start < end:
-                    start += 1
-                    end -= 1
-                else:
-                    end = start - 1
-
-            elif self.mutation_type == "dup":
-                end = start - 1
 
+            if start > end:
+                raise AssertionError(
+                    "cdna_start cannot be greater than cdna_end")
         elif self.kind == 'g':
             chrom = self.chrom
             start = self.start
             end = self.end
-
         else:
             raise NotImplementedError(
                 'Coordinates are not available for this kind of HGVS name "%s"'
@@ -1186,9 +1188,26 @@ def get_coords(self, transcript=None):
 
         return chrom, start, end
 
+    def get_ref_coords(self, transcript=None):
+        """Return genomic coordinates of reference allele."""
+        
+        chrom, start, end = self.get_raw_coords(transcript)
+        
+        if self.mutation_type == "ins":
+            # Inserts have empty interval.
+            if start < end:
+                start += 1
+                end -= 1
+            else:
+                end = start - 1
+
+        elif self.mutation_type == "dup":
+            end = start - 1
+        return chrom, start, end
+
     def get_vcf_coords(self, transcript=None):
         """Return genomic coordinates of reference allele in VCF-style."""
-        chrom, start, end = self.get_coords(transcript)
+        chrom, start, end = self.get_ref_coords(transcript)
 
         # Inserts and deletes require left-padding by 1 base
         if self.mutation_type in ("=", ">"):
@@ -1344,7 +1363,8 @@ def hgvs_normalize_variant(chrom, offset, ref, alt, genome, transcript=None):
 
 def parse_hgvs_name(hgvs_name, genome, transcript=None,
                     get_transcript=lambda name: None,
-                    flank_length=30, normalize=True, lazy=False):
+                    flank_length=30, normalize=True, lazy=False,
+                    indels_start_with_same_base=True):
     """
     Parse an HGVS name into (chrom, start, end, ref, alt)
 
@@ -1353,15 +1373,16 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None,
     transcript: Transcript corresponding to HGVS name.
     normalize: If True, normalize allele according to VCF standard.
     lazy: If True, discard version information from incoming transcript/gene.
+    indels_start_with_same_base: When normalizing, don't strip common prefix from indels
     """
     hgvs = HGVSName(hgvs_name)
 
     # Determine transcript.
     if hgvs.kind == 'c' and not transcript:
         if '.' in hgvs.transcript and lazy:
-            hgvs.transcript, version = hgvs.transcript.split('.')
+            hgvs.transcript, _ = hgvs.transcript.split('.')
         elif '.' in hgvs.gene and lazy:
-            hgvs.gene, version = hgvs.gene.split('.')
+            hgvs.gene, _ = hgvs.gene.split('.')
         if get_transcript:
             if hgvs.transcript:
                 transcript = get_transcript(hgvs.transcript)
@@ -1377,11 +1398,12 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None,
                               transcript.tx_position.chrom_stop,
                               hgvs.transcript)
 
-    chrom, start, end, ref, alt = get_vcf_allele(hgvs, genome, transcript)
+    chrom, start, _, ref, alt = get_vcf_allele(hgvs, genome, transcript)
     if normalize:
-        chrom, start, ref, [alt] = normalize_variant(
-            chrom, start, ref, [alt], genome,
-            flank_length=flank_length).variant
+        nv = normalize_variant(chrom, start, ref, [alt], genome,
+                               flank_length=flank_length,
+                               indels_start_with_same_base=indels_start_with_same_base)
+        chrom, start, ref, [alt] = nv.variant
     return (chrom, start, ref, alt)
 
 
@@ -1408,11 +1430,19 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
     hgvs = HGVSName()
 
     # Populate coordinates.
+    if mutation_type == 'ins':
+        # Insert uses coordinates around the insert point.
+        offset_start = offset - 1
+        offset_end = offset
+    else:
+        offset_start = offset
+        offset_end = offset + len(ref) - 1
+
     if not transcript:
         # Use genomic coordinate when no transcript is available.
         hgvs.kind = 'g'
-        hgvs.start = offset
-        hgvs.end = offset + len(ref) - 1
+        hgvs.start = offset_start
+        hgvs.end = offset_end
     else:
         # Use cDNA coordinates.
         hgvs.kind = 'c'
@@ -1425,14 +1455,6 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
             hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset)
             hgvs.cdna_end = hgvs.cdna_start
         else:
-            # Use a range of coordinates.
-            if mutation_type == 'ins':
-                # Insert uses coordinates around the insert point.
-                offset_start = offset - 1
-                offset_end = offset
-            else:
-                offset_start = offset
-                offset_end = offset + len(ref) - 1
             if transcript.strand == '-':
                 offset_start, offset_end = offset_end, offset_start
             hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start)
diff --git a/pyhgvs/models.py b/pyhgvs/models.py
index b26b8c4..656e67b 100644
--- a/pyhgvs/models.py
+++ b/pyhgvs/models.py
@@ -27,7 +27,7 @@ def __init__(self, name):
 
 
 class Transcript(object):
-    """RefGene Transcripts for hg19
+    """
 
     A gene may have multiple transcripts with different combinations of exons.
     """
diff --git a/pyhgvs/tests/data/test_name_to_variant.genome b/pyhgvs/tests/data/test_name_to_variant.genome
index 90eb59e..a36abfb 100644
--- a/pyhgvs/tests/data/test_name_to_variant.genome
+++ b/pyhgvs/tests/data/test_name_to_variant.genome
@@ -61,6 +61,7 @@ chr1	76205650	76205691	TTTATATATTCAAGGCTTATTGTGTAACAGAACCTGGAGCA
 chr1	76205639	76205669	TTTCTCTTGTTTTTATATATTCAAGGCTTA
 chr1	76205670	76205700	TGTGTAACAGAACCTGGAGCAGGCTCTGAT
 chr1	76227048	76227049	C
+chr1	76227049	76227050	T
 chr1	76227029	76227070	TAATGAGGGATGCCAAAATCTATCAGGTAAGGTTAAAGATG
 chr1	76227019	76227049	GTAGAAAAACTAATGAGGGATGCCAAAATC
 chr1	76227049	76227079	TATCAGGTAAGGTTAAAGATGATTTTTTTG
@@ -84,7 +85,13 @@ chr7	117176661	117176664	TAT
 chr7	117176642	117176684	CCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACT
 chr7	117176630	117176660	GACTTGTGATTACCTCAGAAATGATTGAAA
 chr7	117176662	117176692	ATCCAATCTGTTAAGGCATACTGCTGGGAA
+chr7	117182084	117182126	CAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAA
+chr7	117182103	117182104	A
+chr7	117182103	117182106	AAT
+chr7	117182106	117182109	ATA
 chr7	117182106	117182107	A
+chr7	117182107	117182109	TA
+chr7	117182077	117182107	TACAAAAGCAAGAATATAAGACATTGGAATA
 chr7	117182087	117182129	GAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTA
 chr7	117182074	117182104	CTTACAAAAGCAAGAATATAAGACATTGGA
 chr7	117182104	117182134	ATATAACTTAACGACTACAGAAGTAGTGAT
@@ -98,7 +105,11 @@ chr17	41245340	41245341	T
 chr17	41245340	41245341	T
 chr7	117307164	117307165	A
 chr11	17496507	17496508	T
+chr17	7125998	7126028	AAGAAGATGGGCATCAAGGCTTCAAACACA
+chr17	7126009	7126051	CATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGT
 chr17	7126027	7126037	AGCAGAGGTG
+chr17	7126028	7126029	A
+chr17	7126028	7126058	GCAGAGGTGTTCTTTGATGGAGTACGGGTG
 chr1	76216233	76216235	AA
 chr1	76199213	76199220	AGGTCTT
 chr1	76199194	76199240	AACATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGC
diff --git a/pyhgvs/tests/data/test_variant_to_name.genome b/pyhgvs/tests/data/test_variant_to_name.genome
index f310c9c..1f0b257 100644
--- a/pyhgvs/tests/data/test_variant_to_name.genome
+++ b/pyhgvs/tests/data/test_variant_to_name.genome
@@ -66,3 +66,9 @@ chr7	117188682	117188712	tTTTTTTAACAGGGATTTGGGGAATTATTT
 chr7	117188582	117188783	CCATGTGCTTTTCAAACTAATTGTACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGAtgtgtgtgtgtgtgtgtgtgtgtTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGTAATTT
 chr7	117188687	117188689	TT
 chr7	117188689	117188691	AA
+chr17	7125928	7126129	TCCCCCAGTGACAACCTGTTGAACACACCTCTGCTTTCCCACACTGCCCTGACACAGTGGGCCCCCTGAGAAGAAGATGGGCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGTACGGGTGCCATCGGAGAACGTGCTGGGTGAGGTTGGGAGTGGCTTCAAGGTTGCCATGCACATCCTCAACAATGGAAG
+chr17	7126008	7126050	GCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAG
+chr17	7126027	7126029	AG
+chr17	7126028	7126058	GCAGAGGTGTTCTTTGATGGAGTACGGGTG
+chr17	7126029	7126031	CA
+chr17	7125998	7126028	AAGAAGATGGGCATCAAGGCTTCAAACACA
\ No newline at end of file
diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py
index 5c53da6..0887dfd 100644
--- a/pyhgvs/tests/genome.py
+++ b/pyhgvs/tests/genome.py
@@ -77,9 +77,11 @@ def __init__(self, lookup=None, filename=None, db_filename=None,
             if SequenceFileDB is None:
                 raise ValueError('pygr is not available.')
             self._genome = SequenceFileDB(db_filename)
+            self._source_filename = db_filename
         elif filename:
             # Read genome sequence from lookup table.
             self.read(filename)
+            self._source_filename = filename
 
     def __contains__(self, chrom):
         """Return True if genome contains chromosome."""
@@ -113,8 +115,8 @@ def get_seq(self, chrom, start, end):
                         None, end - start))
                 else:
                     raise MockGenomeError(
-                        'Sequence not in test data: %s:%d-%d' %
-                        (chrom, start, end))
+                        'Sequence not in test data: %s:%d-%d source: %s' %
+                        (chrom, start, end, self._source_filename))
 
     def read(self, filename):
         """Read a sequence lookup table from a file.
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 83e239e..55b20f4 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -11,57 +11,52 @@
 
 
 def read_refgene(infile):
-    """
-    Iterate through a refGene file.
+    """ refGene = genePred with extra column at front (and ignored ones after) """
+    return read_genepred(infile, skip_first_column=True)
+
 
+def read_genepred(infile, skip_first_column=False):
+    """
     GenePred extension format:
     http://genome.ucsc.edu/FAQ/FAQformat.html#GenePredExt
 
     Column definitions:
-    0. uint undocumented id
-    1. string name;             "Name of gene (usually transcript_id from GTF)"
-    2. string chrom;                "Chromosome name"
-    3. char[1] strand;              "+ or - for strand"
-    4. uint txStart;                "Transcription start position"
-    5. uint txEnd;                  "Transcription end position"
-    6. uint cdsStart;               "Coding region start"
-    7. uint cdsEnd;                 "Coding region end"
-    8. uint exonCount;              "Number of exons"
-    9. uint[exonCount] exonStarts;  "Exon start positions"
-    10. uint[exonCount] exonEnds;   "Exon end positions"
-    11. uint id;                    "Unique identifier"
-    12. string name2;               "Alternate name (e.g. gene_id from GTF)"
-    13. string cdsStartStat;        "enum('none','unk','incmpl','cmpl')"
-    14. string cdsEndStat;          "enum('none','unk','incmpl','cmpl')"
-    15. lstring exonFrames;         "Exon frame offsets {0,1,2}"
+    0. string name;                 "Name of gene (usually transcript_id from GTF)"
+    1. string chrom;                "Chromosome name"
+    2. char[1] strand;              "+ or - for strand"
+    3. uint txStart;                "Transcription start position"
+    4. uint txEnd;                  "Transcription end position"
+    5. uint cdsStart;               "Coding region start"
+    6. uint cdsEnd;                 "Coding region end"
+    7. uint exonCount;              "Number of exons"
+    8. uint[exonCount] exonStarts;  "Exon start positions"
+    9. uint[exonCount] exonEnds;    "Exon end positions"
+    10. uint id;                    "Unique identifier"
+    11. string name2;               "Alternate name (e.g. gene_id from GTF)"
     """
     for line in infile:
         # Skip comments.
         if line.startswith('#'):
             continue
         row = line.rstrip('\n').split('\t')
-        if len(row) != 16:
-            raise ValueError(
-                'File has incorrect number of columns '
-                'in at least one line.')
+        if skip_first_column:
+            row = row[1:]
 
         # Skip trailing ,
-        exon_starts = list(map(int, row[9].split(',')[:-1]))
-        exon_ends = list(map(int, row[10].split(',')[:-1]))
-        exon_frames = list(map(int, row[15].split(',')[:-1]))
+        exon_starts = list(map(int, row[8].split(',')[:-1]))
+        exon_ends = list(map(int, row[9].split(',')[:-1]))
         exons = list(zip(exon_starts, exon_ends))
 
         yield {
-            'chrom': row[2],
-            'start': int(row[4]),
-            'end': int(row[5]),
-            'id': row[1],
-            'strand': row[3],
-            'cds_start': int(row[6]),
-            'cds_end': int(row[7]),
-            'gene_name': row[12],
+            'chrom': row[1],
+            'start': int(row[3]),
+            'end': int(row[4]),
+            'id': row[0],
+            'strand': row[2],
+            'cds_start': int(row[5]),
+            'cds_end': int(row[6]),
+            'gene_name': row[11],
             'exons': exons,
-            'exon_frames': exon_frames
         }
 
 
diff --git a/pyhgvs/variants.py b/pyhgvs/variants.py
index fca0cdc..36c7383 100644
--- a/pyhgvs/variants.py
+++ b/pyhgvs/variants.py
@@ -115,7 +115,7 @@ def justify_genomic_indel(genome, chrom, start, end, indel, justify,
 
 
 def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome,
-                      justify='left', flank_length=30):
+                      justify='left', flank_length=30, indels_start_with_same_base=True):
     """
     Normalize variant according to the GATK/VCF standard.
 
@@ -133,7 +133,7 @@ def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome,
         chrom_stop=end,
         is_forward_strand=True)
     return NormalizedVariant(position, ref_sequence, alt_sequences,
-                             genome=genome, justify=justify)
+                             genome=genome, justify=justify, indels_start_with_same_base=indels_start_with_same_base)
 
 
 class NormalizedVariant(object):
@@ -142,7 +142,8 @@ class NormalizedVariant(object):
     """
 
     def __init__(self, position, ref_allele, alt_alleles,
-                 seq_5p='', seq_3p='', genome=None, justify='left'):
+                 seq_5p='', seq_3p='', genome=None, justify='left',
+                 indels_start_with_same_base=True):
         """
         position: a 0-index genomic Position.
         ref_allele: the reference allele sequence.
@@ -150,6 +151,9 @@ def __init__(self, position, ref_allele, alt_alleles,
         seq_5p: 5 prime flanking sequence of variant.
         seq_3p: 3 prime flanking sequence of variant.
         genome: a pygr compatible genome object (optional).
+        
+        indels_start_with_same_base: DML - I have no idea why this is required
+                                           but am keeping for backwards compat
         """
         self.position = position
         self.alleles = [ref_allele] + list(alt_alleles)
@@ -157,6 +161,7 @@ def __init__(self, position, ref_allele, alt_alleles,
         self.seq_3p = seq_3p
         self.genome = genome
         self.log = []
+        self.indels_start_with_same_base = indels_start_with_same_base
 
         self._on_forward_strand()
         self._trim_common_prefix()
@@ -275,7 +280,7 @@ def _1bp_pad(self):
         # Pad sequences with one 5-prime base before the mutation event.
         empty_seq = any(not allele for allele in self.alleles)
         uniq_starts = set(allele[0] for allele in self.alleles if allele)
-        if empty_seq or len(uniq_starts) > 1:
+        if empty_seq or (self.indels_start_with_same_base and len(uniq_starts) > 1):
             # Fetch more 5p flanking sequence if needed.
             if self.genome and self.seq_5p == '':
                 start = self.position.chrom_start
@@ -299,9 +304,10 @@ def _1bp_pad(self):
                 self.seq_3p = self.seq_3p[1:]
                 self.position.chrom_stop += 1
 
-        if len(set(a[0] for a in self.alleles)) != 1:
-            raise AssertionError(
-                "All INDEL alleles should start with same base.")
+        if self.indels_start_with_same_base:
+            if len(set(a[0] for a in self.alleles)) != 1:
+                raise AssertionError(
+                    "All INDEL alleles should start with same base.")
 
     def _set_1based_position(self):
         """

From e2f7dfb718c79eb00a687634e9b64f6bf2c8d792 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Mon, 16 Mar 2020 17:18:14 +1030
Subject: [PATCH 02/47] Unit tests for previously failing HGVS names

---
 pyhgvs/tests/test_hgvs_names.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index be294e0..58cf351 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -123,13 +123,22 @@ def test_variant_to_name():
     for (expected_hgvs_name, variant,
          name_canonical, var_canonical) in _name_variants:
         if name_canonical:
-            transcript_name = HGVSName(expected_hgvs_name).transcript
-            transcript = get_transcript(transcript_name)
-            assert transcript, transcript_name
+            hgvs = HGVSName(expected_hgvs_name)
+            transcript_name = hgvs.transcript
+            transcript = None
+            if transcript_name:
+                transcript = get_transcript(transcript_name)
+                assert transcript, transcript_name
             chrom, offset, ref, alt = variant
             hgvs_name = format_hgvs_name(
                 chrom, offset, ref, alt, genome, transcript,
                 use_gene=False)
+
+            if hgvs.kind == 'g':
+                # Strip off g.HGVS prefix
+                i = expected_hgvs_name.find(":g.")
+                expected_hgvs_name = expected_hgvs_name[i+1:]
+
             nose.tools.assert_equal(
                 hgvs_name, expected_hgvs_name,
                 repr([hgvs_name, expected_hgvs_name, variant]))
@@ -723,7 +732,11 @@ def test_invalid_coordinates():
     ('NM_000016.4:c.387+1delG', ('chr1', 76199309, 'TG', 'T'), True, True),
     ('NM_000016.4:c.475delT', ('chr1', 76205669, 'AT', 'A'), True, True),
     ('NM_000016.4:c.1189dupT', ('chr1', 76227049, 'C', 'CT'), True, True),
+    # Same as above but without optional trailing base
+    ('NM_000016.4:c.1189dup', ('chr1', 76227049, 'C', 'CT'), False, True),
     ('NM_000016.4:c.1191delT', ('chr1', 76227051, 'AT', 'A'), True, True),
+    # Same as above but without optional trailing base
+    ('NM_000016.4:c.1191del', ('chr1', 76227051, 'AT', 'A'), False, True),
     ('NM_000016.4:c.306_307insG', ('chr1', 76199232, 'T', 'TG'), True, True),
 
     # Alignment tests for HGVS 3' and VCF left-alignment.
@@ -734,6 +747,9 @@ def test_invalid_coordinates():
      True, True),
     ('NM_000492.3:c.1155_1156dupTA', ('chr7', 117182104, 'A', 'AAT'),
      True, True),
+    # Same as above but without optional trailing base
+    ('NM_000492.3:c.1155_1156dup', ('chr7', 117182104, 'A', 'AAT'),
+     False, True),
     ('NM_000492.3:c.3889dupT', ('chr7', 117292905, 'A', 'AT'), True, True),
 
     # Transcript prefix.
@@ -750,6 +766,9 @@ def test_invalid_coordinates():
     ('chr11:g.17496508T>C', ('chr11', 17496508, 'T', 'C'), False, True),
 
     # Genomic indels.
+    ('chr17:g.7126029_7126030insTG',
+     ('chr17', 7126028, 'A', 'AGT'), True, True),
+
     ('chr17:g.7126029_7126037delGCAGAGGTGinsTCAAAGCAC',
      ('chr17', 7126028, 'AGCAGAGGTG', 'ATCAAAGCAC'), False, True),
 
@@ -757,6 +776,13 @@ def test_invalid_coordinates():
     ('chr1:g.76216235delAinsGC', ('chr1', 76216234, 'AA', 'AGC'),
      False, True),
 
+    # Genomic dup
+    ('chr7:g.117182108_117182109dup', ('chr7', 117182104, 'A', 'AAT'),
+     False, True),
+
+    ('chr7:g.117182105_117182106dupAT', ('chr7', 117182104, 'A', 'AAT'),
+     False, True),
+
     # Genomic delete region.
     ('chr1:g.76199215_76199220delGGTCTT',
      ('chr1', 76199214, 'AGGTCTT', 'A'), False, True),

From 9ffdc7fb9692054957e536243ec1734b2fab5217 Mon Sep 17 00:00:00 2001
From: davmlaw <davmlaw@gmail.com>
Date: Wed, 22 Apr 2020 15:50:26 +0930
Subject: [PATCH 03/47] Populate alt for values like NC_000001.11:g.169549811=

---
 pyhgvs/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 9b28e8f..df6e108 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -672,9 +672,13 @@ def get_vcf_allele(hgvs, genome, transcript=None):
         transcript.tx_position.is_forward_strand if transcript else True)
     ref = get_genomic_sequence(genome, chrom, start, end)
 
+    # Sometimes we need to retrieve alt from reference
+    # Eg NC_000001.11:g.169549811=
+    if hgvs.mutation_type == "=" and alt == 'N':
+        alt = ref
+
     if hgvs.mutation_type in _indel_mutation_types:
         if hgvs.mutation_type == 'dup':
-            # Sometimes we need to retrieve alt from reference, eg:
             # No alt supplied: NM_000492.3:c.1155_1156dup
             # Number used:     NM_004119.2(FLT3):c.1794_1811dup18
             # We *know* what the sequence is for "dup18", but not for "ins18"

From 00a22054a4060a5dfb3e05376f9fbf0338352eb2 Mon Sep 17 00:00:00 2001
From: davmlaw <davmlaw@gmail.com>
Date: Wed, 29 Apr 2020 20:29:50 +0930
Subject: [PATCH 04/47] Support mito (ie "m.")

---
 pyhgvs/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index df6e108..9744b84 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -853,7 +853,7 @@ def parse_allele(self, allele):
             self.parse_cdna(details)
         elif kind == "p":
             self.parse_protein(details)
-        elif kind == "g":
+        elif kind in ["g", "m"]:
             self.parse_genome(details)
         else:
             raise NotImplementedError("unknown kind: %s" % allele)
@@ -1019,8 +1019,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
             allele = 'c.' + self.format_cdna()
         elif self.kind == 'p':
             allele = 'p.' + self.format_protein()
-        elif self.kind == 'g':
-            allele = 'g.' + self.format_genome()
+        elif self.kind in ['g', 'm']:
+            allele = self.kind + '.' + self.format_genome()
         else:
             raise NotImplementedError("not implemented: '%s'" % self.kind)
 
@@ -1040,7 +1040,7 @@ def format_prefix(self, use_gene=True):
           NM_007294.3(BRCA1):c.2207A>C
         """
 
-        if self.kind == 'g':
+        if self.kind in ['g', 'm']:
             if self.chrom:
                 return self.chrom
 
@@ -1181,7 +1181,7 @@ def get_raw_coords(self, transcript=None):
             if start > end:
                 raise AssertionError(
                     "cdna_start cannot be greater than cdna_end")
-        elif self.kind == 'g':
+        elif self.kind in ['g', 'm']:
             chrom = self.chrom
             start = self.start
             end = self.end

From 2b4ac8bb64b61410535e79858af4c67582fd9b95 Mon Sep 17 00:00:00 2001
From: davmlaw <davmlaw@gmail.com>
Date: Wed, 29 Apr 2020 20:33:37 +0930
Subject: [PATCH 05/47] Revert "Support mito (ie "m.")"

This reverts commit 00a22054a4060a5dfb3e05376f9fbf0338352eb2.
---
 pyhgvs/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 9744b84..df6e108 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -853,7 +853,7 @@ def parse_allele(self, allele):
             self.parse_cdna(details)
         elif kind == "p":
             self.parse_protein(details)
-        elif kind in ["g", "m"]:
+        elif kind == "g":
             self.parse_genome(details)
         else:
             raise NotImplementedError("unknown kind: %s" % allele)
@@ -1019,8 +1019,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
             allele = 'c.' + self.format_cdna()
         elif self.kind == 'p':
             allele = 'p.' + self.format_protein()
-        elif self.kind in ['g', 'm']:
-            allele = self.kind + '.' + self.format_genome()
+        elif self.kind == 'g':
+            allele = 'g.' + self.format_genome()
         else:
             raise NotImplementedError("not implemented: '%s'" % self.kind)
 
@@ -1040,7 +1040,7 @@ def format_prefix(self, use_gene=True):
           NM_007294.3(BRCA1):c.2207A>C
         """
 
-        if self.kind in ['g', 'm']:
+        if self.kind == 'g':
             if self.chrom:
                 return self.chrom
 
@@ -1181,7 +1181,7 @@ def get_raw_coords(self, transcript=None):
             if start > end:
                 raise AssertionError(
                     "cdna_start cannot be greater than cdna_end")
-        elif self.kind in ['g', 'm']:
+        elif self.kind == 'g':
             chrom = self.chrom
             start = self.start
             end = self.end

From a6e238e4c94023867fed6ce2dfa9991bbcc2ef43 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 12 May 2021 13:03:25 +0930
Subject: [PATCH 06/47] Support n. (non-coding) ie
 https://github.com/counsyl/hgvs/issues/7

---
 pyhgvs/__init__.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index df6e108..fe9aab6 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -849,7 +849,7 @@ def parse_allele(self, allele):
         self.kind = kind
         self.mutation_type = None
 
-        if kind == "c":
+        if kind in ("c", 'n'):
             self.parse_cdna(details)
         elif kind == "p":
             self.parse_protein(details)
@@ -1017,6 +1017,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
 
         if self.kind == 'c':
             allele = 'c.' + self.format_cdna()
+        elif self.kind == 'n':
+            allele = 'n.' + self.format_cdna()
         elif self.kind == 'p':
             allele = 'p.' + self.format_protein()
         elif self.kind == 'g':
@@ -1167,7 +1169,7 @@ def format_genome(self):
 
     def get_raw_coords(self, transcript=None):
         """ return genomic coordinates """
-        if self.kind == 'c':
+        if self.kind in ('c', 'n'):
             chrom = transcript.tx_position.chrom
             start = cdna_to_genomic_coord(transcript, self.cdna_start)
             end = cdna_to_genomic_coord(transcript, self.cdna_end)
@@ -1382,7 +1384,7 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None,
     hgvs = HGVSName(hgvs_name)
 
     # Determine transcript.
-    if hgvs.kind == 'c' and not transcript:
+    if hgvs.kind in ('c', 'n') and not transcript:
         if '.' in hgvs.transcript and lazy:
             hgvs.transcript, _ = hgvs.transcript.split('.')
         elif '.' in hgvs.gene and lazy:
@@ -1449,7 +1451,11 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
         hgvs.end = offset_end
     else:
         # Use cDNA coordinates.
-        hgvs.kind = 'c'
+        if transcript.is_coding:
+            hgvs.kind = 'c'
+        else:
+            hgvs.kind = 'n'
+
         is_single_base_indel = (
             (mutation_type == 'ins' and len(alt) == 1) or
             (mutation_type in ('del', 'delins', 'dup') and len(ref) == 1))

From 957d25c5dfc6d5526ef993f56e869b3461ca156f Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Tue, 7 Sep 2021 10:52:41 +0930
Subject: [PATCH 07/47] Make sure non-coding doesn't have 5' or 3' UTR offsets

---
 pyhgvs/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index fe9aab6..977ae39 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -851,6 +851,11 @@ def parse_allele(self, allele):
 
         if kind in ("c", 'n'):
             self.parse_cdna(details)
+            if kind == 'n':  # Ensure no 3'UTR or 5'UTR coords in non-coding
+                if self.cdna_start.coord < 0:
+                    raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain negative (5'UTR) coordinates")
+                if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop':
+                    raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain '*' (3'UTR) coordinates")
         elif kind == "p":
             self.parse_protein(details)
         elif kind == "g":

From 1a8b67fd7f374c9e845c246655ab5c61be7a5550 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Tue, 12 Oct 2021 11:18:01 +1030
Subject: [PATCH 08/47] Refactor - move functions into methods

---
 pyhgvs/__init__.py                   | 1179 +-------------------------
 pyhgvs/models.py                     |  152 ----
 pyhgvs/models/__init__.py            |    0
 pyhgvs/models/cdna.py                |  123 +++
 pyhgvs/models/genome.py              |   47 +
 pyhgvs/models/hgvs_name.py           |  801 +++++++++++++++++
 pyhgvs/models/transcript.py          |  335 ++++++++
 pyhgvs/{ => models}/variants.py      |   21 +-
 pyhgvs/tests/genome.py               |    2 +-
 pyhgvs/tests/test_hgvs_names.py      |   12 +-
 pyhgvs/tests/test_hgvs_names_long.py |    2 +-
 pyhgvs/tests/test_variants.py        |    2 +-
 pyhgvs/utils.py                      |    5 +-
 requirements-dev.txt                 |    1 +
 setup.py                             |    1 +
 15 files changed, 1340 insertions(+), 1343 deletions(-)
 delete mode 100644 pyhgvs/models.py
 create mode 100644 pyhgvs/models/__init__.py
 create mode 100644 pyhgvs/models/cdna.py
 create mode 100644 pyhgvs/models/genome.py
 create mode 100644 pyhgvs/models/hgvs_name.py
 create mode 100644 pyhgvs/models/transcript.py
 rename pyhgvs/{ => models}/variants.py (96%)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 977ae39..b3004b2 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -7,503 +7,16 @@
 Definition of which transcript to use coding variants:
 ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene
 
-
-HGVS language currently implemented.
-
-HGVS = ALLELE
-     | PREFIX_NAME : ALLELE
-
-PREFIX_NAME = TRANSCRIPT
-            | TRANSCRIPT '(' GENE ')'
-
-TRANSCRIPT = TRANSCRIPT_NAME
-           | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION
-
-TRANSCRIPT_VERSION = NUMBER
-
-ALLELE = 'c.' CDNA_ALLELE    # cDNA
-       | 'g.' GENOMIC_ALLELE # genomic
-       | 'm.' MIT_ALLELE     # mitochondrial sequence
-       | 'n.' NC_ALLELE      # non-coding RNA reference sequence
-       | 'r.' RNA_ALLELE     # RNA sequence (like r.76a>u)
-       | 'p.' PROTEIN_ALLELE # protein sequence (like  p.Lys76Asn)
-
-NC_ALLELE =
-RNA_ALLELE =
-CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE
-            | CDNA_COORD_RANGE MULTI_BASE_CHANGE
-
-GENOMIC_ALLELE =
-MIT_ALLELE = COORD SINGLE_BASE_CHANGE
-           | COORD_RANGE MULTI_BASE_CHANGE
-
-SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '='        # no change
-                   | CDNA_COORD BASE '>' BASE                 # substitution
-                   | CDNA_COORD 'ins' BASE                    # 1bp insertion
-                   | CDNA_COORD 'del' BASE                    # 1bp deletion
-                   | CDNA_COORD 'dup' BASE                    # 1bp duplication
-                   | CDNA_COORD 'ins'                         # 1bp insertion
-                   | CDNA_COORD 'del'                         # 1bp deletion
-                   | CDNA_COORD 'dup'                         # 1bp duplication
-                   | CDNA_COORD 'del' BASE 'ins' BASE         # 1bp indel
-                   | CDNA_COORD 'delins' BASE                 # 1bp indel
-
-MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES             # deletion
-                  | COORD_RANGE 'ins' BASES             # insertion
-                  | COORD_RANGE 'dup' BASES             # duplication
-                  | COORD_RANGE 'del'                   # deletion
-                  | COORD_RANGE 'dup'                   # duplication
-                  | COORD_RANGE 'del' BASES 'ins' BASES # indel
-                  | COORD_RANGE 'delins' BASES          # indel
-
-
-AMINO1 = [GAVLIMFWPSTCYNQDEKRH]
-
-AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro'
-       | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys'
-       | 'Arg' | 'His'
-
-PROTEIN_ALLELE = AMINO3 COORD '='               # no peptide change
-               | AMINO1 COORD '='               # no peptide change
-               | AMINO3 COORD AMINO3 PEP_EXTRA  # peptide change
-               | AMINO1 COORD AMINO1 PEP_EXTRA  # peptide change
-               | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA        # indel
-               | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA        # indel
-               | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel
-               | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel
-
-# A genomic range:
-COORD_RANGE = COORD '_' COORD
-
-# A cDNA range:
-CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD
-
-# A cDNA coordinate:
-CDNA_COORD = COORD_PREFIX COORD
-           | COORD_PREFIX COORD OFFSET_PREFIX OFFSET
-COORD_PREFIX = '' | '-' | '*'
-COORD = NUMBER
-OFFSET_PREFIX = '-' | '+'
-OFFSET = NUMBER
-
-# Primatives:
-NUMBER = \d+
-BASE = [ACGT]
-BASES = BASE+
-
 """
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
 import re
 
-from .variants import justify_indel
-from .variants import normalize_variant
-from .variants import revcomp
-
-
-CHROM_PREFIX = 'chr'
-CDNA_START_CODON = 'cdna_start'
-CDNA_STOP_CODON = 'cdna_stop'
-
-
-class HGVSRegex(object):
-    """
-    All regular expression for HGVS names.
-    """
-
-    # DNA syntax
-    # http://www.hgvs.org/mutnomen/standards.html#nucleotide
-    BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+"
-    BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+"
-    DNA_REF = "(?P<ref>" + BASES + ")"
-    DNA_ALT = "(?P<alt>" + BASES + ")"
-
-    # Mutation types
-    EQUAL = "(?P<mutation_type>=)"
-    SUB = "(?P<mutation_type>>)"
-    INS = "(?P<mutation_type>ins)"
-    DEL = "(?P<mutation_type>del)"
-    DUP = "(?P<mutation_type>dup)"
-
-    # Simple coordinate syntax
-    COORD_START = "(?P<start>\d+)"
-    COORD_END = "(?P<end>\d+)"
-    COORD_RANGE = COORD_START + "_" + COORD_END
-
-    # cDNA coordinate syntax
-    CDNA_COORD = ("(?P<coord_prefix>|-|\*)(?P<coord>\d+)"
-                  "((?P<offset_prefix>-|\+)(?P<offset>\d+))?")
-    CDNA_START = ("(?P<start>(?P<start_coord_prefix>|-|\*)(?P<start_coord>\d+)"
-                  "((?P<start_offset_prefix>-|\+)(?P<start_offset>\d+))?)")
-    CDNA_END = (r"(?P<end>(?P<end_coord_prefix>|-|\*)(?P<end_coord>\d+)"
-                "((?P<end_offset_prefix>-|\+)(?P<end_offset>\d+))?)")
-    CDNA_RANGE = CDNA_START + "_" + CDNA_END
-
-    # cDNA allele syntax
-    CDNA_ALLELE = [
-        # No change
-        CDNA_START + DNA_REF + EQUAL,
-
-        # Substitution
-        CDNA_START + DNA_REF + SUB + DNA_ALT,
-
-        # 1bp insertion, deletion, duplication
-        CDNA_START + INS + DNA_ALT,
-        CDNA_START + DEL + DNA_REF,
-        CDNA_START + DUP + DNA_REF,
-        CDNA_START + DEL,
-        CDNA_START + DUP,
-
-        # Insertion, deletion, duplication
-        CDNA_RANGE + INS + DNA_ALT,
-        CDNA_RANGE + DEL + DNA_REF,
-        CDNA_RANGE + DUP + DNA_REF,
-        CDNA_RANGE + DEL,
-        CDNA_RANGE + DUP,
-
-        # Indels
-        "(?P<delins>" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
-        "(?P<delins>" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
-        "(?P<delins>" + CDNA_START + 'delins' + DNA_ALT + ")",
-        "(?P<delins>" + CDNA_RANGE + 'delins' + DNA_ALT + ")",
-    ]
-
-    CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$")
-                           for regex in CDNA_ALLELE]
-
-    # Peptide syntax
-    PEP = "([A-Z]([a-z]{2}))+"
-    PEP_REF = "(?P<ref>" + PEP + ")"
-    PEP_REF2 = "(?P<ref2>" + PEP + ")"
-    PEP_ALT = "(?P<alt>" + PEP + ")"
-
-    PEP_EXTRA = "(?P<extra>(|=|\?)(|fs))"
-
-    # Peptide allele syntax
-    PEP_ALLELE = [
-        # No peptide change
-        # Example: Glu1161=
-        PEP_REF + COORD_START + PEP_EXTRA,
-
-        # Peptide change
-        # Example: Glu1161Ser
-        PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA,
-
-        # Peptide indel
-        # Example: Glu1161_Ser1164?fs
-        "(?P<delins>" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END +
-        PEP_EXTRA + ")",
-        "(?P<delins>" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END +
-        PEP_ALT + PEP_EXTRA + ")",
-    ]
-
-    PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$")
-                          for regex in PEP_ALLELE]
-
-    # Genomic allele syntax
-    GENOMIC_ALLELE = [
-        # No change
-        COORD_START + DNA_REF + EQUAL,
-
-        # Substitution
-        COORD_START + DNA_REF + SUB + DNA_ALT,
-
-        # 1bp insertion, deletion, duplication
-        COORD_START + INS + DNA_ALT,
-        COORD_START + DEL + DNA_REF,
-        COORD_START + DUP + DNA_REF,
-        COORD_START + DEL,
-        COORD_START + DUP,
-
-        # Insertion, deletion, duplication
-        COORD_RANGE + INS + DNA_ALT,
-        COORD_RANGE + DEL + DNA_REF,
-        COORD_RANGE + DUP + DNA_REF,
-        COORD_RANGE + DEL,
-        COORD_RANGE + DUP,
-
-        # Indels
-        "(?P<delins>" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
-        "(?P<delins>" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
-        "(?P<delins>" + COORD_START + 'delins' + DNA_ALT + ")",
-        "(?P<delins>" + COORD_RANGE + 'delins' + DNA_ALT + ")",
-    ]
-
-    GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$")
-                              for regex in GENOMIC_ALLELE]
-
-
-class ChromosomeSubset(object):
-    """
-    Allow direct access to a subset of the chromosome.
-    """
-
-    def __init__(self, name, genome=None):
-        self.name = name
-        self.genome = genome
-
-    def __getitem__(self, key):
-        """Return sequence from region [start, end)
-
-        Coordinates are 0-based, end-exclusive."""
-        if isinstance(key, slice):
-            start, end = (key.start, key.stop)
-            start -= self.genome.start
-            end -= self.genome.start
-            return self.genome.genome[self.genome.seqid][start:end]
-        else:
-            raise TypeError('Expected a slice object but '
-                            'received a {0}.'.format(type(key)))
-
-    def __repr__(self):
-        return 'ChromosomeSubset("%s")' % self.name
-
-
-class GenomeSubset(object):
-    """
-    Allow the direct access of a subset of the genome.
-    """
-
-    def __init__(self, genome, chrom, start, end, seqid):
-        self.genome = genome
-        self.chrom = chrom
-        self.start = start
-        self.end = end
-        self.seqid = seqid
-        self._chroms = {}
-
-    def __getitem__(self, chrom):
-        """Return a chromosome by its name."""
-        if chrom in self._chroms:
-            return self._chroms[chrom]
-        else:
-            chromosome = ChromosomeSubset(chrom, self)
-            self._chroms[chrom] = chromosome
-            return chromosome
-
-
-class CDNACoord(object):
-    """
-    A HGVS cDNA-based coordinate.
-
-    A cDNA coordinate can take one of these forms:
-
-    N = nucleotide N in protein coding sequence (e.g. 11A>G)
-
-    -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G)
-         NOTE: so located in the 5'UTR or 5' of the transcription initiation
-         site (upstream of the gene, incl. promoter)
-
-    *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G)
-         NOTE: so located in the 3'UTR or 3' of the polyA-addition site
-         (including downstream of the gene)
-
-    N+M = nucleotide M in the intron after (3' of) position N in the coding DNA
-          reference sequence (e.g. 30+4A>G)
-
-    N-M = nucleotide M in the intron before (5' of) position N in the coding
-          DNA reference sequence (e.g. 301-2A>G)
-
-    -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G)
-
-    *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G)
-    """
-
-    def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON,
-                 string=''):
-        """
-        coord: main coordinate along cDNA on the same strand as the transcript
-
-        offset: an additional genomic offset from the main coordinate.  This
-                allows referencing non-coding (e.g. intronic) positions.
-                Offset is also interpreted on the coding strand.
-
-        landmark: ('cdna_start', 'cdna_stop') indicating that 'coord'
-                  is relative to one of these landmarks.
-
-        string: a coordinate from an HGVS name.  If given coord, offset, and
-                landmark should not be specified.
-        """
-
-        if string:
-            if coord != 0 or offset != 0 or landmark != CDNA_START_CODON:
-                raise ValueError("coord, offset, and landmark should not "
-                                 "be given with string argument")
-
-            self.parse(string)
-        else:
-            self.coord = coord
-            self.offset = offset
-            self.landmark = landmark
-
-    def parse(self, coord_text):
-        """
-        Parse a HGVS formatted cDNA coordinate.
-        """
-
-        match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text)
-        if not match:
-            raise ValueError("unknown coordinate format '%s'" % coord_text)
-        coord_prefix, coord, _, offset_prefix, offset = match.groups()
-
-        self.coord = int(coord)
-        self.offset = int(offset) if offset else 0
-
-        if offset_prefix == '-':
-            self.offset *= -1
-        elif offset_prefix == '+' or offset is None:
-            pass
-        else:
-            raise ValueError("unknown offset_prefix '%s'" % offset_prefix)
-
-        if coord_prefix == '':
-            self.landmark = CDNA_START_CODON
-        elif coord_prefix == "-":
-            self.coord *= -1
-            self.landmark = CDNA_START_CODON
-        elif coord_prefix == '*':
-            self.landmark = CDNA_STOP_CODON
-        else:
-            raise ValueError("unknown coord_prefix '%s'" % coord_prefix)
-        return self
-
-    def __str__(self):
-        """
-        Return a formatted cDNA coordinate
-        """
-        if self.landmark == CDNA_STOP_CODON:
-            coord_prefix = '*'
-        else:
-            coord_prefix = ''
-
-        if self.offset < 0:
-            offset = '%d' % self.offset
-        elif self.offset > 0:
-            offset = '+%d' % self.offset
-        else:
-            offset = ''
-
-        return '%s%d%s' % (coord_prefix, self.coord, offset)
-
-    def __eq__(self, other):
-        """Equality operator."""
-        return ((self.coord, self.offset, self.landmark) ==
-                (other.coord, other.offset, other.landmark))
-
-    def __repr__(self):
-        """
-        Returns a string representation of a cDNA coordinate.
-        """
-        if self.landmark != CDNA_START_CODON:
-            return "CDNACoord(%d, %d, '%s')" % (
-                self.coord, self.offset, self.landmark)
-        else:
-            return "CDNACoord(%d, %d)" % (self.coord, self.offset)
-
-
-# The RefSeq standard for naming contigs/transcripts/proteins:
-# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly  # nopep8
-REFSEQ_PREFIXES = [
-    ('AC_', 'genomic',
-     'Complete genomic molecule, usually alternate assembly'),
-    ('NC_', 'genomic',
-     'Complete genomic molecule, usually reference assembly'),
-    ('NG_', 'genomic', 'Incomplete genomic region'),
-    ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),
-    ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),
-    ('NS_', 'genomic', 'Environmental sequence'),
-    ('NZ_', 'genomic', 'Unfinished WGS'),
-    ('NM_', 'mRNA', ''),
-    ('NR_', 'RNA', ''),
-    ('XM_', 'mRNA', 'Predicted model'),
-    ('XR_', 'RNA', 'Predicted model'),
-    ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),
-    ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),
-    ('YP_', 'Protein', ''),
-    ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),
-    ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),
-]
-
-REFSEQ_PREFIX_LOOKUP = dict(
-    (prefix, (kind, description))
-    for prefix, kind, description in REFSEQ_PREFIXES
-)
-
-
-def get_refseq_type(name):
-    """
-    Return the RefSeq type for a refseq name.
-    """
-    prefix = name[:3]
-    return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0]
-
-
-def get_exons(transcript):
-    """Yield exons in coding order."""
-    transcript_strand = transcript.tx_position.is_forward_strand
-    if hasattr(transcript.exons, 'select_related'):
-        exons = list(transcript.exons.select_related('tx_position'))
-    else:
-        exons = list(transcript.exons)
-    exons.sort(key=lambda exon: exon.tx_position.chrom_start)
-    if not transcript_strand:
-        exons.reverse()
-    return exons
-
-
-def get_coding_exons(transcript):
-    """Yield non-empty coding exonic regions in coding order."""
-    for exon in get_exons(transcript):
-        region = exon.get_as_interval(coding_only=True)
-        if region:
-            yield region
-
-
-def get_utr5p_size(transcript):
-    """Return the size of the 5prime UTR of a transcript."""
-
-    transcript_strand = transcript.tx_position.is_forward_strand
-    exons = get_exons(transcript)
-
-    # Find the exon containing the start codon.
-    start_codon = (transcript.cds_position.chrom_start if transcript_strand
-                   else transcript.cds_position.chrom_stop - 1)
-    cdna_len = 0
-    for exon in exons:
-        exon_start = exon.tx_position.chrom_start
-        exon_end = exon.tx_position.chrom_stop
-        if exon_start <= start_codon < exon_end:
-            break
-        cdna_len += exon_end - exon_start
-    else:
-        raise ValueError("transcript contains no exons")
-
-    if transcript_strand:
-        return cdna_len + (start_codon - exon_start)
-    else:
-        return cdna_len + (exon_end - start_codon - 1)
-
-
-def find_stop_codon(exons, cds_position):
-    """Return the position along the cDNA of the base after the stop codon."""
-    if cds_position.is_forward_strand:
-        stop_pos = cds_position.chrom_stop
-    else:
-        stop_pos = cds_position.chrom_start
-    cdna_pos = 0
-    for exon in exons:
-        exon_start = exon.tx_position.chrom_start
-        exon_stop = exon.tx_position.chrom_stop
-
-        if exon_start <= stop_pos <= exon_stop:
-            if cds_position.is_forward_strand:
-                return cdna_pos + stop_pos - exon_start
-            else:
-                return cdna_pos + exon_stop - stop_pos
-        else:
-            cdna_pos += exon_stop - exon_start
-    raise ValueError('Stop codon is not in any of the exons')
+from .models.cdna import CDNACoord
+from .models.genome import GenomeSubset
+from .models.hgvs_name import HGVSName
+from .models.variants import justify_indel, normalize_variant, revcomp
 
 
 def get_genomic_sequence(genome, chrom, start, end):
@@ -518,141 +31,6 @@ def get_genomic_sequence(genome, chrom, start, end):
         return str(genome[str(chrom)][start - 1:end]).upper()
 
 
-def cdna_to_genomic_coord(transcript, coord):
-    """Convert a HGVS cDNA coordinate to a genomic coordinate."""
-    transcript_strand = transcript.tx_position.is_forward_strand
-    exons = get_exons(transcript)
-    utr5p = (get_utr5p_size(transcript)
-             if transcript.is_coding else 0)
-
-    # compute starting position along spliced transcript.
-    if coord.landmark == CDNA_START_CODON:
-        if coord.coord > 0:
-            pos = utr5p + coord.coord
-        else:
-            pos = utr5p + coord.coord + 1
-    elif coord.landmark == CDNA_STOP_CODON:
-        if coord.coord < 0:
-            raise ValueError('CDNACoord cannot have a negative coord and '
-                             'landmark CDNA_STOP_CODON')
-        pos = find_stop_codon(exons, transcript.cds_position) + coord.coord
-    else:
-        raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark)
-
-    # 5' flanking sequence.
-    if pos < 1:
-        if transcript_strand:
-            return transcript.tx_position.chrom_start + pos
-        else:
-            return transcript.tx_position.chrom_stop - pos + 1
-
-    # Walk along transcript until we find an exon that contains pos.
-    cdna_start = 1
-    cdna_end = 1
-    for exon in exons:
-        exon_start = exon.tx_position.chrom_start + 1
-        exon_end = exon.tx_position.chrom_stop
-        cdna_end = cdna_start + (exon_end - exon_start)
-        if cdna_start <= pos <= cdna_end:
-            break
-        cdna_start = cdna_end + 1
-    else:
-        # 3' flanking sequence
-        if transcript_strand:
-            return transcript.cds_position.chrom_stop + coord.coord
-        else:
-            return transcript.cds_position.chrom_start + 1 - coord.coord
-
-    # Compute genomic coordinate using offset.
-    if transcript_strand:
-        # Plus strand.
-        return exon_start + (pos - cdna_start) + coord.offset
-    else:
-        # Minus strand.
-        return exon_end - (pos - cdna_start) - coord.offset
-
-
-def genomic_to_cdna_coord(transcript, genomic_coord):
-    """Convert a genomic coordinate to a cDNA coordinate and offset.
-    """
-    exons = [exon.get_as_interval()
-             for exon in get_exons(transcript)]
-
-    if len(exons) == 0:
-        return None
-
-    strand = transcript.strand
-
-    if strand == "+":
-        exons.sort(key=lambda exon: exon.chrom_start)
-    else:
-        exons.sort(key=lambda exon: -exon.chrom_end)
-
-    distances = [exon.distance(genomic_coord)
-                 for exon in exons]
-    min_distance_to_exon = min(map(abs, distances))
-
-    coding_offset = 0
-    for exon in exons:
-        exon_length = exon.chrom_end - exon.chrom_start
-        distance = exon.distance(genomic_coord)
-        if abs(distance) == min_distance_to_exon:
-            if strand == "+":
-                exon_start_cds_offset = coding_offset + 1
-                exon_end_cds_offset = coding_offset + exon_length
-            else:
-                exon_start_cds_offset = coding_offset + exon_length
-                exon_end_cds_offset = coding_offset + 1
-            # This is the exon we want to annotate against.
-            if distance == 0:
-                # Inside the exon.
-                if strand == "+":
-                    coord = (exon_start_cds_offset +
-                             (genomic_coord -
-                              (exon.chrom_start + 1)))
-                else:
-                    coord = (exon_end_cds_offset +
-                             (exon.chrom_end -
-                              genomic_coord))
-                cdna_coord = CDNACoord(coord, 0)
-            else:
-                # Outside the exon.
-                if distance > 0:
-                    nearest_exonic = exon_start_cds_offset
-                else:
-                    nearest_exonic = exon_end_cds_offset
-                if strand == "+":
-                    distance *= -1
-
-                # If outside transcript, don't use offset.
-                if (genomic_coord < transcript.tx_position.chrom_start + 1 or
-                        genomic_coord > transcript.tx_position.chrom_stop):
-                    nearest_exonic += distance
-                    distance = 0
-                cdna_coord = CDNACoord(nearest_exonic, distance)
-            break
-        coding_offset += exon_length
-
-    # Adjust coordinates for coding transcript.
-    if transcript.is_coding:
-        # Detect if position before start codon.
-        utr5p = get_utr5p_size(transcript) if transcript.is_coding else 0
-        cdna_coord.coord -= utr5p
-        if cdna_coord.coord <= 0:
-            cdna_coord.coord -= 1
-        else:
-            # Detect if position is after stop_codon.
-            exons = get_exons(transcript)
-            stop_codon = find_stop_codon(exons, transcript.cds_position)
-            stop_codon -= utr5p
-            if (cdna_coord.coord > stop_codon or
-                    cdna_coord.coord == stop_codon and cdna_coord.offset > 0):
-                cdna_coord.coord -= stop_codon
-                cdna_coord.landmark = CDNA_STOP_CODON
-
-    return cdna_coord
-
-
 def get_allele(hgvs, genome, transcript=None):
     """Get an allele from a HGVSName, a genome, and a transcript."""
     chrom, start, end = hgvs.get_ref_coords(transcript)
@@ -706,549 +84,6 @@ def matches_ref_allele(hgvs, genome, transcript=None):
     return genome_ref == ref
 
 
-class InvalidHGVSName(ValueError):
-    def __init__(self, name='', part='name', reason=''):
-        if name:
-            message = 'Invalid HGVS %s "%s"' % (part, name)
-        else:
-            message = 'Invalid HGVS %s' % part
-        if reason:
-            message += ': ' + reason
-        super(InvalidHGVSName, self).__init__(message)
-
-        self.name = name
-        self.part = part
-        self.reason = reason
-
-
-class HGVSName(object):
-    """
-    Represents a HGVS variant name.
-    """
-
-    def __init__(self, name='', prefix='', chrom='', transcript='', gene='',
-                 kind='', mutation_type=None, start=0, end=0, ref_allele='',
-                 ref2_allele='', alt_allele='',
-                 cdna_start=None, cdna_end=None, pep_extra=''):
-
-        # Full HGVS name.
-        self.name = name
-
-        # Name parts.
-        self.prefix = prefix
-        self.chrom = chrom
-        self.transcript = transcript
-        self.gene = gene
-        self.kind = kind
-        self.mutation_type = mutation_type
-        self.start = start
-        self.end = end
-        self.ref_allele = ref_allele    # reference allele
-        self.ref2_allele = ref2_allele  # reference allele at end of pep indel
-        self.alt_allele = alt_allele    # alternate allele
-
-        # cDNA-specific fields
-        self.cdna_start = cdna_start if cdna_start else CDNACoord()
-        self.cdna_end = cdna_end if cdna_end else CDNACoord()
-
-        # Protein-specific fields
-        self.pep_extra = pep_extra
-
-        if name:
-            self.parse(name)
-
-    def parse(self, name):
-        """Parse a HGVS name."""
-        # Does HGVS name have transcript/gene prefix?
-        if ':' in name:
-            prefix, allele = name.split(':', 1)
-        else:
-            prefix = ''
-            allele = name
-
-        self.name = name
-
-        # Parse prefix and allele.
-        self.parse_allele(allele)
-        self.parse_prefix(prefix, self.kind)
-        self._validate()
-
-    def parse_prefix(self, prefix, kind):
-        """
-        Parse a HGVS prefix (gene/transcript/chromosome).
-
-        Some examples of full hgvs names with transcript include:
-          NM_007294.3:c.2207A>C
-          NM_007294.3(BRCA1):c.2207A>C
-          BRCA1{NM_007294.3}:c.2207A>C
-        """
-
-        self.prefix = prefix
-
-        # No prefix.
-        if prefix == '':
-            self.chrom = ''
-            self.transcript = ''
-            self.gene = ''
-            return
-
-        # Transcript and gene given with parens.
-        # example: NM_007294.3(BRCA1):c.2207A>C
-        match = re.match("^(?P<transcript>[^(]+)\((?P<gene>[^)]+)\)$", prefix)
-        if match:
-            self.transcript = match.group('transcript')
-            self.gene = match.group('gene')
-            return
-
-        # Transcript and gene given with braces.
-        # example: BRCA1{NM_007294.3}:c.2207A>C
-        match = re.match("^(?P<gene>[^{]+)\{(?P<transcript>[^}]+)\}$", prefix)
-        if match:
-            self.transcript = match.group('transcript')
-            self.gene = match.group('gene')
-            return
-
-        # Determine using Ensembl type.
-        if prefix.startswith('ENST'):
-            self.transcript = prefix
-            return
-
-        # Determine using refseq type.
-        refseq_type = get_refseq_type(prefix)
-        if refseq_type in ('mRNA', 'RNA'):
-            self.transcript = prefix
-            return
-
-        # Determine using refseq type.
-        if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic':
-            self.chrom = prefix
-            return
-
-        # Assume gene name.
-        self.gene = prefix
-
-    def parse_allele(self, allele):
-        """
-        Parse a HGVS allele description.
-
-        Some examples include:
-          cDNA substitution: c.101A>C,
-          cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG
-          No protein change: p.Glu1161=
-          Protein change: p.Glu1161Ser
-          Protein frameshift: p.Glu1161_Ser1164?fs
-          Genomic substitution: g.1000100A>T
-          Genomic indel: g.1000100_1000102delATG
-        """
-        if '.' not in allele:
-            raise InvalidHGVSName(allele, 'allele',
-                                  'expected kind "c.", "p.", "g.", etc')
-
-        # Determine HGVS name kind.
-        kind, details = allele.split('.', 1)
-        self.kind = kind
-        self.mutation_type = None
-
-        if kind in ("c", 'n'):
-            self.parse_cdna(details)
-            if kind == 'n':  # Ensure no 3'UTR or 5'UTR coords in non-coding
-                if self.cdna_start.coord < 0:
-                    raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain negative (5'UTR) coordinates")
-                if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop':
-                    raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain '*' (3'UTR) coordinates")
-        elif kind == "p":
-            self.parse_protein(details)
-        elif kind == "g":
-            self.parse_genome(details)
-        else:
-            raise NotImplementedError("unknown kind: %s" % allele)
-
-    def parse_cdna(self, details):
-        """
-        Parse a HGVS cDNA name.
-
-        Some examples include:
-          Substitution: 101A>C,
-          Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG
-        """
-        for regex in HGVSRegex.CDNA_ALLELE_REGEXES:
-            match = re.match(regex, details)
-            if match:
-                groups = match.groupdict()
-
-                # Parse mutation type.
-                if groups.get('delins'):
-                    self.mutation_type = 'delins'
-                else:
-                    self.mutation_type = groups['mutation_type']
-
-                # Parse coordinates.
-                self.cdna_start = CDNACoord(string=groups.get('start'))
-                if groups.get('end'):
-                    self.cdna_end = CDNACoord(string=groups.get('end'))
-                else:
-                    self.cdna_end = CDNACoord(string=groups.get('start'))
-
-                # Parse alleles.
-                self.ref_allele = groups.get('ref', '')
-                self.alt_allele = groups.get('alt', '')
-
-                # Convert numerical allelles.
-                if self.ref_allele.isdigit():
-                    self.ref_allele = "N" * int(self.ref_allele)
-                if self.alt_allele.isdigit():
-                    self.alt_allele = "N" * int(self.alt_allele)
-
-                # Convert duplication alleles.
-                if self.mutation_type == "dup":
-                    self.alt_allele = self.ref_allele * 2
-
-                # Convert no match alleles.
-                if self.mutation_type == "=":
-                    self.alt_allele = self.ref_allele
-                return
-
-        raise InvalidHGVSName(details, 'cDNA allele')
-
-    def parse_protein(self, details):
-        """
-        Parse a HGVS protein name.
-
-        Some examples include:
-          No change: Glu1161=
-          Change: Glu1161Ser
-          Frameshift: Glu1161_Ser1164?fs
-        """
-        for regex in HGVSRegex.PEP_ALLELE_REGEXES:
-            match = re.match(regex, details)
-            if match:
-                groups = match.groupdict()
-
-                # Parse mutation type.
-                if groups.get('delins'):
-                    self.mutation_type = 'delins'
-                else:
-                    self.mutation_type = '>'
-
-                # Parse coordinates.
-                self.start = int(groups.get('start'))
-                if groups.get('end'):
-                    self.end = int(groups.get('end'))
-                else:
-                    self.end = self.start
-
-                # Parse alleles.
-                self.ref_allele = groups.get('ref', '')
-                if groups.get('ref2'):
-                    self.ref2_allele = groups.get('ref2')
-                    self.alt_allele = groups.get('alt', '')
-                else:
-                    # If alt is not given, assume matching with ref
-                    self.ref2_allele = self.ref_allele
-                    self.alt_allele = groups.get(
-                        'alt', self.ref_allele)
-
-                self.pep_extra = groups.get('extra')
-                return
-
-        raise InvalidHGVSName(details, 'protein allele')
-
-    def parse_genome(self, details):
-        """
-        Parse a HGVS genomic name.
-
-        Som examples include:
-          Substitution: 1000100A>T
-          Indel: 1000100_1000102delATG
-        """
-
-        for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES:
-            match = re.match(regex, details)
-            if match:
-                groups = match.groupdict()
-
-                # Parse mutation type.
-                if groups.get('delins'):
-                    self.mutation_type = 'delins'
-                else:
-                    self.mutation_type = groups['mutation_type']
-
-                # Parse coordinates.
-                self.start = int(groups.get('start'))
-                if groups.get('end'):
-                    self.end = int(groups.get('end'))
-                else:
-                    self.end = self.start
-
-                # Parse alleles.
-                self.ref_allele = groups.get('ref', '')
-                self.alt_allele = groups.get('alt', '')
-
-                # Convert numerical allelles.
-                if self.ref_allele.isdigit():
-                    self.ref_allele = "N" * int(self.ref_allele)
-                if self.alt_allele.isdigit():
-                    self.alt_allele = "N" * int(self.alt_allele)
-
-                # Convert duplication alleles.
-                if self.mutation_type == "dup":
-                    self.alt_allele = self.ref_allele * 2
-
-                # Convert no match alleles.
-                if self.mutation_type == "=":
-                    self.alt_allele = self.ref_allele
-                return
-
-        raise InvalidHGVSName(details, 'genomic allele')
-
-    def _validate(self):
-        """
-        Check for internal inconsistencies in representation
-        """
-        if self.start > self.end:
-            raise InvalidHGVSName(reason="Coordinates are nonincreasing")
-
-    def __repr__(self):
-        try:
-            return "HGVSName('%s')" % self.format()
-        except NotImplementedError:
-            return "HGVSName('%s')" % self.name
-
-    def __unicode__(self):
-        return self.format()
-
-    def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
-        """Generate a HGVS name as a string."""
-
-        if self.kind == 'c':
-            allele = 'c.' + self.format_cdna()
-        elif self.kind == 'n':
-            allele = 'n.' + self.format_cdna()
-        elif self.kind == 'p':
-            allele = 'p.' + self.format_protein()
-        elif self.kind == 'g':
-            allele = 'g.' + self.format_genome()
-        else:
-            raise NotImplementedError("not implemented: '%s'" % self.kind)
-
-        prefix = self.format_prefix(use_gene=use_gene) if use_prefix else ''
-
-        if prefix:
-            return prefix + ':' + allele
-        else:
-            return allele
-
-    def format_prefix(self, use_gene=True):
-        """
-        Generate HGVS trancript/gene prefix.
-
-        Some examples of full hgvs names with transcript include:
-          NM_007294.3:c.2207A>C
-          NM_007294.3(BRCA1):c.2207A>C
-        """
-
-        if self.kind == 'g':
-            if self.chrom:
-                return self.chrom
-
-        if self.transcript:
-            if use_gene and self.gene:
-                return '%s(%s)' % (self.transcript, self.gene)
-            else:
-                return self.transcript
-        else:
-            if use_gene:
-                return self.gene
-            else:
-                return ''
-
-    def format_cdna_coords(self):
-        """
-        Generate HGVS cDNA coordinates string.
-        """
-        # Format coordinates.
-        if self.cdna_start == self.cdna_end:
-            return str(self.cdna_start)
-        else:
-            return "%s_%s" % (self.cdna_start, self.cdna_end)
-
-    def format_dna_allele(self):
-        """
-        Generate HGVS DNA allele.
-        """
-        if self.mutation_type == '=':
-            # No change.
-            # example: 101A=
-            return self.ref_allele + '='
-
-        if self.mutation_type == '>':
-            # SNP.
-            # example: 101A>C
-            return self.ref_allele + '>' + self.alt_allele
-
-        elif self.mutation_type == 'delins':
-            # Indel.
-            # example: 112_117delAGGTCAinsTG, 112_117delinsTG
-            return 'del' + self.ref_allele + 'ins' + self.alt_allele
-
-        elif self.mutation_type in ('del', 'dup'):
-            # Delete, duplication.
-            # example: 1000_1003delATG, 1000_1003dupATG
-            return self.mutation_type + self.ref_allele
-
-        elif self.mutation_type == 'ins':
-            # Insert.
-            # example: 1000_1001insATG
-            return self.mutation_type + self.alt_allele
-
-        elif self.mutation_type == 'inv':
-            return self.mutation_type
-
-        else:
-            raise AssertionError(
-                "unknown mutation type: '%s'" % self.mutation_type)
-
-    def format_cdna(self):
-        """
-        Generate HGVS cDNA allele.
-
-        Some examples include:
-          Substitution: 101A>C,
-          Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG
-        """
-        return self.format_cdna_coords() + self.format_dna_allele()
-
-    def format_protein(self):
-        """
-        Generate HGVS protein name.
-
-        Some examples include:
-          No change: Glu1161=
-          Change: Glu1161Ser
-          Frameshift: Glu1161_Ser1164?fs
-        """
-        if (self.start == self.end and
-                self.ref_allele == self.ref2_allele == self.alt_allele):
-            # Match.
-            # Example: Glu1161=
-            pep_extra = self.pep_extra if self.pep_extra else '='
-            return self.ref_allele + str(self.start) + pep_extra
-
-        elif (self.start == self.end and
-                self.ref_allele == self.ref2_allele and
-                self.ref_allele != self.alt_allele):
-            # Change.
-            # Example: Glu1161Ser
-            return (self.ref_allele + str(self.start) +
-                    self.alt_allele + self.pep_extra)
-
-        elif self.start != self.end:
-            # Range change.
-            # Example: Glu1161_Ser1164?fs
-            return (self.ref_allele + str(self.start) + '_' +
-                    self.ref2_allele + str(self.end) +
-                    self.pep_extra)
-
-        else:
-            raise NotImplementedError('protein name formatting.')
-
-    def format_coords(self):
-        """
-        Generate HGVS cDNA coordinates string.
-        """
-        # Format coordinates.
-        if self.start == self.end:
-            return str(self.start)
-        else:
-            return "%s_%s" % (self.start, self.end)
-
-    def format_genome(self):
-        """
-        Generate HGVS genomic allele.
-
-        Som examples include:
-          Substitution: 1000100A>T
-          Indel: 1000100_1000102delATG
-        """
-        return self.format_coords() + self.format_dna_allele()
-
-    def get_raw_coords(self, transcript=None):
-        """ return genomic coordinates """
-        if self.kind in ('c', 'n'):
-            chrom = transcript.tx_position.chrom
-            start = cdna_to_genomic_coord(transcript, self.cdna_start)
-            end = cdna_to_genomic_coord(transcript, self.cdna_end)
-
-            if not transcript.tx_position.is_forward_strand:
-                if end > start:
-                    raise AssertionError(
-                        "cdna_start cannot be greater than cdna_end")
-                start, end = end, start
-
-            if start > end:
-                raise AssertionError(
-                    "cdna_start cannot be greater than cdna_end")
-        elif self.kind == 'g':
-            chrom = self.chrom
-            start = self.start
-            end = self.end
-        else:
-            raise NotImplementedError(
-                'Coordinates are not available for this kind of HGVS name "%s"'
-                % self.kind)
-
-        return chrom, start, end
-
-    def get_ref_coords(self, transcript=None):
-        """Return genomic coordinates of reference allele."""
-        
-        chrom, start, end = self.get_raw_coords(transcript)
-        
-        if self.mutation_type == "ins":
-            # Inserts have empty interval.
-            if start < end:
-                start += 1
-                end -= 1
-            else:
-                end = start - 1
-
-        elif self.mutation_type == "dup":
-            end = start - 1
-        return chrom, start, end
-
-    def get_vcf_coords(self, transcript=None):
-        """Return genomic coordinates of reference allele in VCF-style."""
-        chrom, start, end = self.get_ref_coords(transcript)
-
-        # Inserts and deletes require left-padding by 1 base
-        if self.mutation_type in ("=", ">"):
-            pass
-        elif self.mutation_type in ("del", "ins", "dup", "delins"):
-            # Indels have left-padding.
-            start -= 1
-        else:
-            raise NotImplementedError("Unknown mutation_type '%s'" %
-                                      self.mutation_type)
-        return chrom, start, end
-
-    def get_ref_alt(self, is_forward_strand=True):
-        """Return reference and alternate alleles."""
-        if self.kind == 'p':
-            raise NotImplementedError(
-                'get_ref_alt is not implemented for protein HGVS names')
-        alleles = [self.ref_allele, self.alt_allele]
-
-        # Represent duplications are inserts.
-        if self.mutation_type == "dup":
-            alleles[0] = ""
-            alleles[1] = alleles[1][:len(alleles[1]) // 2]
-
-        if is_forward_strand:
-            return alleles
-        else:
-            return tuple(map(revcomp, alleles))
-
-
 def hgvs_justify_dup(chrom, offset, ref, alt, genome):
     """
     Determines if allele is a duplication and justifies.
@@ -1467,13 +302,13 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript,
 
         if mutation_type == '>' or (use_counsyl and is_single_base_indel):
             # Use a single coordinate.
-            hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset)
+            hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset)
             hgvs.cdna_end = hgvs.cdna_start
         else:
             if transcript.strand == '-':
                 offset_start, offset_end = offset_end, offset_start
-            hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start)
-            hgvs.cdna_end = genomic_to_cdna_coord(transcript, offset_end)
+            hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset_start)
+            hgvs.cdna_end = transcript.genomic_to_cdna_coord(offset_end)
 
     # Populate prefix.
     if transcript:
diff --git a/pyhgvs/models.py b/pyhgvs/models.py
deleted file mode 100644
index 656e67b..0000000
--- a/pyhgvs/models.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""
-Models for representing genomic elements.
-"""
-
-from __future__ import unicode_literals
-
-from collections import namedtuple
-
-
-class Position(object):
-    """A position in the genome."""
-
-    def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand):
-        self.chrom = chrom
-        self.chrom_start = chrom_start
-        self.chrom_stop = chrom_stop
-        self.is_forward_strand = is_forward_strand
-
-    def __repr__(self):
-        return "<Position %s[%d:%d]>" % (
-            self.chrom, self.chrom_start, self.chrom_stop)
-
-
-class Gene(object):
-    def __init__(self, name):
-        self.name = name
-
-
-class Transcript(object):
-    """
-
-    A gene may have multiple transcripts with different combinations of exons.
-    """
-
-    def __init__(self, name, version, gene, tx_position, cds_position,
-                 is_default=False, exons=None):
-        self.name = name
-        self.version = version
-        self.gene = Gene(gene)
-        self.tx_position = tx_position
-        self.cds_position = cds_position
-        self.is_default = is_default
-        self.exons = exons if exons else []
-
-    @property
-    def full_name(self):
-        if self.version is not None:
-            return '%s.%d' % (self.name, self.version)
-        else:
-            return self.name
-
-    @property
-    def is_coding(self):
-        # Coding transcripts have CDS with non-zero length.
-        return (self.cds_position.chrom_stop -
-                self.cds_position.chrom_start > 0)
-
-    @property
-    def strand(self):
-        return ('+' if self.tx_position.is_forward_strand else '-')
-
-    @property
-    def coding_exons(self):
-        return [exon.get_as_interval(coding_only=True)
-                for exon in self.exons]
-
-
-BED6Interval_base = namedtuple(
-    "BED6Interval_base", (
-        "chrom",
-        "chrom_start",
-        "chrom_end",
-        "name",
-        "score",
-        "strand"))
-
-
-class BED6Interval(BED6Interval_base):
-    def distance(self, offset):
-        """Return the distance to the interval.
-
-        if offset is inside the exon, distance is zero.
-        otherwise, distance is the distance to the nearest edge.
-
-        distance is positive if the exon comes after the offset.
-        distance is negative if the exon comes before the offset.
-        """
-
-        start = self.chrom_start + 1
-        end = self.chrom_end
-
-        if start <= offset <= end:
-            return 0
-
-        start_distance = start - offset
-        end_distance = offset - end
-
-        if abs(start_distance) < abs(end_distance):
-            return start_distance
-        else:
-            return -end_distance
-
-
-class Exon(object):
-    def __init__(self, transcript, tx_position, exon_number):
-        self.transcript = transcript
-        self.tx_position = tx_position
-        self.exon_number = exon_number
-
-    @property
-    def get_exon_name(self):
-        return "%s.%d" % (self.transcript.name, self.exon_number)
-
-    def get_as_interval(self, coding_only=False):
-        """Returns the coding region for this exon as a BED6Interval.
-
-        This function returns a BED6Interval objects containing  position
-        information for this exon. This may be used as input for
-        pybedtools.create_interval_from_list() after casting chrom_start
-        and chrom_end as strings.
-
-        coding_only: only include exons in the coding region
-
-        """
-
-        exon_start = self.tx_position.chrom_start
-        exon_stop = self.tx_position.chrom_stop
-
-        # Get only exon coding region if requested
-        if coding_only:
-            if (exon_stop <= self.transcript.cds_position.chrom_start or
-                    exon_start >= self.transcript.cds_position.chrom_stop):
-                return None
-            exon_start = max(exon_start,
-                             self.transcript.cds_position.chrom_start)
-            exon_stop = min(
-                max(exon_stop, self.transcript.cds_position.chrom_start),
-                self.transcript.cds_position.chrom_stop)
-
-        return BED6Interval(
-            self.tx_position.chrom,
-            exon_start,
-            exon_stop,
-            self.get_exon_name,
-            '.',
-            self.strand,
-        )
-
-    @property
-    def strand(self):
-        strand = '+' if self.tx_position.is_forward_strand else '-'
-        return strand
diff --git a/pyhgvs/models/__init__.py b/pyhgvs/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyhgvs/models/cdna.py b/pyhgvs/models/cdna.py
new file mode 100644
index 0000000..bc1280f
--- /dev/null
+++ b/pyhgvs/models/cdna.py
@@ -0,0 +1,123 @@
+import re
+
+CDNA_START_CODON = 'cdna_start'
+CDNA_STOP_CODON = 'cdna_stop'
+
+
+class CDNACoord(object):
+    """
+    A HGVS cDNA-based coordinate.
+
+    A cDNA coordinate can take one of these forms:
+
+    N = nucleotide N in protein coding sequence (e.g. 11A>G)
+
+    -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G)
+         NOTE: so located in the 5'UTR or 5' of the transcription initiation
+         site (upstream of the gene, incl. promoter)
+
+    *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G)
+         NOTE: so located in the 3'UTR or 3' of the polyA-addition site
+         (including downstream of the gene)
+
+    N+M = nucleotide M in the intron after (3' of) position N in the coding DNA
+          reference sequence (e.g. 30+4A>G)
+
+    N-M = nucleotide M in the intron before (5' of) position N in the coding
+          DNA reference sequence (e.g. 301-2A>G)
+
+    -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G)
+
+    *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G)
+    """
+
+    def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON,
+                 string=''):
+        """
+        coord: main coordinate along cDNA on the same strand as the transcript
+
+        offset: an additional genomic offset from the main coordinate.  This
+                allows referencing non-coding (e.g. intronic) positions.
+                Offset is also interpreted on the coding strand.
+
+        landmark: ('cdna_start', 'cdna_stop') indicating that 'coord'
+                  is relative to one of these landmarks.
+
+        string: a coordinate from an HGVS name.  If given coord, offset, and
+                landmark should not be specified.
+        """
+
+        if string:
+            if coord != 0 or offset != 0 or landmark != CDNA_START_CODON:
+                raise ValueError("coord, offset, and landmark should not "
+                                 "be given with string argument")
+
+            self.parse(string)
+        else:
+            self.coord = coord
+            self.offset = offset
+            self.landmark = landmark
+
+    def parse(self, coord_text):
+        """
+        Parse a HGVS formatted cDNA coordinate.
+        """
+
+        match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text)
+        if not match:
+            raise ValueError("unknown coordinate format '%s'" % coord_text)
+        coord_prefix, coord, _, offset_prefix, offset = match.groups()
+
+        self.coord = int(coord)
+        self.offset = int(offset) if offset else 0
+
+        if offset_prefix == '-':
+            self.offset *= -1
+        elif offset_prefix == '+' or offset is None:
+            pass
+        else:
+            raise ValueError("unknown offset_prefix '%s'" % offset_prefix)
+
+        if coord_prefix == '':
+            self.landmark = CDNA_START_CODON
+        elif coord_prefix == "-":
+            self.coord *= -1
+            self.landmark = CDNA_START_CODON
+        elif coord_prefix == '*':
+            self.landmark = CDNA_STOP_CODON
+        else:
+            raise ValueError("unknown coord_prefix '%s'" % coord_prefix)
+        return self
+
+    def __str__(self):
+        """
+        Return a formatted cDNA coordinate
+        """
+        if self.landmark == CDNA_STOP_CODON:
+            coord_prefix = '*'
+        else:
+            coord_prefix = ''
+
+        if self.offset < 0:
+            offset = '%d' % self.offset
+        elif self.offset > 0:
+            offset = '+%d' % self.offset
+        else:
+            offset = ''
+
+        return '%s%d%s' % (coord_prefix, self.coord, offset)
+
+    def __eq__(self, other):
+        """Equality operator."""
+        return ((self.coord, self.offset, self.landmark) ==
+                (other.coord, other.offset, other.landmark))
+
+    def __repr__(self):
+        """
+        Returns a string representation of a cDNA coordinate.
+        """
+        if self.landmark != CDNA_START_CODON:
+            return "CDNACoord(%d, %d, '%s')" % (
+                self.coord, self.offset, self.landmark)
+        else:
+            return "CDNACoord(%d, %d)" % (self.coord, self.offset)
diff --git a/pyhgvs/models/genome.py b/pyhgvs/models/genome.py
new file mode 100644
index 0000000..4cfc87d
--- /dev/null
+++ b/pyhgvs/models/genome.py
@@ -0,0 +1,47 @@
+class ChromosomeSubset(object):
+    """
+    Allow direct access to a subset of the chromosome.
+    """
+
+    def __init__(self, name, genome=None):
+        self.name = name
+        self.genome = genome
+
+    def __getitem__(self, key):
+        """Return sequence from region [start, end)
+
+        Coordinates are 0-based, end-exclusive."""
+        if isinstance(key, slice):
+            start, end = (key.start, key.stop)
+            start -= self.genome.start
+            end -= self.genome.start
+            return self.genome.genome[self.genome.seqid][start:end]
+        else:
+            raise TypeError('Expected a slice object but '
+                            'received a {0}.'.format(type(key)))
+
+    def __repr__(self):
+        return 'ChromosomeSubset("%s")' % self.name
+
+
+class GenomeSubset(object):
+    """
+    Allow the direct access of a subset of the genome.
+    """
+
+    def __init__(self, genome, chrom, start, end, seqid):
+        self.genome = genome
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.seqid = seqid
+        self._chroms = {}
+
+    def __getitem__(self, chrom):
+        """Return a chromosome by its name."""
+        if chrom in self._chroms:
+            return self._chroms[chrom]
+        else:
+            chromosome = ChromosomeSubset(chrom, self)
+            self._chroms[chrom] = chromosome
+            return chromosome
\ No newline at end of file
diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
new file mode 100644
index 0000000..efe9471
--- /dev/null
+++ b/pyhgvs/models/hgvs_name.py
@@ -0,0 +1,801 @@
+"""
+HGVS language currently implemented.
+
+HGVS = ALLELE
+     | PREFIX_NAME : ALLELE
+
+PREFIX_NAME = TRANSCRIPT
+            | TRANSCRIPT '(' GENE ')'
+
+TRANSCRIPT = TRANSCRIPT_NAME
+           | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION
+
+TRANSCRIPT_VERSION = NUMBER
+
+ALLELE = 'c.' CDNA_ALLELE    # cDNA
+       | 'g.' GENOMIC_ALLELE # genomic
+       | 'm.' MIT_ALLELE     # mitochondrial sequence
+       | 'n.' NC_ALLELE      # non-coding RNA reference sequence
+       | 'r.' RNA_ALLELE     # RNA sequence (like r.76a>u)
+       | 'p.' PROTEIN_ALLELE # protein sequence (like  p.Lys76Asn)
+
+NC_ALLELE =
+RNA_ALLELE =
+CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE
+            | CDNA_COORD_RANGE MULTI_BASE_CHANGE
+
+GENOMIC_ALLELE =
+MIT_ALLELE = COORD SINGLE_BASE_CHANGE
+           | COORD_RANGE MULTI_BASE_CHANGE
+
+SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '='        # no change
+                   | CDNA_COORD BASE '>' BASE                 # substitution
+                   | CDNA_COORD 'ins' BASE                    # 1bp insertion
+                   | CDNA_COORD 'del' BASE                    # 1bp deletion
+                   | CDNA_COORD 'dup' BASE                    # 1bp duplication
+                   | CDNA_COORD 'ins'                         # 1bp insertion
+                   | CDNA_COORD 'del'                         # 1bp deletion
+                   | CDNA_COORD 'dup'                         # 1bp duplication
+                   | CDNA_COORD 'del' BASE 'ins' BASE         # 1bp indel
+                   | CDNA_COORD 'delins' BASE                 # 1bp indel
+
+MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES             # deletion
+                  | COORD_RANGE 'ins' BASES             # insertion
+                  | COORD_RANGE 'dup' BASES             # duplication
+                  | COORD_RANGE 'del'                   # deletion
+                  | COORD_RANGE 'dup'                   # duplication
+                  | COORD_RANGE 'del' BASES 'ins' BASES # indel
+                  | COORD_RANGE 'delins' BASES          # indel
+
+
+AMINO1 = [GAVLIMFWPSTCYNQDEKRH]
+
+AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro'
+       | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys'
+       | 'Arg' | 'His'
+
+PROTEIN_ALLELE = AMINO3 COORD '='               # no peptide change
+               | AMINO1 COORD '='               # no peptide change
+               | AMINO3 COORD AMINO3 PEP_EXTRA  # peptide change
+               | AMINO1 COORD AMINO1 PEP_EXTRA  # peptide change
+               | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA        # indel
+               | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA        # indel
+               | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel
+               | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel
+
+# A genomic range:
+COORD_RANGE = COORD '_' COORD
+
+# A cDNA range:
+CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD
+
+# A cDNA coordinate:
+CDNA_COORD = COORD_PREFIX COORD
+           | COORD_PREFIX COORD OFFSET_PREFIX OFFSET
+COORD_PREFIX = '' | '-' | '*'
+COORD = NUMBER
+OFFSET_PREFIX = '-' | '+'
+OFFSET = NUMBER
+
+# Primatives:
+NUMBER = \d+
+BASE = [ACGT]
+BASES = BASE+
+
+"""
+
+import re
+
+from .cdna import CDNACoord
+from .variants import revcomp
+
+CHROM_PREFIX = 'chr'
+
+
+class HGVSRegex(object):
+    """
+    All regular expression for HGVS names.
+    """
+
+    # DNA syntax
+    # http://www.hgvs.org/mutnomen/standards.html#nucleotide
+    BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+"
+    BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+"
+    DNA_REF = "(?P<ref>" + BASES + ")"
+    DNA_ALT = "(?P<alt>" + BASES + ")"
+
+    # Mutation types
+    EQUAL = "(?P<mutation_type>=)"
+    SUB = "(?P<mutation_type>>)"
+    INS = "(?P<mutation_type>ins)"
+    DEL = "(?P<mutation_type>del)"
+    DUP = "(?P<mutation_type>dup)"
+
+    # Simple coordinate syntax
+    COORD_START = "(?P<start>\d+)"
+    COORD_END = "(?P<end>\d+)"
+    COORD_RANGE = COORD_START + "_" + COORD_END
+
+    # cDNA coordinate syntax
+    CDNA_COORD = ("(?P<coord_prefix>|-|\*)(?P<coord>\d+)"
+                  "((?P<offset_prefix>-|\+)(?P<offset>\d+))?")
+    CDNA_START = ("(?P<start>(?P<start_coord_prefix>|-|\*)(?P<start_coord>\d+)"
+                  "((?P<start_offset_prefix>-|\+)(?P<start_offset>\d+))?)")
+    CDNA_END = (r"(?P<end>(?P<end_coord_prefix>|-|\*)(?P<end_coord>\d+)"
+                "((?P<end_offset_prefix>-|\+)(?P<end_offset>\d+))?)")
+    CDNA_RANGE = CDNA_START + "_" + CDNA_END
+
+    # cDNA allele syntax
+    CDNA_ALLELE = [
+        # No change
+        CDNA_START + DNA_REF + EQUAL,
+
+        # Substitution
+        CDNA_START + DNA_REF + SUB + DNA_ALT,
+
+        # 1bp insertion, deletion, duplication
+        CDNA_START + INS + DNA_ALT,
+        CDNA_START + DEL + DNA_REF,
+        CDNA_START + DUP + DNA_REF,
+        CDNA_START + DEL,
+        CDNA_START + DUP,
+
+        # Insertion, deletion, duplication
+        CDNA_RANGE + INS + DNA_ALT,
+        CDNA_RANGE + DEL + DNA_REF,
+        CDNA_RANGE + DUP + DNA_REF,
+        CDNA_RANGE + DEL,
+        CDNA_RANGE + DUP,
+
+        # Indels
+        "(?P<delins>" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
+        "(?P<delins>" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
+        "(?P<delins>" + CDNA_START + 'delins' + DNA_ALT + ")",
+        "(?P<delins>" + CDNA_RANGE + 'delins' + DNA_ALT + ")",
+    ]
+
+    CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$")
+                           for regex in CDNA_ALLELE]
+
+    # Peptide syntax
+    PEP = "([A-Z]([a-z]{2}))+"
+    PEP_REF = "(?P<ref>" + PEP + ")"
+    PEP_REF2 = "(?P<ref2>" + PEP + ")"
+    PEP_ALT = "(?P<alt>" + PEP + ")"
+
+    PEP_EXTRA = "(?P<extra>(|=|\?)(|fs))"
+
+    # Peptide allele syntax
+    PEP_ALLELE = [
+        # No peptide change
+        # Example: Glu1161=
+        PEP_REF + COORD_START + PEP_EXTRA,
+
+        # Peptide change
+        # Example: Glu1161Ser
+        PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA,
+
+        # Peptide indel
+        # Example: Glu1161_Ser1164?fs
+        "(?P<delins>" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END +
+        PEP_EXTRA + ")",
+        "(?P<delins>" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END +
+        PEP_ALT + PEP_EXTRA + ")",
+    ]
+
+    PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$")
+                          for regex in PEP_ALLELE]
+
+    # Genomic allele syntax
+    GENOMIC_ALLELE = [
+        # No change
+        COORD_START + DNA_REF + EQUAL,
+
+        # Substitution
+        COORD_START + DNA_REF + SUB + DNA_ALT,
+
+        # 1bp insertion, deletion, duplication
+        COORD_START + INS + DNA_ALT,
+        COORD_START + DEL + DNA_REF,
+        COORD_START + DUP + DNA_REF,
+        COORD_START + DEL,
+        COORD_START + DUP,
+
+        # Insertion, deletion, duplication
+        COORD_RANGE + INS + DNA_ALT,
+        COORD_RANGE + DEL + DNA_REF,
+        COORD_RANGE + DUP + DNA_REF,
+        COORD_RANGE + DEL,
+        COORD_RANGE + DUP,
+
+        # Indels
+        "(?P<delins>" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
+        "(?P<delins>" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")",
+        "(?P<delins>" + COORD_START + 'delins' + DNA_ALT + ")",
+        "(?P<delins>" + COORD_RANGE + 'delins' + DNA_ALT + ")",
+    ]
+
+    GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$")
+                              for regex in GENOMIC_ALLELE]
+
+
+# The RefSeq standard for naming contigs/transcripts/proteins:
+# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly  # nopep8
+REFSEQ_PREFIXES = [
+    ('AC_', 'genomic',
+     'Complete genomic molecule, usually alternate assembly'),
+    ('NC_', 'genomic',
+     'Complete genomic molecule, usually reference assembly'),
+    ('NG_', 'genomic', 'Incomplete genomic region'),
+    ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),
+    ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),
+    ('NS_', 'genomic', 'Environmental sequence'),
+    ('NZ_', 'genomic', 'Unfinished WGS'),
+    ('NM_', 'mRNA', ''),
+    ('NR_', 'RNA', ''),
+    ('XM_', 'mRNA', 'Predicted model'),
+    ('XR_', 'RNA', 'Predicted model'),
+    ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),
+    ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),
+    ('YP_', 'Protein', ''),
+    ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),
+    ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),
+]
+
+
+REFSEQ_PREFIX_LOOKUP = dict(
+    (prefix, (kind, description))
+    for prefix, kind, description in REFSEQ_PREFIXES
+)
+
+
+def get_refseq_type(name):
+    """
+    Return the RefSeq type for a refseq name.
+    """
+    prefix = name[:3]
+    return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0]
+
+class InvalidHGVSName(ValueError):
+    def __init__(self, name='', part='name', reason=''):
+        if name:
+            message = 'Invalid HGVS %s "%s"' % (part, name)
+        else:
+            message = 'Invalid HGVS %s' % part
+        if reason:
+            message += ': ' + reason
+        super(InvalidHGVSName, self).__init__(message)
+
+        self.name = name
+        self.part = part
+        self.reason = reason
+
+
+class HGVSName(object):
+    """
+    Represents a HGVS variant name.
+    """
+
+    def __init__(self, name='', prefix='', chrom='', transcript='', gene='',
+                 kind='', mutation_type=None, start=0, end=0, ref_allele='',
+                 ref2_allele='', alt_allele='',
+                 cdna_start=None, cdna_end=None, pep_extra=''):
+
+        # Full HGVS name.
+        self.name = name
+
+        # Name parts.
+        self.prefix = prefix
+        self.chrom = chrom
+        self.transcript = transcript
+        self.gene = gene
+        self.kind = kind
+        self.mutation_type = mutation_type
+        self.start = start
+        self.end = end
+        self.ref_allele = ref_allele  # reference allele
+        self.ref2_allele = ref2_allele  # reference allele at end of pep indel
+        self.alt_allele = alt_allele  # alternate allele
+
+        # cDNA-specific fields
+        self.cdna_start = cdna_start if cdna_start else CDNACoord()
+        self.cdna_end = cdna_end if cdna_end else CDNACoord()
+
+        # Protein-specific fields
+        self.pep_extra = pep_extra
+
+        if name:
+            self.parse(name)
+
+    def parse(self, name):
+        """Parse a HGVS name."""
+        # Does HGVS name have transcript/gene prefix?
+        if ':' in name:
+            prefix, allele = name.split(':', 1)
+        else:
+            prefix = ''
+            allele = name
+
+        self.name = name
+
+        # Parse prefix and allele.
+        self.parse_allele(allele)
+        self.parse_prefix(prefix, self.kind)
+        self._validate()
+
+    def parse_prefix(self, prefix, kind):
+        """
+        Parse a HGVS prefix (gene/transcript/chromosome).
+
+        Some examples of full hgvs names with transcript include:
+          NM_007294.3:c.2207A>C
+          NM_007294.3(BRCA1):c.2207A>C
+          BRCA1{NM_007294.3}:c.2207A>C
+        """
+
+        self.prefix = prefix
+
+        # No prefix.
+        if prefix == '':
+            self.chrom = ''
+            self.transcript = ''
+            self.gene = ''
+            return
+
+        # Transcript and gene given with parens.
+        # example: NM_007294.3(BRCA1):c.2207A>C
+        match = re.match("^(?P<transcript>[^(]+)\((?P<gene>[^)]+)\)$", prefix)
+        if match:
+            self.transcript = match.group('transcript')
+            self.gene = match.group('gene')
+            return
+
+        # Transcript and gene given with braces.
+        # example: BRCA1{NM_007294.3}:c.2207A>C
+        match = re.match("^(?P<gene>[^{]+)\{(?P<transcript>[^}]+)\}$", prefix)
+        if match:
+            self.transcript = match.group('transcript')
+            self.gene = match.group('gene')
+            return
+
+        # Determine using Ensembl type.
+        if prefix.startswith('ENST'):
+            self.transcript = prefix
+            return
+
+        # Determine using refseq type.
+        refseq_type = get_refseq_type(prefix)
+        if refseq_type in ('mRNA', 'RNA'):
+            self.transcript = prefix
+            return
+
+        # Determine using refseq type.
+        if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic':
+            self.chrom = prefix
+            return
+
+        # Assume gene name.
+        self.gene = prefix
+
+    def parse_allele(self, allele):
+        """
+        Parse a HGVS allele description.
+
+        Some examples include:
+          cDNA substitution: c.101A>C,
+          cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG
+          No protein change: p.Glu1161=
+          Protein change: p.Glu1161Ser
+          Protein frameshift: p.Glu1161_Ser1164?fs
+          Genomic substitution: g.1000100A>T
+          Genomic indel: g.1000100_1000102delATG
+        """
+        if '.' not in allele:
+            raise InvalidHGVSName(allele, 'allele',
+                                  'expected kind "c.", "p.", "g.", etc')
+
+        # Determine HGVS name kind.
+        kind, details = allele.split('.', 1)
+        self.kind = kind
+        self.mutation_type = None
+
+        if kind in ("c", 'n'):
+            self.parse_cdna(details)
+            if kind == 'n':  # Ensure no 3'UTR or 5'UTR coords in non-coding
+                if self.cdna_start.coord < 0:
+                    raise InvalidHGVSName(allele, 'allele',
+                                          "Non-coding transcript cannot contain negative (5'UTR) coordinates")
+                if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop':
+                    raise InvalidHGVSName(allele, 'allele',
+                                          "Non-coding transcript cannot contain '*' (3'UTR) coordinates")
+        elif kind == "p":
+            self.parse_protein(details)
+        elif kind == "g":
+            self.parse_genome(details)
+        else:
+            raise NotImplementedError("unknown kind: %s" % allele)
+
+    def parse_cdna(self, details):
+        """
+        Parse a HGVS cDNA name.
+
+        Some examples include:
+          Substitution: 101A>C,
+          Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG
+        """
+        for regex in HGVSRegex.CDNA_ALLELE_REGEXES:
+            match = re.match(regex, details)
+            if match:
+                groups = match.groupdict()
+
+                # Parse mutation type.
+                if groups.get('delins'):
+                    self.mutation_type = 'delins'
+                else:
+                    self.mutation_type = groups['mutation_type']
+
+                # Parse coordinates.
+                self.cdna_start = CDNACoord(string=groups.get('start'))
+                if groups.get('end'):
+                    self.cdna_end = CDNACoord(string=groups.get('end'))
+                else:
+                    self.cdna_end = CDNACoord(string=groups.get('start'))
+
+                # Parse alleles.
+                self.ref_allele = groups.get('ref', '')
+                self.alt_allele = groups.get('alt', '')
+
+                # Convert numerical allelles.
+                if self.ref_allele.isdigit():
+                    self.ref_allele = "N" * int(self.ref_allele)
+                if self.alt_allele.isdigit():
+                    self.alt_allele = "N" * int(self.alt_allele)
+
+                # Convert duplication alleles.
+                if self.mutation_type == "dup":
+                    self.alt_allele = self.ref_allele * 2
+
+                # Convert no match alleles.
+                if self.mutation_type == "=":
+                    self.alt_allele = self.ref_allele
+                return
+
+        raise InvalidHGVSName(details, 'cDNA allele')
+
+    def parse_protein(self, details):
+        """
+        Parse a HGVS protein name.
+
+        Some examples include:
+          No change: Glu1161=
+          Change: Glu1161Ser
+          Frameshift: Glu1161_Ser1164?fs
+        """
+        for regex in HGVSRegex.PEP_ALLELE_REGEXES:
+            match = re.match(regex, details)
+            if match:
+                groups = match.groupdict()
+
+                # Parse mutation type.
+                if groups.get('delins'):
+                    self.mutation_type = 'delins'
+                else:
+                    self.mutation_type = '>'
+
+                # Parse coordinates.
+                self.start = int(groups.get('start'))
+                if groups.get('end'):
+                    self.end = int(groups.get('end'))
+                else:
+                    self.end = self.start
+
+                # Parse alleles.
+                self.ref_allele = groups.get('ref', '')
+                if groups.get('ref2'):
+                    self.ref2_allele = groups.get('ref2')
+                    self.alt_allele = groups.get('alt', '')
+                else:
+                    # If alt is not given, assume matching with ref
+                    self.ref2_allele = self.ref_allele
+                    self.alt_allele = groups.get(
+                        'alt', self.ref_allele)
+
+                self.pep_extra = groups.get('extra')
+                return
+
+        raise InvalidHGVSName(details, 'protein allele')
+
+    def parse_genome(self, details):
+        """
+        Parse a HGVS genomic name.
+
+        Som examples include:
+          Substitution: 1000100A>T
+          Indel: 1000100_1000102delATG
+        """
+
+        for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES:
+            match = re.match(regex, details)
+            if match:
+                groups = match.groupdict()
+
+                # Parse mutation type.
+                if groups.get('delins'):
+                    self.mutation_type = 'delins'
+                else:
+                    self.mutation_type = groups['mutation_type']
+
+                # Parse coordinates.
+                self.start = int(groups.get('start'))
+                if groups.get('end'):
+                    self.end = int(groups.get('end'))
+                else:
+                    self.end = self.start
+
+                # Parse alleles.
+                self.ref_allele = groups.get('ref', '')
+                self.alt_allele = groups.get('alt', '')
+
+                # Convert numerical allelles.
+                if self.ref_allele.isdigit():
+                    self.ref_allele = "N" * int(self.ref_allele)
+                if self.alt_allele.isdigit():
+                    self.alt_allele = "N" * int(self.alt_allele)
+
+                # Convert duplication alleles.
+                if self.mutation_type == "dup":
+                    self.alt_allele = self.ref_allele * 2
+
+                # Convert no match alleles.
+                if self.mutation_type == "=":
+                    self.alt_allele = self.ref_allele
+                return
+
+        raise InvalidHGVSName(details, 'genomic allele')
+
+    def _validate(self):
+        """
+        Check for internal inconsistencies in representation
+        """
+        if self.start > self.end:
+            raise InvalidHGVSName(reason="Coordinates are nonincreasing")
+
+    def __repr__(self):
+        try:
+            return "HGVSName('%s')" % self.format()
+        except NotImplementedError:
+            return "HGVSName('%s')" % self.name
+
+    def __unicode__(self):
+        return self.format()
+
+    def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
+        """Generate a HGVS name as a string."""
+
+        if self.kind == 'c':
+            allele = 'c.' + self.format_cdna()
+        elif self.kind == 'n':
+            allele = 'n.' + self.format_cdna()
+        elif self.kind == 'p':
+            allele = 'p.' + self.format_protein()
+        elif self.kind == 'g':
+            allele = 'g.' + self.format_genome()
+        else:
+            raise NotImplementedError("not implemented: '%s'" % self.kind)
+
+        prefix = self.format_prefix(use_gene=use_gene) if use_prefix else ''
+
+        if prefix:
+            return prefix + ':' + allele
+        else:
+            return allele
+
+    def format_prefix(self, use_gene=True):
+        """
+        Generate HGVS trancript/gene prefix.
+
+        Some examples of full hgvs names with transcript include:
+          NM_007294.3:c.2207A>C
+          NM_007294.3(BRCA1):c.2207A>C
+        """
+
+        if self.kind == 'g':
+            if self.chrom:
+                return self.chrom
+
+        if self.transcript:
+            if use_gene and self.gene:
+                return '%s(%s)' % (self.transcript, self.gene)
+            else:
+                return self.transcript
+        else:
+            if use_gene:
+                return self.gene
+            else:
+                return ''
+
+    def format_cdna_coords(self):
+        """
+        Generate HGVS cDNA coordinates string.
+        """
+        # Format coordinates.
+        if self.cdna_start == self.cdna_end:
+            return str(self.cdna_start)
+        else:
+            return "%s_%s" % (self.cdna_start, self.cdna_end)
+
+    def format_dna_allele(self):
+        """
+        Generate HGVS DNA allele.
+        """
+        if self.mutation_type == '=':
+            # No change.
+            # example: 101A=
+            return self.ref_allele + '='
+
+        if self.mutation_type == '>':
+            # SNP.
+            # example: 101A>C
+            return self.ref_allele + '>' + self.alt_allele
+
+        elif self.mutation_type == 'delins':
+            # Indel.
+            # example: 112_117delAGGTCAinsTG, 112_117delinsTG
+            return 'del' + self.ref_allele + 'ins' + self.alt_allele
+
+        elif self.mutation_type in ('del', 'dup'):
+            # Delete, duplication.
+            # example: 1000_1003delATG, 1000_1003dupATG
+            return self.mutation_type + self.ref_allele
+
+        elif self.mutation_type == 'ins':
+            # Insert.
+            # example: 1000_1001insATG
+            return self.mutation_type + self.alt_allele
+
+        elif self.mutation_type == 'inv':
+            return self.mutation_type
+
+        else:
+            raise AssertionError(
+                "unknown mutation type: '%s'" % self.mutation_type)
+
+    def format_cdna(self):
+        """
+        Generate HGVS cDNA allele.
+
+        Some examples include:
+          Substitution: 101A>C,
+          Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG
+        """
+        return self.format_cdna_coords() + self.format_dna_allele()
+
+    def format_protein(self):
+        """
+        Generate HGVS protein name.
+
+        Some examples include:
+          No change: Glu1161=
+          Change: Glu1161Ser
+          Frameshift: Glu1161_Ser1164?fs
+        """
+        if (self.start == self.end and
+                self.ref_allele == self.ref2_allele == self.alt_allele):
+            # Match.
+            # Example: Glu1161=
+            pep_extra = self.pep_extra if self.pep_extra else '='
+            return self.ref_allele + str(self.start) + pep_extra
+
+        elif (self.start == self.end and
+              self.ref_allele == self.ref2_allele and
+              self.ref_allele != self.alt_allele):
+            # Change.
+            # Example: Glu1161Ser
+            return (self.ref_allele + str(self.start) +
+                    self.alt_allele + self.pep_extra)
+
+        elif self.start != self.end:
+            # Range change.
+            # Example: Glu1161_Ser1164?fs
+            return (self.ref_allele + str(self.start) + '_' +
+                    self.ref2_allele + str(self.end) +
+                    self.pep_extra)
+
+        else:
+            raise NotImplementedError('protein name formatting.')
+
+    def format_coords(self):
+        """
+        Generate HGVS cDNA coordinates string.
+        """
+        # Format coordinates.
+        if self.start == self.end:
+            return str(self.start)
+        else:
+            return "%s_%s" % (self.start, self.end)
+
+    def format_genome(self):
+        """
+        Generate HGVS genomic allele.
+
+        Som examples include:
+          Substitution: 1000100A>T
+          Indel: 1000100_1000102delATG
+        """
+        return self.format_coords() + self.format_dna_allele()
+
+    def get_raw_coords(self, transcript=None):
+        """ return genomic coordinates """
+        if self.kind in ('c', 'n'):
+            chrom = transcript.tx_position.chrom
+            start = transcript.cdna_to_genomic_coord(self.cdna_start)
+            end = transcript.cdna_to_genomic_coord(self.cdna_end)
+
+            if not transcript.tx_position.is_forward_strand:
+                if end > start:
+                    raise AssertionError(
+                        "cdna_start cannot be greater than cdna_end")
+                start, end = end, start
+
+            if start > end:
+                raise AssertionError(
+                    "cdna_start cannot be greater than cdna_end")
+        elif self.kind == 'g':
+            chrom = self.chrom
+            start = self.start
+            end = self.end
+        else:
+            raise NotImplementedError(
+                'Coordinates are not available for this kind of HGVS name "%s"'
+                % self.kind)
+
+        return chrom, start, end
+
+    def get_ref_coords(self, transcript=None):
+        """Return genomic coordinates of reference allele."""
+
+        chrom, start, end = self.get_raw_coords(transcript)
+
+        if self.mutation_type == "ins":
+            # Inserts have empty interval.
+            if start < end:
+                start += 1
+                end -= 1
+            else:
+                end = start - 1
+
+        elif self.mutation_type == "dup":
+            end = start - 1
+        return chrom, start, end
+
+    def get_vcf_coords(self, transcript=None):
+        """Return genomic coordinates of reference allele in VCF-style."""
+        chrom, start, end = self.get_ref_coords(transcript)
+
+        # Inserts and deletes require left-padding by 1 base
+        if self.mutation_type in ("=", ">"):
+            pass
+        elif self.mutation_type in ("del", "ins", "dup", "delins"):
+            # Indels have left-padding.
+            start -= 1
+        else:
+            raise NotImplementedError("Unknown mutation_type '%s'" %
+                                      self.mutation_type)
+        return chrom, start, end
+
+    def get_ref_alt(self, is_forward_strand=True):
+        """Return reference and alternate alleles."""
+        if self.kind == 'p':
+            raise NotImplementedError(
+                'get_ref_alt is not implemented for protein HGVS names')
+        alleles = [self.ref_allele, self.alt_allele]
+
+        # Represent duplications are inserts.
+        if self.mutation_type == "dup":
+            alleles[0] = ""
+            alleles[1] = alleles[1][:len(alleles[1]) // 2]
+
+        if is_forward_strand:
+            return alleles
+        else:
+            return tuple(map(revcomp, alleles))
diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
new file mode 100644
index 0000000..e707375
--- /dev/null
+++ b/pyhgvs/models/transcript.py
@@ -0,0 +1,335 @@
+"""
+Models for representing genomic elements.
+"""
+
+from __future__ import unicode_literals
+
+from collections import namedtuple
+
+from lazy import lazy
+
+from pyhgvs.models.cdna import CDNA_START_CODON, CDNA_STOP_CODON, CDNACoord
+
+
+class Gene(object):
+    def __init__(self, name):
+        self.name = name
+
+
+class Transcript(object):
+    """
+
+    A gene may have multiple transcripts with different combinations of exons.
+    """
+
+    def __init__(self, name, version, gene, tx_position, cds_position,
+                 is_default=False, exons=None):
+        self.name = name
+        self.version = version
+        self.gene = Gene(gene)
+        self.tx_position = tx_position
+        self.cds_position = cds_position
+        self.is_default = is_default
+        self.exons = exons if exons else []
+
+    @property
+    def full_name(self):
+        if self.version is not None:
+            return '%s.%d' % (self.name, self.version)
+        else:
+            return self.name
+
+    @property
+    def is_coding(self):
+        # Coding transcripts have CDS with non-zero length.
+        return (self.cds_position.chrom_stop -
+                self.cds_position.chrom_start > 0)
+
+    @property
+    def strand(self):
+        return ('+' if self.tx_position.is_forward_strand else '-')
+
+    @property
+    def coding_exons(self):
+        return [exon.get_as_interval(coding_only=True)
+                for exon in self.exons]
+
+    @lazy
+    def ordered_exons(self):
+        """Yield exons in coding order."""
+        transcript_strand = self.tx_position.is_forward_strand
+        if hasattr(self.exons, 'select_related'):
+            exons = list(self.exons.select_related('tx_position'))
+        else:
+            exons = list(self.exons)
+        exons.sort(key=lambda exon: exon.tx_position.chrom_start)
+        if not transcript_strand:
+            exons.reverse()
+        return exons
+
+    def get_coding_exons(self):
+        """Yield non-empty coding exonic regions in coding order."""
+        for exon in self.ordered_exons:
+            region = exon.get_as_interval(coding_only=True)
+            if region:
+                yield region
+
+    def get_utr5p_size(self):
+        """Return the size of the 5prime UTR of a transcript."""
+
+        transcript_strand = self.tx_position.is_forward_strand
+
+        # Find the exon containing the start codon.
+        start_codon = (self.cds_position.chrom_start if transcript_strand
+                       else self.cds_position.chrom_stop - 1)
+        cdna_len = 0
+        for exon in self.ordered_exons:
+            exon_start = exon.tx_position.chrom_start
+            exon_end = exon.tx_position.chrom_stop
+            if exon_start <= start_codon < exon_end:
+                break
+            cdna_len += exon_end - exon_start
+        else:
+            raise ValueError("transcript contains no exons")
+
+        if transcript_strand:
+            return cdna_len + (start_codon - exon_start)
+        else:
+            return cdna_len + (exon_end - start_codon - 1)
+
+    def find_stop_codon(self, cds_position):
+        """Return the position along the cDNA of the base after the stop codon."""
+        if cds_position.is_forward_strand:
+            stop_pos = cds_position.chrom_stop
+        else:
+            stop_pos = cds_position.chrom_start
+        cdna_pos = 0
+        for exon in self.ordered_exons:
+            exon_start = exon.tx_position.chrom_start
+            exon_stop = exon.tx_position.chrom_stop
+
+            if exon_start <= stop_pos <= exon_stop:
+                if cds_position.is_forward_strand:
+                    return cdna_pos + stop_pos - exon_start
+                else:
+                    return cdna_pos + exon_stop - stop_pos
+            else:
+                cdna_pos += exon_stop - exon_start
+        raise ValueError('Stop codon is not in any of the exons')
+
+    def cdna_to_genomic_coord(self, coord):
+        """Convert a HGVS cDNA coordinate to a genomic coordinate."""
+        transcript_strand = self.tx_position.is_forward_strand
+        utr5p = (self.get_utr5p_size()
+                 if self.is_coding else 0)
+
+        # compute starting position along spliced transcript.
+        if coord.landmark == CDNA_START_CODON:
+            if coord.coord > 0:
+                pos = utr5p + coord.coord
+            else:
+                pos = utr5p + coord.coord + 1
+        elif coord.landmark == CDNA_STOP_CODON:
+            if coord.coord < 0:
+                raise ValueError('CDNACoord cannot have a negative coord and '
+                                 'landmark CDNA_STOP_CODON')
+            pos = self.find_stop_codon(self.cds_position) + coord.coord
+        else:
+            raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark)
+
+        # 5' flanking sequence.
+        if pos < 1:
+            if transcript_strand:
+                return self.tx_position.chrom_start + pos
+            else:
+                return self.tx_position.chrom_stop - pos + 1
+
+        # Walk along transcript until we find an exon that contains pos.
+        cdna_start = 1
+        cdna_end = 1
+        for exon in self.ordered_exons:
+            exon_start = exon.tx_position.chrom_start + 1
+            exon_end = exon.tx_position.chrom_stop
+            cdna_end = cdna_start + (exon_end - exon_start)
+            if cdna_start <= pos <= cdna_end:
+                break
+            cdna_start = cdna_end + 1
+        else:
+            # 3' flanking sequence
+            if transcript_strand:
+                return self.cds_position.chrom_stop + coord.coord
+            else:
+                return self.cds_position.chrom_start + 1 - coord.coord
+
+        # Compute genomic coordinate using offset.
+        if transcript_strand:
+            # Plus strand.
+            return exon_start + (pos - cdna_start) + coord.offset
+        else:
+            # Minus strand.
+            return exon_end - (pos - cdna_start) - coord.offset
+
+    def genomic_to_cdna_coord(self, genomic_coord):
+        """Convert a genomic coordinate to a cDNA coordinate and offset.
+        """
+        exons = [exon.get_as_interval()
+                 for exon in self.ordered_exons]
+
+        if len(exons) == 0:
+            return None
+
+        strand = self.strand
+
+        if strand == "+":
+            exons.sort(key=lambda exon: exon.chrom_start)
+        else:
+            exons.sort(key=lambda exon: -exon.chrom_end)
+
+        distances = [exon.distance(genomic_coord)
+                     for exon in exons]
+        min_distance_to_exon = min(map(abs, distances))
+
+        coding_offset = 0
+        for exon in exons:
+            exon_length = exon.chrom_end - exon.chrom_start
+            distance = exon.distance(genomic_coord)
+            if abs(distance) == min_distance_to_exon:
+                if strand == "+":
+                    exon_start_cds_offset = coding_offset + 1
+                    exon_end_cds_offset = coding_offset + exon_length
+                else:
+                    exon_start_cds_offset = coding_offset + exon_length
+                    exon_end_cds_offset = coding_offset + 1
+                # This is the exon we want to annotate against.
+                if distance == 0:
+                    # Inside the exon.
+                    if strand == "+":
+                        coord = (exon_start_cds_offset +
+                                 (genomic_coord -
+                                  (exon.chrom_start + 1)))
+                    else:
+                        coord = (exon_end_cds_offset +
+                                 (exon.chrom_end -
+                                  genomic_coord))
+                    cdna_coord = CDNACoord(coord, 0)
+                else:
+                    # Outside the exon.
+                    if distance > 0:
+                        nearest_exonic = exon_start_cds_offset
+                    else:
+                        nearest_exonic = exon_end_cds_offset
+                    if strand == "+":
+                        distance *= -1
+
+                    # If outside transcript, don't use offset.
+                    if (genomic_coord < self.tx_position.chrom_start + 1 or
+                            genomic_coord > self.tx_position.chrom_stop):
+                        nearest_exonic += distance
+                        distance = 0
+                    cdna_coord = CDNACoord(nearest_exonic, distance)
+                break
+            coding_offset += exon_length
+
+        # Adjust coordinates for coding transcript.
+        if self.is_coding:
+            # Detect if position before start codon.
+            utr5p = self.get_utr5p_size() if self.is_coding else 0
+            cdna_coord.coord -= utr5p
+            if cdna_coord.coord <= 0:
+                cdna_coord.coord -= 1
+            else:
+                # Detect if position is after stop_codon.
+                stop_codon = self.find_stop_codon(self.cds_position)
+                stop_codon -= utr5p
+                if (cdna_coord.coord > stop_codon or
+                        cdna_coord.coord == stop_codon and cdna_coord.offset > 0):
+                    cdna_coord.coord -= stop_codon
+                    cdna_coord.landmark = CDNA_STOP_CODON
+
+        return cdna_coord
+
+BED6Interval_base = namedtuple(
+    "BED6Interval_base", (
+        "chrom",
+        "chrom_start",
+        "chrom_end",
+        "name",
+        "score",
+        "strand"))
+
+
+class BED6Interval(BED6Interval_base):
+    def distance(self, offset):
+        """Return the distance to the interval.
+
+        if offset is inside the exon, distance is zero.
+        otherwise, distance is the distance to the nearest edge.
+
+        distance is positive if the exon comes after the offset.
+        distance is negative if the exon comes before the offset.
+        """
+
+        start = self.chrom_start + 1
+        end = self.chrom_end
+
+        if start <= offset <= end:
+            return 0
+
+        start_distance = start - offset
+        end_distance = offset - end
+
+        if abs(start_distance) < abs(end_distance):
+            return start_distance
+        else:
+            return -end_distance
+
+
+class Exon(object):
+    def __init__(self, transcript, tx_position, exon_number):
+        self.transcript = transcript
+        self.tx_position = tx_position
+        self.exon_number = exon_number
+
+    @property
+    def get_exon_name(self):
+        return "%s.%d" % (self.transcript.name, self.exon_number)
+
+    def get_as_interval(self, coding_only=False):
+        """Returns the coding region for this exon as a BED6Interval.
+
+        This function returns a BED6Interval objects containing  position
+        information for this exon. This may be used as input for
+        pybedtools.create_interval_from_list() after casting chrom_start
+        and chrom_end as strings.
+
+        coding_only: only include exons in the coding region
+
+        """
+
+        exon_start = self.tx_position.chrom_start
+        exon_stop = self.tx_position.chrom_stop
+
+        # Get only exon coding region if requested
+        if coding_only:
+            if (exon_stop <= self.transcript.cds_position.chrom_start or
+                    exon_start >= self.transcript.cds_position.chrom_stop):
+                return None
+            exon_start = max(exon_start,
+                             self.transcript.cds_position.chrom_start)
+            exon_stop = min(
+                max(exon_stop, self.transcript.cds_position.chrom_start),
+                self.transcript.cds_position.chrom_stop)
+
+        return BED6Interval(
+            self.tx_position.chrom,
+            exon_start,
+            exon_stop,
+            self.get_exon_name,
+            '.',
+            self.strand,
+        )
+
+    @property
+    def strand(self):
+        strand = '+' if self.tx_position.is_forward_strand else '-'
+        return strand
diff --git a/pyhgvs/variants.py b/pyhgvs/models/variants.py
similarity index 96%
rename from pyhgvs/variants.py
rename to pyhgvs/models/variants.py
index 36c7383..aced9fc 100644
--- a/pyhgvs/variants.py
+++ b/pyhgvs/models/variants.py
@@ -5,13 +5,24 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
-from .models import Position
-
-
 _COMP = dict(A='T', C='G', G='C', T='A', N='N',
              a='t', c='g', g='c', t='a', n='n')
 
 
+class Position(object):
+    """A position in the genome."""
+
+    def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand):
+        self.chrom = chrom
+        self.chrom_start = chrom_start
+        self.chrom_stop = chrom_stop
+        self.is_forward_strand = is_forward_strand
+
+    def __repr__(self):
+        return "<Position %s[%d:%d]>" % (
+            self.chrom, self.chrom_start, self.chrom_stop)
+
+
 def revcomp(seq):
     """Reverse complement."""
     return ''.join(_COMP[base] for base in reversed(seq))
@@ -151,7 +162,7 @@ def __init__(self, position, ref_allele, alt_alleles,
         seq_5p: 5 prime flanking sequence of variant.
         seq_3p: 3 prime flanking sequence of variant.
         genome: a pygr compatible genome object (optional).
-        
+
         indels_start_with_same_base: DML - I have no idea why this is required
                                            but am keeping for backwards compat
         """
@@ -208,7 +219,7 @@ def _trim_common_suffix(self):
         """
         minlength = min(map(len, self.alleles))
         common_suffix = 0
-        for i in range(1, minlength+1):
+        for i in range(1, minlength + 1):
             if len(set(allele[-i] for allele in self.alleles)) > 1:
                 # Not all alleles match at this site, so common suffix ends.
                 break
diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py
index 0887dfd..ecc35ae 100644
--- a/pyhgvs/tests/genome.py
+++ b/pyhgvs/tests/genome.py
@@ -4,7 +4,7 @@
 import itertools
 import os
 
-from ..variants import revcomp
+from ..models.variants import revcomp
 
 
 try:
diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index 58cf351..48bbb99 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -9,13 +9,9 @@
 except ImportError:
     SequenceFileDB = None
 
-from .. import CDNACoord
-from .. import CDNA_STOP_CODON
-from .. import HGVSName
-from .. import InvalidHGVSName
-from .. import cdna_to_genomic_coord
+from ..models.cdna import CDNACoord, CDNA_STOP_CODON
+from ..models.hgvs_name import HGVSName, InvalidHGVSName
 from .. import format_hgvs_name
-from .. import genomic_to_cdna_coord
 from .. import parse_hgvs_name
 from ..utils import read_transcripts
 from .genome import MockGenomeTestFile
@@ -50,7 +46,7 @@ def test_genomic_to_cdna_coord():
     """
     for transcript_name, genomic_coord, cdna_coord_expected in _convert_coords:
         transcript = get_transcript(transcript_name)
-        cdna_coord = genomic_to_cdna_coord(transcript, genomic_coord[1])
+        cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord[1])
         nose.tools.assert_equal(
             cdna_coord, cdna_coord_expected,
             repr((cdna_coord, cdna_coord_expected,
@@ -63,7 +59,7 @@ def test_cdna_to_genomic_coord():
     """
     for transcript_name, genomic_coord_expected, cdna_coord in _convert_coords:
         transcript = get_transcript(transcript_name)
-        genomic_coord = cdna_to_genomic_coord(transcript, cdna_coord)
+        genomic_coord = transcript.cdna_to_genomic_coord(cdna_coord)
         nose.tools.assert_equal(
             genomic_coord, genomic_coord_expected[1],
             repr((genomic_coord, genomic_coord_expected[1],
diff --git a/pyhgvs/tests/test_hgvs_names_long.py b/pyhgvs/tests/test_hgvs_names_long.py
index 9a019a7..2bfdfec 100644
--- a/pyhgvs/tests/test_hgvs_names_long.py
+++ b/pyhgvs/tests/test_hgvs_names_long.py
@@ -2,7 +2,7 @@
 
 from .. import parse_hgvs_name
 from ..utils import read_transcripts
-from ..variants import normalize_variant
+from ..models.variants import normalize_variant
 from .genome import MockGenomeTestFile
 
 
diff --git a/pyhgvs/tests/test_variants.py b/pyhgvs/tests/test_variants.py
index de37a45..c4b9b35 100644
--- a/pyhgvs/tests/test_variants.py
+++ b/pyhgvs/tests/test_variants.py
@@ -3,7 +3,7 @@
 
 from unittest import TestCase
 
-from ..variants import normalize_variant
+from ..models.variants import normalize_variant
 from .genome import MockGenomeTestFile
 
 
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 55b20f4..6d3ac3e 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -5,9 +5,8 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
-from .models import Exon
-from .models import Position
-from .models import Transcript
+from .models.variants import Position
+from .models.transcript import Exon, Transcript
 
 
 def read_refgene(infile):
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 1018450..5b2667d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,4 @@
 flake8==2.2.5
+lazy==1.4
 nose==1.3.7
 pyfaidx==0.5.0.1
diff --git a/setup.py b/setup.py
index 3f94c25..6ed2380 100644
--- a/setup.py
+++ b/setup.py
@@ -31,6 +31,7 @@ def main():
         install_requires=['pip>=1.2'],
         tests_require=[
           'flake8==2.2.5',
+          'lazy==1.4',
           'nose==1.3.7',
           'pyfaidx==0.5.0.1',
         ],

From 67ab67f263e34208f57e27a572c521b037e8a619 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Tue, 12 Oct 2021 16:16:37 +1030
Subject: [PATCH 09/47] Start of cDNA matching code changes. Put everything
 through 1 path as exons are just a special case of alignments with no gaps

---
 pyhgvs/models/transcript.py     | 113 +++++++++++++++++++++++---------
 pyhgvs/tests/test_cdna_match.py |  71 ++++++++++++++++++++
 pyhgvs/tests/test_hgvs_names.py |   2 +-
 pyhgvs/utils.py                 |  66 +++++++++++++++----
 4 files changed, 207 insertions(+), 45 deletions(-)
 create mode 100644 pyhgvs/tests/test_cdna_match.py

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index e707375..011da10 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -18,19 +18,22 @@ def __init__(self, name):
 
 class Transcript(object):
     """
-
     A gene may have multiple transcripts with different combinations of exons.
     """
 
     def __init__(self, name, version, gene, tx_position, cds_position,
-                 is_default=False, exons=None):
+                 is_default=False, cdna_match=None):
         self.name = name
         self.version = version
         self.gene = Gene(gene)
         self.tx_position = tx_position
         self.cds_position = cds_position
         self.is_default = is_default
-        self.exons = exons if exons else []
+        self.cdna_match = cdna_match or []
+
+    @lazy
+    def exons(self):
+        return self.cdna_match
 
     @property
     def full_name(self):
@@ -49,10 +52,15 @@ def is_coding(self):
     def strand(self):
         return ('+' if self.tx_position.is_forward_strand else '-')
 
-    @property
-    def coding_exons(self):
-        return [exon.get_as_interval(coding_only=True)
-                for exon in self.exons]
+    @lazy
+    def ordered_cdna_match(self):
+        """ Coding order """
+        transcript_strand = self.tx_position.is_forward_strand
+        cdna_match = list(self.cdna_match)
+        cdna_match.sort(key=lambda cm: cm.tx_position.chrom_start)
+        if not transcript_strand:
+            cdna_match.reverse()
+        return cdna_match
 
     @lazy
     def ordered_exons(self):
@@ -67,13 +75,6 @@ def ordered_exons(self):
             exons.reverse()
         return exons
 
-    def get_coding_exons(self):
-        """Yield non-empty coding exonic regions in coding order."""
-        for exon in self.ordered_exons:
-            region = exon.get_as_interval(coding_only=True)
-            if region:
-                yield region
-
     def get_utr5p_size(self):
         """Return the size of the 5prime UTR of a transcript."""
 
@@ -83,19 +84,22 @@ def get_utr5p_size(self):
         start_codon = (self.cds_position.chrom_start if transcript_strand
                        else self.cds_position.chrom_stop - 1)
         cdna_len = 0
-        for exon in self.ordered_exons:
-            exon_start = exon.tx_position.chrom_start
-            exon_end = exon.tx_position.chrom_stop
-            if exon_start <= start_codon < exon_end:
+        for cdna_match in self.ordered_cdna_match:
+            start = cdna_match.tx_position.chrom_start
+            end = cdna_match.tx_position.chrom_stop
+            if start <= start_codon < end:
+                # We're inside this match
+                if transcript_strand:
+                    position = start_codon - start
+                else:
+                    position = end - start_codon - 1
+                cdna_len += position + cdna_match.get_offset(position)
                 break
-            cdna_len += exon_end - exon_start
+            cdna_len += cdna_match.length
         else:
-            raise ValueError("transcript contains no exons")
+            raise ValueError("transcript contains no cdna_match")
 
-        if transcript_strand:
-            return cdna_len + (start_codon - exon_start)
-        else:
-            return cdna_len + (exon_end - start_codon - 1)
+        return cdna_len
 
     def find_stop_codon(self, cds_position):
         """Return the position along the cDNA of the base after the stop codon."""
@@ -248,6 +252,7 @@ def genomic_to_cdna_coord(self, genomic_coord):
 
         return cdna_coord
 
+
 BED6Interval_base = namedtuple(
     "BED6Interval_base", (
         "chrom",
@@ -284,15 +289,23 @@ def distance(self, offset):
             return -end_distance
 
 
-class Exon(object):
-    def __init__(self, transcript, tx_position, exon_number):
+class CDNA_Match(object):
+    """ An exon is a special case of a cDNA match which has 0 gaps """
+    def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number):
         self.transcript = transcript
         self.tx_position = tx_position
-        self.exon_number = exon_number
+        self.cdna_start = cdna_start
+        self.cdna_end = cdna_end
+        self.gap = gap
+        self.number = number
+
+    @property
+    def length(self):
+        return self.cdna_end - self.cdna_start + 1
 
     @property
-    def get_exon_name(self):
-        return "%s.%d" % (self.transcript.name, self.exon_number)
+    def name(self):
+        return "%s.%d" % (self.transcript.name, self.number)
 
     def get_as_interval(self, coding_only=False):
         """Returns the coding region for this exon as a BED6Interval.
@@ -324,7 +337,7 @@ def get_as_interval(self, coding_only=False):
             self.tx_position.chrom,
             exon_start,
             exon_stop,
-            self.get_exon_name,
+            self.name,
             '.',
             self.strand,
         )
@@ -333,3 +346,43 @@ def get_as_interval(self, coding_only=False):
     def strand(self):
         strand = '+' if self.tx_position.is_forward_strand else '-'
         return strand
+
+    def get_offset(self, position: int):
+        """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length
+            @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute
+            codes operation
+            M 	match
+            I 	insert a gap into the reference sequence
+            D 	insert a gap into the target (delete from reference)
+
+            If you want the whole exon, then pass the end
+        """
+
+        if not self.gap:
+            return 0
+
+        match_i = 0
+        offset = 0
+        for gap_op in self.gap.split():
+            code = gap_op[0]
+            length = int(gap_op[1:])
+            if code == "M":
+                match_i += length
+            elif code == "I":
+                if position <= match_i + length:
+                    raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op)
+                offset += length
+            elif code == "D":
+                if position <= match_i + length:
+                    raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op)
+                offset -= length
+            else:
+                raise ValueError(f"Unknown code in cDNA GAP: {gap_op}")
+
+            #if self.transcript.name == "NM_000016":
+            #    print(f"{position}: {code} {length} - {offset=}, {match_i=}")
+
+            if match_i >= position:
+                break
+
+        return offset
diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py
new file mode 100644
index 0000000..ed49b02
--- /dev/null
+++ b/pyhgvs/tests/test_cdna_match.py
@@ -0,0 +1,71 @@
+"""
+    RefSeq transcripts don't always match the genome reference, so can align with gaps.
+"""
+
+import nose
+from pyhgvs import HGVSName
+
+from pyhgvs.utils import make_transcript
+
+
+def get_transcript(accession):
+    transcript_json = _transcripts[accession]
+    return make_transcript(transcript_json)
+
+
+def test_cdna_to_genomic_coord():
+    HGVS_GRCh37_COORDS = [
+        # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert
+        ("NM_015120.4:c.74A>T", 73613070),
+#        ("NM_015120.4:c.75G>T", None),  # No consistent alignment
+#        ("NM_015120.4:c.78A>T", 73613071),
+#        ("NM_015120.4:c.79G>C", 73613072),
+    ]
+    transcript = get_transcript("NM_015120.4")
+    print(f"NM_015120.4 UTR = {transcript.get_utr5p_size()}")
+    for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if expected_genomic_coord:
+            genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
+            nose.tools.assert_equal(genomic_coord, expected_genomic_coord)
+        else:
+            pass
+            # nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
+
+
+def test_cdna_to_genomic_coord_negative_strand():
+    # TODO:
+    pass
+
+
+_transcripts = {
+    # GRCh37
+    "NM_015120.4": {
+        "id": "NM_015120.4",
+        "gene_name": "ALMS1",
+        "end": 73837046,
+        "chrom": "NC_000002.11",
+        "exons": [[73612885, 73613320], [73635749, 73635875], [73646250, 73646446], [73649984, 73650102],
+                  [73651557, 73652030], [73653580, 73653681], [73659325, 73659419], [73675089, 73681194],
+                  [73682288, 73682422], [73716760, 73718625], [73746901, 73747143], [73761950, 73762076],
+                  [73777393, 73777564], [73784346, 73784481], [73786098, 73786269], [73799388, 73800551],
+                  [73826527, 73826648], [73827804, 73828008], [73828321, 73828563], [73829311, 73829495],
+                  [73830367, 73830431], [73835601, 73835701], [73836694, 73837046]],
+        "start": 73612885,
+        "strand": "+",
+        "cds_end": 73836739,
+        "cds_start": 73612996,
+        "cdna_match": [[73612885, 73613320, 1, 438, "M185 I3 M250"], [73635749, 73635875, 439, 564, None],
+                       [73646250, 73646446, 565, 760, None], [73649984, 73650102, 761, 878, None],
+                       [73651557, 73652030, 879, 1351, None], [73653580, 73653681, 1352, 1452, None],
+                       [73659325, 73659419, 1453, 1546, None], [73675089, 73681194, 1547, 7654, "M141 I3 M5964"],
+                       [73682288, 73682422, 7655, 7788, None], [73716760, 73718625, 7789, 9653, None],
+                       [73746901, 73747143, 9654, 9895, None], [73761950, 73762076, 9896, 10021, None],
+                       [73777393, 73777564, 10022, 10192, None], [73784346, 73784481, 10193, 10327, None],
+                       [73786098, 73786269, 10328, 10498, None], [73799388, 73800551, 10499, 11661, None],
+                       [73826527, 73826648, 11662, 11782, None], [73827804, 73828008, 11783, 11986, None],
+                       [73828321, 73828563, 11987, 12228, None], [73829311, 73829495, 12229, 12412, None],
+                       [73830367, 73830431, 12413, 12476, None], [73835601, 73835701, 12477, 12576, None],
+                       [73836694, 73837046, 12577, 12928, None]]}
+
+}
diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index 48bbb99..8f06237 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -211,7 +211,7 @@ def test_invalid_coordinates():
 ]
 
 
-# Test examples of coverting coordinates.
+# Test examples of converting coordinates.
 _convert_coords = [
     # Positions near start codon.
     ('NM_000016.4', ('chr1', 76190473), CDNACoord(1)),
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 6d3ac3e..f691a71 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -6,7 +6,7 @@
 from __future__ import unicode_literals
 
 from .models.variants import Position
-from .models.transcript import Exon, Transcript
+from .models.transcript import Transcript, CDNA_Match
 
 
 def read_refgene(infile):
@@ -85,23 +85,61 @@ def make_transcript(transcript_json):
             transcript_json['cds_end'],
             transcript_json['strand'] == '+'))
 
-    exons = transcript_json['exons']
-    if not transcript.tx_position.is_forward_strand:
-        exons = reversed(exons)
-
-    for exon_number, (exon_start, exon_end) in enumerate(exons, 1):
-        transcript.exons.append(
-            Exon(transcript=transcript,
-                 tx_position=Position(
-                     transcript_json['chrom'],
-                     exon_start,
-                     exon_end,
-                     transcript_json['strand'] == '+'),
-                 exon_number=exon_number))
+    cdna_match = transcript_json.get('cdna_match')
+    if cdna_match:
+        if not transcript.tx_position.is_forward_strand:
+            cdna_match = reversed(cdna_match)
+    else:
+        exons = transcript_json['exons']
+        if not transcript.tx_position.is_forward_strand:
+            exons = reversed(exons)
+        cdna_match = json_perfect_exons_to_cdna_match(exons)  # Only use single=True once ALL has been implemented
+
+    for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1):
+        transcript.cdna_match.append(CDNA_Match(transcript=transcript,
+                                                tx_position=Position(
+                                                    transcript_json['chrom'],
+                                                    exon_start,
+                                                    exon_end,
+                                                    transcript_json['strand'] == '+'),
+                                                cdna_start=cdna_start,
+                                                cdna_end=cdna_end,
+                                                gap=gap,
+                                                number=number))
 
     return transcript
 
 
+def json_perfect_exons_to_cdna_match(ordered_exons, single=False):
+    """ Perfectly matched exons are basically a no-gap case of cDNA match """
+    cdna_match = []
+    if single:
+        ordered_exons = list(ordered_exons)
+        start = ordered_exons[0][0]
+        end = ordered_exons[-1][1]
+        last_exon_end = None
+        gap_list = []
+        cdna_length = 0
+        for (exon_start, exon_end) in ordered_exons:
+            # end up looking like "M D M D (M=exon, D=intron length)"
+            if last_exon_end:
+                intron_length = abs(exon_start - last_exon_end)
+                gap_list.append("D%d" % intron_length)
+            exon_length = exon_end - exon_start
+            cdna_length += exon_length
+            gap_list.append("M%d" % exon_length)
+            last_exon_end = exon_end
+        cdna_match = [[start, end, 1, cdna_length, " ".join(gap_list)]]
+    else:
+        cdna_end = 0
+        for (exon_start, exon_end) in ordered_exons:
+            cdna_start = cdna_end + 1
+            exon_length = exon_end - exon_start
+            cdna_end = cdna_start + exon_length - 1
+            cdna_match.append([exon_start, exon_end, cdna_start, cdna_end, None])
+    return cdna_match
+
+
 def read_transcripts(refgene_file):
     """
     Read all transcripts in a RefGene file.

From 04ac8a7af6065a605b5989c0c1a2015d0cc8d69b Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Wed, 13 Oct 2021 12:02:11 +1030
Subject: [PATCH 10/47] Handle gaps

---
 pyhgvs/models/transcript.py     | 89 +++++++++++++++++----------------
 pyhgvs/tests/test_cdna_match.py | 47 +++++++++++++----
 pyhgvs/utils.py                 |  4 +-
 3 files changed, 86 insertions(+), 54 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 011da10..5bddbb9 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -108,18 +108,25 @@ def find_stop_codon(self, cds_position):
         else:
             stop_pos = cds_position.chrom_start
         cdna_pos = 0
-        for exon in self.ordered_exons:
-            exon_start = exon.tx_position.chrom_start
-            exon_stop = exon.tx_position.chrom_stop
+        for cdna_match in self.ordered_cdna_match:
+            start = cdna_match.tx_position.chrom_start
+            stop = cdna_match.tx_position.chrom_stop
 
-            if exon_start <= stop_pos <= exon_stop:
+            if start <= stop_pos <= stop:
+                # We're inside this match
                 if cds_position.is_forward_strand:
-                    return cdna_pos + stop_pos - exon_start
+                    position = stop_pos - start
                 else:
-                    return cdna_pos + exon_stop - stop_pos
+                    position = stop - stop_pos
+                # TODO: I don't think this is right as we need to go in reverse through match on '-' strand??
+                cdna_pos += position + cdna_match.get_offset(position)
+                break
             else:
-                cdna_pos += exon_stop - exon_start
-        raise ValueError('Stop codon is not in any of the exons')
+                cdna_pos += stop - start
+        else:
+            raise ValueError('Stop codon is not in any of the exons')
+
+        return cdna_pos
 
     def cdna_to_genomic_coord(self, coord):
         """Convert a HGVS cDNA coordinate to a genomic coordinate."""
@@ -130,49 +137,49 @@ def cdna_to_genomic_coord(self, coord):
         # compute starting position along spliced transcript.
         if coord.landmark == CDNA_START_CODON:
             if coord.coord > 0:
-                pos = utr5p + coord.coord
+                cdna_pos = utr5p + coord.coord
             else:
-                pos = utr5p + coord.coord + 1
+                cdna_pos = utr5p + coord.coord + 1
         elif coord.landmark == CDNA_STOP_CODON:
             if coord.coord < 0:
                 raise ValueError('CDNACoord cannot have a negative coord and '
                                  'landmark CDNA_STOP_CODON')
-            pos = self.find_stop_codon(self.cds_position) + coord.coord
+            cdna_pos = self.find_stop_codon(self.cds_position) + coord.coord
         else:
             raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark)
 
-        # 5' flanking sequence.
-        if pos < 1:
+        # 5' flanking sequence (no need to account for gaps)
+        if cdna_pos < 1:
             if transcript_strand:
-                return self.tx_position.chrom_start + pos
+                return self.tx_position.chrom_start + cdna_pos
             else:
-                return self.tx_position.chrom_stop - pos + 1
+                return self.tx_position.chrom_stop - cdna_pos + 1
 
-        # Walk along transcript until we find an exon that contains pos.
+        # Walk along transcript until we find an exon that contains cdna_pos.
         cdna_start = 1
-        cdna_end = 1
-        for exon in self.ordered_exons:
-            exon_start = exon.tx_position.chrom_start + 1
-            exon_end = exon.tx_position.chrom_stop
-            cdna_end = cdna_start + (exon_end - exon_start)
-            if cdna_start <= pos <= cdna_end:
-                break
+        for cdna_match in self.ordered_cdna_match:
+            cdna_end = cdna_start + cdna_match.length - 1
+            if cdna_start <= cdna_pos <= cdna_end:
+                match_pos = cdna_pos - cdna_start
+                # TODO: I don't think this is right as we need to go in reverse through match on '-' strand??
+                match_pos -= cdna_match.get_offset(match_pos)
+                # Compute genomic coordinate using offset.
+                if transcript_strand:
+                    # Plus strand.
+                    start = cdna_match.tx_position.chrom_start + 1
+                    return start + match_pos + coord.offset
+                else:
+                    # Minus strand.
+                    end = cdna_match.tx_position.chrom_stop
+                    return end - match_pos - coord.offset
             cdna_start = cdna_end + 1
         else:
-            # 3' flanking sequence
+            # 3' flanking sequence (no need to account for gaps)
             if transcript_strand:
                 return self.cds_position.chrom_stop + coord.coord
             else:
                 return self.cds_position.chrom_start + 1 - coord.coord
 
-        # Compute genomic coordinate using offset.
-        if transcript_strand:
-            # Plus strand.
-            return exon_start + (pos - cdna_start) + coord.offset
-        else:
-            # Minus strand.
-            return exon_end - (pos - cdna_start) - coord.offset
-
     def genomic_to_cdna_coord(self, genomic_coord):
         """Convert a genomic coordinate to a cDNA coordinate and offset.
         """
@@ -361,28 +368,26 @@ def get_offset(self, position: int):
         if not self.gap:
             return 0
 
-        match_i = 0
+        position_1_based = position + 1
+        cdna_match_index = 1
         offset = 0
         for gap_op in self.gap.split():
             code = gap_op[0]
             length = int(gap_op[1:])
             if code == "M":
-                match_i += length
+                cdna_match_index += length
             elif code == "I":
-                if position <= match_i + length:
-                    raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op)
+                if position_1_based < cdna_match_index + length:
+                    raise ValueError("Coordinate (%d) inside insertion (%s) - no mapping possible!" % (position_1_based, gap_op))
                 offset += length
             elif code == "D":
-                if position <= match_i + length:
-                    raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op)
+                if position < cdna_match_index + length:
+                    raise ValueError("Coordinate (%d) inside deletion (%s) - no mapping possible!" % (position_1_based, gap_op))
                 offset -= length
             else:
                 raise ValueError(f"Unknown code in cDNA GAP: {gap_op}")
 
-            #if self.transcript.name == "NM_000016":
-            #    print(f"{position}: {code} {length} - {offset=}, {match_i=}")
-
-            if match_i >= position:
+            if cdna_match_index > position_1_based:
                 break
 
         return offset
diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py
index ed49b02..ec316e8 100644
--- a/pyhgvs/tests/test_cdna_match.py
+++ b/pyhgvs/tests/test_cdna_match.py
@@ -17,25 +17,40 @@ def test_cdna_to_genomic_coord():
     HGVS_GRCh37_COORDS = [
         # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert
         ("NM_015120.4:c.74A>T", 73613070),
-#        ("NM_015120.4:c.75G>T", None),  # No consistent alignment
-#        ("NM_015120.4:c.78A>T", 73613071),
-#        ("NM_015120.4:c.79G>C", 73613072),
+        ("NM_015120.4:c.75G>T", None),  # No consistent alignment
+        ("NM_015120.4:c.76G>T", None),  # No consistent alignment
+        ("NM_015120.4:c.77A>T", None),  # No consistent alignment
+        ("NM_015120.4:c.78A>T", 73613071),
+        ("NM_015120.4:c.79G>C", 73613072),
     ]
     transcript = get_transcript("NM_015120.4")
-    print(f"NM_015120.4 UTR = {transcript.get_utr5p_size()}")
     for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS:
         hgvs_name = HGVSName(hgvs_str)
         if expected_genomic_coord:
             genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
             nose.tools.assert_equal(genomic_coord, expected_genomic_coord)
         else:
-            pass
-            # nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
+            nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
 def test_cdna_to_genomic_coord_negative_strand():
-    # TODO:
-    pass
+    HGVS_GRCh37_COORDS = [
+        # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert
+        ("NM_001135649.3:c.300C>T", 88751754),
+        ("NM_001135649.3:c.301A>C", 88751753),
+        ("NM_001135649.3:c.302C>T", 88751752),
+        ("NM_001135649.3:c.303C>T", None),  # N/A GRCh37 - 1 bp insert
+        ("NM_001135649.3:c.304T>C", 88751751),
+        ("NM_001135649.3:c.305T>C", 88751750),
+    ]
+    transcript = get_transcript("NM_001135649.3")
+    for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if expected_genomic_coord:
+            genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
+            nose.tools.assert_equal(genomic_coord, expected_genomic_coord)
+        else:
+            nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
 _transcripts = {
@@ -66,6 +81,18 @@ def test_cdna_to_genomic_coord_negative_strand():
                        [73826527, 73826648, 11662, 11782, None], [73827804, 73828008, 11783, 11986, None],
                        [73828321, 73828563, 11987, 12228, None], [73829311, 73829495, 12229, 12412, None],
                        [73830367, 73830431, 12413, 12476, None], [73835601, 73835701, 12477, 12576, None],
-                       [73836694, 73837046, 12577, 12928, None]]}
-
+                       [73836694, 73837046, 12577, 12928, None]]
+    },
+    "NM_001135649.3": {
+        "id": "NM_001135649.3",
+        "gene_name": "FOXI3",
+        "end": 88752211,
+        "chrom": "NC_000002.11",
+        "exons": [[88746305, 88748348], [88751414, 88751751], [88751753, 88752211]],
+        "start": 88746305,
+        "strand": "-",
+        "cds_end": 88752053,
+        "cds_start": 88747725,
+        "cdna_match": [[88746305, 88748348, 799, 2841, None], [88751414, 88752211, 1, 798, "M460 I1 M337"]]
+    }
 }
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index f691a71..601baa8 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -131,12 +131,12 @@ def json_perfect_exons_to_cdna_match(ordered_exons, single=False):
             last_exon_end = exon_end
         cdna_match = [[start, end, 1, cdna_length, " ".join(gap_list)]]
     else:
-        cdna_end = 0
+        cdna_start = 1
         for (exon_start, exon_end) in ordered_exons:
-            cdna_start = cdna_end + 1
             exon_length = exon_end - exon_start
             cdna_end = cdna_start + exon_length - 1
             cdna_match.append([exon_start, exon_end, cdna_start, cdna_end, None])
+            cdna_start = cdna_end + 1
     return cdna_match
 
 

From ae264c177f87481d375309ce7422b48b2a2b7be4 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Wed, 13 Oct 2021 17:03:47 +1030
Subject: [PATCH 11/47] Handle gaps - put back exons as we need them to work
 out flanking

---
 pyhgvs/models/transcript.py     | 36 +++++++++--------
 pyhgvs/tests/test_cdna_match.py | 68 ++++++++++++++++++++++-----------
 pyhgvs/utils.py                 | 26 ++++++++-----
 3 files changed, 82 insertions(+), 48 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 5bddbb9..82de220 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -19,22 +19,21 @@ def __init__(self, name):
 class Transcript(object):
     """
     A gene may have multiple transcripts with different combinations of exons.
+
+    We need both exons and cdna_match as need to know exact exon boundaries to work out flanking
     """
 
     def __init__(self, name, version, gene, tx_position, cds_position,
-                 is_default=False, cdna_match=None):
+                 is_default=False, exons=None, cdna_match=None):
         self.name = name
         self.version = version
         self.gene = Gene(gene)
         self.tx_position = tx_position
         self.cds_position = cds_position
         self.is_default = is_default
+        self.exons = exons or []
         self.cdna_match = cdna_match or []
 
-    @lazy
-    def exons(self):
-        return self.cdna_match
-
     @property
     def full_name(self):
         if self.version is not None:
@@ -244,7 +243,7 @@ def genomic_to_cdna_coord(self, genomic_coord):
         # Adjust coordinates for coding transcript.
         if self.is_coding:
             # Detect if position before start codon.
-            utr5p = self.get_utr5p_size() if self.is_coding else 0
+            utr5p = self.get_utr5p_size()
             cdna_coord.coord -= utr5p
             if cdna_coord.coord <= 0:
                 cdna_coord.coord -= 1
@@ -296,20 +295,13 @@ def distance(self, offset):
             return -end_distance
 
 
-class CDNA_Match(object):
-    """ An exon is a special case of a cDNA match which has 0 gaps """
-    def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number):
+class Exon(object):
+    """ We still need exons to work out the flanking boundaries """
+    def __init__(self, transcript, tx_position, number):
         self.transcript = transcript
         self.tx_position = tx_position
-        self.cdna_start = cdna_start
-        self.cdna_end = cdna_end
-        self.gap = gap
         self.number = number
 
-    @property
-    def length(self):
-        return self.cdna_end - self.cdna_start + 1
-
     @property
     def name(self):
         return "%s.%d" % (self.transcript.name, self.number)
@@ -354,6 +346,18 @@ def strand(self):
         strand = '+' if self.tx_position.is_forward_strand else '-'
         return strand
 
+
+class CDNA_Match(Exon):
+    def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number):
+        super(CDNA_Match, self).__init__(transcript, tx_position, number)
+        self.cdna_start = cdna_start
+        self.cdna_end = cdna_end
+        self.gap = gap
+
+    @property
+    def length(self):
+        return self.cdna_end - self.cdna_start + 1
+
     def get_offset(self, position: int):
         """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length
             @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute
diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py
index ec316e8..7e35ea5 100644
--- a/pyhgvs/tests/test_cdna_match.py
+++ b/pyhgvs/tests/test_cdna_match.py
@@ -7,24 +7,30 @@
 
 from pyhgvs.utils import make_transcript
 
+_HGVS_NM_015120_GRCh37_COORDS = [
+    # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert
+    ("NM_015120.4:c.74A>T", 73613070),
+    ("NM_015120.4:c.75G>T", None),  # No consistent alignment
+    ("NM_015120.4:c.76G>T", None),  # No consistent alignment
+    ("NM_015120.4:c.77A>T", None),  # No consistent alignment
+    ("NM_015120.4:c.78A>T", 73613071),
+    ("NM_015120.4:c.79G>C", 73613072),
+]
 
-def get_transcript(accession):
-    transcript_json = _transcripts[accession]
-    return make_transcript(transcript_json)
+_HGVS_NM_001135649_GRCh37_COORDS = [
+    # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert
+    ("NM_001135649.3:c.300C>T", 88751754),
+    ("NM_001135649.3:c.301A>C", 88751753),
+    ("NM_001135649.3:c.302C>T", 88751752),
+    ("NM_001135649.3:c.303C>T", None),  # N/A GRCh37 - 1 bp insert
+    ("NM_001135649.3:c.304T>C", 88751751),
+    ("NM_001135649.3:c.305T>C", 88751750),
+]
 
 
 def test_cdna_to_genomic_coord():
-    HGVS_GRCh37_COORDS = [
-        # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert
-        ("NM_015120.4:c.74A>T", 73613070),
-        ("NM_015120.4:c.75G>T", None),  # No consistent alignment
-        ("NM_015120.4:c.76G>T", None),  # No consistent alignment
-        ("NM_015120.4:c.77A>T", None),  # No consistent alignment
-        ("NM_015120.4:c.78A>T", 73613071),
-        ("NM_015120.4:c.79G>C", 73613072),
-    ]
     transcript = get_transcript("NM_015120.4")
-    for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS:
+    for hgvs_str, expected_genomic_coord in _HGVS_NM_015120_GRCh37_COORDS:
         hgvs_name = HGVSName(hgvs_str)
         if expected_genomic_coord:
             genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
@@ -33,18 +39,19 @@ def test_cdna_to_genomic_coord():
             nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
+@nose.SkipTest
+def test_genomic_to_cdna_coord():
+    transcript = get_transcript("NM_015120.4")
+    for hgvs_str, genomic_coord in _HGVS_NM_015120_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if genomic_coord:
+            cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord)
+            nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start)
+
+
 def test_cdna_to_genomic_coord_negative_strand():
-    HGVS_GRCh37_COORDS = [
-        # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert
-        ("NM_001135649.3:c.300C>T", 88751754),
-        ("NM_001135649.3:c.301A>C", 88751753),
-        ("NM_001135649.3:c.302C>T", 88751752),
-        ("NM_001135649.3:c.303C>T", None),  # N/A GRCh37 - 1 bp insert
-        ("NM_001135649.3:c.304T>C", 88751751),
-        ("NM_001135649.3:c.305T>C", 88751750),
-    ]
     transcript = get_transcript("NM_001135649.3")
-    for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS:
+    for hgvs_str, expected_genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS:
         hgvs_name = HGVSName(hgvs_str)
         if expected_genomic_coord:
             genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
@@ -53,6 +60,21 @@ def test_cdna_to_genomic_coord_negative_strand():
             nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
+@nose.SkipTest
+def test_genomic_to_cdna_coord_negative_strand():
+    transcript = get_transcript("NM_001135649.3")
+    for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if genomic_coord:
+            cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord)
+            nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start)
+
+
+def get_transcript(accession):
+    transcript_json = _transcripts[accession]
+    return make_transcript(transcript_json)
+
+
 _transcripts = {
     # GRCh37
     "NM_015120.4": {
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 601baa8..4a4f298 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -6,7 +6,7 @@
 from __future__ import unicode_literals
 
 from .models.variants import Position
-from .models.transcript import Transcript, CDNA_Match
+from .models.transcript import Transcript, CDNA_Match, Exon
 
 
 def read_refgene(infile):
@@ -85,16 +85,24 @@ def make_transcript(transcript_json):
             transcript_json['cds_end'],
             transcript_json['strand'] == '+'))
 
-    cdna_match = transcript_json.get('cdna_match')
-    if cdna_match:
-        if not transcript.tx_position.is_forward_strand:
-            cdna_match = reversed(cdna_match)
-    else:
-        exons = transcript_json['exons']
-        if not transcript.tx_position.is_forward_strand:
-            exons = reversed(exons)
+    exons = transcript_json['exons']
+    cdna_match = transcript_json.get('cdna_match', [])
+    if not transcript.tx_position.is_forward_strand:
+        exons.reverse()
+        cdna_match.reverse()
+
+    if not cdna_match:
         cdna_match = json_perfect_exons_to_cdna_match(exons)  # Only use single=True once ALL has been implemented
 
+    for number, (exon_start, exon_end) in enumerate(exons, 1):
+        transcript.exons.append(Exon(transcript=transcript,
+                                     tx_position=Position(
+                                         transcript_json['chrom'],
+                                         exon_start,
+                                         exon_end,
+                                         transcript_json['strand'] == '+'),
+                                     number=number))
+
     for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1):
         transcript.cdna_match.append(CDNA_Match(transcript=transcript,
                                                 tx_position=Position(

From 5a4b48b2fccfea2c5fd9b64e12a6465009059730 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Thu, 14 Oct 2021 11:49:19 +1030
Subject: [PATCH 12/47] checkpoint - handle flanking

---
 pyhgvs/models/transcript.py | 78 +++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 82de220..089e79d 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -117,7 +117,6 @@ def find_stop_codon(self, cds_position):
                     position = stop_pos - start
                 else:
                     position = stop - stop_pos
-                # TODO: I don't think this is right as we need to go in reverse through match on '-' strand??
                 cdna_pos += position + cdna_match.get_offset(position)
                 break
             else:
@@ -160,7 +159,6 @@ def cdna_to_genomic_coord(self, coord):
             cdna_end = cdna_start + cdna_match.length - 1
             if cdna_start <= cdna_pos <= cdna_end:
                 match_pos = cdna_pos - cdna_start
-                # TODO: I don't think this is right as we need to go in reverse through match on '-' strand??
                 match_pos -= cdna_match.get_offset(match_pos)
                 # Compute genomic coordinate using offset.
                 if transcript_strand:
@@ -189,56 +187,40 @@ def genomic_to_cdna_coord(self, genomic_coord):
             return None
 
         strand = self.strand
-
-        if strand == "+":
-            exons.sort(key=lambda exon: exon.chrom_start)
-        else:
-            exons.sort(key=lambda exon: -exon.chrom_end)
-
         distances = [exon.distance(genomic_coord)
                      for exon in exons]
         min_distance_to_exon = min(map(abs, distances))
+        if min_distance_to_exon:
+            # We're outside of exon - so need to find closest point
+            for exon in exons:
+                distance = exon.distance(genomic_coord)
+                if abs(distance) == min_distance_to_exon:
+                    # print(f"{exon}.distance({genomic_coord}) = {distance}")
 
-        coding_offset = 0
-        for exon in exons:
-            exon_length = exon.chrom_end - exon.chrom_start
-            distance = exon.distance(genomic_coord)
-            if abs(distance) == min_distance_to_exon:
-                if strand == "+":
-                    exon_start_cds_offset = coding_offset + 1
-                    exon_end_cds_offset = coding_offset + exon_length
-                else:
-                    exon_start_cds_offset = coding_offset + exon_length
-                    exon_end_cds_offset = coding_offset + 1
-                # This is the exon we want to annotate against.
-                if distance == 0:
-                    # Inside the exon.
-                    if strand == "+":
-                        coord = (exon_start_cds_offset +
-                                 (genomic_coord -
-                                  (exon.chrom_start + 1)))
-                    else:
-                        coord = (exon_end_cds_offset +
-                                 (exon.chrom_end -
-                                  genomic_coord))
-                    cdna_coord = CDNACoord(coord, 0)
-                else:
                     # Outside the exon.
                     if distance > 0:
-                        nearest_exonic = exon_start_cds_offset
+                        genomic_nearest_exon = exon.chrom_start + 1
                     else:
-                        nearest_exonic = exon_end_cds_offset
+                        genomic_nearest_exon = exon.chrom_end
+
                     if strand == "+":
                         distance *= -1
 
+                    nearest_exon_coord = self._exon_genomic_to_cdna_coord(genomic_nearest_exon)
+
                     # If outside transcript, don't use offset.
                     if (genomic_coord < self.tx_position.chrom_start + 1 or
                             genomic_coord > self.tx_position.chrom_stop):
-                        nearest_exonic += distance
+                        print("OUTSIDE TRANSCRIPT - DON'T USE OFFSET")
+                        nearest_exon_coord += distance
                         distance = 0
-                    cdna_coord = CDNACoord(nearest_exonic, distance)
-                break
-            coding_offset += exon_length
+                    cdna_coord = CDNACoord(nearest_exon_coord, distance)
+                    break
+            else:
+                raise ValueError("Could not find closest exon!") # Should never happen
+        else:
+            coord = self._exon_genomic_to_cdna_coord(genomic_coord)
+            cdna_coord = CDNACoord(coord, 0)
 
         # Adjust coordinates for coding transcript.
         if self.is_coding:
@@ -253,11 +235,31 @@ def genomic_to_cdna_coord(self, genomic_coord):
                 stop_codon -= utr5p
                 if (cdna_coord.coord > stop_codon or
                         cdna_coord.coord == stop_codon and cdna_coord.offset > 0):
+                    print(f"ADJUSTING - 5' UTR size: {utr5p} Stop codon={stop_codon}")
                     cdna_coord.coord -= stop_codon
                     cdna_coord.landmark = CDNA_STOP_CODON
 
         return cdna_coord
 
+    def _exon_genomic_to_cdna_coord(self, genomic_coord):
+        coding_offset = 0
+        for cdna_match in self.ordered_cdna_match:
+            exon_length = cdna_match.tx_position.chrom_stop - cdna_match.tx_position.chrom_start
+            cds_start = coding_offset + 1
+            cds_end = coding_offset + exon_length
+
+            # Inside the exon.
+            if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop:
+                #print(f"{genomic_coord} INSIDE EXON: {exon}")
+                if self.strand == "+":
+                    position = genomic_coord - (cdna_match.tx_position.chrom_start + 1)
+                else:
+                    position = cdna_match.tx_position.chrom_stop - genomic_coord
+                return cds_start + position  #  - cdna_match.get_offset(position)
+            coding_offset += exon_length
+        else:
+            raise ValueError(f"Couldn't find {genomic_coord=}!")
+
 
 BED6Interval_base = namedtuple(
     "BED6Interval_base", (

From 5667511f5f45f6eea4448d292fd878fa59131988 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 10:02:23 +1030
Subject: [PATCH 13/47] Finish off gap handling code

---
 pyhgvs/models/transcript.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 089e79d..8cd2e10 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -129,11 +129,12 @@ def find_stop_codon(self, cds_position):
     def cdna_to_genomic_coord(self, coord):
         """Convert a HGVS cDNA coordinate to a genomic coordinate."""
         transcript_strand = self.tx_position.is_forward_strand
-        utr5p = (self.get_utr5p_size()
-                 if self.is_coding else 0)
 
         # compute starting position along spliced transcript.
         if coord.landmark == CDNA_START_CODON:
+            utr5p = (self.get_utr5p_size()
+                     if self.is_coding else 0)
+
             if coord.coord > 0:
                 cdna_pos = utr5p + coord.coord
             else:
@@ -178,8 +179,7 @@ def cdna_to_genomic_coord(self, coord):
                 return self.cds_position.chrom_start + 1 - coord.coord
 
     def genomic_to_cdna_coord(self, genomic_coord):
-        """Convert a genomic coordinate to a cDNA coordinate and offset.
-        """
+        """ Convert a genomic coordinate to a cDNA coordinate and offset. """
         exons = [exon.get_as_interval()
                  for exon in self.ordered_exons]
 
@@ -195,7 +195,6 @@ def genomic_to_cdna_coord(self, genomic_coord):
             for exon in exons:
                 distance = exon.distance(genomic_coord)
                 if abs(distance) == min_distance_to_exon:
-                    # print(f"{exon}.distance({genomic_coord}) = {distance}")
 
                     # Outside the exon.
                     if distance > 0:
@@ -211,7 +210,6 @@ def genomic_to_cdna_coord(self, genomic_coord):
                     # If outside transcript, don't use offset.
                     if (genomic_coord < self.tx_position.chrom_start + 1 or
                             genomic_coord > self.tx_position.chrom_stop):
-                        print("OUTSIDE TRANSCRIPT - DON'T USE OFFSET")
                         nearest_exon_coord += distance
                         distance = 0
                     cdna_coord = CDNACoord(nearest_exon_coord, distance)
@@ -235,7 +233,6 @@ def genomic_to_cdna_coord(self, genomic_coord):
                 stop_codon -= utr5p
                 if (cdna_coord.coord > stop_codon or
                         cdna_coord.coord == stop_codon and cdna_coord.offset > 0):
-                    print(f"ADJUSTING - 5' UTR size: {utr5p} Stop codon={stop_codon}")
                     cdna_coord.coord -= stop_codon
                     cdna_coord.landmark = CDNA_STOP_CODON
 
@@ -244,19 +241,16 @@ def genomic_to_cdna_coord(self, genomic_coord):
     def _exon_genomic_to_cdna_coord(self, genomic_coord):
         coding_offset = 0
         for cdna_match in self.ordered_cdna_match:
-            exon_length = cdna_match.tx_position.chrom_stop - cdna_match.tx_position.chrom_start
             cds_start = coding_offset + 1
-            cds_end = coding_offset + exon_length
 
             # Inside the exon.
             if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop:
-                #print(f"{genomic_coord} INSIDE EXON: {exon}")
                 if self.strand == "+":
                     position = genomic_coord - (cdna_match.tx_position.chrom_start + 1)
                 else:
                     position = cdna_match.tx_position.chrom_stop - genomic_coord
-                return cds_start + position  #  - cdna_match.get_offset(position)
-            coding_offset += exon_length
+                return cds_start + position - cdna_match.get_offset(position)
+            coding_offset += cdna_match.length
         else:
             raise ValueError(f"Couldn't find {genomic_coord=}!")
 

From c1f2099e99f28ff42146a3d6340a0c90c7088895 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 10:29:43 +1030
Subject: [PATCH 14/47] linting

---
 pyhgvs/models/hgvs_name.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index efe9471..cc20682 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -256,6 +256,7 @@ def get_refseq_type(name):
     prefix = name[:3]
     return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0]
 
+
 class InvalidHGVSName(ValueError):
     def __init__(self, name='', part='name', reason=''):
         if name:
@@ -344,7 +345,7 @@ def parse_prefix(self, prefix, kind):
 
         # Transcript and gene given with parens.
         # example: NM_007294.3(BRCA1):c.2207A>C
-        match = re.match("^(?P<transcript>[^(]+)\((?P<gene>[^)]+)\)$", prefix)
+        match = re.match(r"^(?P<transcript>[^(]+)\((?P<gene>[^)]+)\)$", prefix)
         if match:
             self.transcript = match.group('transcript')
             self.gene = match.group('gene')
@@ -352,7 +353,7 @@ def parse_prefix(self, prefix, kind):
 
         # Transcript and gene given with braces.
         # example: BRCA1{NM_007294.3}:c.2207A>C
-        match = re.match("^(?P<gene>[^{]+)\{(?P<transcript>[^}]+)\}$", prefix)
+        match = re.match(r"^(?P<gene>[^{]+){(?P<transcript>[^}]+)}$", prefix)
         if match:
             self.transcript = match.group('transcript')
             self.gene = match.group('gene')

From dc6d638a761cf676e67cfdbbff2ab95fdbf63246 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 10:31:19 +1030
Subject: [PATCH 15/47] Keep module interface the same

---
 pyhgvs/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index b3004b2..b092787 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -15,7 +15,7 @@
 
 from .models.cdna import CDNACoord
 from .models.genome import GenomeSubset
-from .models.hgvs_name import HGVSName
+from .models.hgvs_name import HGVSName, InvalidHGVSName
 from .models.variants import justify_indel, normalize_variant, revcomp
 
 

From 0d6ddd56b089a4daedcacddb11c96172007c0cc7 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 10:38:33 +1030
Subject: [PATCH 16/47] Better errors, note about not using single=True

---
 pyhgvs/models/transcript.py | 2 +-
 pyhgvs/utils.py             | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 8cd2e10..d04f917 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -96,7 +96,7 @@ def get_utr5p_size(self):
                 break
             cdna_len += cdna_match.length
         else:
-            raise ValueError("transcript contains no cdna_match")
+            raise ValueError(f"Couldn't find start_codon in cdna_match: {self.ordered_cdna_match}")
 
         return cdna_len
 
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 4a4f298..ced1e10 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -92,7 +92,7 @@ def make_transcript(transcript_json):
         cdna_match.reverse()
 
     if not cdna_match:
-        cdna_match = json_perfect_exons_to_cdna_match(exons)  # Only use single=True once ALL has been implemented
+        cdna_match = json_perfect_exons_to_cdna_match(exons)
 
     for number, (exon_start, exon_end) in enumerate(exons, 1):
         transcript.exons.append(Exon(transcript=transcript,
@@ -119,7 +119,9 @@ def make_transcript(transcript_json):
 
 
 def json_perfect_exons_to_cdna_match(ordered_exons, single=False):
-    """ Perfectly matched exons are basically a no-gap case of cDNA match """
+    """ Perfectly matched exons are basically a no-gap case of cDNA match
+        single - use a single cDNA match (deletions for introns) - this is currently broken do not use
+    """
     cdna_match = []
     if single:
         ordered_exons = list(ordered_exons)

From 79fa49b380b8e189fa95ff389c5a23ca6d2d4a29 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 12:17:59 +1030
Subject: [PATCH 17/47] Keep module interface the same

---
 pyhgvs/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index b092787..70a9f37 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -13,7 +13,7 @@
 
 import re
 
-from .models.cdna import CDNACoord
+from .models.cdna import CDNACoord, CDNA_START_CODON, CDNA_STOP_CODON
 from .models.genome import GenomeSubset
 from .models.hgvs_name import HGVSName, InvalidHGVSName
 from .models.variants import justify_indel, normalize_variant, revcomp

From 2145f96c05d12ebf99ee10f6329ec8bcedf1551e Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 16:58:00 +1030
Subject: [PATCH 18/47] Don't die in gaps when going the other way. Change
 testing data

---
 pyhgvs/models/transcript.py     | 11 ++++----
 pyhgvs/tests/test_cdna_match.py | 49 +++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index d04f917..4207f00 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -215,7 +215,7 @@ def genomic_to_cdna_coord(self, genomic_coord):
                     cdna_coord = CDNACoord(nearest_exon_coord, distance)
                     break
             else:
-                raise ValueError("Could not find closest exon!") # Should never happen
+                raise ValueError("Could not find closest exon!")  # Should never happen
         else:
             coord = self._exon_genomic_to_cdna_coord(genomic_coord)
             cdna_coord = CDNACoord(coord, 0)
@@ -249,7 +249,8 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord):
                     position = genomic_coord - (cdna_match.tx_position.chrom_start + 1)
                 else:
                     position = cdna_match.tx_position.chrom_stop - genomic_coord
-                return cds_start + position - cdna_match.get_offset(position)
+                offset = cdna_match.get_offset(position, validate=False)
+                return cds_start + position + offset
             coding_offset += cdna_match.length
         else:
             raise ValueError(f"Couldn't find {genomic_coord=}!")
@@ -354,7 +355,7 @@ def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number):
     def length(self):
         return self.cdna_end - self.cdna_start + 1
 
-    def get_offset(self, position: int):
+    def get_offset(self, position: int, validate=True):
         """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length
             @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute
             codes operation
@@ -377,11 +378,11 @@ def get_offset(self, position: int):
             if code == "M":
                 cdna_match_index += length
             elif code == "I":
-                if position_1_based < cdna_match_index + length:
+                if validate and position_1_based < cdna_match_index + length:
                     raise ValueError("Coordinate (%d) inside insertion (%s) - no mapping possible!" % (position_1_based, gap_op))
                 offset += length
             elif code == "D":
-                if position < cdna_match_index + length:
+                if validate and position < cdna_match_index + length:
                     raise ValueError("Coordinate (%d) inside deletion (%s) - no mapping possible!" % (position_1_based, gap_op))
                 offset -= length
             else:
diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py
index 7e35ea5..883b867 100644
--- a/pyhgvs/tests/test_cdna_match.py
+++ b/pyhgvs/tests/test_cdna_match.py
@@ -18,7 +18,7 @@
 ]
 
 _HGVS_NM_001135649_GRCh37_COORDS = [
-    # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert
+    # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.302 then 1 base insert
     ("NM_001135649.3:c.300C>T", 88751754),
     ("NM_001135649.3:c.301A>C", 88751753),
     ("NM_001135649.3:c.302C>T", 88751752),
@@ -27,6 +27,13 @@
     ("NM_001135649.3:c.305T>C", 88751750),
 ]
 
+_HGVS_NM_001012755_GRCh37_COORDS = [
+    # 133, 6207, "M1574 I1 M4500 - 5'UTR length is 163 so last match is transcript pos (1574+132) ie c.1543 then 1bp ins
+    ("NM_001012755.5:c.*619G>C", 103348398),  # NM_001012755.5:c.1543G>C
+    ("NM_001012755.5:c.*620G>C", None),       # NM_001012755.5:c.1544G>C - No GRCh37
+    ("NM_001012755.5:c.*621C>G", 103348397),  # NM_001012755.5:c.1545G>C
+]
+
 
 def test_cdna_to_genomic_coord():
     transcript = get_transcript("NM_015120.4")
@@ -39,7 +46,6 @@ def test_cdna_to_genomic_coord():
             nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
-@nose.SkipTest
 def test_genomic_to_cdna_coord():
     transcript = get_transcript("NM_015120.4")
     for hgvs_str, genomic_coord in _HGVS_NM_015120_GRCh37_COORDS:
@@ -60,7 +66,19 @@ def test_cdna_to_genomic_coord_negative_strand():
             nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
-@nose.SkipTest
+def test_cdna_to_genomic_coord_negative_strand2():
+    transcript = get_transcript("NM_001012755.5")
+    for hgvs_str, expected_genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if expected_genomic_coord:
+            genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start)
+            nose.tools.assert_equal(genomic_coord, expected_genomic_coord)
+        else:
+            nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
+
+
+
+@nose.SkipTest  # Currently fails
 def test_genomic_to_cdna_coord_negative_strand():
     transcript = get_transcript("NM_001135649.3")
     for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS:
@@ -70,6 +88,17 @@ def test_genomic_to_cdna_coord_negative_strand():
             nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start)
 
 
+def test_genomic_to_cdna_coord_negative_strand2():
+    transcript = get_transcript("NM_001012755.5")
+
+    for hgvs_str, genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS:
+        hgvs_name = HGVSName(hgvs_str)
+        if genomic_coord:
+            cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord)
+            nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start)
+
+
+
 def get_transcript(accession):
     transcript_json = _transcripts[accession]
     return make_transcript(transcript_json)
@@ -106,6 +135,8 @@ def get_transcript(accession):
                        [73836694, 73837046, 12577, 12928, None]]
     },
     "NM_001135649.3": {
+        # This is a strange case - there are 3 exons, but 2 cDNA match entries
+        # I am not sure if this is an error, ie where are the splice sites?
         "id": "NM_001135649.3",
         "gene_name": "FOXI3",
         "end": 88752211,
@@ -116,5 +147,17 @@ def get_transcript(accession):
         "cds_end": 88752053,
         "cds_start": 88747725,
         "cdna_match": [[88746305, 88748348, 799, 2841, None], [88751414, 88752211, 1, 798, "M460 I1 M337"]]
+    },
+    "NM_001012755.5": {
+        "id": "NM_001012755.5",
+        "gene_name": "SLC25A53",
+        "end": 103401690,
+        "chrom": "NC_000023.10",
+        "exons": [[103343897, 103349971], [103401558, 103401690]],
+        "start": 103343897,
+        "strand": "-",
+        "cds_end": 103349940,
+        "cds_start": 103349016,
+        "cdna_match": [[103343897, 103349971, 133, 6207, "M1574 I1 M4500"], [103401558, 103401690, 1, 132, None]]
     }
 }

From d58789b72ba585f2a451c6d391d73246c83513ae Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 15 Oct 2021 17:08:26 +1030
Subject: [PATCH 19/47] Always use cDNA as exons, to handle those strange cases
 like eg NM_001135649.3

---
 pyhgvs/models/transcript.py     | 15 +--------------
 pyhgvs/tests/test_cdna_match.py |  6 ++----
 pyhgvs/utils.py                 | 11 ++---------
 3 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 4207f00..cdee8a4 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -61,19 +61,6 @@ def ordered_cdna_match(self):
             cdna_match.reverse()
         return cdna_match
 
-    @lazy
-    def ordered_exons(self):
-        """Yield exons in coding order."""
-        transcript_strand = self.tx_position.is_forward_strand
-        if hasattr(self.exons, 'select_related'):
-            exons = list(self.exons.select_related('tx_position'))
-        else:
-            exons = list(self.exons)
-        exons.sort(key=lambda exon: exon.tx_position.chrom_start)
-        if not transcript_strand:
-            exons.reverse()
-        return exons
-
     def get_utr5p_size(self):
         """Return the size of the 5prime UTR of a transcript."""
 
@@ -181,7 +168,7 @@ def cdna_to_genomic_coord(self, coord):
     def genomic_to_cdna_coord(self, genomic_coord):
         """ Convert a genomic coordinate to a cDNA coordinate and offset. """
         exons = [exon.get_as_interval()
-                 for exon in self.ordered_exons]
+                 for exon in self.ordered_cdna_match]
 
         if len(exons) == 0:
             return None
diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py
index 883b867..bbaf17f 100644
--- a/pyhgvs/tests/test_cdna_match.py
+++ b/pyhgvs/tests/test_cdna_match.py
@@ -77,8 +77,6 @@ def test_cdna_to_genomic_coord_negative_strand2():
             nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start)
 
 
-
-@nose.SkipTest  # Currently fails
 def test_genomic_to_cdna_coord_negative_strand():
     transcript = get_transcript("NM_001135649.3")
     for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS:
@@ -98,7 +96,6 @@ def test_genomic_to_cdna_coord_negative_strand2():
             nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start)
 
 
-
 def get_transcript(accession):
     transcript_json = _transcripts[accession]
     return make_transcript(transcript_json)
@@ -136,7 +133,8 @@ def get_transcript(accession):
     },
     "NM_001135649.3": {
         # This is a strange case - there are 3 exons, but 2 cDNA match entries
-        # I am not sure if this is an error, ie where are the splice sites?
+        # I think the exons are in error, as eg where is the splice site?
+        # So - we'll just use the cDNA as exons
         "id": "NM_001135649.3",
         "gene_name": "FOXI3",
         "end": 88752211,
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index ced1e10..185f49f 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -91,18 +91,11 @@ def make_transcript(transcript_json):
         exons.reverse()
         cdna_match.reverse()
 
+    # We don't use exons, but run everything through cDNA match so there's just 1 path
+    # exons are treated as a perfect cDNA match
     if not cdna_match:
         cdna_match = json_perfect_exons_to_cdna_match(exons)
 
-    for number, (exon_start, exon_end) in enumerate(exons, 1):
-        transcript.exons.append(Exon(transcript=transcript,
-                                     tx_position=Position(
-                                         transcript_json['chrom'],
-                                         exon_start,
-                                         exon_end,
-                                         transcript_json['strand'] == '+'),
-                                     number=number))
-
     for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1):
         transcript.cdna_match.append(CDNA_Match(transcript=transcript,
                                                 tx_position=Position(

From 2dbd69ddf0df58eef6b03b8abddcefd7739ca72d Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Mon, 18 Oct 2021 13:43:21 +1030
Subject: [PATCH 20/47] Make sure to include models in package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6ed2380..c468888 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def main():
         long_description=description,
         author='Matt Rasmussen',
         author_email='rasmus@counsyl.com',
-        packages=[str('pyhgvs'), str('pyhgvs.tests')],
+        packages=[str('pyhgvs'), str('pyhgvs.models'), str('pyhgvs.tests')],
         include_package_data=True,
         scripts=[],
         install_requires=['pip>=1.2'],

From 4271aa7680182dd57356e17d4a04f9b1df59bdfb Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 22 Oct 2021 14:03:28 +1030
Subject: [PATCH 21/47] Better error message

---
 pyhgvs/models/transcript.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index cdee8a4..89018c9 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -83,7 +83,13 @@ def get_utr5p_size(self):
                 break
             cdna_len += cdna_match.length
         else:
-            raise ValueError(f"Couldn't find start_codon in cdna_match: {self.ordered_cdna_match}")
+            cdna_matches = []
+            for cdna_match in self.ordered_cdna_match:
+                start = cdna_match.tx_position.chrom_start
+                end = cdna_match.tx_position.chrom_stop
+                cdna_matches.append(f"{start}-{end}")
+            cdna_match_summary = ", ".join(cdna_matches)
+            raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary))
 
         return cdna_len
 

From cc3de4d3db843182d1a190a8e74103b916592622 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Fri, 22 Oct 2021 16:24:54 +1030
Subject: [PATCH 22/47] Data generation scripts

---
 .../ensembl_gene_annotation_grch37.sh         |  28 ++++
 .../ensembl_gene_annotation_grch38.sh         |  37 +++++
 .../pyreference_to_pyhgvs_json.py             | 131 ++++++++++++++++++
 .../refseq_gene_annotation_grch37.sh          |  75 ++++++++++
 .../refseq_gene_annotation_grch38.sh          |  90 ++++++++++++
 5 files changed, 361 insertions(+)
 create mode 100755 generate_transcript_data/ensembl_gene_annotation_grch37.sh
 create mode 100755 generate_transcript_data/ensembl_gene_annotation_grch38.sh
 create mode 100644 generate_transcript_data/pyreference_to_pyhgvs_json.py
 create mode 100755 generate_transcript_data/refseq_gene_annotation_grch37.sh
 create mode 100755 generate_transcript_data/refseq_gene_annotation_grch38.sh

diff --git a/generate_transcript_data/ensembl_gene_annotation_grch37.sh b/generate_transcript_data/ensembl_gene_annotation_grch37.sh
new file mode 100755
index 0000000..97ee668
--- /dev/null
+++ b/generate_transcript_data/ensembl_gene_annotation_grch37.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them
+
+#82 is first GFF3 for GRCh37
+#83 has no data
+#84 is 82 again
+#86 is 85 again
+pyreference_args=()
+for release in 82 85 87; do
+  filename=Homo_sapiens.GRCh37.${release}.gff3.gz
+  url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
+  pyreference_file=${filename}.json.gz
+  if [[ ! -e ${filename} ]]; then
+    wget ${url}
+  fi
+  if [[ ! -e ${pyreference_file} ]]; then
+    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+  fi
+  pyreference_args+=(--pyreference-json ${pyreference_file})
+done
+
+merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz"
+if [[ ! -e ${merged_file} ]]; then
+  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
+
+  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
+fi
diff --git a/generate_transcript_data/ensembl_gene_annotation_grch38.sh b/generate_transcript_data/ensembl_gene_annotation_grch38.sh
new file mode 100755
index 0000000..9f8c0a0
--- /dev/null
+++ b/generate_transcript_data/ensembl_gene_annotation_grch38.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Skip earlier GTFs as they don't have versions
+#for release in 76 77 78 79 80; do
+#  filename=Homo_sapiens.GRCh38.${release}.gtf.gz
+#  url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
+#  if [[ ! -e ${filename} ]]; then
+#    wget ${url}
+#  fi
+
+#  if [[ ! -e ${filename}.json.gz ]]; then
+#    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+#  fi
+#done
+
+#81 is first GFF3 for GRCh38
+pyreference_args=()
+for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do
+  filename=Homo_sapiens.GRCh38.${release}.gff3.gz
+  url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
+  pyreference_file=${filename}.json.gz
+
+  if [[ ! -e ${filename} ]]; then
+    wget ${url}
+  fi
+  if [[ ! -e ${filename}.json.gz ]]; then
+    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+  fi
+  pyreference_args+=(--pyreference-json ${pyreference_file})
+done
+
+merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz"
+if [[ ! -e ${merged_file} ]]; then
+  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
+
+  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
+fi
diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
new file mode 100644
index 0000000..44574d0
--- /dev/null
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -0,0 +1,131 @@
+"""
+    Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS
+
+    @see https://bitbucket.org/sacgf/pyreference/
+
+    Dave Lawrence (davmlaw@gmail.com) on 22/10/2021
+"""
+import gzip
+import json
+from argparse import ArgumentParser
+from typing import Dict
+
+
+def handle_args():
+    parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS')
+    parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend",
+                        help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)')
+    parser.add_argument('--output', required=True, help='Output filename')
+    return parser.parse_args()
+
+
+def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
+    gene_version_data = {
+        'biotype': ",".join(gene_data["biotype"]),
+        'description': gene_data.get("description"),
+        'gene_symbol': gene_data["name"],
+    }
+
+    if hgnc_str := gene_data.get("HGNC"):
+        # Has HGNC: (5 characters) at start of it
+        gene_version_data["hgnc"] = hgnc_str[5:]
+
+    # Only Ensembl Genes have versions
+    if version := gene_data.get("version"):
+        gene_data["version"] = version
+
+    return gene_version_data
+
+
+def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
+    start = transcript_data["start"]
+    end = transcript_data["stop"]
+    strand = transcript_data["strand"]
+    # PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts
+    cds_start = transcript_data.get("cds_start", start)
+    cds_end = transcript_data.get("cds_end", end)
+    # PyHGVS exons are in genomic order, PyReference are in stranded
+    features = transcript_data["features_by_type"]
+    exons = [[ed["start"], ed["stop"]] for ed in features["exon"]]
+    cdna_match = []
+    for cdm in features.get("cDNA_match", []):
+        if "cdna_strand" in cdm:  # None in Human RefSeq so haven't handled
+            raise ValueError("Haven't handled stranded Target alignment")
+        cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]])
+
+    if strand == '-':
+        exons.reverse()
+        cdna_match.reverse()
+
+    pyhgvs_data = {
+        'chrom': transcript_data["chr"],
+        'start': start,
+        'end': end,
+        'strand': strand,
+        'cds_start': cds_start,
+        'cds_end': cds_end,
+        'exons': exons,
+    }
+
+    # Optional stuff
+    if cdna_match:
+        pyhgvs_data["cdna_match"] = cdna_match
+    if transcript_data.get("partial"):
+        pyhgvs_data["partial"] = 1
+
+    # Few extra fields (pyHGVS doesn't currently use)
+    pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"])
+    return pyhgvs_data
+
+
+def main():
+    args = handle_args()
+
+    gene_versions = {}  # We only keep those that are in the latest transcript version
+    transcript_versions = {}
+
+    for pyref_filename in args.pyreference_json:
+        print(f"Loading '{pyref_filename}'")
+        with gzip.open(pyref_filename) as f:
+            pyref_data = json.load(f)
+
+            url = pyref_data["reference_gtf"]["url"]
+
+            # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes)
+            transcript_gene_version = {}
+
+            for gene_id, gene in pyref_data["genes_by_id"].items():
+                if version := gene.get("version"):
+                    gene_accession = f"{gene_id}.{version}"
+                else:
+                    gene_accession = gene_id
+
+                gene_version = convert_gene_pyreference_to_gene_version_data(gene)
+                gene_version["url"] = url
+                gene_versions[gene_accession] = gene_version
+
+                for transcript_accession in gene["transcripts"]:
+                    transcript_gene_version[transcript_accession] = gene_accession
+
+            for transcript_accession, transcript in pyref_data["transcripts_by_id"].items():
+                hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript)
+                gene_accession = transcript_gene_version[transcript_accession]
+                gene_version = gene_versions[gene_accession]
+                hgvs_data["id"] = transcript_accession
+                hgvs_data["gene_version"] = gene_accession
+                hgvs_data["gene_name"] = gene_version["gene_symbol"]
+                hgvs_data["url"] = url
+                transcript_versions[transcript_accession] = hgvs_data
+
+    print("Writing PyHGVS data")
+    with gzip.open(args.output, 'w') as outfile:
+        data = {
+            "transcripts": transcript_versions,
+            "genes": gene_versions,
+        }
+        json_str = json.dumps(data)
+        outfile.write(json_str.encode('ascii'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/generate_transcript_data/refseq_gene_annotation_grch37.sh b/generate_transcript_data/refseq_gene_annotation_grch37.sh
new file mode 100755
index 0000000..c0846fc
--- /dev/null
+++ b/generate_transcript_data/refseq_gene_annotation_grch37.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+pyreference_args=()
+
+# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
+
+filename=ref_GRCh37.p5_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename}
+pyreference_file=${filename}.json.gz
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh37.p9_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename}
+pyreference_file=${filename}.json.gz
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh37.p10_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename}
+pyreference_file=${filename}.json.gz
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh37.p13_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename}
+pyreference_file=${filename}.json.gz
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+# These all have the same name, so rename them based on release ID
+for release in 105.20190906 105.20201022; do
+  filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz
+  url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
+  pyreference_file=${filename}.json.gz
+  if [[ ! -e ${filename} ]]; then
+    wget ${url} --output-document=${filename}
+  fi
+  if [[ ! -e ${pyreference_file} ]]; then
+    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+  fi
+  pyreference_args+=(--pyreference-json ${pyreference_file})
+done
+
+
+merged_file="pyhgvs_transcripts_refseq_grch37.json.gz"
+if [[ ! -e ${merged_file} ]]; then
+  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
+
+  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
+fi
diff --git a/generate_transcript_data/refseq_gene_annotation_grch38.sh b/generate_transcript_data/refseq_gene_annotation_grch38.sh
new file mode 100755
index 0000000..a4682fe
--- /dev/null
+++ b/generate_transcript_data/refseq_gene_annotation_grch38.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
+
+filename=ref_GRCh38_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename}
+pyreference_file=${filename}.json.gz
+
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh38.p2_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename}
+pyreference_file=${filename}.json.gz
+
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh38.p7_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename}
+pyreference_file=${filename}.json.gz
+
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=ref_GRCh38.p12_top_level.gff3.gz
+url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename}
+pyreference_file=${filename}.json.gz
+
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz
+url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename}
+pyreference_file=${filename}.json.gz
+
+if [[ ! -e ${filename} ]]; then
+  wget ${url}
+fi
+if [[ ! -e ${pyreference_file} ]]; then
+  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+fi
+pyreference_args+=(--pyreference-json ${pyreference_file})
+
+
+# These all have the same name, so rename them based on release ID
+for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do
+  filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
+  url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
+  pyreference_file=${filename}.json.gz
+  if [[ ! -e ${filename} ]]; then
+    wget ${url} --output-document=${filename}
+  fi
+  if [[ ! -e ${pyreference_file} ]]; then
+    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
+  fi
+  pyreference_args+=(--pyreference-json ${pyreference_file})
+done
+
+
+merged_file="pyhgvs_transcripts_refseq_grch38.json.gz"
+if [[ ! -e ${merged_file} ]]; then
+  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
+
+  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
+fi

From 2e71ba930366cedcf7a728b54d242fa3a479e392 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Wed, 27 Oct 2021 16:17:23 +1030
Subject: [PATCH 23/47] Keep track of what other contigs we have coordinates
 for (eg chrX and chrY)

---
 generate_transcript_data/pyreference_to_pyhgvs_json.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
index 44574d0..82ebabe 100644
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -72,6 +72,9 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
         pyhgvs_data["cdna_match"] = cdna_match
     if transcript_data.get("partial"):
         pyhgvs_data["partial"] = 1
+    other_chroms = transcript_data.get("other_chroms")
+    if other_chroms:
+        pyhgvs_data["other_chroms"] = other_chroms
 
     # Few extra fields (pyHGVS doesn't currently use)
     pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"])

From 1337350e9b1c54077e73b89f064a7cca0e143ef8 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Thu, 28 Oct 2021 15:40:42 +1030
Subject: [PATCH 24/47] For non-coding, PyHGVS has cds_start = end not start
 (so coding length = 0)

---
 generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
index 82ebabe..d53edf6 100644
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -41,8 +41,8 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
     start = transcript_data["start"]
     end = transcript_data["stop"]
     strand = transcript_data["strand"]
-    # PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts
-    cds_start = transcript_data.get("cds_start", start)
+    # PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0)
+    cds_start = transcript_data.get("cds_start", end)
     cds_end = transcript_data.get("cds_end", end)
     # PyHGVS exons are in genomic order, PyReference are in stranded
     features = transcript_data["features_by_type"]

From 6c5e381c6f34ec7288016ab31f68c8d4dd18cd69 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 10 Nov 2021 14:18:18 +1030
Subject: [PATCH 25/47] Use transcript offsets rather than exon lengths so we
 handle gaps properly

---
 pyhgvs/models/transcript.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 89018c9..38a8bbb 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -113,7 +113,7 @@ def find_stop_codon(self, cds_position):
                 cdna_pos += position + cdna_match.get_offset(position)
                 break
             else:
-                cdna_pos += stop - start
+                cdna_pos = cdna_match.cdna_end
         else:
             raise ValueError('Stop codon is not in any of the exons')
 

From 2ca3e08d2437ce9fe709940c2da7160a6ebcc011 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 10 Nov 2021 14:47:24 +1030
Subject: [PATCH 26/47] Simplify and make consistent

---
 pyhgvs/models/transcript.py | 55 +++++++++++++++----------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 38a8bbb..8e0d7c0 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -69,7 +69,7 @@ def get_utr5p_size(self):
         # Find the exon containing the start codon.
         start_codon = (self.cds_position.chrom_start if transcript_strand
                        else self.cds_position.chrom_stop - 1)
-        cdna_len = 0
+        cdna_offset = 0
         for cdna_match in self.ordered_cdna_match:
             start = cdna_match.tx_position.chrom_start
             end = cdna_match.tx_position.chrom_stop
@@ -79,19 +79,17 @@ def get_utr5p_size(self):
                     position = start_codon - start
                 else:
                     position = end - start_codon - 1
-                cdna_len += position + cdna_match.get_offset(position)
-                break
-            cdna_len += cdna_match.length
-        else:
-            cdna_matches = []
-            for cdna_match in self.ordered_cdna_match:
-                start = cdna_match.tx_position.chrom_start
-                end = cdna_match.tx_position.chrom_stop
-                cdna_matches.append(f"{start}-{end}")
-            cdna_match_summary = ", ".join(cdna_matches)
-            raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary))
+                return cdna_offset + position + cdna_match.get_offset(position)
+            cdna_offset = cdna_match.cdna_end
 
-        return cdna_len
+        # Couldn't find it
+        cdna_matches = []
+        for cdna_match in self.ordered_cdna_match:
+            start = cdna_match.tx_position.chrom_start
+            end = cdna_match.tx_position.chrom_stop
+            cdna_matches.append(f"{start}-{end}")
+        cdna_match_summary = ", ".join(cdna_matches)
+        raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary))
 
     def find_stop_codon(self, cds_position):
         """Return the position along the cDNA of the base after the stop codon."""
@@ -99,7 +97,7 @@ def find_stop_codon(self, cds_position):
             stop_pos = cds_position.chrom_stop
         else:
             stop_pos = cds_position.chrom_start
-        cdna_pos = 0
+        cdna_offset = 0
         for cdna_match in self.ordered_cdna_match:
             start = cdna_match.tx_position.chrom_start
             stop = cdna_match.tx_position.chrom_stop
@@ -110,14 +108,10 @@ def find_stop_codon(self, cds_position):
                     position = stop_pos - start
                 else:
                     position = stop - stop_pos
-                cdna_pos += position + cdna_match.get_offset(position)
-                break
+                return cdna_offset + position + cdna_match.get_offset(position)
             else:
-                cdna_pos = cdna_match.cdna_end
-        else:
-            raise ValueError('Stop codon is not in any of the exons')
-
-        return cdna_pos
+                cdna_offset = cdna_match.cdna_end
+        raise ValueError('Stop codon is not in any of the exons')
 
     def cdna_to_genomic_coord(self, coord):
         """Convert a HGVS cDNA coordinate to a genomic coordinate."""
@@ -148,11 +142,9 @@ def cdna_to_genomic_coord(self, coord):
                 return self.tx_position.chrom_stop - cdna_pos + 1
 
         # Walk along transcript until we find an exon that contains cdna_pos.
-        cdna_start = 1
         for cdna_match in self.ordered_cdna_match:
-            cdna_end = cdna_start + cdna_match.length - 1
-            if cdna_start <= cdna_pos <= cdna_end:
-                match_pos = cdna_pos - cdna_start
+            if cdna_match.cdna_start <= cdna_pos <= cdna_match.cdna_end:
+                match_pos = cdna_pos - cdna_match.cdna_start
                 match_pos -= cdna_match.get_offset(match_pos)
                 # Compute genomic coordinate using offset.
                 if transcript_strand:
@@ -163,7 +155,6 @@ def cdna_to_genomic_coord(self, coord):
                     # Minus strand.
                     end = cdna_match.tx_position.chrom_stop
                     return end - match_pos - coord.offset
-            cdna_start = cdna_end + 1
         else:
             # 3' flanking sequence (no need to account for gaps)
             if transcript_strand:
@@ -232,10 +223,8 @@ def genomic_to_cdna_coord(self, genomic_coord):
         return cdna_coord
 
     def _exon_genomic_to_cdna_coord(self, genomic_coord):
-        coding_offset = 0
+        cdna_offset = 0
         for cdna_match in self.ordered_cdna_match:
-            cds_start = coding_offset + 1
-
             # Inside the exon.
             if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop:
                 if self.strand == "+":
@@ -243,10 +232,10 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord):
                 else:
                     position = cdna_match.tx_position.chrom_stop - genomic_coord
                 offset = cdna_match.get_offset(position, validate=False)
-                return cds_start + position + offset
-            coding_offset += cdna_match.length
-        else:
-            raise ValueError(f"Couldn't find {genomic_coord=}!")
+                return cdna_offset + position + offset + 1
+            cdna_offset = cdna_match.cdna_end
+
+        raise ValueError(f"Couldn't find {genomic_coord=}!")
 
 
 BED6Interval_base = namedtuple(

From b4571a7cff67ff2844668a3a24327aff9c243938 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 10 Nov 2021 17:08:06 +1030
Subject: [PATCH 27/47] Previous change didn't work on negative strand

---
 pyhgvs/models/transcript.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 8e0d7c0..4f7c0b9 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -80,7 +80,7 @@ def get_utr5p_size(self):
                 else:
                     position = end - start_codon - 1
                 return cdna_offset + position + cdna_match.get_offset(position)
-            cdna_offset = cdna_match.cdna_end
+            cdna_offset += cdna_match.length
 
         # Couldn't find it
         cdna_matches = []
@@ -110,7 +110,7 @@ def find_stop_codon(self, cds_position):
                     position = stop - stop_pos
                 return cdna_offset + position + cdna_match.get_offset(position)
             else:
-                cdna_offset = cdna_match.cdna_end
+                cdna_offset += cdna_match.length
         raise ValueError('Stop codon is not in any of the exons')
 
     def cdna_to_genomic_coord(self, coord):
@@ -233,7 +233,7 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord):
                     position = cdna_match.tx_position.chrom_stop - genomic_coord
                 offset = cdna_match.get_offset(position, validate=False)
                 return cdna_offset + position + offset + 1
-            cdna_offset = cdna_match.cdna_end
+            cdna_offset += cdna_match.length
 
         raise ValueError(f"Couldn't find {genomic_coord=}!")
 

From bb17f436bcb522e3c7e98d3c8fc687ff15b8708d Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 10 Nov 2021 18:28:26 +1030
Subject: [PATCH 28/47] Make sure exons are sorted correctly

---
 pyhgvs/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index 185f49f..e6d6250 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -5,6 +5,7 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
+import operator
 from .models.variants import Position
 from .models.transcript import Transcript, CDNA_Match, Exon
 
@@ -86,7 +87,10 @@ def make_transcript(transcript_json):
             transcript_json['strand'] == '+'))
 
     exons = transcript_json['exons']
+    exons.sort(key=operator.itemgetter(0))
     cdna_match = transcript_json.get('cdna_match', [])
+    cdna_match.sort(key=operator.itemgetter(0))
+
     if not transcript.tx_position.is_forward_strand:
         exons.reverse()
         cdna_match.reverse()

From dbc621f14555cf79be84082fa7e7b635795e67ae Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Mon, 15 Nov 2021 10:58:35 +1030
Subject: [PATCH 29/47] Support LRG transcripts (was setting as gene not
 transcripts if no gene provided)

---
 pyhgvs/models/hgvs_name.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index cc20682..313bd2e 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -364,6 +364,11 @@ def parse_prefix(self, prefix, kind):
             self.transcript = prefix
             return
 
+        # Determine using LRG type.
+        if prefix.startswith('LRG_'):
+            self.transcript = prefix
+            return
+
         # Determine using refseq type.
         refseq_type = get_refseq_type(prefix)
         if refseq_type in ('mRNA', 'RNA'):

From 1409aba014c878d9e58874cdb86a2b04c892e169 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 16 Nov 2021 15:33:18 +1030
Subject: [PATCH 30/47] Use ref for alt on any reference HGVS

---
 pyhgvs/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 70a9f37..ff31c73 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -52,7 +52,7 @@ def get_vcf_allele(hgvs, genome, transcript=None):
 
     # Sometimes we need to retrieve alt from reference
     # Eg NC_000001.11:g.169549811=
-    if hgvs.mutation_type == "=" and alt == 'N':
+    if hgvs.mutation_type == "=":
         alt = ref
 
     if hgvs.mutation_type in _indel_mutation_types:

From 9ceb6498b8aa99481ea1719589c15118e00919c4 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 16 Nov 2021 18:07:10 +1030
Subject: [PATCH 31/47] Validate coordinate span and reference are the same, as
 eg c.1727_1740dup14 should pass but c.1727_1740dup9999 should fail

---
 pyhgvs/models/hgvs_name.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index 313bd2e..9581dda 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -542,7 +542,7 @@ def parse_genome(self, details):
                 self.ref_allele = groups.get('ref', '')
                 self.alt_allele = groups.get('alt', '')
 
-                # Convert numerical allelles.
+                # Convert numerical alleles.
                 if self.ref_allele.isdigit():
                     self.ref_allele = "N" * int(self.ref_allele)
                 if self.alt_allele.isdigit():
@@ -738,9 +738,6 @@ def get_raw_coords(self, transcript=None):
             end = transcript.cdna_to_genomic_coord(self.cdna_end)
 
             if not transcript.tx_position.is_forward_strand:
-                if end > start:
-                    raise AssertionError(
-                        "cdna_start cannot be greater than cdna_end")
                 start, end = end, start
 
             if start > end:
@@ -755,6 +752,13 @@ def get_raw_coords(self, transcript=None):
                 'Coordinates are not available for this kind of HGVS name "%s"'
                 % self.kind)
 
+        # Check coordinate span is equal to reference bases
+        if self.ref_allele:
+            coordinate_span = end - start + 1  # Ref will always be >=1 base
+            ref_length = len(self.ref_allele)
+            if coordinate_span != ref_length:
+                raise InvalidHGVSName("Coordinate span (%d) not equal to ref length %d" % (coordinate_span, ref_length))
+
         return chrom, start, end
 
     def get_ref_coords(self, transcript=None):

From 27535c355b7e4eb8491f633f0cae45c31ab60393 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Tue, 16 Nov 2021 21:54:03 +1030
Subject: [PATCH 32/47] Support mitochondria

---
 pyhgvs/models/hgvs_name.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index 9581dda..16b0df6 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -416,7 +416,7 @@ def parse_allele(self, allele):
                                           "Non-coding transcript cannot contain '*' (3'UTR) coordinates")
         elif kind == "p":
             self.parse_protein(details)
-        elif kind == "g":
+        elif kind in ("g", 'm'):
             self.parse_genome(details)
         else:
             raise NotImplementedError("unknown kind: %s" % allele)
@@ -578,14 +578,12 @@ def __unicode__(self):
     def format(self, use_prefix=True, use_gene=True, use_counsyl=False):
         """Generate a HGVS name as a string."""
 
-        if self.kind == 'c':
-            allele = 'c.' + self.format_cdna()
-        elif self.kind == 'n':
-            allele = 'n.' + self.format_cdna()
+        if self.kind in ('c', 'n'):
+            allele = self.kind + '.' + self.format_cdna()
         elif self.kind == 'p':
             allele = 'p.' + self.format_protein()
-        elif self.kind == 'g':
-            allele = 'g.' + self.format_genome()
+        elif self.kind in ('g', 'm'):
+            allele = self.kind + '.' + self.format_genome()
         else:
             raise NotImplementedError("not implemented: '%s'" % self.kind)
 
@@ -605,7 +603,7 @@ def format_prefix(self, use_gene=True):
           NM_007294.3(BRCA1):c.2207A>C
         """
 
-        if self.kind == 'g':
+        if self.kind in ('g', 'm'):
             if self.chrom:
                 return self.chrom
 
@@ -743,7 +741,7 @@ def get_raw_coords(self, transcript=None):
             if start > end:
                 raise AssertionError(
                     "cdna_start cannot be greater than cdna_end")
-        elif self.kind == 'g':
+        elif self.kind in ('g', 'm'):
             chrom = self.chrom
             start = self.start
             end = self.end

From a0a8edddfbf36cfa100a650719105b7e4c65fd0d Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 24 Nov 2021 17:26:18 +1030
Subject: [PATCH 33/47] Bump version so I can track it

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c468888..c7dc1c1 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.11.1',
+        version='0.12.0',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',

From 12924f0cfe7226d238b85f45529e3616bd16dc85 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Thu, 9 Dec 2021 11:26:19 +1030
Subject: [PATCH 34/47] HGVS issue #61 - Reference HGVS without reference base
 leads to wrong coordinates and reference allele as it treats last digit as
 ref digit multiplying N that many times

---
 pyhgvs/models/hgvs_name.py      |  2 ++
 pyhgvs/tests/test_hgvs_names.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index 16b0df6..3d2b649 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -128,6 +128,7 @@ class HGVSRegex(object):
     # cDNA allele syntax
     CDNA_ALLELE = [
         # No change
+        CDNA_START + EQUAL,
         CDNA_START + DNA_REF + EQUAL,
 
         # Substitution
@@ -189,6 +190,7 @@ class HGVSRegex(object):
     # Genomic allele syntax
     GENOMIC_ALLELE = [
         # No change
+        COORD_START + EQUAL,
         COORD_START + DNA_REF + EQUAL,
 
         # Substitution
diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index 8f06237..06220de 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -358,6 +358,18 @@ def test_invalid_coordinates():
          'mutation_type': '=',
      }),
 
+    # cDNA no change, no provided base
+    ('BRCA1:c.101=', True,
+     {
+         'gene': 'BRCA1',
+         'kind': 'c',
+         'cdna_start': CDNACoord(101),
+         'cdna_end': CDNACoord(101),
+         'ref_allele': '',
+         'alt_allele': '',
+         'mutation_type': '=',
+     }),
+
     # cDNA 1bp mutations.
     ('BRCA1:c.101A>C', True,
      {
@@ -542,6 +554,18 @@ def test_invalid_coordinates():
          'mutation_type': '=',
      }),
 
+    # Genomic no change + no ref base
+    ('BRCA1:g.101=', True,
+     {
+         'gene': 'BRCA1',
+         'kind': 'g',
+         'start': 101,
+         'end': 101,
+         'ref_allele': '',
+         'alt_allele': '',
+         'mutation_type': '=',
+     }),
+
     # Genomic 1bp mutations.
     ('BRCA1:g.101A>C', True,
      {

From 70565eb4352555521ff8a36a9742b77410df0a84 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Thu, 9 Dec 2021 14:21:19 +1030
Subject: [PATCH 35/47] Bump version, add changelog

---
 CHANGELOG.md | 10 ++++++++++
 setup.py     |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3355e12..dfe400c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # HGVS library change log
 
+## 0.12.1 (2021-12-09)
+  - Fix issue #61 - Regex for reference HGVS without reference base
+
+## 0.12.0 (2021-11-24)
+  - cDNA gaps
+  - Refactor models
+  - Support non-coding, mitochondria and LRG transcripts
+  - Validate coordinate span equals reference length
+  - Scripts to generate transcripts from RefSeq and Ensembl GTFs on the web
+
 ## 0.9.2 (2014-12-11)
   - Rename package to pyhgvs to avoid naming conflict with other libraries.
 
diff --git a/setup.py b/setup.py
index c7dc1c1..7067c93 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.12.0',
+        version='0.12.1',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',

From 1ea73ad19a17c7063c6b902a5b58f8792c75e136 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 14 Jan 2022 17:36:14 +1030
Subject: [PATCH 36/47] Use same code for calculating transcript position of
 start/stop codons. Be able to pass in pre-calculated values to save
 calculation if available

---
 pyhgvs/models/transcript.py | 86 ++++++++++++++++++-------------------
 pyhgvs/utils.py             |  5 ++-
 2 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py
index 4f7c0b9..7d93233 100644
--- a/pyhgvs/models/transcript.py
+++ b/pyhgvs/models/transcript.py
@@ -24,15 +24,18 @@ class Transcript(object):
     """
 
     def __init__(self, name, version, gene, tx_position, cds_position,
-                 is_default=False, exons=None, cdna_match=None):
+                 is_default=False, cdna_match=None,
+                 start_codon_transcript_pos=None, stop_codon_transcript_pos=None):
         self.name = name
         self.version = version
         self.gene = Gene(gene)
         self.tx_position = tx_position
         self.cds_position = cds_position
         self.is_default = is_default
-        self.exons = exons or []
         self.cdna_match = cdna_match or []
+        # Optional pre-calculated transcript coordinates
+        self._start_codon_transcript_pos = start_codon_transcript_pos
+        self._stop_codon_transcript_pos = stop_codon_transcript_pos
 
     @property
     def full_name(self):
@@ -61,57 +64,51 @@ def ordered_cdna_match(self):
             cdna_match.reverse()
         return cdna_match
 
-    def get_utr5p_size(self):
-        """Return the size of the 5prime UTR of a transcript."""
+    def get_cds_start_stop(self):
+        (start, stop) = (self.cds_position.chrom_start, self.cds_position.chrom_stop)
+        if not self.tx_position.is_forward_strand:
+            (stop, start) = (start, stop)
+        return start, stop
 
-        transcript_strand = self.tx_position.is_forward_strand
+    @lazy
+    def start_codon(self):
+        """ Transcript position """
 
-        # Find the exon containing the start codon.
-        start_codon = (self.cds_position.chrom_start if transcript_strand
-                       else self.cds_position.chrom_stop - 1)
-        cdna_offset = 0
-        for cdna_match in self.ordered_cdna_match:
-            start = cdna_match.tx_position.chrom_start
-            end = cdna_match.tx_position.chrom_stop
-            if start <= start_codon < end:
-                # We're inside this match
-                if transcript_strand:
-                    position = start_codon - start
-                else:
-                    position = end - start_codon - 1
-                return cdna_offset + position + cdna_match.get_offset(position)
-            cdna_offset += cdna_match.length
+        if self._start_codon_transcript_pos is not None:
+            return self._start_codon_transcript_pos
+
+        start_genomic_coordinate, _ = self.get_cds_start_stop()
+        return self._get_transcript_position(start_genomic_coordinate)
+
+    @lazy
+    def stop_codon(self):
+        """ Transcript position """
+        if self._stop_codon_transcript_pos is not None:
+            return self._stop_codon_transcript_pos
+
+        _, stop_genomic_coordinate = self.get_cds_start_stop()
+        return self._get_transcript_position(stop_genomic_coordinate)
+
+    def _get_transcript_position(self, genomic_coordinate, label=None):
+        transcript_strand = self.tx_position.is_forward_strand
 
-        # Couldn't find it
-        cdna_matches = []
-        for cdna_match in self.ordered_cdna_match:
-            start = cdna_match.tx_position.chrom_start
-            end = cdna_match.tx_position.chrom_stop
-            cdna_matches.append(f"{start}-{end}")
-        cdna_match_summary = ", ".join(cdna_matches)
-        raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary))
-
-    def find_stop_codon(self, cds_position):
-        """Return the position along the cDNA of the base after the stop codon."""
-        if cds_position.is_forward_strand:
-            stop_pos = cds_position.chrom_stop
-        else:
-            stop_pos = cds_position.chrom_start
         cdna_offset = 0
         for cdna_match in self.ordered_cdna_match:
             start = cdna_match.tx_position.chrom_start
             stop = cdna_match.tx_position.chrom_stop
 
-            if start <= stop_pos <= stop:
+            if start <= genomic_coordinate <= stop:
                 # We're inside this match
-                if cds_position.is_forward_strand:
-                    position = stop_pos - start
+                if transcript_strand:
+                    position = genomic_coordinate - start
                 else:
-                    position = stop - stop_pos
+                    position = stop - genomic_coordinate
                 return cdna_offset + position + cdna_match.get_offset(position)
             else:
                 cdna_offset += cdna_match.length
-        raise ValueError('Stop codon is not in any of the exons')
+        if label is None:
+            label = "Genomic coordinate: %d" % genomic_coordinate
+        raise ValueError('%s is not in any of the exons' % label)
 
     def cdna_to_genomic_coord(self, coord):
         """Convert a HGVS cDNA coordinate to a genomic coordinate."""
@@ -119,8 +116,7 @@ def cdna_to_genomic_coord(self, coord):
 
         # compute starting position along spliced transcript.
         if coord.landmark == CDNA_START_CODON:
-            utr5p = (self.get_utr5p_size()
-                     if self.is_coding else 0)
+            utr5p = (self.start_codon if self.is_coding else 0)
 
             if coord.coord > 0:
                 cdna_pos = utr5p + coord.coord
@@ -130,7 +126,7 @@ def cdna_to_genomic_coord(self, coord):
             if coord.coord < 0:
                 raise ValueError('CDNACoord cannot have a negative coord and '
                                  'landmark CDNA_STOP_CODON')
-            cdna_pos = self.find_stop_codon(self.cds_position) + coord.coord
+            cdna_pos = self.stop_codon + coord.coord
         else:
             raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark)
 
@@ -207,13 +203,13 @@ def genomic_to_cdna_coord(self, genomic_coord):
         # Adjust coordinates for coding transcript.
         if self.is_coding:
             # Detect if position before start codon.
-            utr5p = self.get_utr5p_size()
+            utr5p = self.start_codon
             cdna_coord.coord -= utr5p
             if cdna_coord.coord <= 0:
                 cdna_coord.coord -= 1
             else:
                 # Detect if position is after stop_codon.
-                stop_codon = self.find_stop_codon(self.cds_position)
+                stop_codon = self.stop_codon
                 stop_codon -= utr5p
                 if (cdna_coord.coord > stop_codon or
                         cdna_coord.coord == stop_codon and cdna_coord.offset > 0):
diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py
index e6d6250..3fcc320 100644
--- a/pyhgvs/utils.py
+++ b/pyhgvs/utils.py
@@ -84,7 +84,10 @@ def make_transcript(transcript_json):
             transcript_json['chrom'],
             transcript_json['cds_start'],
             transcript_json['cds_end'],
-            transcript_json['strand'] == '+'))
+            transcript_json['strand'] == '+'),
+        start_codon_transcript_pos=transcript_json.get("start_codon_transcript_pos"),
+        stop_codon_transcript_pos=transcript_json.get("stop_codon_transcript_pos"),
+    )
 
     exons = transcript_json['exons']
     exons.sort(key=operator.itemgetter(0))

From f42bd06b6ff30bb6a38686b1592484c135a2fa69 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 14 Jan 2022 17:37:29 +1030
Subject: [PATCH 37/47] Changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dfe400c..c04fe6d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # HGVS library change log
 
+## Unreleased
+  - Use same code for calculating transcript position of start/stop codons.
+  - Be able to pass in pre-calculated start/stop from transcript_json if available
+
 ## 0.12.1 (2021-12-09)
   - Fix issue #61 - Regex for reference HGVS without reference base
 

From d59d079af578c8ba7b4297b7fea8c504d5b409fe Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 14 Jan 2022 22:22:51 +1030
Subject: [PATCH 38/47] No need to sort before storing as we don't rely on
 order and re-sort when loading in utils

---
 generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
index d53edf6..3ac4d53 100644
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -53,10 +53,6 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
             raise ValueError("Haven't handled stranded Target alignment")
         cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]])
 
-    if strand == '-':
-        exons.reverse()
-        cdna_match.reverse()
-
     pyhgvs_data = {
         'chrom': transcript_data["chr"],
         'start': start,

From eef2514c916d3f3a8909f48fca5f3021fa27cfef Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Fri, 14 Jan 2022 22:42:31 +1030
Subject: [PATCH 39/47] Including coding start/end transcript coords if
 available

---
 generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
index 3ac4d53..a77de4a 100644
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -64,6 +64,10 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
     }
 
     # Optional stuff
+    for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]
+        if value := transcript_data.get(optional_key):
+            pyhgvs_data[optional_key] = value
+
     if cdna_match:
         pyhgvs_data["cdna_match"] = cdna_match
     if transcript_data.get("partial"):

From 936d2764c9ae94961c8dc1df6f018cb403f08489 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Mon, 17 Jan 2022 10:53:14 +1030
Subject: [PATCH 40/47] Fix syntax error

---
 generate_transcript_data/pyreference_to_pyhgvs_json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
index a77de4a..322ef98 100644
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py
@@ -64,7 +64,7 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
     }
 
     # Optional stuff
-    for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]
+    for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]:
         if value := transcript_data.get(optional_key):
             pyhgvs_data[optional_key] = value
 

From 992c4afe3edd29d25ce37e71507a2849eb15a675 Mon Sep 17 00:00:00 2001
From: David Lawrence <davmlaw@gmail.com>
Date: Mon, 1 Aug 2022 15:22:55 +0930
Subject: [PATCH 41/47] Fix issue where dups always matched ref allele

---
 pyhgvs/__init__.py              |  6 +++---
 pyhgvs/models/hgvs_name.py      |  8 +++++---
 pyhgvs/tests/test_hgvs_names.py | 27 ++++++++++++++++++++++++++-
 setup.py                        |  2 +-
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index ff31c73..0fc378d 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -77,9 +77,9 @@ def get_alt_from_sequence(hgvs, genome, transcript):
 
 def matches_ref_allele(hgvs, genome, transcript=None):
     """Return True if reference allele matches genomic sequence."""
-    ref, _ = hgvs.get_ref_alt(
-        transcript.tx_position.is_forward_strand if transcript else True)
-    chrom, start, end = hgvs.get_ref_coords(transcript)
+    is_forward_strand = transcript.tx_position.is_forward_strand if transcript else True
+    ref, _ = hgvs.get_ref_alt(is_forward_strand, raw_dup_alleles=True)  # get raw values so dup isn't always True
+    chrom, start, end = hgvs.get_raw_coords(transcript)
     genome_ref = get_genomic_sequence(genome, chrom, start, end)
     return genome_ref == ref
 
diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index 3d2b649..85f46bf 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -793,15 +793,17 @@ def get_vcf_coords(self, transcript=None):
                                       self.mutation_type)
         return chrom, start, end
 
-    def get_ref_alt(self, is_forward_strand=True):
-        """Return reference and alternate alleles."""
+    def get_ref_alt(self, is_forward_strand=True, raw_dup_alleles=False):
+        """ Return reference and alternate alleles.
+            Original code was for representation - ie it altered dup to look like an insert
+            pass raw_dup_alleles=True to get the raw values """
         if self.kind == 'p':
             raise NotImplementedError(
                 'get_ref_alt is not implemented for protein HGVS names')
         alleles = [self.ref_allele, self.alt_allele]
 
         # Represent duplications are inserts.
-        if self.mutation_type == "dup":
+        if not raw_dup_alleles and self.mutation_type == "dup":
             alleles[0] = ""
             alleles[1] = alleles[1][:len(alleles[1]) // 2]
 
diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index 06220de..5754731 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -11,7 +11,7 @@
 
 from ..models.cdna import CDNACoord, CDNA_STOP_CODON
 from ..models.hgvs_name import HGVSName, InvalidHGVSName
-from .. import format_hgvs_name
+from .. import format_hgvs_name, matches_ref_allele
 from .. import parse_hgvs_name
 from ..utils import read_transcripts
 from .genome import MockGenomeTestFile
@@ -197,6 +197,31 @@ def test_invalid_coordinates():
     parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
 
 
+def test_matches_ref():
+    genome = MockGenomeTestFile(
+        db_filename='hg19.fa',
+        filename='pyhgvs/tests/data/test_hgvs.genome',
+        create_data=False)
+    # genome = SequenceFileDB('pyhgvs/tests/data/test_hgvs.genome')
+    transcript = get_transcript("NM_000016.4")
+
+    # The base at this position is "T" - used to always work w/dups
+    expected = [
+        ("NM_000016.4:c.1189dupT", True),
+        ("NM_000016.4:c.1189T>TT", True),  # same as dup by both alleles provided
+        ("NM_000016.4:c.1189dupA", False),
+        ("NM_000016.4:c.1189A>AA", False),
+        ("NM_000016.4:c.1189delT", True),
+        ("NM_000016.4:c.1189delC", False),
+        ("NM_000016.4:c.1189delCinsT", False),
+        ("NM_000016.4:c.1189delTinsC", True),
+    ]
+    for hgvs_string, expected_result in expected:
+        hgvs_name = HGVSName(hgvs_string)
+        matches_ref = matches_ref_allele(hgvs_name, genome, transcript)
+        nose.tools.assert_equal(matches_ref, expected_result, hgvs_string + " matches reference base")
+
+
 # Test examples of cDNA coordinates.
 _parse_cdna_coords = [
     ('1001', CDNACoord(1001)),
diff --git a/setup.py b/setup.py
index 7067c93..f08964a 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.12.1',
+        version='0.12.2',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',

From de3a1d336da05e91b111cd5d2218be5c04af0011 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 18 Aug 2022 14:39:51 +0930
Subject: [PATCH 42/47] Generating transcripts is now handled via
 https://github.com/SACGF/cdot

---
 .../ensembl_gene_annotation_grch37.sh         |  28 ----
 .../ensembl_gene_annotation_grch38.sh         |  37 -----
 .../pyreference_to_pyhgvs_json.py             | 134 ------------------
 .../refseq_gene_annotation_grch37.sh          |  75 ----------
 .../refseq_gene_annotation_grch38.sh          |  90 ------------
 5 files changed, 364 deletions(-)
 delete mode 100755 generate_transcript_data/ensembl_gene_annotation_grch37.sh
 delete mode 100755 generate_transcript_data/ensembl_gene_annotation_grch38.sh
 delete mode 100644 generate_transcript_data/pyreference_to_pyhgvs_json.py
 delete mode 100755 generate_transcript_data/refseq_gene_annotation_grch37.sh
 delete mode 100755 generate_transcript_data/refseq_gene_annotation_grch38.sh

diff --git a/generate_transcript_data/ensembl_gene_annotation_grch37.sh b/generate_transcript_data/ensembl_gene_annotation_grch37.sh
deleted file mode 100755
index 97ee668..0000000
--- a/generate_transcript_data/ensembl_gene_annotation_grch37.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them
-
-#82 is first GFF3 for GRCh37
-#83 has no data
-#84 is 82 again
-#86 is 85 again
-pyreference_args=()
-for release in 82 85 87; do
-  filename=Homo_sapiens.GRCh37.${release}.gff3.gz
-  url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename}
-  pyreference_file=${filename}.json.gz
-  if [[ ! -e ${filename} ]]; then
-    wget ${url}
-  fi
-  if [[ ! -e ${pyreference_file} ]]; then
-    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-  fi
-  pyreference_args+=(--pyreference-json ${pyreference_file})
-done
-
-merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz"
-if [[ ! -e ${merged_file} ]]; then
-  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
-
-  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
-fi
diff --git a/generate_transcript_data/ensembl_gene_annotation_grch38.sh b/generate_transcript_data/ensembl_gene_annotation_grch38.sh
deleted file mode 100755
index 9f8c0a0..0000000
--- a/generate_transcript_data/ensembl_gene_annotation_grch38.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Skip earlier GTFs as they don't have versions
-#for release in 76 77 78 79 80; do
-#  filename=Homo_sapiens.GRCh38.${release}.gtf.gz
-#  url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
-#  if [[ ! -e ${filename} ]]; then
-#    wget ${url}
-#  fi
-
-#  if [[ ! -e ${filename}.json.gz ]]; then
-#    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-#  fi
-#done
-
-#81 is first GFF3 for GRCh38
-pyreference_args=()
-for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do
-  filename=Homo_sapiens.GRCh38.${release}.gff3.gz
-  url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename}
-  pyreference_file=${filename}.json.gz
-
-  if [[ ! -e ${filename} ]]; then
-    wget ${url}
-  fi
-  if [[ ! -e ${filename}.json.gz ]]; then
-    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-  fi
-  pyreference_args+=(--pyreference-json ${pyreference_file})
-done
-
-merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz"
-if [[ ! -e ${merged_file} ]]; then
-  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
-
-  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
-fi
diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py
deleted file mode 100644
index 322ef98..0000000
--- a/generate_transcript_data/pyreference_to_pyhgvs_json.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""
-    Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS
-
-    @see https://bitbucket.org/sacgf/pyreference/
-
-    Dave Lawrence (davmlaw@gmail.com) on 22/10/2021
-"""
-import gzip
-import json
-from argparse import ArgumentParser
-from typing import Dict
-
-
-def handle_args():
-    parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS')
-    parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend",
-                        help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)')
-    parser.add_argument('--output', required=True, help='Output filename')
-    return parser.parse_args()
-
-
-def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict:
-    gene_version_data = {
-        'biotype': ",".join(gene_data["biotype"]),
-        'description': gene_data.get("description"),
-        'gene_symbol': gene_data["name"],
-    }
-
-    if hgnc_str := gene_data.get("HGNC"):
-        # Has HGNC: (5 characters) at start of it
-        gene_version_data["hgnc"] = hgnc_str[5:]
-
-    # Only Ensembl Genes have versions
-    if version := gene_data.get("version"):
-        gene_data["version"] = version
-
-    return gene_version_data
-
-
-def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict:
-    start = transcript_data["start"]
-    end = transcript_data["stop"]
-    strand = transcript_data["strand"]
-    # PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0)
-    cds_start = transcript_data.get("cds_start", end)
-    cds_end = transcript_data.get("cds_end", end)
-    # PyHGVS exons are in genomic order, PyReference are in stranded
-    features = transcript_data["features_by_type"]
-    exons = [[ed["start"], ed["stop"]] for ed in features["exon"]]
-    cdna_match = []
-    for cdm in features.get("cDNA_match", []):
-        if "cdna_strand" in cdm:  # None in Human RefSeq so haven't handled
-            raise ValueError("Haven't handled stranded Target alignment")
-        cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]])
-
-    pyhgvs_data = {
-        'chrom': transcript_data["chr"],
-        'start': start,
-        'end': end,
-        'strand': strand,
-        'cds_start': cds_start,
-        'cds_end': cds_end,
-        'exons': exons,
-    }
-
-    # Optional stuff
-    for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]:
-        if value := transcript_data.get(optional_key):
-            pyhgvs_data[optional_key] = value
-
-    if cdna_match:
-        pyhgvs_data["cdna_match"] = cdna_match
-    if transcript_data.get("partial"):
-        pyhgvs_data["partial"] = 1
-    other_chroms = transcript_data.get("other_chroms")
-    if other_chroms:
-        pyhgvs_data["other_chroms"] = other_chroms
-
-    # Few extra fields (pyHGVS doesn't currently use)
-    pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"])
-    return pyhgvs_data
-
-
-def main():
-    args = handle_args()
-
-    gene_versions = {}  # We only keep those that are in the latest transcript version
-    transcript_versions = {}
-
-    for pyref_filename in args.pyreference_json:
-        print(f"Loading '{pyref_filename}'")
-        with gzip.open(pyref_filename) as f:
-            pyref_data = json.load(f)
-
-            url = pyref_data["reference_gtf"]["url"]
-
-            # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes)
-            transcript_gene_version = {}
-
-            for gene_id, gene in pyref_data["genes_by_id"].items():
-                if version := gene.get("version"):
-                    gene_accession = f"{gene_id}.{version}"
-                else:
-                    gene_accession = gene_id
-
-                gene_version = convert_gene_pyreference_to_gene_version_data(gene)
-                gene_version["url"] = url
-                gene_versions[gene_accession] = gene_version
-
-                for transcript_accession in gene["transcripts"]:
-                    transcript_gene_version[transcript_accession] = gene_accession
-
-            for transcript_accession, transcript in pyref_data["transcripts_by_id"].items():
-                hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript)
-                gene_accession = transcript_gene_version[transcript_accession]
-                gene_version = gene_versions[gene_accession]
-                hgvs_data["id"] = transcript_accession
-                hgvs_data["gene_version"] = gene_accession
-                hgvs_data["gene_name"] = gene_version["gene_symbol"]
-                hgvs_data["url"] = url
-                transcript_versions[transcript_accession] = hgvs_data
-
-    print("Writing PyHGVS data")
-    with gzip.open(args.output, 'w') as outfile:
-        data = {
-            "transcripts": transcript_versions,
-            "genes": gene_versions,
-        }
-        json_str = json.dumps(data)
-        outfile.write(json_str.encode('ascii'))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/generate_transcript_data/refseq_gene_annotation_grch37.sh b/generate_transcript_data/refseq_gene_annotation_grch37.sh
deleted file mode 100755
index c0846fc..0000000
--- a/generate_transcript_data/refseq_gene_annotation_grch37.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-
-pyreference_args=()
-
-# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
-
-filename=ref_GRCh37.p5_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename}
-pyreference_file=${filename}.json.gz
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh37.p9_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename}
-pyreference_file=${filename}.json.gz
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh37.p10_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename}
-pyreference_file=${filename}.json.gz
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh37.p13_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename}
-pyreference_file=${filename}.json.gz
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-# These all have the same name, so rename them based on release ID
-for release in 105.20190906 105.20201022; do
-  filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz
-  url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
-  pyreference_file=${filename}.json.gz
-  if [[ ! -e ${filename} ]]; then
-    wget ${url} --output-document=${filename}
-  fi
-  if [[ ! -e ${pyreference_file} ]]; then
-    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-  fi
-  pyreference_args+=(--pyreference-json ${pyreference_file})
-done
-
-
-merged_file="pyhgvs_transcripts_refseq_grch37.json.gz"
-if [[ ! -e ${merged_file} ]]; then
-  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
-
-  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
-fi
diff --git a/generate_transcript_data/refseq_gene_annotation_grch38.sh b/generate_transcript_data/refseq_gene_annotation_grch38.sh
deleted file mode 100755
index a4682fe..0000000
--- a/generate_transcript_data/refseq_gene_annotation_grch38.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/bin/bash
-
-# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
-
-filename=ref_GRCh38_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename}
-pyreference_file=${filename}.json.gz
-
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh38.p2_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename}
-pyreference_file=${filename}.json.gz
-
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh38.p7_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename}
-pyreference_file=${filename}.json.gz
-
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=ref_GRCh38.p12_top_level.gff3.gz
-url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename}
-pyreference_file=${filename}.json.gz
-
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz
-url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename}
-pyreference_file=${filename}.json.gz
-
-if [[ ! -e ${filename} ]]; then
-  wget ${url}
-fi
-if [[ ! -e ${pyreference_file} ]]; then
-  pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-fi
-pyreference_args+=(--pyreference-json ${pyreference_file})
-
-
-# These all have the same name, so rename them based on release ID
-for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do
-  filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
-  url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
-  pyreference_file=${filename}.json.gz
-  if [[ ! -e ${filename} ]]; then
-    wget ${url} --output-document=${filename}
-  fi
-  if [[ ! -e ${pyreference_file} ]]; then
-    pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
-  fi
-  pyreference_args+=(--pyreference-json ${pyreference_file})
-done
-
-
-merged_file="pyhgvs_transcripts_refseq_grch38.json.gz"
-if [[ ! -e ${merged_file} ]]; then
-  BASE_DIR=$(dirname ${BASH_SOURCE[0]})
-
-  python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file}
-fi

From 715c4f2e29de11272b9a39523460efcb6ea31a52 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 12 Apr 2023 12:19:19 +0930
Subject: [PATCH 43/47] Support for coord range + equals ie
 "NC_000010.10:g.89623915_89623920="

---
 pyhgvs/models/hgvs_name.py      | 1 +
 pyhgvs/tests/test_hgvs_names.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py
index 85f46bf..6d6cc46 100644
--- a/pyhgvs/models/hgvs_name.py
+++ b/pyhgvs/models/hgvs_name.py
@@ -204,6 +204,7 @@ class HGVSRegex(object):
         COORD_START + DUP,
 
         # Insertion, deletion, duplication
+        COORD_RANGE + EQUAL,
         COORD_RANGE + INS + DNA_ALT,
         COORD_RANGE + DEL + DNA_REF,
         COORD_RANGE + DUP + DNA_REF,
diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py
index 5754731..579e2a6 100644
--- a/pyhgvs/tests/test_hgvs_names.py
+++ b/pyhgvs/tests/test_hgvs_names.py
@@ -706,6 +706,11 @@ def test_matches_ref():
          'alt_allele': 'TA',
          'mutation_type': 'delins',
      }),
+    #
+    ('NC_000010.10:g.89623915_89623920=', False,
+     {
+         'chrom': 'NC_000010.10',
+     }),
 ]
 
 

From 116217d18876b0882555e621e4fa9122b316f016 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 12 Apr 2023 12:30:37 +0930
Subject: [PATCH 44/47] Bump version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index f08964a..a48f39a 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.12.2',
+        version='0.12.3',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',

From 7591c13dd89dc9d5e759b3dcf1ed099840c74cd8 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 25 May 2023 17:00:01 +0930
Subject: [PATCH 45/47] #57 - c. dup converted back to delins

---
 pyhgvs/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 0fc378d..9dabfeb 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -155,8 +155,9 @@ def hgvs_justify_indel(chrom, offset, ref, alt, strand, genome):
         return chrom, offset, ref, alt
 
     # Get genomic sequence around the lesion.
-    start = max(offset - 100, 0)
-    end = offset + 100
+    size = max(len(ref), len(alt)) + 1
+    start = max(offset - size, 0)
+    end = offset + size
     seq = str(genome[str(chrom)][start - 1:end]).upper()
     cds_offset = offset - start
 

From 8b30efd10ee44830d22501ff9dd1b6ce61c8c68e Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Thu, 1 Jun 2023 14:05:07 +0930
Subject: [PATCH 46/47] bump version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a48f39a..4f36017 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.12.3',
+        version='0.12.4',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',

From ef005725a4c9f7dfbb3bb3411a9ff170e177c918 Mon Sep 17 00:00:00 2001
From: Dave Lawrence <davmlaw@gmail.com>
Date: Wed, 14 Jun 2023 13:42:40 +0930
Subject: [PATCH 47/47] Use min of 100 again for normalization window space

---
 pyhgvs/__init__.py | 3 ++-
 setup.py           | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py
index 9dabfeb..0521dab 100644
--- a/pyhgvs/__init__.py
+++ b/pyhgvs/__init__.py
@@ -155,7 +155,8 @@ def hgvs_justify_indel(chrom, offset, ref, alt, strand, genome):
         return chrom, offset, ref, alt
 
     # Get genomic sequence around the lesion.
-    size = max(len(ref), len(alt)) + 1
+    window_size = 100
+    size = window_size + max(len(ref), len(alt))
     start = max(offset - size, 0)
     end = offset + size
     seq = str(genome[str(chrom)][start - 1:end]).upper()
diff --git a/setup.py b/setup.py
index 4f36017..34bcc44 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@ def main():
 
     setup(
         name='pyhgvs',
-        version='0.12.4',
+        version='0.12.5',
         description='HGVS name parsing and formatting',
         long_description=description,
         author='Matt Rasmussen',