From e65db8330b783bc28e108c8df7c5f6cf66be008b Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Mon, 16 Mar 2020 16:45:01 +1030 Subject: [PATCH 01/47] Patch on top of latest counsyl/hgvs - indel resolution. Load genePred files --- pyhgvs/__init__.py | 100 +++++++++++------- pyhgvs/models.py | 2 +- pyhgvs/tests/data/test_name_to_variant.genome | 11 ++ pyhgvs/tests/data/test_variant_to_name.genome | 6 ++ pyhgvs/tests/genome.py | 6 +- pyhgvs/utils.py | 63 +++++------ pyhgvs/variants.py | 20 ++-- 7 files changed, 125 insertions(+), 83 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index b4df00c..9b28e8f 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -655,7 +655,7 @@ def genomic_to_cdna_coord(transcript, genomic_coord): def get_allele(hgvs, genome, transcript=None): """Get an allele from a HGVSName, a genome, and a transcript.""" - chrom, start, end = hgvs.get_coords(transcript) + chrom, start, end = hgvs.get_ref_coords(transcript) _, alt = hgvs.get_ref_alt( transcript.tx_position.is_forward_strand if transcript else True) ref = get_genomic_sequence(genome, chrom, start, end) @@ -673,16 +673,31 @@ def get_vcf_allele(hgvs, genome, transcript=None): ref = get_genomic_sequence(genome, chrom, start, end) if hgvs.mutation_type in _indel_mutation_types: + if hgvs.mutation_type == 'dup': + # Sometimes we need to retrieve alt from reference, eg: + # No alt supplied: NM_000492.3:c.1155_1156dup + # Number used: NM_004119.2(FLT3):c.1794_1811dup18 + # We *know* what the sequence is for "dup18", but not for "ins18" + if not hgvs.alt_allele or re.match("^N+$", hgvs.alt_allele): + alt = get_alt_from_sequence(hgvs, genome, transcript) + # Left-pad alternate allele. alt = ref[0] + alt return chrom, start, end, ref, alt +def get_alt_from_sequence(hgvs, genome, transcript): + """ returns ready for VCF """ + + chrom, start, end = hgvs.get_raw_coords(transcript) + return get_genomic_sequence(genome, chrom, start, end) + + def matches_ref_allele(hgvs, genome, transcript=None): """Return True if reference allele matches genomic sequence.""" - ref, alt = hgvs.get_ref_alt( + ref, _ = hgvs.get_ref_alt( transcript.tx_position.is_forward_strand if transcript else True) - chrom, start, end = hgvs.get_coords(transcript) + chrom, start, end = hgvs.get_ref_coords(transcript) genome_ref = get_genomic_sequence(genome, chrom, start, end) return genome_ref == ref @@ -1146,8 +1161,8 @@ def format_genome(self): """ return self.format_coords() + self.format_dna_allele() - def get_coords(self, transcript=None): - """Return genomic coordinates of reference allele.""" + def get_raw_coords(self, transcript=None): + """ return genomic coordinates """ if self.kind == 'c': chrom = transcript.tx_position.chrom start = cdna_to_genomic_coord(transcript, self.cdna_start) @@ -1158,27 +1173,14 @@ def get_coords(self, transcript=None): raise AssertionError( "cdna_start cannot be greater than cdna_end") start, end = end, start - else: - if start > end: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - - if self.mutation_type == "ins": - # Inserts have empty interval. - if start < end: - start += 1 - end -= 1 - else: - end = start - 1 - - elif self.mutation_type == "dup": - end = start - 1 + if start > end: + raise AssertionError( + "cdna_start cannot be greater than cdna_end") elif self.kind == 'g': chrom = self.chrom start = self.start end = self.end - else: raise NotImplementedError( 'Coordinates are not available for this kind of HGVS name "%s"' @@ -1186,9 +1188,26 @@ def get_coords(self, transcript=None): return chrom, start, end + def get_ref_coords(self, transcript=None): + """Return genomic coordinates of reference allele.""" + + chrom, start, end = self.get_raw_coords(transcript) + + if self.mutation_type == "ins": + # Inserts have empty interval. + if start < end: + start += 1 + end -= 1 + else: + end = start - 1 + + elif self.mutation_type == "dup": + end = start - 1 + return chrom, start, end + def get_vcf_coords(self, transcript=None): """Return genomic coordinates of reference allele in VCF-style.""" - chrom, start, end = self.get_coords(transcript) + chrom, start, end = self.get_ref_coords(transcript) # Inserts and deletes require left-padding by 1 base if self.mutation_type in ("=", ">"): @@ -1344,7 +1363,8 @@ def hgvs_normalize_variant(chrom, offset, ref, alt, genome, transcript=None): def parse_hgvs_name(hgvs_name, genome, transcript=None, get_transcript=lambda name: None, - flank_length=30, normalize=True, lazy=False): + flank_length=30, normalize=True, lazy=False, + indels_start_with_same_base=True): """ Parse an HGVS name into (chrom, start, end, ref, alt) @@ -1353,15 +1373,16 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript: Transcript corresponding to HGVS name. normalize: If True, normalize allele according to VCF standard. lazy: If True, discard version information from incoming transcript/gene. + indels_start_with_same_base: When normalizing, don't strip common prefix from indels """ hgvs = HGVSName(hgvs_name) # Determine transcript. if hgvs.kind == 'c' and not transcript: if '.' in hgvs.transcript and lazy: - hgvs.transcript, version = hgvs.transcript.split('.') + hgvs.transcript, _ = hgvs.transcript.split('.') elif '.' in hgvs.gene and lazy: - hgvs.gene, version = hgvs.gene.split('.') + hgvs.gene, _ = hgvs.gene.split('.') if get_transcript: if hgvs.transcript: transcript = get_transcript(hgvs.transcript) @@ -1377,11 +1398,12 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript.tx_position.chrom_stop, hgvs.transcript) - chrom, start, end, ref, alt = get_vcf_allele(hgvs, genome, transcript) + chrom, start, _, ref, alt = get_vcf_allele(hgvs, genome, transcript) if normalize: - chrom, start, ref, [alt] = normalize_variant( - chrom, start, ref, [alt], genome, - flank_length=flank_length).variant + nv = normalize_variant(chrom, start, ref, [alt], genome, + flank_length=flank_length, + indels_start_with_same_base=indels_start_with_same_base) + chrom, start, ref, [alt] = nv.variant return (chrom, start, ref, alt) @@ -1408,11 +1430,19 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs = HGVSName() # Populate coordinates. + if mutation_type == 'ins': + # Insert uses coordinates around the insert point. + offset_start = offset - 1 + offset_end = offset + else: + offset_start = offset + offset_end = offset + len(ref) - 1 + if not transcript: # Use genomic coordinate when no transcript is available. hgvs.kind = 'g' - hgvs.start = offset - hgvs.end = offset + len(ref) - 1 + hgvs.start = offset_start + hgvs.end = offset_end else: # Use cDNA coordinates. hgvs.kind = 'c' @@ -1425,14 +1455,6 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset) hgvs.cdna_end = hgvs.cdna_start else: - # Use a range of coordinates. - if mutation_type == 'ins': - # Insert uses coordinates around the insert point. - offset_start = offset - 1 - offset_end = offset - else: - offset_start = offset - offset_end = offset + len(ref) - 1 if transcript.strand == '-': offset_start, offset_end = offset_end, offset_start hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start) diff --git a/pyhgvs/models.py b/pyhgvs/models.py index b26b8c4..656e67b 100644 --- a/pyhgvs/models.py +++ b/pyhgvs/models.py @@ -27,7 +27,7 @@ def __init__(self, name): class Transcript(object): - """RefGene Transcripts for hg19 + """ A gene may have multiple transcripts with different combinations of exons. """ diff --git a/pyhgvs/tests/data/test_name_to_variant.genome b/pyhgvs/tests/data/test_name_to_variant.genome index 90eb59e..a36abfb 100644 --- a/pyhgvs/tests/data/test_name_to_variant.genome +++ b/pyhgvs/tests/data/test_name_to_variant.genome @@ -61,6 +61,7 @@ chr1 76205650 76205691 TTTATATATTCAAGGCTTATTGTGTAACAGAACCTGGAGCA chr1 76205639 76205669 TTTCTCTTGTTTTTATATATTCAAGGCTTA chr1 76205670 76205700 TGTGTAACAGAACCTGGAGCAGGCTCTGAT chr1 76227048 76227049 C +chr1 76227049 76227050 T chr1 76227029 76227070 TAATGAGGGATGCCAAAATCTATCAGGTAAGGTTAAAGATG chr1 76227019 76227049 GTAGAAAAACTAATGAGGGATGCCAAAATC chr1 76227049 76227079 TATCAGGTAAGGTTAAAGATGATTTTTTTG @@ -84,7 +85,13 @@ chr7 117176661 117176664 TAT chr7 117176642 117176684 CCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACT chr7 117176630 117176660 GACTTGTGATTACCTCAGAAATGATTGAAA chr7 117176662 117176692 ATCCAATCTGTTAAGGCATACTGCTGGGAA +chr7 117182084 117182126 CAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAA +chr7 117182103 117182104 A +chr7 117182103 117182106 AAT +chr7 117182106 117182109 ATA chr7 117182106 117182107 A +chr7 117182107 117182109 TA +chr7 117182077 117182107 TACAAAAGCAAGAATATAAGACATTGGAATA chr7 117182087 117182129 GAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTA chr7 117182074 117182104 CTTACAAAAGCAAGAATATAAGACATTGGA chr7 117182104 117182134 ATATAACTTAACGACTACAGAAGTAGTGAT @@ -98,7 +105,11 @@ chr17 41245340 41245341 T chr17 41245340 41245341 T chr7 117307164 117307165 A chr11 17496507 17496508 T +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA +chr17 7126009 7126051 CATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGT chr17 7126027 7126037 AGCAGAGGTG +chr17 7126028 7126029 A +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG chr1 76216233 76216235 AA chr1 76199213 76199220 AGGTCTT chr1 76199194 76199240 AACATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGC diff --git a/pyhgvs/tests/data/test_variant_to_name.genome b/pyhgvs/tests/data/test_variant_to_name.genome index f310c9c..1f0b257 100644 --- a/pyhgvs/tests/data/test_variant_to_name.genome +++ b/pyhgvs/tests/data/test_variant_to_name.genome @@ -66,3 +66,9 @@ chr7 117188682 117188712 tTTTTTTAACAGGGATTTGGGGAATTATTT chr7 117188582 117188783 CCATGTGCTTTTCAAACTAATTGTACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGAtgtgtgtgtgtgtgtgtgtgtgtTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGTAATTT chr7 117188687 117188689 TT chr7 117188689 117188691 AA +chr17 7125928 7126129 TCCCCCAGTGACAACCTGTTGAACACACCTCTGCTTTCCCACACTGCCCTGACACAGTGGGCCCCCTGAGAAGAAGATGGGCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGTACGGGTGCCATCGGAGAACGTGCTGGGTGAGGTTGGGAGTGGCTTCAAGGTTGCCATGCACATCCTCAACAATGGAAG +chr17 7126008 7126050 GCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAG +chr17 7126027 7126029 AG +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG +chr17 7126029 7126031 CA +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA \ No newline at end of file diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py index 5c53da6..0887dfd 100644 --- a/pyhgvs/tests/genome.py +++ b/pyhgvs/tests/genome.py @@ -77,9 +77,11 @@ def __init__(self, lookup=None, filename=None, db_filename=None, if SequenceFileDB is None: raise ValueError('pygr is not available.') self._genome = SequenceFileDB(db_filename) + self._source_filename = db_filename elif filename: # Read genome sequence from lookup table. self.read(filename) + self._source_filename = filename def __contains__(self, chrom): """Return True if genome contains chromosome.""" @@ -113,8 +115,8 @@ def get_seq(self, chrom, start, end): None, end - start)) else: raise MockGenomeError( - 'Sequence not in test data: %s:%d-%d' % - (chrom, start, end)) + 'Sequence not in test data: %s:%d-%d source: %s' % + (chrom, start, end, self._source_filename)) def read(self, filename): """Read a sequence lookup table from a file. diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 83e239e..55b20f4 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -11,57 +11,52 @@ def read_refgene(infile): - """ - Iterate through a refGene file. + """ refGene = genePred with extra column at front (and ignored ones after) """ + return read_genepred(infile, skip_first_column=True) + +def read_genepred(infile, skip_first_column=False): + """ GenePred extension format: http://genome.ucsc.edu/FAQ/FAQformat.html#GenePredExt Column definitions: - 0. uint undocumented id - 1. string name; "Name of gene (usually transcript_id from GTF)" - 2. string chrom; "Chromosome name" - 3. char[1] strand; "+ or - for strand" - 4. uint txStart; "Transcription start position" - 5. uint txEnd; "Transcription end position" - 6. uint cdsStart; "Coding region start" - 7. uint cdsEnd; "Coding region end" - 8. uint exonCount; "Number of exons" - 9. uint[exonCount] exonStarts; "Exon start positions" - 10. uint[exonCount] exonEnds; "Exon end positions" - 11. uint id; "Unique identifier" - 12. string name2; "Alternate name (e.g. gene_id from GTF)" - 13. string cdsStartStat; "enum('none','unk','incmpl','cmpl')" - 14. string cdsEndStat; "enum('none','unk','incmpl','cmpl')" - 15. lstring exonFrames; "Exon frame offsets {0,1,2}" + 0. string name; "Name of gene (usually transcript_id from GTF)" + 1. string chrom; "Chromosome name" + 2. char[1] strand; "+ or - for strand" + 3. uint txStart; "Transcription start position" + 4. uint txEnd; "Transcription end position" + 5. uint cdsStart; "Coding region start" + 6. uint cdsEnd; "Coding region end" + 7. uint exonCount; "Number of exons" + 8. uint[exonCount] exonStarts; "Exon start positions" + 9. uint[exonCount] exonEnds; "Exon end positions" + 10. uint id; "Unique identifier" + 11. string name2; "Alternate name (e.g. gene_id from GTF)" """ for line in infile: # Skip comments. if line.startswith('#'): continue row = line.rstrip('\n').split('\t') - if len(row) != 16: - raise ValueError( - 'File has incorrect number of columns ' - 'in at least one line.') + if skip_first_column: + row = row[1:] # Skip trailing , - exon_starts = list(map(int, row[9].split(',')[:-1])) - exon_ends = list(map(int, row[10].split(',')[:-1])) - exon_frames = list(map(int, row[15].split(',')[:-1])) + exon_starts = list(map(int, row[8].split(',')[:-1])) + exon_ends = list(map(int, row[9].split(',')[:-1])) exons = list(zip(exon_starts, exon_ends)) yield { - 'chrom': row[2], - 'start': int(row[4]), - 'end': int(row[5]), - 'id': row[1], - 'strand': row[3], - 'cds_start': int(row[6]), - 'cds_end': int(row[7]), - 'gene_name': row[12], + 'chrom': row[1], + 'start': int(row[3]), + 'end': int(row[4]), + 'id': row[0], + 'strand': row[2], + 'cds_start': int(row[5]), + 'cds_end': int(row[6]), + 'gene_name': row[11], 'exons': exons, - 'exon_frames': exon_frames } diff --git a/pyhgvs/variants.py b/pyhgvs/variants.py index fca0cdc..36c7383 100644 --- a/pyhgvs/variants.py +++ b/pyhgvs/variants.py @@ -115,7 +115,7 @@ def justify_genomic_indel(genome, chrom, start, end, indel, justify, def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, - justify='left', flank_length=30): + justify='left', flank_length=30, indels_start_with_same_base=True): """ Normalize variant according to the GATK/VCF standard. @@ -133,7 +133,7 @@ def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, chrom_stop=end, is_forward_strand=True) return NormalizedVariant(position, ref_sequence, alt_sequences, - genome=genome, justify=justify) + genome=genome, justify=justify, indels_start_with_same_base=indels_start_with_same_base) class NormalizedVariant(object): @@ -142,7 +142,8 @@ class NormalizedVariant(object): """ def __init__(self, position, ref_allele, alt_alleles, - seq_5p='', seq_3p='', genome=None, justify='left'): + seq_5p='', seq_3p='', genome=None, justify='left', + indels_start_with_same_base=True): """ position: a 0-index genomic Position. ref_allele: the reference allele sequence. @@ -150,6 +151,9 @@ def __init__(self, position, ref_allele, alt_alleles, seq_5p: 5 prime flanking sequence of variant. seq_3p: 3 prime flanking sequence of variant. genome: a pygr compatible genome object (optional). + + indels_start_with_same_base: DML - I have no idea why this is required + but am keeping for backwards compat """ self.position = position self.alleles = [ref_allele] + list(alt_alleles) @@ -157,6 +161,7 @@ def __init__(self, position, ref_allele, alt_alleles, self.seq_3p = seq_3p self.genome = genome self.log = [] + self.indels_start_with_same_base = indels_start_with_same_base self._on_forward_strand() self._trim_common_prefix() @@ -275,7 +280,7 @@ def _1bp_pad(self): # Pad sequences with one 5-prime base before the mutation event. empty_seq = any(not allele for allele in self.alleles) uniq_starts = set(allele[0] for allele in self.alleles if allele) - if empty_seq or len(uniq_starts) > 1: + if empty_seq or (self.indels_start_with_same_base and len(uniq_starts) > 1): # Fetch more 5p flanking sequence if needed. if self.genome and self.seq_5p == '': start = self.position.chrom_start @@ -299,9 +304,10 @@ def _1bp_pad(self): self.seq_3p = self.seq_3p[1:] self.position.chrom_stop += 1 - if len(set(a[0] for a in self.alleles)) != 1: - raise AssertionError( - "All INDEL alleles should start with same base.") + if self.indels_start_with_same_base: + if len(set(a[0] for a in self.alleles)) != 1: + raise AssertionError( + "All INDEL alleles should start with same base.") def _set_1based_position(self): """ From e2f7dfb718c79eb00a687634e9b64f6bf2c8d792 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Mon, 16 Mar 2020 17:18:14 +1030 Subject: [PATCH 02/47] Unit tests for previously failing HGVS names --- pyhgvs/tests/test_hgvs_names.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index be294e0..58cf351 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -123,13 +123,22 @@ def test_variant_to_name(): for (expected_hgvs_name, variant, name_canonical, var_canonical) in _name_variants: if name_canonical: - transcript_name = HGVSName(expected_hgvs_name).transcript - transcript = get_transcript(transcript_name) - assert transcript, transcript_name + hgvs = HGVSName(expected_hgvs_name) + transcript_name = hgvs.transcript + transcript = None + if transcript_name: + transcript = get_transcript(transcript_name) + assert transcript, transcript_name chrom, offset, ref, alt = variant hgvs_name = format_hgvs_name( chrom, offset, ref, alt, genome, transcript, use_gene=False) + + if hgvs.kind == 'g': + # Strip off g.HGVS prefix + i = expected_hgvs_name.find(":g.") + expected_hgvs_name = expected_hgvs_name[i+1:] + nose.tools.assert_equal( hgvs_name, expected_hgvs_name, repr([hgvs_name, expected_hgvs_name, variant])) @@ -723,7 +732,11 @@ def test_invalid_coordinates(): ('NM_000016.4:c.387+1delG', ('chr1', 76199309, 'TG', 'T'), True, True), ('NM_000016.4:c.475delT', ('chr1', 76205669, 'AT', 'A'), True, True), ('NM_000016.4:c.1189dupT', ('chr1', 76227049, 'C', 'CT'), True, True), + # Same as above but without optional trailing base + ('NM_000016.4:c.1189dup', ('chr1', 76227049, 'C', 'CT'), False, True), ('NM_000016.4:c.1191delT', ('chr1', 76227051, 'AT', 'A'), True, True), + # Same as above but without optional trailing base + ('NM_000016.4:c.1191del', ('chr1', 76227051, 'AT', 'A'), False, True), ('NM_000016.4:c.306_307insG', ('chr1', 76199232, 'T', 'TG'), True, True), # Alignment tests for HGVS 3' and VCF left-alignment. @@ -734,6 +747,9 @@ def test_invalid_coordinates(): True, True), ('NM_000492.3:c.1155_1156dupTA', ('chr7', 117182104, 'A', 'AAT'), True, True), + # Same as above but without optional trailing base + ('NM_000492.3:c.1155_1156dup', ('chr7', 117182104, 'A', 'AAT'), + False, True), ('NM_000492.3:c.3889dupT', ('chr7', 117292905, 'A', 'AT'), True, True), # Transcript prefix. @@ -750,6 +766,9 @@ def test_invalid_coordinates(): ('chr11:g.17496508T>C', ('chr11', 17496508, 'T', 'C'), False, True), # Genomic indels. + ('chr17:g.7126029_7126030insTG', + ('chr17', 7126028, 'A', 'AGT'), True, True), + ('chr17:g.7126029_7126037delGCAGAGGTGinsTCAAAGCAC', ('chr17', 7126028, 'AGCAGAGGTG', 'ATCAAAGCAC'), False, True), @@ -757,6 +776,13 @@ def test_invalid_coordinates(): ('chr1:g.76216235delAinsGC', ('chr1', 76216234, 'AA', 'AGC'), False, True), + # Genomic dup + ('chr7:g.117182108_117182109dup', ('chr7', 117182104, 'A', 'AAT'), + False, True), + + ('chr7:g.117182105_117182106dupAT', ('chr7', 117182104, 'A', 'AAT'), + False, True), + # Genomic delete region. ('chr1:g.76199215_76199220delGGTCTT', ('chr1', 76199214, 'AGGTCTT', 'A'), False, True), From 9ffdc7fb9692054957e536243ec1734b2fab5217 Mon Sep 17 00:00:00 2001 From: davmlaw Date: Wed, 22 Apr 2020 15:50:26 +0930 Subject: [PATCH 03/47] Populate alt for values like NC_000001.11:g.169549811= --- pyhgvs/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 9b28e8f..df6e108 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -672,9 +672,13 @@ def get_vcf_allele(hgvs, genome, transcript=None): transcript.tx_position.is_forward_strand if transcript else True) ref = get_genomic_sequence(genome, chrom, start, end) + # Sometimes we need to retrieve alt from reference + # Eg NC_000001.11:g.169549811= + if hgvs.mutation_type == "=" and alt == 'N': + alt = ref + if hgvs.mutation_type in _indel_mutation_types: if hgvs.mutation_type == 'dup': - # Sometimes we need to retrieve alt from reference, eg: # No alt supplied: NM_000492.3:c.1155_1156dup # Number used: NM_004119.2(FLT3):c.1794_1811dup18 # We *know* what the sequence is for "dup18", but not for "ins18" From 00a22054a4060a5dfb3e05376f9fbf0338352eb2 Mon Sep 17 00:00:00 2001 From: davmlaw Date: Wed, 29 Apr 2020 20:29:50 +0930 Subject: [PATCH 04/47] Support mito (ie "m.") --- pyhgvs/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index df6e108..9744b84 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -853,7 +853,7 @@ def parse_allele(self, allele): self.parse_cdna(details) elif kind == "p": self.parse_protein(details) - elif kind == "g": + elif kind in ["g", "m"]: self.parse_genome(details) else: raise NotImplementedError("unknown kind: %s" % allele) @@ -1019,8 +1019,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False): allele = 'c.' + self.format_cdna() elif self.kind == 'p': allele = 'p.' + self.format_protein() - elif self.kind == 'g': - allele = 'g.' + self.format_genome() + elif self.kind in ['g', 'm']: + allele = self.kind + '.' + self.format_genome() else: raise NotImplementedError("not implemented: '%s'" % self.kind) @@ -1040,7 +1040,7 @@ def format_prefix(self, use_gene=True): NM_007294.3(BRCA1):c.2207A>C """ - if self.kind == 'g': + if self.kind in ['g', 'm']: if self.chrom: return self.chrom @@ -1181,7 +1181,7 @@ def get_raw_coords(self, transcript=None): if start > end: raise AssertionError( "cdna_start cannot be greater than cdna_end") - elif self.kind == 'g': + elif self.kind in ['g', 'm']: chrom = self.chrom start = self.start end = self.end From 2b4ac8bb64b61410535e79858af4c67582fd9b95 Mon Sep 17 00:00:00 2001 From: davmlaw Date: Wed, 29 Apr 2020 20:33:37 +0930 Subject: [PATCH 05/47] Revert "Support mito (ie "m.")" This reverts commit 00a22054a4060a5dfb3e05376f9fbf0338352eb2. --- pyhgvs/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 9744b84..df6e108 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -853,7 +853,7 @@ def parse_allele(self, allele): self.parse_cdna(details) elif kind == "p": self.parse_protein(details) - elif kind in ["g", "m"]: + elif kind == "g": self.parse_genome(details) else: raise NotImplementedError("unknown kind: %s" % allele) @@ -1019,8 +1019,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False): allele = 'c.' + self.format_cdna() elif self.kind == 'p': allele = 'p.' + self.format_protein() - elif self.kind in ['g', 'm']: - allele = self.kind + '.' + self.format_genome() + elif self.kind == 'g': + allele = 'g.' + self.format_genome() else: raise NotImplementedError("not implemented: '%s'" % self.kind) @@ -1040,7 +1040,7 @@ def format_prefix(self, use_gene=True): NM_007294.3(BRCA1):c.2207A>C """ - if self.kind in ['g', 'm']: + if self.kind == 'g': if self.chrom: return self.chrom @@ -1181,7 +1181,7 @@ def get_raw_coords(self, transcript=None): if start > end: raise AssertionError( "cdna_start cannot be greater than cdna_end") - elif self.kind in ['g', 'm']: + elif self.kind == 'g': chrom = self.chrom start = self.start end = self.end From a6e238e4c94023867fed6ce2dfa9991bbcc2ef43 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 12 May 2021 13:03:25 +0930 Subject: [PATCH 06/47] Support n. (non-coding) ie https://github.com/counsyl/hgvs/issues/7 --- pyhgvs/__init__.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index df6e108..fe9aab6 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -849,7 +849,7 @@ def parse_allele(self, allele): self.kind = kind self.mutation_type = None - if kind == "c": + if kind in ("c", 'n'): self.parse_cdna(details) elif kind == "p": self.parse_protein(details) @@ -1017,6 +1017,8 @@ def format(self, use_prefix=True, use_gene=True, use_counsyl=False): if self.kind == 'c': allele = 'c.' + self.format_cdna() + elif self.kind == 'n': + allele = 'n.' + self.format_cdna() elif self.kind == 'p': allele = 'p.' + self.format_protein() elif self.kind == 'g': @@ -1167,7 +1169,7 @@ def format_genome(self): def get_raw_coords(self, transcript=None): """ return genomic coordinates """ - if self.kind == 'c': + if self.kind in ('c', 'n'): chrom = transcript.tx_position.chrom start = cdna_to_genomic_coord(transcript, self.cdna_start) end = cdna_to_genomic_coord(transcript, self.cdna_end) @@ -1382,7 +1384,7 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, hgvs = HGVSName(hgvs_name) # Determine transcript. - if hgvs.kind == 'c' and not transcript: + if hgvs.kind in ('c', 'n') and not transcript: if '.' in hgvs.transcript and lazy: hgvs.transcript, _ = hgvs.transcript.split('.') elif '.' in hgvs.gene and lazy: @@ -1449,7 +1451,11 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs.end = offset_end else: # Use cDNA coordinates. - hgvs.kind = 'c' + if transcript.is_coding: + hgvs.kind = 'c' + else: + hgvs.kind = 'n' + is_single_base_indel = ( (mutation_type == 'ins' and len(alt) == 1) or (mutation_type in ('del', 'delins', 'dup') and len(ref) == 1)) From 957d25c5dfc6d5526ef993f56e869b3461ca156f Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Tue, 7 Sep 2021 10:52:41 +0930 Subject: [PATCH 07/47] Make sure non-coding doesn't have 5' or 3' UTR offsets --- pyhgvs/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index fe9aab6..977ae39 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -851,6 +851,11 @@ def parse_allele(self, allele): if kind in ("c", 'n'): self.parse_cdna(details) + if kind == 'n': # Ensure no 3'UTR or 5'UTR coords in non-coding + if self.cdna_start.coord < 0: + raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain negative (5'UTR) coordinates") + if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop': + raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain '*' (3'UTR) coordinates") elif kind == "p": self.parse_protein(details) elif kind == "g": From 1a8b67fd7f374c9e845c246655ab5c61be7a5550 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Tue, 12 Oct 2021 11:18:01 +1030 Subject: [PATCH 08/47] Refactor - move functions into methods --- pyhgvs/__init__.py | 1179 +------------------------- pyhgvs/models.py | 152 ---- pyhgvs/models/__init__.py | 0 pyhgvs/models/cdna.py | 123 +++ pyhgvs/models/genome.py | 47 + pyhgvs/models/hgvs_name.py | 801 +++++++++++++++++ pyhgvs/models/transcript.py | 335 ++++++++ pyhgvs/{ => models}/variants.py | 21 +- pyhgvs/tests/genome.py | 2 +- pyhgvs/tests/test_hgvs_names.py | 12 +- pyhgvs/tests/test_hgvs_names_long.py | 2 +- pyhgvs/tests/test_variants.py | 2 +- pyhgvs/utils.py | 5 +- requirements-dev.txt | 1 + setup.py | 1 + 15 files changed, 1340 insertions(+), 1343 deletions(-) delete mode 100644 pyhgvs/models.py create mode 100644 pyhgvs/models/__init__.py create mode 100644 pyhgvs/models/cdna.py create mode 100644 pyhgvs/models/genome.py create mode 100644 pyhgvs/models/hgvs_name.py create mode 100644 pyhgvs/models/transcript.py rename pyhgvs/{ => models}/variants.py (96%) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 977ae39..b3004b2 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -7,503 +7,16 @@ Definition of which transcript to use coding variants: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene - -HGVS language currently implemented. - -HGVS = ALLELE - | PREFIX_NAME : ALLELE - -PREFIX_NAME = TRANSCRIPT - | TRANSCRIPT '(' GENE ')' - -TRANSCRIPT = TRANSCRIPT_NAME - | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION - -TRANSCRIPT_VERSION = NUMBER - -ALLELE = 'c.' CDNA_ALLELE # cDNA - | 'g.' GENOMIC_ALLELE # genomic - | 'm.' MIT_ALLELE # mitochondrial sequence - | 'n.' NC_ALLELE # non-coding RNA reference sequence - | 'r.' RNA_ALLELE # RNA sequence (like r.76a>u) - | 'p.' PROTEIN_ALLELE # protein sequence (like p.Lys76Asn) - -NC_ALLELE = -RNA_ALLELE = -CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE - | CDNA_COORD_RANGE MULTI_BASE_CHANGE - -GENOMIC_ALLELE = -MIT_ALLELE = COORD SINGLE_BASE_CHANGE - | COORD_RANGE MULTI_BASE_CHANGE - -SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '=' # no change - | CDNA_COORD BASE '>' BASE # substitution - | CDNA_COORD 'ins' BASE # 1bp insertion - | CDNA_COORD 'del' BASE # 1bp deletion - | CDNA_COORD 'dup' BASE # 1bp duplication - | CDNA_COORD 'ins' # 1bp insertion - | CDNA_COORD 'del' # 1bp deletion - | CDNA_COORD 'dup' # 1bp duplication - | CDNA_COORD 'del' BASE 'ins' BASE # 1bp indel - | CDNA_COORD 'delins' BASE # 1bp indel - -MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES # deletion - | COORD_RANGE 'ins' BASES # insertion - | COORD_RANGE 'dup' BASES # duplication - | COORD_RANGE 'del' # deletion - | COORD_RANGE 'dup' # duplication - | COORD_RANGE 'del' BASES 'ins' BASES # indel - | COORD_RANGE 'delins' BASES # indel - - -AMINO1 = [GAVLIMFWPSTCYNQDEKRH] - -AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro' - | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys' - | 'Arg' | 'His' - -PROTEIN_ALLELE = AMINO3 COORD '=' # no peptide change - | AMINO1 COORD '=' # no peptide change - | AMINO3 COORD AMINO3 PEP_EXTRA # peptide change - | AMINO1 COORD AMINO1 PEP_EXTRA # peptide change - | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA # indel - | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA # indel - | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel - | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel - -# A genomic range: -COORD_RANGE = COORD '_' COORD - -# A cDNA range: -CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD - -# A cDNA coordinate: -CDNA_COORD = COORD_PREFIX COORD - | COORD_PREFIX COORD OFFSET_PREFIX OFFSET -COORD_PREFIX = '' | '-' | '*' -COORD = NUMBER -OFFSET_PREFIX = '-' | '+' -OFFSET = NUMBER - -# Primatives: -NUMBER = \d+ -BASE = [ACGT] -BASES = BASE+ - """ from __future__ import absolute_import from __future__ import unicode_literals import re -from .variants import justify_indel -from .variants import normalize_variant -from .variants import revcomp - - -CHROM_PREFIX = 'chr' -CDNA_START_CODON = 'cdna_start' -CDNA_STOP_CODON = 'cdna_stop' - - -class HGVSRegex(object): - """ - All regular expression for HGVS names. - """ - - # DNA syntax - # http://www.hgvs.org/mutnomen/standards.html#nucleotide - BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+" - BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+" - DNA_REF = "(?P" + BASES + ")" - DNA_ALT = "(?P" + BASES + ")" - - # Mutation types - EQUAL = "(?P=)" - SUB = "(?P>)" - INS = "(?Pins)" - DEL = "(?Pdel)" - DUP = "(?Pdup)" - - # Simple coordinate syntax - COORD_START = "(?P\d+)" - COORD_END = "(?P\d+)" - COORD_RANGE = COORD_START + "_" + COORD_END - - # cDNA coordinate syntax - CDNA_COORD = ("(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?") - CDNA_START = ("(?P(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?)") - CDNA_END = (r"(?P(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?)") - CDNA_RANGE = CDNA_START + "_" + CDNA_END - - # cDNA allele syntax - CDNA_ALLELE = [ - # No change - CDNA_START + DNA_REF + EQUAL, - - # Substitution - CDNA_START + DNA_REF + SUB + DNA_ALT, - - # 1bp insertion, deletion, duplication - CDNA_START + INS + DNA_ALT, - CDNA_START + DEL + DNA_REF, - CDNA_START + DUP + DNA_REF, - CDNA_START + DEL, - CDNA_START + DUP, - - # Insertion, deletion, duplication - CDNA_RANGE + INS + DNA_ALT, - CDNA_RANGE + DEL + DNA_REF, - CDNA_RANGE + DUP + DNA_REF, - CDNA_RANGE + DEL, - CDNA_RANGE + DUP, - - # Indels - "(?P" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + CDNA_START + 'delins' + DNA_ALT + ")", - "(?P" + CDNA_RANGE + 'delins' + DNA_ALT + ")", - ] - - CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in CDNA_ALLELE] - - # Peptide syntax - PEP = "([A-Z]([a-z]{2}))+" - PEP_REF = "(?P" + PEP + ")" - PEP_REF2 = "(?P" + PEP + ")" - PEP_ALT = "(?P" + PEP + ")" - - PEP_EXTRA = "(?P(|=|\?)(|fs))" - - # Peptide allele syntax - PEP_ALLELE = [ - # No peptide change - # Example: Glu1161= - PEP_REF + COORD_START + PEP_EXTRA, - - # Peptide change - # Example: Glu1161Ser - PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA, - - # Peptide indel - # Example: Glu1161_Ser1164?fs - "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + - PEP_EXTRA + ")", - "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + - PEP_ALT + PEP_EXTRA + ")", - ] - - PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in PEP_ALLELE] - - # Genomic allele syntax - GENOMIC_ALLELE = [ - # No change - COORD_START + DNA_REF + EQUAL, - - # Substitution - COORD_START + DNA_REF + SUB + DNA_ALT, - - # 1bp insertion, deletion, duplication - COORD_START + INS + DNA_ALT, - COORD_START + DEL + DNA_REF, - COORD_START + DUP + DNA_REF, - COORD_START + DEL, - COORD_START + DUP, - - # Insertion, deletion, duplication - COORD_RANGE + INS + DNA_ALT, - COORD_RANGE + DEL + DNA_REF, - COORD_RANGE + DUP + DNA_REF, - COORD_RANGE + DEL, - COORD_RANGE + DUP, - - # Indels - "(?P" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + COORD_START + 'delins' + DNA_ALT + ")", - "(?P" + COORD_RANGE + 'delins' + DNA_ALT + ")", - ] - - GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in GENOMIC_ALLELE] - - -class ChromosomeSubset(object): - """ - Allow direct access to a subset of the chromosome. - """ - - def __init__(self, name, genome=None): - self.name = name - self.genome = genome - - def __getitem__(self, key): - """Return sequence from region [start, end) - - Coordinates are 0-based, end-exclusive.""" - if isinstance(key, slice): - start, end = (key.start, key.stop) - start -= self.genome.start - end -= self.genome.start - return self.genome.genome[self.genome.seqid][start:end] - else: - raise TypeError('Expected a slice object but ' - 'received a {0}.'.format(type(key))) - - def __repr__(self): - return 'ChromosomeSubset("%s")' % self.name - - -class GenomeSubset(object): - """ - Allow the direct access of a subset of the genome. - """ - - def __init__(self, genome, chrom, start, end, seqid): - self.genome = genome - self.chrom = chrom - self.start = start - self.end = end - self.seqid = seqid - self._chroms = {} - - def __getitem__(self, chrom): - """Return a chromosome by its name.""" - if chrom in self._chroms: - return self._chroms[chrom] - else: - chromosome = ChromosomeSubset(chrom, self) - self._chroms[chrom] = chromosome - return chromosome - - -class CDNACoord(object): - """ - A HGVS cDNA-based coordinate. - - A cDNA coordinate can take one of these forms: - - N = nucleotide N in protein coding sequence (e.g. 11A>G) - - -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G) - NOTE: so located in the 5'UTR or 5' of the transcription initiation - site (upstream of the gene, incl. promoter) - - *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G) - NOTE: so located in the 3'UTR or 3' of the polyA-addition site - (including downstream of the gene) - - N+M = nucleotide M in the intron after (3' of) position N in the coding DNA - reference sequence (e.g. 30+4A>G) - - N-M = nucleotide M in the intron before (5' of) position N in the coding - DNA reference sequence (e.g. 301-2A>G) - - -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G) - - *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G) - """ - - def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON, - string=''): - """ - coord: main coordinate along cDNA on the same strand as the transcript - - offset: an additional genomic offset from the main coordinate. This - allows referencing non-coding (e.g. intronic) positions. - Offset is also interpreted on the coding strand. - - landmark: ('cdna_start', 'cdna_stop') indicating that 'coord' - is relative to one of these landmarks. - - string: a coordinate from an HGVS name. If given coord, offset, and - landmark should not be specified. - """ - - if string: - if coord != 0 or offset != 0 or landmark != CDNA_START_CODON: - raise ValueError("coord, offset, and landmark should not " - "be given with string argument") - - self.parse(string) - else: - self.coord = coord - self.offset = offset - self.landmark = landmark - - def parse(self, coord_text): - """ - Parse a HGVS formatted cDNA coordinate. - """ - - match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text) - if not match: - raise ValueError("unknown coordinate format '%s'" % coord_text) - coord_prefix, coord, _, offset_prefix, offset = match.groups() - - self.coord = int(coord) - self.offset = int(offset) if offset else 0 - - if offset_prefix == '-': - self.offset *= -1 - elif offset_prefix == '+' or offset is None: - pass - else: - raise ValueError("unknown offset_prefix '%s'" % offset_prefix) - - if coord_prefix == '': - self.landmark = CDNA_START_CODON - elif coord_prefix == "-": - self.coord *= -1 - self.landmark = CDNA_START_CODON - elif coord_prefix == '*': - self.landmark = CDNA_STOP_CODON - else: - raise ValueError("unknown coord_prefix '%s'" % coord_prefix) - return self - - def __str__(self): - """ - Return a formatted cDNA coordinate - """ - if self.landmark == CDNA_STOP_CODON: - coord_prefix = '*' - else: - coord_prefix = '' - - if self.offset < 0: - offset = '%d' % self.offset - elif self.offset > 0: - offset = '+%d' % self.offset - else: - offset = '' - - return '%s%d%s' % (coord_prefix, self.coord, offset) - - def __eq__(self, other): - """Equality operator.""" - return ((self.coord, self.offset, self.landmark) == - (other.coord, other.offset, other.landmark)) - - def __repr__(self): - """ - Returns a string representation of a cDNA coordinate. - """ - if self.landmark != CDNA_START_CODON: - return "CDNACoord(%d, %d, '%s')" % ( - self.coord, self.offset, self.landmark) - else: - return "CDNACoord(%d, %d)" % (self.coord, self.offset) - - -# The RefSeq standard for naming contigs/transcripts/proteins: -# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8 -REFSEQ_PREFIXES = [ - ('AC_', 'genomic', - 'Complete genomic molecule, usually alternate assembly'), - ('NC_', 'genomic', - 'Complete genomic molecule, usually reference assembly'), - ('NG_', 'genomic', 'Incomplete genomic region'), - ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'), - ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'), - ('NS_', 'genomic', 'Environmental sequence'), - ('NZ_', 'genomic', 'Unfinished WGS'), - ('NM_', 'mRNA', ''), - ('NR_', 'RNA', ''), - ('XM_', 'mRNA', 'Predicted model'), - ('XR_', 'RNA', 'Predicted model'), - ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'), - ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'), - ('YP_', 'Protein', ''), - ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'), - ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'), -] - -REFSEQ_PREFIX_LOOKUP = dict( - (prefix, (kind, description)) - for prefix, kind, description in REFSEQ_PREFIXES -) - - -def get_refseq_type(name): - """ - Return the RefSeq type for a refseq name. - """ - prefix = name[:3] - return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0] - - -def get_exons(transcript): - """Yield exons in coding order.""" - transcript_strand = transcript.tx_position.is_forward_strand - if hasattr(transcript.exons, 'select_related'): - exons = list(transcript.exons.select_related('tx_position')) - else: - exons = list(transcript.exons) - exons.sort(key=lambda exon: exon.tx_position.chrom_start) - if not transcript_strand: - exons.reverse() - return exons - - -def get_coding_exons(transcript): - """Yield non-empty coding exonic regions in coding order.""" - for exon in get_exons(transcript): - region = exon.get_as_interval(coding_only=True) - if region: - yield region - - -def get_utr5p_size(transcript): - """Return the size of the 5prime UTR of a transcript.""" - - transcript_strand = transcript.tx_position.is_forward_strand - exons = get_exons(transcript) - - # Find the exon containing the start codon. - start_codon = (transcript.cds_position.chrom_start if transcript_strand - else transcript.cds_position.chrom_stop - 1) - cdna_len = 0 - for exon in exons: - exon_start = exon.tx_position.chrom_start - exon_end = exon.tx_position.chrom_stop - if exon_start <= start_codon < exon_end: - break - cdna_len += exon_end - exon_start - else: - raise ValueError("transcript contains no exons") - - if transcript_strand: - return cdna_len + (start_codon - exon_start) - else: - return cdna_len + (exon_end - start_codon - 1) - - -def find_stop_codon(exons, cds_position): - """Return the position along the cDNA of the base after the stop codon.""" - if cds_position.is_forward_strand: - stop_pos = cds_position.chrom_stop - else: - stop_pos = cds_position.chrom_start - cdna_pos = 0 - for exon in exons: - exon_start = exon.tx_position.chrom_start - exon_stop = exon.tx_position.chrom_stop - - if exon_start <= stop_pos <= exon_stop: - if cds_position.is_forward_strand: - return cdna_pos + stop_pos - exon_start - else: - return cdna_pos + exon_stop - stop_pos - else: - cdna_pos += exon_stop - exon_start - raise ValueError('Stop codon is not in any of the exons') +from .models.cdna import CDNACoord +from .models.genome import GenomeSubset +from .models.hgvs_name import HGVSName +from .models.variants import justify_indel, normalize_variant, revcomp def get_genomic_sequence(genome, chrom, start, end): @@ -518,141 +31,6 @@ def get_genomic_sequence(genome, chrom, start, end): return str(genome[str(chrom)][start - 1:end]).upper() -def cdna_to_genomic_coord(transcript, coord): - """Convert a HGVS cDNA coordinate to a genomic coordinate.""" - transcript_strand = transcript.tx_position.is_forward_strand - exons = get_exons(transcript) - utr5p = (get_utr5p_size(transcript) - if transcript.is_coding else 0) - - # compute starting position along spliced transcript. - if coord.landmark == CDNA_START_CODON: - if coord.coord > 0: - pos = utr5p + coord.coord - else: - pos = utr5p + coord.coord + 1 - elif coord.landmark == CDNA_STOP_CODON: - if coord.coord < 0: - raise ValueError('CDNACoord cannot have a negative coord and ' - 'landmark CDNA_STOP_CODON') - pos = find_stop_codon(exons, transcript.cds_position) + coord.coord - else: - raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) - - # 5' flanking sequence. - if pos < 1: - if transcript_strand: - return transcript.tx_position.chrom_start + pos - else: - return transcript.tx_position.chrom_stop - pos + 1 - - # Walk along transcript until we find an exon that contains pos. - cdna_start = 1 - cdna_end = 1 - for exon in exons: - exon_start = exon.tx_position.chrom_start + 1 - exon_end = exon.tx_position.chrom_stop - cdna_end = cdna_start + (exon_end - exon_start) - if cdna_start <= pos <= cdna_end: - break - cdna_start = cdna_end + 1 - else: - # 3' flanking sequence - if transcript_strand: - return transcript.cds_position.chrom_stop + coord.coord - else: - return transcript.cds_position.chrom_start + 1 - coord.coord - - # Compute genomic coordinate using offset. - if transcript_strand: - # Plus strand. - return exon_start + (pos - cdna_start) + coord.offset - else: - # Minus strand. - return exon_end - (pos - cdna_start) - coord.offset - - -def genomic_to_cdna_coord(transcript, genomic_coord): - """Convert a genomic coordinate to a cDNA coordinate and offset. - """ - exons = [exon.get_as_interval() - for exon in get_exons(transcript)] - - if len(exons) == 0: - return None - - strand = transcript.strand - - if strand == "+": - exons.sort(key=lambda exon: exon.chrom_start) - else: - exons.sort(key=lambda exon: -exon.chrom_end) - - distances = [exon.distance(genomic_coord) - for exon in exons] - min_distance_to_exon = min(map(abs, distances)) - - coding_offset = 0 - for exon in exons: - exon_length = exon.chrom_end - exon.chrom_start - distance = exon.distance(genomic_coord) - if abs(distance) == min_distance_to_exon: - if strand == "+": - exon_start_cds_offset = coding_offset + 1 - exon_end_cds_offset = coding_offset + exon_length - else: - exon_start_cds_offset = coding_offset + exon_length - exon_end_cds_offset = coding_offset + 1 - # This is the exon we want to annotate against. - if distance == 0: - # Inside the exon. - if strand == "+": - coord = (exon_start_cds_offset + - (genomic_coord - - (exon.chrom_start + 1))) - else: - coord = (exon_end_cds_offset + - (exon.chrom_end - - genomic_coord)) - cdna_coord = CDNACoord(coord, 0) - else: - # Outside the exon. - if distance > 0: - nearest_exonic = exon_start_cds_offset - else: - nearest_exonic = exon_end_cds_offset - if strand == "+": - distance *= -1 - - # If outside transcript, don't use offset. - if (genomic_coord < transcript.tx_position.chrom_start + 1 or - genomic_coord > transcript.tx_position.chrom_stop): - nearest_exonic += distance - distance = 0 - cdna_coord = CDNACoord(nearest_exonic, distance) - break - coding_offset += exon_length - - # Adjust coordinates for coding transcript. - if transcript.is_coding: - # Detect if position before start codon. - utr5p = get_utr5p_size(transcript) if transcript.is_coding else 0 - cdna_coord.coord -= utr5p - if cdna_coord.coord <= 0: - cdna_coord.coord -= 1 - else: - # Detect if position is after stop_codon. - exons = get_exons(transcript) - stop_codon = find_stop_codon(exons, transcript.cds_position) - stop_codon -= utr5p - if (cdna_coord.coord > stop_codon or - cdna_coord.coord == stop_codon and cdna_coord.offset > 0): - cdna_coord.coord -= stop_codon - cdna_coord.landmark = CDNA_STOP_CODON - - return cdna_coord - - def get_allele(hgvs, genome, transcript=None): """Get an allele from a HGVSName, a genome, and a transcript.""" chrom, start, end = hgvs.get_ref_coords(transcript) @@ -706,549 +84,6 @@ def matches_ref_allele(hgvs, genome, transcript=None): return genome_ref == ref -class InvalidHGVSName(ValueError): - def __init__(self, name='', part='name', reason=''): - if name: - message = 'Invalid HGVS %s "%s"' % (part, name) - else: - message = 'Invalid HGVS %s' % part - if reason: - message += ': ' + reason - super(InvalidHGVSName, self).__init__(message) - - self.name = name - self.part = part - self.reason = reason - - -class HGVSName(object): - """ - Represents a HGVS variant name. - """ - - def __init__(self, name='', prefix='', chrom='', transcript='', gene='', - kind='', mutation_type=None, start=0, end=0, ref_allele='', - ref2_allele='', alt_allele='', - cdna_start=None, cdna_end=None, pep_extra=''): - - # Full HGVS name. - self.name = name - - # Name parts. - self.prefix = prefix - self.chrom = chrom - self.transcript = transcript - self.gene = gene - self.kind = kind - self.mutation_type = mutation_type - self.start = start - self.end = end - self.ref_allele = ref_allele # reference allele - self.ref2_allele = ref2_allele # reference allele at end of pep indel - self.alt_allele = alt_allele # alternate allele - - # cDNA-specific fields - self.cdna_start = cdna_start if cdna_start else CDNACoord() - self.cdna_end = cdna_end if cdna_end else CDNACoord() - - # Protein-specific fields - self.pep_extra = pep_extra - - if name: - self.parse(name) - - def parse(self, name): - """Parse a HGVS name.""" - # Does HGVS name have transcript/gene prefix? - if ':' in name: - prefix, allele = name.split(':', 1) - else: - prefix = '' - allele = name - - self.name = name - - # Parse prefix and allele. - self.parse_allele(allele) - self.parse_prefix(prefix, self.kind) - self._validate() - - def parse_prefix(self, prefix, kind): - """ - Parse a HGVS prefix (gene/transcript/chromosome). - - Some examples of full hgvs names with transcript include: - NM_007294.3:c.2207A>C - NM_007294.3(BRCA1):c.2207A>C - BRCA1{NM_007294.3}:c.2207A>C - """ - - self.prefix = prefix - - # No prefix. - if prefix == '': - self.chrom = '' - self.transcript = '' - self.gene = '' - return - - # Transcript and gene given with parens. - # example: NM_007294.3(BRCA1):c.2207A>C - match = re.match("^(?P[^(]+)\((?P[^)]+)\)$", prefix) - if match: - self.transcript = match.group('transcript') - self.gene = match.group('gene') - return - - # Transcript and gene given with braces. - # example: BRCA1{NM_007294.3}:c.2207A>C - match = re.match("^(?P[^{]+)\{(?P[^}]+)\}$", prefix) - if match: - self.transcript = match.group('transcript') - self.gene = match.group('gene') - return - - # Determine using Ensembl type. - if prefix.startswith('ENST'): - self.transcript = prefix - return - - # Determine using refseq type. - refseq_type = get_refseq_type(prefix) - if refseq_type in ('mRNA', 'RNA'): - self.transcript = prefix - return - - # Determine using refseq type. - if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic': - self.chrom = prefix - return - - # Assume gene name. - self.gene = prefix - - def parse_allele(self, allele): - """ - Parse a HGVS allele description. - - Some examples include: - cDNA substitution: c.101A>C, - cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG - No protein change: p.Glu1161= - Protein change: p.Glu1161Ser - Protein frameshift: p.Glu1161_Ser1164?fs - Genomic substitution: g.1000100A>T - Genomic indel: g.1000100_1000102delATG - """ - if '.' not in allele: - raise InvalidHGVSName(allele, 'allele', - 'expected kind "c.", "p.", "g.", etc') - - # Determine HGVS name kind. - kind, details = allele.split('.', 1) - self.kind = kind - self.mutation_type = None - - if kind in ("c", 'n'): - self.parse_cdna(details) - if kind == 'n': # Ensure no 3'UTR or 5'UTR coords in non-coding - if self.cdna_start.coord < 0: - raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain negative (5'UTR) coordinates") - if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop': - raise InvalidHGVSName(allele, 'allele', "Non-coding transcript cannot contain '*' (3'UTR) coordinates") - elif kind == "p": - self.parse_protein(details) - elif kind == "g": - self.parse_genome(details) - else: - raise NotImplementedError("unknown kind: %s" % allele) - - def parse_cdna(self, details): - """ - Parse a HGVS cDNA name. - - Some examples include: - Substitution: 101A>C, - Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG - """ - for regex in HGVSRegex.CDNA_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = groups['mutation_type'] - - # Parse coordinates. - self.cdna_start = CDNACoord(string=groups.get('start')) - if groups.get('end'): - self.cdna_end = CDNACoord(string=groups.get('end')) - else: - self.cdna_end = CDNACoord(string=groups.get('start')) - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - self.alt_allele = groups.get('alt', '') - - # Convert numerical allelles. - if self.ref_allele.isdigit(): - self.ref_allele = "N" * int(self.ref_allele) - if self.alt_allele.isdigit(): - self.alt_allele = "N" * int(self.alt_allele) - - # Convert duplication alleles. - if self.mutation_type == "dup": - self.alt_allele = self.ref_allele * 2 - - # Convert no match alleles. - if self.mutation_type == "=": - self.alt_allele = self.ref_allele - return - - raise InvalidHGVSName(details, 'cDNA allele') - - def parse_protein(self, details): - """ - Parse a HGVS protein name. - - Some examples include: - No change: Glu1161= - Change: Glu1161Ser - Frameshift: Glu1161_Ser1164?fs - """ - for regex in HGVSRegex.PEP_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = '>' - - # Parse coordinates. - self.start = int(groups.get('start')) - if groups.get('end'): - self.end = int(groups.get('end')) - else: - self.end = self.start - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - if groups.get('ref2'): - self.ref2_allele = groups.get('ref2') - self.alt_allele = groups.get('alt', '') - else: - # If alt is not given, assume matching with ref - self.ref2_allele = self.ref_allele - self.alt_allele = groups.get( - 'alt', self.ref_allele) - - self.pep_extra = groups.get('extra') - return - - raise InvalidHGVSName(details, 'protein allele') - - def parse_genome(self, details): - """ - Parse a HGVS genomic name. - - Som examples include: - Substitution: 1000100A>T - Indel: 1000100_1000102delATG - """ - - for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = groups['mutation_type'] - - # Parse coordinates. - self.start = int(groups.get('start')) - if groups.get('end'): - self.end = int(groups.get('end')) - else: - self.end = self.start - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - self.alt_allele = groups.get('alt', '') - - # Convert numerical allelles. - if self.ref_allele.isdigit(): - self.ref_allele = "N" * int(self.ref_allele) - if self.alt_allele.isdigit(): - self.alt_allele = "N" * int(self.alt_allele) - - # Convert duplication alleles. - if self.mutation_type == "dup": - self.alt_allele = self.ref_allele * 2 - - # Convert no match alleles. - if self.mutation_type == "=": - self.alt_allele = self.ref_allele - return - - raise InvalidHGVSName(details, 'genomic allele') - - def _validate(self): - """ - Check for internal inconsistencies in representation - """ - if self.start > self.end: - raise InvalidHGVSName(reason="Coordinates are nonincreasing") - - def __repr__(self): - try: - return "HGVSName('%s')" % self.format() - except NotImplementedError: - return "HGVSName('%s')" % self.name - - def __unicode__(self): - return self.format() - - def format(self, use_prefix=True, use_gene=True, use_counsyl=False): - """Generate a HGVS name as a string.""" - - if self.kind == 'c': - allele = 'c.' + self.format_cdna() - elif self.kind == 'n': - allele = 'n.' + self.format_cdna() - elif self.kind == 'p': - allele = 'p.' + self.format_protein() - elif self.kind == 'g': - allele = 'g.' + self.format_genome() - else: - raise NotImplementedError("not implemented: '%s'" % self.kind) - - prefix = self.format_prefix(use_gene=use_gene) if use_prefix else '' - - if prefix: - return prefix + ':' + allele - else: - return allele - - def format_prefix(self, use_gene=True): - """ - Generate HGVS trancript/gene prefix. - - Some examples of full hgvs names with transcript include: - NM_007294.3:c.2207A>C - NM_007294.3(BRCA1):c.2207A>C - """ - - if self.kind == 'g': - if self.chrom: - return self.chrom - - if self.transcript: - if use_gene and self.gene: - return '%s(%s)' % (self.transcript, self.gene) - else: - return self.transcript - else: - if use_gene: - return self.gene - else: - return '' - - def format_cdna_coords(self): - """ - Generate HGVS cDNA coordinates string. - """ - # Format coordinates. - if self.cdna_start == self.cdna_end: - return str(self.cdna_start) - else: - return "%s_%s" % (self.cdna_start, self.cdna_end) - - def format_dna_allele(self): - """ - Generate HGVS DNA allele. - """ - if self.mutation_type == '=': - # No change. - # example: 101A= - return self.ref_allele + '=' - - if self.mutation_type == '>': - # SNP. - # example: 101A>C - return self.ref_allele + '>' + self.alt_allele - - elif self.mutation_type == 'delins': - # Indel. - # example: 112_117delAGGTCAinsTG, 112_117delinsTG - return 'del' + self.ref_allele + 'ins' + self.alt_allele - - elif self.mutation_type in ('del', 'dup'): - # Delete, duplication. - # example: 1000_1003delATG, 1000_1003dupATG - return self.mutation_type + self.ref_allele - - elif self.mutation_type == 'ins': - # Insert. - # example: 1000_1001insATG - return self.mutation_type + self.alt_allele - - elif self.mutation_type == 'inv': - return self.mutation_type - - else: - raise AssertionError( - "unknown mutation type: '%s'" % self.mutation_type) - - def format_cdna(self): - """ - Generate HGVS cDNA allele. - - Some examples include: - Substitution: 101A>C, - Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG - """ - return self.format_cdna_coords() + self.format_dna_allele() - - def format_protein(self): - """ - Generate HGVS protein name. - - Some examples include: - No change: Glu1161= - Change: Glu1161Ser - Frameshift: Glu1161_Ser1164?fs - """ - if (self.start == self.end and - self.ref_allele == self.ref2_allele == self.alt_allele): - # Match. - # Example: Glu1161= - pep_extra = self.pep_extra if self.pep_extra else '=' - return self.ref_allele + str(self.start) + pep_extra - - elif (self.start == self.end and - self.ref_allele == self.ref2_allele and - self.ref_allele != self.alt_allele): - # Change. - # Example: Glu1161Ser - return (self.ref_allele + str(self.start) + - self.alt_allele + self.pep_extra) - - elif self.start != self.end: - # Range change. - # Example: Glu1161_Ser1164?fs - return (self.ref_allele + str(self.start) + '_' + - self.ref2_allele + str(self.end) + - self.pep_extra) - - else: - raise NotImplementedError('protein name formatting.') - - def format_coords(self): - """ - Generate HGVS cDNA coordinates string. - """ - # Format coordinates. - if self.start == self.end: - return str(self.start) - else: - return "%s_%s" % (self.start, self.end) - - def format_genome(self): - """ - Generate HGVS genomic allele. - - Som examples include: - Substitution: 1000100A>T - Indel: 1000100_1000102delATG - """ - return self.format_coords() + self.format_dna_allele() - - def get_raw_coords(self, transcript=None): - """ return genomic coordinates """ - if self.kind in ('c', 'n'): - chrom = transcript.tx_position.chrom - start = cdna_to_genomic_coord(transcript, self.cdna_start) - end = cdna_to_genomic_coord(transcript, self.cdna_end) - - if not transcript.tx_position.is_forward_strand: - if end > start: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - start, end = end, start - - if start > end: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - elif self.kind == 'g': - chrom = self.chrom - start = self.start - end = self.end - else: - raise NotImplementedError( - 'Coordinates are not available for this kind of HGVS name "%s"' - % self.kind) - - return chrom, start, end - - def get_ref_coords(self, transcript=None): - """Return genomic coordinates of reference allele.""" - - chrom, start, end = self.get_raw_coords(transcript) - - if self.mutation_type == "ins": - # Inserts have empty interval. - if start < end: - start += 1 - end -= 1 - else: - end = start - 1 - - elif self.mutation_type == "dup": - end = start - 1 - return chrom, start, end - - def get_vcf_coords(self, transcript=None): - """Return genomic coordinates of reference allele in VCF-style.""" - chrom, start, end = self.get_ref_coords(transcript) - - # Inserts and deletes require left-padding by 1 base - if self.mutation_type in ("=", ">"): - pass - elif self.mutation_type in ("del", "ins", "dup", "delins"): - # Indels have left-padding. - start -= 1 - else: - raise NotImplementedError("Unknown mutation_type '%s'" % - self.mutation_type) - return chrom, start, end - - def get_ref_alt(self, is_forward_strand=True): - """Return reference and alternate alleles.""" - if self.kind == 'p': - raise NotImplementedError( - 'get_ref_alt is not implemented for protein HGVS names') - alleles = [self.ref_allele, self.alt_allele] - - # Represent duplications are inserts. - if self.mutation_type == "dup": - alleles[0] = "" - alleles[1] = alleles[1][:len(alleles[1]) // 2] - - if is_forward_strand: - return alleles - else: - return tuple(map(revcomp, alleles)) - - def hgvs_justify_dup(chrom, offset, ref, alt, genome): """ Determines if allele is a duplication and justifies. @@ -1467,13 +302,13 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, if mutation_type == '>' or (use_counsyl and is_single_base_indel): # Use a single coordinate. - hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset) + hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset) hgvs.cdna_end = hgvs.cdna_start else: if transcript.strand == '-': offset_start, offset_end = offset_end, offset_start - hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start) - hgvs.cdna_end = genomic_to_cdna_coord(transcript, offset_end) + hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset_start) + hgvs.cdna_end = transcript.genomic_to_cdna_coord(offset_end) # Populate prefix. if transcript: diff --git a/pyhgvs/models.py b/pyhgvs/models.py deleted file mode 100644 index 656e67b..0000000 --- a/pyhgvs/models.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Models for representing genomic elements. -""" - -from __future__ import unicode_literals - -from collections import namedtuple - - -class Position(object): - """A position in the genome.""" - - def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand): - self.chrom = chrom - self.chrom_start = chrom_start - self.chrom_stop = chrom_stop - self.is_forward_strand = is_forward_strand - - def __repr__(self): - return "" % ( - self.chrom, self.chrom_start, self.chrom_stop) - - -class Gene(object): - def __init__(self, name): - self.name = name - - -class Transcript(object): - """ - - A gene may have multiple transcripts with different combinations of exons. - """ - - def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, exons=None): - self.name = name - self.version = version - self.gene = Gene(gene) - self.tx_position = tx_position - self.cds_position = cds_position - self.is_default = is_default - self.exons = exons if exons else [] - - @property - def full_name(self): - if self.version is not None: - return '%s.%d' % (self.name, self.version) - else: - return self.name - - @property - def is_coding(self): - # Coding transcripts have CDS with non-zero length. - return (self.cds_position.chrom_stop - - self.cds_position.chrom_start > 0) - - @property - def strand(self): - return ('+' if self.tx_position.is_forward_strand else '-') - - @property - def coding_exons(self): - return [exon.get_as_interval(coding_only=True) - for exon in self.exons] - - -BED6Interval_base = namedtuple( - "BED6Interval_base", ( - "chrom", - "chrom_start", - "chrom_end", - "name", - "score", - "strand")) - - -class BED6Interval(BED6Interval_base): - def distance(self, offset): - """Return the distance to the interval. - - if offset is inside the exon, distance is zero. - otherwise, distance is the distance to the nearest edge. - - distance is positive if the exon comes after the offset. - distance is negative if the exon comes before the offset. - """ - - start = self.chrom_start + 1 - end = self.chrom_end - - if start <= offset <= end: - return 0 - - start_distance = start - offset - end_distance = offset - end - - if abs(start_distance) < abs(end_distance): - return start_distance - else: - return -end_distance - - -class Exon(object): - def __init__(self, transcript, tx_position, exon_number): - self.transcript = transcript - self.tx_position = tx_position - self.exon_number = exon_number - - @property - def get_exon_name(self): - return "%s.%d" % (self.transcript.name, self.exon_number) - - def get_as_interval(self, coding_only=False): - """Returns the coding region for this exon as a BED6Interval. - - This function returns a BED6Interval objects containing position - information for this exon. This may be used as input for - pybedtools.create_interval_from_list() after casting chrom_start - and chrom_end as strings. - - coding_only: only include exons in the coding region - - """ - - exon_start = self.tx_position.chrom_start - exon_stop = self.tx_position.chrom_stop - - # Get only exon coding region if requested - if coding_only: - if (exon_stop <= self.transcript.cds_position.chrom_start or - exon_start >= self.transcript.cds_position.chrom_stop): - return None - exon_start = max(exon_start, - self.transcript.cds_position.chrom_start) - exon_stop = min( - max(exon_stop, self.transcript.cds_position.chrom_start), - self.transcript.cds_position.chrom_stop) - - return BED6Interval( - self.tx_position.chrom, - exon_start, - exon_stop, - self.get_exon_name, - '.', - self.strand, - ) - - @property - def strand(self): - strand = '+' if self.tx_position.is_forward_strand else '-' - return strand diff --git a/pyhgvs/models/__init__.py b/pyhgvs/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyhgvs/models/cdna.py b/pyhgvs/models/cdna.py new file mode 100644 index 0000000..bc1280f --- /dev/null +++ b/pyhgvs/models/cdna.py @@ -0,0 +1,123 @@ +import re + +CDNA_START_CODON = 'cdna_start' +CDNA_STOP_CODON = 'cdna_stop' + + +class CDNACoord(object): + """ + A HGVS cDNA-based coordinate. + + A cDNA coordinate can take one of these forms: + + N = nucleotide N in protein coding sequence (e.g. 11A>G) + + -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G) + NOTE: so located in the 5'UTR or 5' of the transcription initiation + site (upstream of the gene, incl. promoter) + + *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G) + NOTE: so located in the 3'UTR or 3' of the polyA-addition site + (including downstream of the gene) + + N+M = nucleotide M in the intron after (3' of) position N in the coding DNA + reference sequence (e.g. 30+4A>G) + + N-M = nucleotide M in the intron before (5' of) position N in the coding + DNA reference sequence (e.g. 301-2A>G) + + -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G) + + *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G) + """ + + def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON, + string=''): + """ + coord: main coordinate along cDNA on the same strand as the transcript + + offset: an additional genomic offset from the main coordinate. This + allows referencing non-coding (e.g. intronic) positions. + Offset is also interpreted on the coding strand. + + landmark: ('cdna_start', 'cdna_stop') indicating that 'coord' + is relative to one of these landmarks. + + string: a coordinate from an HGVS name. If given coord, offset, and + landmark should not be specified. + """ + + if string: + if coord != 0 or offset != 0 or landmark != CDNA_START_CODON: + raise ValueError("coord, offset, and landmark should not " + "be given with string argument") + + self.parse(string) + else: + self.coord = coord + self.offset = offset + self.landmark = landmark + + def parse(self, coord_text): + """ + Parse a HGVS formatted cDNA coordinate. + """ + + match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text) + if not match: + raise ValueError("unknown coordinate format '%s'" % coord_text) + coord_prefix, coord, _, offset_prefix, offset = match.groups() + + self.coord = int(coord) + self.offset = int(offset) if offset else 0 + + if offset_prefix == '-': + self.offset *= -1 + elif offset_prefix == '+' or offset is None: + pass + else: + raise ValueError("unknown offset_prefix '%s'" % offset_prefix) + + if coord_prefix == '': + self.landmark = CDNA_START_CODON + elif coord_prefix == "-": + self.coord *= -1 + self.landmark = CDNA_START_CODON + elif coord_prefix == '*': + self.landmark = CDNA_STOP_CODON + else: + raise ValueError("unknown coord_prefix '%s'" % coord_prefix) + return self + + def __str__(self): + """ + Return a formatted cDNA coordinate + """ + if self.landmark == CDNA_STOP_CODON: + coord_prefix = '*' + else: + coord_prefix = '' + + if self.offset < 0: + offset = '%d' % self.offset + elif self.offset > 0: + offset = '+%d' % self.offset + else: + offset = '' + + return '%s%d%s' % (coord_prefix, self.coord, offset) + + def __eq__(self, other): + """Equality operator.""" + return ((self.coord, self.offset, self.landmark) == + (other.coord, other.offset, other.landmark)) + + def __repr__(self): + """ + Returns a string representation of a cDNA coordinate. + """ + if self.landmark != CDNA_START_CODON: + return "CDNACoord(%d, %d, '%s')" % ( + self.coord, self.offset, self.landmark) + else: + return "CDNACoord(%d, %d)" % (self.coord, self.offset) diff --git a/pyhgvs/models/genome.py b/pyhgvs/models/genome.py new file mode 100644 index 0000000..4cfc87d --- /dev/null +++ b/pyhgvs/models/genome.py @@ -0,0 +1,47 @@ +class ChromosomeSubset(object): + """ + Allow direct access to a subset of the chromosome. + """ + + def __init__(self, name, genome=None): + self.name = name + self.genome = genome + + def __getitem__(self, key): + """Return sequence from region [start, end) + + Coordinates are 0-based, end-exclusive.""" + if isinstance(key, slice): + start, end = (key.start, key.stop) + start -= self.genome.start + end -= self.genome.start + return self.genome.genome[self.genome.seqid][start:end] + else: + raise TypeError('Expected a slice object but ' + 'received a {0}.'.format(type(key))) + + def __repr__(self): + return 'ChromosomeSubset("%s")' % self.name + + +class GenomeSubset(object): + """ + Allow the direct access of a subset of the genome. + """ + + def __init__(self, genome, chrom, start, end, seqid): + self.genome = genome + self.chrom = chrom + self.start = start + self.end = end + self.seqid = seqid + self._chroms = {} + + def __getitem__(self, chrom): + """Return a chromosome by its name.""" + if chrom in self._chroms: + return self._chroms[chrom] + else: + chromosome = ChromosomeSubset(chrom, self) + self._chroms[chrom] = chromosome + return chromosome \ No newline at end of file diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py new file mode 100644 index 0000000..efe9471 --- /dev/null +++ b/pyhgvs/models/hgvs_name.py @@ -0,0 +1,801 @@ +""" +HGVS language currently implemented. + +HGVS = ALLELE + | PREFIX_NAME : ALLELE + +PREFIX_NAME = TRANSCRIPT + | TRANSCRIPT '(' GENE ')' + +TRANSCRIPT = TRANSCRIPT_NAME + | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION + +TRANSCRIPT_VERSION = NUMBER + +ALLELE = 'c.' CDNA_ALLELE # cDNA + | 'g.' GENOMIC_ALLELE # genomic + | 'm.' MIT_ALLELE # mitochondrial sequence + | 'n.' NC_ALLELE # non-coding RNA reference sequence + | 'r.' RNA_ALLELE # RNA sequence (like r.76a>u) + | 'p.' PROTEIN_ALLELE # protein sequence (like p.Lys76Asn) + +NC_ALLELE = +RNA_ALLELE = +CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE + | CDNA_COORD_RANGE MULTI_BASE_CHANGE + +GENOMIC_ALLELE = +MIT_ALLELE = COORD SINGLE_BASE_CHANGE + | COORD_RANGE MULTI_BASE_CHANGE + +SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '=' # no change + | CDNA_COORD BASE '>' BASE # substitution + | CDNA_COORD 'ins' BASE # 1bp insertion + | CDNA_COORD 'del' BASE # 1bp deletion + | CDNA_COORD 'dup' BASE # 1bp duplication + | CDNA_COORD 'ins' # 1bp insertion + | CDNA_COORD 'del' # 1bp deletion + | CDNA_COORD 'dup' # 1bp duplication + | CDNA_COORD 'del' BASE 'ins' BASE # 1bp indel + | CDNA_COORD 'delins' BASE # 1bp indel + +MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES # deletion + | COORD_RANGE 'ins' BASES # insertion + | COORD_RANGE 'dup' BASES # duplication + | COORD_RANGE 'del' # deletion + | COORD_RANGE 'dup' # duplication + | COORD_RANGE 'del' BASES 'ins' BASES # indel + | COORD_RANGE 'delins' BASES # indel + + +AMINO1 = [GAVLIMFWPSTCYNQDEKRH] + +AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro' + | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys' + | 'Arg' | 'His' + +PROTEIN_ALLELE = AMINO3 COORD '=' # no peptide change + | AMINO1 COORD '=' # no peptide change + | AMINO3 COORD AMINO3 PEP_EXTRA # peptide change + | AMINO1 COORD AMINO1 PEP_EXTRA # peptide change + | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA # indel + | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA # indel + | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel + | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel + +# A genomic range: +COORD_RANGE = COORD '_' COORD + +# A cDNA range: +CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD + +# A cDNA coordinate: +CDNA_COORD = COORD_PREFIX COORD + | COORD_PREFIX COORD OFFSET_PREFIX OFFSET +COORD_PREFIX = '' | '-' | '*' +COORD = NUMBER +OFFSET_PREFIX = '-' | '+' +OFFSET = NUMBER + +# Primatives: +NUMBER = \d+ +BASE = [ACGT] +BASES = BASE+ + +""" + +import re + +from .cdna import CDNACoord +from .variants import revcomp + +CHROM_PREFIX = 'chr' + + +class HGVSRegex(object): + """ + All regular expression for HGVS names. + """ + + # DNA syntax + # http://www.hgvs.org/mutnomen/standards.html#nucleotide + BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+" + BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+" + DNA_REF = "(?P" + BASES + ")" + DNA_ALT = "(?P" + BASES + ")" + + # Mutation types + EQUAL = "(?P=)" + SUB = "(?P>)" + INS = "(?Pins)" + DEL = "(?Pdel)" + DUP = "(?Pdup)" + + # Simple coordinate syntax + COORD_START = "(?P\d+)" + COORD_END = "(?P\d+)" + COORD_RANGE = COORD_START + "_" + COORD_END + + # cDNA coordinate syntax + CDNA_COORD = ("(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?") + CDNA_START = ("(?P(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?)") + CDNA_END = (r"(?P(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?)") + CDNA_RANGE = CDNA_START + "_" + CDNA_END + + # cDNA allele syntax + CDNA_ALLELE = [ + # No change + CDNA_START + DNA_REF + EQUAL, + + # Substitution + CDNA_START + DNA_REF + SUB + DNA_ALT, + + # 1bp insertion, deletion, duplication + CDNA_START + INS + DNA_ALT, + CDNA_START + DEL + DNA_REF, + CDNA_START + DUP + DNA_REF, + CDNA_START + DEL, + CDNA_START + DUP, + + # Insertion, deletion, duplication + CDNA_RANGE + INS + DNA_ALT, + CDNA_RANGE + DEL + DNA_REF, + CDNA_RANGE + DUP + DNA_REF, + CDNA_RANGE + DEL, + CDNA_RANGE + DUP, + + # Indels + "(?P" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + CDNA_START + 'delins' + DNA_ALT + ")", + "(?P" + CDNA_RANGE + 'delins' + DNA_ALT + ")", + ] + + CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in CDNA_ALLELE] + + # Peptide syntax + PEP = "([A-Z]([a-z]{2}))+" + PEP_REF = "(?P" + PEP + ")" + PEP_REF2 = "(?P" + PEP + ")" + PEP_ALT = "(?P" + PEP + ")" + + PEP_EXTRA = "(?P(|=|\?)(|fs))" + + # Peptide allele syntax + PEP_ALLELE = [ + # No peptide change + # Example: Glu1161= + PEP_REF + COORD_START + PEP_EXTRA, + + # Peptide change + # Example: Glu1161Ser + PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA, + + # Peptide indel + # Example: Glu1161_Ser1164?fs + "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + + PEP_EXTRA + ")", + "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + + PEP_ALT + PEP_EXTRA + ")", + ] + + PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in PEP_ALLELE] + + # Genomic allele syntax + GENOMIC_ALLELE = [ + # No change + COORD_START + DNA_REF + EQUAL, + + # Substitution + COORD_START + DNA_REF + SUB + DNA_ALT, + + # 1bp insertion, deletion, duplication + COORD_START + INS + DNA_ALT, + COORD_START + DEL + DNA_REF, + COORD_START + DUP + DNA_REF, + COORD_START + DEL, + COORD_START + DUP, + + # Insertion, deletion, duplication + COORD_RANGE + INS + DNA_ALT, + COORD_RANGE + DEL + DNA_REF, + COORD_RANGE + DUP + DNA_REF, + COORD_RANGE + DEL, + COORD_RANGE + DUP, + + # Indels + "(?P" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + COORD_START + 'delins' + DNA_ALT + ")", + "(?P" + COORD_RANGE + 'delins' + DNA_ALT + ")", + ] + + GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in GENOMIC_ALLELE] + + +# The RefSeq standard for naming contigs/transcripts/proteins: +# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8 +REFSEQ_PREFIXES = [ + ('AC_', 'genomic', + 'Complete genomic molecule, usually alternate assembly'), + ('NC_', 'genomic', + 'Complete genomic molecule, usually reference assembly'), + ('NG_', 'genomic', 'Incomplete genomic region'), + ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'), + ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'), + ('NS_', 'genomic', 'Environmental sequence'), + ('NZ_', 'genomic', 'Unfinished WGS'), + ('NM_', 'mRNA', ''), + ('NR_', 'RNA', ''), + ('XM_', 'mRNA', 'Predicted model'), + ('XR_', 'RNA', 'Predicted model'), + ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'), + ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'), + ('YP_', 'Protein', ''), + ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'), + ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'), +] + + +REFSEQ_PREFIX_LOOKUP = dict( + (prefix, (kind, description)) + for prefix, kind, description in REFSEQ_PREFIXES +) + + +def get_refseq_type(name): + """ + Return the RefSeq type for a refseq name. + """ + prefix = name[:3] + return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0] + +class InvalidHGVSName(ValueError): + def __init__(self, name='', part='name', reason=''): + if name: + message = 'Invalid HGVS %s "%s"' % (part, name) + else: + message = 'Invalid HGVS %s' % part + if reason: + message += ': ' + reason + super(InvalidHGVSName, self).__init__(message) + + self.name = name + self.part = part + self.reason = reason + + +class HGVSName(object): + """ + Represents a HGVS variant name. + """ + + def __init__(self, name='', prefix='', chrom='', transcript='', gene='', + kind='', mutation_type=None, start=0, end=0, ref_allele='', + ref2_allele='', alt_allele='', + cdna_start=None, cdna_end=None, pep_extra=''): + + # Full HGVS name. + self.name = name + + # Name parts. + self.prefix = prefix + self.chrom = chrom + self.transcript = transcript + self.gene = gene + self.kind = kind + self.mutation_type = mutation_type + self.start = start + self.end = end + self.ref_allele = ref_allele # reference allele + self.ref2_allele = ref2_allele # reference allele at end of pep indel + self.alt_allele = alt_allele # alternate allele + + # cDNA-specific fields + self.cdna_start = cdna_start if cdna_start else CDNACoord() + self.cdna_end = cdna_end if cdna_end else CDNACoord() + + # Protein-specific fields + self.pep_extra = pep_extra + + if name: + self.parse(name) + + def parse(self, name): + """Parse a HGVS name.""" + # Does HGVS name have transcript/gene prefix? + if ':' in name: + prefix, allele = name.split(':', 1) + else: + prefix = '' + allele = name + + self.name = name + + # Parse prefix and allele. + self.parse_allele(allele) + self.parse_prefix(prefix, self.kind) + self._validate() + + def parse_prefix(self, prefix, kind): + """ + Parse a HGVS prefix (gene/transcript/chromosome). + + Some examples of full hgvs names with transcript include: + NM_007294.3:c.2207A>C + NM_007294.3(BRCA1):c.2207A>C + BRCA1{NM_007294.3}:c.2207A>C + """ + + self.prefix = prefix + + # No prefix. + if prefix == '': + self.chrom = '' + self.transcript = '' + self.gene = '' + return + + # Transcript and gene given with parens. + # example: NM_007294.3(BRCA1):c.2207A>C + match = re.match("^(?P[^(]+)\((?P[^)]+)\)$", prefix) + if match: + self.transcript = match.group('transcript') + self.gene = match.group('gene') + return + + # Transcript and gene given with braces. + # example: BRCA1{NM_007294.3}:c.2207A>C + match = re.match("^(?P[^{]+)\{(?P[^}]+)\}$", prefix) + if match: + self.transcript = match.group('transcript') + self.gene = match.group('gene') + return + + # Determine using Ensembl type. + if prefix.startswith('ENST'): + self.transcript = prefix + return + + # Determine using refseq type. + refseq_type = get_refseq_type(prefix) + if refseq_type in ('mRNA', 'RNA'): + self.transcript = prefix + return + + # Determine using refseq type. + if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic': + self.chrom = prefix + return + + # Assume gene name. + self.gene = prefix + + def parse_allele(self, allele): + """ + Parse a HGVS allele description. + + Some examples include: + cDNA substitution: c.101A>C, + cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG + No protein change: p.Glu1161= + Protein change: p.Glu1161Ser + Protein frameshift: p.Glu1161_Ser1164?fs + Genomic substitution: g.1000100A>T + Genomic indel: g.1000100_1000102delATG + """ + if '.' not in allele: + raise InvalidHGVSName(allele, 'allele', + 'expected kind "c.", "p.", "g.", etc') + + # Determine HGVS name kind. + kind, details = allele.split('.', 1) + self.kind = kind + self.mutation_type = None + + if kind in ("c", 'n'): + self.parse_cdna(details) + if kind == 'n': # Ensure no 3'UTR or 5'UTR coords in non-coding + if self.cdna_start.coord < 0: + raise InvalidHGVSName(allele, 'allele', + "Non-coding transcript cannot contain negative (5'UTR) coordinates") + if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop': + raise InvalidHGVSName(allele, 'allele', + "Non-coding transcript cannot contain '*' (3'UTR) coordinates") + elif kind == "p": + self.parse_protein(details) + elif kind == "g": + self.parse_genome(details) + else: + raise NotImplementedError("unknown kind: %s" % allele) + + def parse_cdna(self, details): + """ + Parse a HGVS cDNA name. + + Some examples include: + Substitution: 101A>C, + Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG + """ + for regex in HGVSRegex.CDNA_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = groups['mutation_type'] + + # Parse coordinates. + self.cdna_start = CDNACoord(string=groups.get('start')) + if groups.get('end'): + self.cdna_end = CDNACoord(string=groups.get('end')) + else: + self.cdna_end = CDNACoord(string=groups.get('start')) + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + self.alt_allele = groups.get('alt', '') + + # Convert numerical allelles. + if self.ref_allele.isdigit(): + self.ref_allele = "N" * int(self.ref_allele) + if self.alt_allele.isdigit(): + self.alt_allele = "N" * int(self.alt_allele) + + # Convert duplication alleles. + if self.mutation_type == "dup": + self.alt_allele = self.ref_allele * 2 + + # Convert no match alleles. + if self.mutation_type == "=": + self.alt_allele = self.ref_allele + return + + raise InvalidHGVSName(details, 'cDNA allele') + + def parse_protein(self, details): + """ + Parse a HGVS protein name. + + Some examples include: + No change: Glu1161= + Change: Glu1161Ser + Frameshift: Glu1161_Ser1164?fs + """ + for regex in HGVSRegex.PEP_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = '>' + + # Parse coordinates. + self.start = int(groups.get('start')) + if groups.get('end'): + self.end = int(groups.get('end')) + else: + self.end = self.start + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + if groups.get('ref2'): + self.ref2_allele = groups.get('ref2') + self.alt_allele = groups.get('alt', '') + else: + # If alt is not given, assume matching with ref + self.ref2_allele = self.ref_allele + self.alt_allele = groups.get( + 'alt', self.ref_allele) + + self.pep_extra = groups.get('extra') + return + + raise InvalidHGVSName(details, 'protein allele') + + def parse_genome(self, details): + """ + Parse a HGVS genomic name. + + Som examples include: + Substitution: 1000100A>T + Indel: 1000100_1000102delATG + """ + + for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = groups['mutation_type'] + + # Parse coordinates. + self.start = int(groups.get('start')) + if groups.get('end'): + self.end = int(groups.get('end')) + else: + self.end = self.start + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + self.alt_allele = groups.get('alt', '') + + # Convert numerical allelles. + if self.ref_allele.isdigit(): + self.ref_allele = "N" * int(self.ref_allele) + if self.alt_allele.isdigit(): + self.alt_allele = "N" * int(self.alt_allele) + + # Convert duplication alleles. + if self.mutation_type == "dup": + self.alt_allele = self.ref_allele * 2 + + # Convert no match alleles. + if self.mutation_type == "=": + self.alt_allele = self.ref_allele + return + + raise InvalidHGVSName(details, 'genomic allele') + + def _validate(self): + """ + Check for internal inconsistencies in representation + """ + if self.start > self.end: + raise InvalidHGVSName(reason="Coordinates are nonincreasing") + + def __repr__(self): + try: + return "HGVSName('%s')" % self.format() + except NotImplementedError: + return "HGVSName('%s')" % self.name + + def __unicode__(self): + return self.format() + + def format(self, use_prefix=True, use_gene=True, use_counsyl=False): + """Generate a HGVS name as a string.""" + + if self.kind == 'c': + allele = 'c.' + self.format_cdna() + elif self.kind == 'n': + allele = 'n.' + self.format_cdna() + elif self.kind == 'p': + allele = 'p.' + self.format_protein() + elif self.kind == 'g': + allele = 'g.' + self.format_genome() + else: + raise NotImplementedError("not implemented: '%s'" % self.kind) + + prefix = self.format_prefix(use_gene=use_gene) if use_prefix else '' + + if prefix: + return prefix + ':' + allele + else: + return allele + + def format_prefix(self, use_gene=True): + """ + Generate HGVS trancript/gene prefix. + + Some examples of full hgvs names with transcript include: + NM_007294.3:c.2207A>C + NM_007294.3(BRCA1):c.2207A>C + """ + + if self.kind == 'g': + if self.chrom: + return self.chrom + + if self.transcript: + if use_gene and self.gene: + return '%s(%s)' % (self.transcript, self.gene) + else: + return self.transcript + else: + if use_gene: + return self.gene + else: + return '' + + def format_cdna_coords(self): + """ + Generate HGVS cDNA coordinates string. + """ + # Format coordinates. + if self.cdna_start == self.cdna_end: + return str(self.cdna_start) + else: + return "%s_%s" % (self.cdna_start, self.cdna_end) + + def format_dna_allele(self): + """ + Generate HGVS DNA allele. + """ + if self.mutation_type == '=': + # No change. + # example: 101A= + return self.ref_allele + '=' + + if self.mutation_type == '>': + # SNP. + # example: 101A>C + return self.ref_allele + '>' + self.alt_allele + + elif self.mutation_type == 'delins': + # Indel. + # example: 112_117delAGGTCAinsTG, 112_117delinsTG + return 'del' + self.ref_allele + 'ins' + self.alt_allele + + elif self.mutation_type in ('del', 'dup'): + # Delete, duplication. + # example: 1000_1003delATG, 1000_1003dupATG + return self.mutation_type + self.ref_allele + + elif self.mutation_type == 'ins': + # Insert. + # example: 1000_1001insATG + return self.mutation_type + self.alt_allele + + elif self.mutation_type == 'inv': + return self.mutation_type + + else: + raise AssertionError( + "unknown mutation type: '%s'" % self.mutation_type) + + def format_cdna(self): + """ + Generate HGVS cDNA allele. + + Some examples include: + Substitution: 101A>C, + Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG + """ + return self.format_cdna_coords() + self.format_dna_allele() + + def format_protein(self): + """ + Generate HGVS protein name. + + Some examples include: + No change: Glu1161= + Change: Glu1161Ser + Frameshift: Glu1161_Ser1164?fs + """ + if (self.start == self.end and + self.ref_allele == self.ref2_allele == self.alt_allele): + # Match. + # Example: Glu1161= + pep_extra = self.pep_extra if self.pep_extra else '=' + return self.ref_allele + str(self.start) + pep_extra + + elif (self.start == self.end and + self.ref_allele == self.ref2_allele and + self.ref_allele != self.alt_allele): + # Change. + # Example: Glu1161Ser + return (self.ref_allele + str(self.start) + + self.alt_allele + self.pep_extra) + + elif self.start != self.end: + # Range change. + # Example: Glu1161_Ser1164?fs + return (self.ref_allele + str(self.start) + '_' + + self.ref2_allele + str(self.end) + + self.pep_extra) + + else: + raise NotImplementedError('protein name formatting.') + + def format_coords(self): + """ + Generate HGVS cDNA coordinates string. + """ + # Format coordinates. + if self.start == self.end: + return str(self.start) + else: + return "%s_%s" % (self.start, self.end) + + def format_genome(self): + """ + Generate HGVS genomic allele. + + Som examples include: + Substitution: 1000100A>T + Indel: 1000100_1000102delATG + """ + return self.format_coords() + self.format_dna_allele() + + def get_raw_coords(self, transcript=None): + """ return genomic coordinates """ + if self.kind in ('c', 'n'): + chrom = transcript.tx_position.chrom + start = transcript.cdna_to_genomic_coord(self.cdna_start) + end = transcript.cdna_to_genomic_coord(self.cdna_end) + + if not transcript.tx_position.is_forward_strand: + if end > start: + raise AssertionError( + "cdna_start cannot be greater than cdna_end") + start, end = end, start + + if start > end: + raise AssertionError( + "cdna_start cannot be greater than cdna_end") + elif self.kind == 'g': + chrom = self.chrom + start = self.start + end = self.end + else: + raise NotImplementedError( + 'Coordinates are not available for this kind of HGVS name "%s"' + % self.kind) + + return chrom, start, end + + def get_ref_coords(self, transcript=None): + """Return genomic coordinates of reference allele.""" + + chrom, start, end = self.get_raw_coords(transcript) + + if self.mutation_type == "ins": + # Inserts have empty interval. + if start < end: + start += 1 + end -= 1 + else: + end = start - 1 + + elif self.mutation_type == "dup": + end = start - 1 + return chrom, start, end + + def get_vcf_coords(self, transcript=None): + """Return genomic coordinates of reference allele in VCF-style.""" + chrom, start, end = self.get_ref_coords(transcript) + + # Inserts and deletes require left-padding by 1 base + if self.mutation_type in ("=", ">"): + pass + elif self.mutation_type in ("del", "ins", "dup", "delins"): + # Indels have left-padding. + start -= 1 + else: + raise NotImplementedError("Unknown mutation_type '%s'" % + self.mutation_type) + return chrom, start, end + + def get_ref_alt(self, is_forward_strand=True): + """Return reference and alternate alleles.""" + if self.kind == 'p': + raise NotImplementedError( + 'get_ref_alt is not implemented for protein HGVS names') + alleles = [self.ref_allele, self.alt_allele] + + # Represent duplications are inserts. + if self.mutation_type == "dup": + alleles[0] = "" + alleles[1] = alleles[1][:len(alleles[1]) // 2] + + if is_forward_strand: + return alleles + else: + return tuple(map(revcomp, alleles)) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py new file mode 100644 index 0000000..e707375 --- /dev/null +++ b/pyhgvs/models/transcript.py @@ -0,0 +1,335 @@ +""" +Models for representing genomic elements. +""" + +from __future__ import unicode_literals + +from collections import namedtuple + +from lazy import lazy + +from pyhgvs.models.cdna import CDNA_START_CODON, CDNA_STOP_CODON, CDNACoord + + +class Gene(object): + def __init__(self, name): + self.name = name + + +class Transcript(object): + """ + + A gene may have multiple transcripts with different combinations of exons. + """ + + def __init__(self, name, version, gene, tx_position, cds_position, + is_default=False, exons=None): + self.name = name + self.version = version + self.gene = Gene(gene) + self.tx_position = tx_position + self.cds_position = cds_position + self.is_default = is_default + self.exons = exons if exons else [] + + @property + def full_name(self): + if self.version is not None: + return '%s.%d' % (self.name, self.version) + else: + return self.name + + @property + def is_coding(self): + # Coding transcripts have CDS with non-zero length. + return (self.cds_position.chrom_stop - + self.cds_position.chrom_start > 0) + + @property + def strand(self): + return ('+' if self.tx_position.is_forward_strand else '-') + + @property + def coding_exons(self): + return [exon.get_as_interval(coding_only=True) + for exon in self.exons] + + @lazy + def ordered_exons(self): + """Yield exons in coding order.""" + transcript_strand = self.tx_position.is_forward_strand + if hasattr(self.exons, 'select_related'): + exons = list(self.exons.select_related('tx_position')) + else: + exons = list(self.exons) + exons.sort(key=lambda exon: exon.tx_position.chrom_start) + if not transcript_strand: + exons.reverse() + return exons + + def get_coding_exons(self): + """Yield non-empty coding exonic regions in coding order.""" + for exon in self.ordered_exons: + region = exon.get_as_interval(coding_only=True) + if region: + yield region + + def get_utr5p_size(self): + """Return the size of the 5prime UTR of a transcript.""" + + transcript_strand = self.tx_position.is_forward_strand + + # Find the exon containing the start codon. + start_codon = (self.cds_position.chrom_start if transcript_strand + else self.cds_position.chrom_stop - 1) + cdna_len = 0 + for exon in self.ordered_exons: + exon_start = exon.tx_position.chrom_start + exon_end = exon.tx_position.chrom_stop + if exon_start <= start_codon < exon_end: + break + cdna_len += exon_end - exon_start + else: + raise ValueError("transcript contains no exons") + + if transcript_strand: + return cdna_len + (start_codon - exon_start) + else: + return cdna_len + (exon_end - start_codon - 1) + + def find_stop_codon(self, cds_position): + """Return the position along the cDNA of the base after the stop codon.""" + if cds_position.is_forward_strand: + stop_pos = cds_position.chrom_stop + else: + stop_pos = cds_position.chrom_start + cdna_pos = 0 + for exon in self.ordered_exons: + exon_start = exon.tx_position.chrom_start + exon_stop = exon.tx_position.chrom_stop + + if exon_start <= stop_pos <= exon_stop: + if cds_position.is_forward_strand: + return cdna_pos + stop_pos - exon_start + else: + return cdna_pos + exon_stop - stop_pos + else: + cdna_pos += exon_stop - exon_start + raise ValueError('Stop codon is not in any of the exons') + + def cdna_to_genomic_coord(self, coord): + """Convert a HGVS cDNA coordinate to a genomic coordinate.""" + transcript_strand = self.tx_position.is_forward_strand + utr5p = (self.get_utr5p_size() + if self.is_coding else 0) + + # compute starting position along spliced transcript. + if coord.landmark == CDNA_START_CODON: + if coord.coord > 0: + pos = utr5p + coord.coord + else: + pos = utr5p + coord.coord + 1 + elif coord.landmark == CDNA_STOP_CODON: + if coord.coord < 0: + raise ValueError('CDNACoord cannot have a negative coord and ' + 'landmark CDNA_STOP_CODON') + pos = self.find_stop_codon(self.cds_position) + coord.coord + else: + raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) + + # 5' flanking sequence. + if pos < 1: + if transcript_strand: + return self.tx_position.chrom_start + pos + else: + return self.tx_position.chrom_stop - pos + 1 + + # Walk along transcript until we find an exon that contains pos. + cdna_start = 1 + cdna_end = 1 + for exon in self.ordered_exons: + exon_start = exon.tx_position.chrom_start + 1 + exon_end = exon.tx_position.chrom_stop + cdna_end = cdna_start + (exon_end - exon_start) + if cdna_start <= pos <= cdna_end: + break + cdna_start = cdna_end + 1 + else: + # 3' flanking sequence + if transcript_strand: + return self.cds_position.chrom_stop + coord.coord + else: + return self.cds_position.chrom_start + 1 - coord.coord + + # Compute genomic coordinate using offset. + if transcript_strand: + # Plus strand. + return exon_start + (pos - cdna_start) + coord.offset + else: + # Minus strand. + return exon_end - (pos - cdna_start) - coord.offset + + def genomic_to_cdna_coord(self, genomic_coord): + """Convert a genomic coordinate to a cDNA coordinate and offset. + """ + exons = [exon.get_as_interval() + for exon in self.ordered_exons] + + if len(exons) == 0: + return None + + strand = self.strand + + if strand == "+": + exons.sort(key=lambda exon: exon.chrom_start) + else: + exons.sort(key=lambda exon: -exon.chrom_end) + + distances = [exon.distance(genomic_coord) + for exon in exons] + min_distance_to_exon = min(map(abs, distances)) + + coding_offset = 0 + for exon in exons: + exon_length = exon.chrom_end - exon.chrom_start + distance = exon.distance(genomic_coord) + if abs(distance) == min_distance_to_exon: + if strand == "+": + exon_start_cds_offset = coding_offset + 1 + exon_end_cds_offset = coding_offset + exon_length + else: + exon_start_cds_offset = coding_offset + exon_length + exon_end_cds_offset = coding_offset + 1 + # This is the exon we want to annotate against. + if distance == 0: + # Inside the exon. + if strand == "+": + coord = (exon_start_cds_offset + + (genomic_coord - + (exon.chrom_start + 1))) + else: + coord = (exon_end_cds_offset + + (exon.chrom_end - + genomic_coord)) + cdna_coord = CDNACoord(coord, 0) + else: + # Outside the exon. + if distance > 0: + nearest_exonic = exon_start_cds_offset + else: + nearest_exonic = exon_end_cds_offset + if strand == "+": + distance *= -1 + + # If outside transcript, don't use offset. + if (genomic_coord < self.tx_position.chrom_start + 1 or + genomic_coord > self.tx_position.chrom_stop): + nearest_exonic += distance + distance = 0 + cdna_coord = CDNACoord(nearest_exonic, distance) + break + coding_offset += exon_length + + # Adjust coordinates for coding transcript. + if self.is_coding: + # Detect if position before start codon. + utr5p = self.get_utr5p_size() if self.is_coding else 0 + cdna_coord.coord -= utr5p + if cdna_coord.coord <= 0: + cdna_coord.coord -= 1 + else: + # Detect if position is after stop_codon. + stop_codon = self.find_stop_codon(self.cds_position) + stop_codon -= utr5p + if (cdna_coord.coord > stop_codon or + cdna_coord.coord == stop_codon and cdna_coord.offset > 0): + cdna_coord.coord -= stop_codon + cdna_coord.landmark = CDNA_STOP_CODON + + return cdna_coord + +BED6Interval_base = namedtuple( + "BED6Interval_base", ( + "chrom", + "chrom_start", + "chrom_end", + "name", + "score", + "strand")) + + +class BED6Interval(BED6Interval_base): + def distance(self, offset): + """Return the distance to the interval. + + if offset is inside the exon, distance is zero. + otherwise, distance is the distance to the nearest edge. + + distance is positive if the exon comes after the offset. + distance is negative if the exon comes before the offset. + """ + + start = self.chrom_start + 1 + end = self.chrom_end + + if start <= offset <= end: + return 0 + + start_distance = start - offset + end_distance = offset - end + + if abs(start_distance) < abs(end_distance): + return start_distance + else: + return -end_distance + + +class Exon(object): + def __init__(self, transcript, tx_position, exon_number): + self.transcript = transcript + self.tx_position = tx_position + self.exon_number = exon_number + + @property + def get_exon_name(self): + return "%s.%d" % (self.transcript.name, self.exon_number) + + def get_as_interval(self, coding_only=False): + """Returns the coding region for this exon as a BED6Interval. + + This function returns a BED6Interval objects containing position + information for this exon. This may be used as input for + pybedtools.create_interval_from_list() after casting chrom_start + and chrom_end as strings. + + coding_only: only include exons in the coding region + + """ + + exon_start = self.tx_position.chrom_start + exon_stop = self.tx_position.chrom_stop + + # Get only exon coding region if requested + if coding_only: + if (exon_stop <= self.transcript.cds_position.chrom_start or + exon_start >= self.transcript.cds_position.chrom_stop): + return None + exon_start = max(exon_start, + self.transcript.cds_position.chrom_start) + exon_stop = min( + max(exon_stop, self.transcript.cds_position.chrom_start), + self.transcript.cds_position.chrom_stop) + + return BED6Interval( + self.tx_position.chrom, + exon_start, + exon_stop, + self.get_exon_name, + '.', + self.strand, + ) + + @property + def strand(self): + strand = '+' if self.tx_position.is_forward_strand else '-' + return strand diff --git a/pyhgvs/variants.py b/pyhgvs/models/variants.py similarity index 96% rename from pyhgvs/variants.py rename to pyhgvs/models/variants.py index 36c7383..aced9fc 100644 --- a/pyhgvs/variants.py +++ b/pyhgvs/models/variants.py @@ -5,13 +5,24 @@ from __future__ import absolute_import from __future__ import unicode_literals -from .models import Position - - _COMP = dict(A='T', C='G', G='C', T='A', N='N', a='t', c='g', g='c', t='a', n='n') +class Position(object): + """A position in the genome.""" + + def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand): + self.chrom = chrom + self.chrom_start = chrom_start + self.chrom_stop = chrom_stop + self.is_forward_strand = is_forward_strand + + def __repr__(self): + return "" % ( + self.chrom, self.chrom_start, self.chrom_stop) + + def revcomp(seq): """Reverse complement.""" return ''.join(_COMP[base] for base in reversed(seq)) @@ -151,7 +162,7 @@ def __init__(self, position, ref_allele, alt_alleles, seq_5p: 5 prime flanking sequence of variant. seq_3p: 3 prime flanking sequence of variant. genome: a pygr compatible genome object (optional). - + indels_start_with_same_base: DML - I have no idea why this is required but am keeping for backwards compat """ @@ -208,7 +219,7 @@ def _trim_common_suffix(self): """ minlength = min(map(len, self.alleles)) common_suffix = 0 - for i in range(1, minlength+1): + for i in range(1, minlength + 1): if len(set(allele[-i] for allele in self.alleles)) > 1: # Not all alleles match at this site, so common suffix ends. break diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py index 0887dfd..ecc35ae 100644 --- a/pyhgvs/tests/genome.py +++ b/pyhgvs/tests/genome.py @@ -4,7 +4,7 @@ import itertools import os -from ..variants import revcomp +from ..models.variants import revcomp try: diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index 58cf351..48bbb99 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -9,13 +9,9 @@ except ImportError: SequenceFileDB = None -from .. import CDNACoord -from .. import CDNA_STOP_CODON -from .. import HGVSName -from .. import InvalidHGVSName -from .. import cdna_to_genomic_coord +from ..models.cdna import CDNACoord, CDNA_STOP_CODON +from ..models.hgvs_name import HGVSName, InvalidHGVSName from .. import format_hgvs_name -from .. import genomic_to_cdna_coord from .. import parse_hgvs_name from ..utils import read_transcripts from .genome import MockGenomeTestFile @@ -50,7 +46,7 @@ def test_genomic_to_cdna_coord(): """ for transcript_name, genomic_coord, cdna_coord_expected in _convert_coords: transcript = get_transcript(transcript_name) - cdna_coord = genomic_to_cdna_coord(transcript, genomic_coord[1]) + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord[1]) nose.tools.assert_equal( cdna_coord, cdna_coord_expected, repr((cdna_coord, cdna_coord_expected, @@ -63,7 +59,7 @@ def test_cdna_to_genomic_coord(): """ for transcript_name, genomic_coord_expected, cdna_coord in _convert_coords: transcript = get_transcript(transcript_name) - genomic_coord = cdna_to_genomic_coord(transcript, cdna_coord) + genomic_coord = transcript.cdna_to_genomic_coord(cdna_coord) nose.tools.assert_equal( genomic_coord, genomic_coord_expected[1], repr((genomic_coord, genomic_coord_expected[1], diff --git a/pyhgvs/tests/test_hgvs_names_long.py b/pyhgvs/tests/test_hgvs_names_long.py index 9a019a7..2bfdfec 100644 --- a/pyhgvs/tests/test_hgvs_names_long.py +++ b/pyhgvs/tests/test_hgvs_names_long.py @@ -2,7 +2,7 @@ from .. import parse_hgvs_name from ..utils import read_transcripts -from ..variants import normalize_variant +from ..models.variants import normalize_variant from .genome import MockGenomeTestFile diff --git a/pyhgvs/tests/test_variants.py b/pyhgvs/tests/test_variants.py index de37a45..c4b9b35 100644 --- a/pyhgvs/tests/test_variants.py +++ b/pyhgvs/tests/test_variants.py @@ -3,7 +3,7 @@ from unittest import TestCase -from ..variants import normalize_variant +from ..models.variants import normalize_variant from .genome import MockGenomeTestFile diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 55b20f4..6d3ac3e 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -5,9 +5,8 @@ from __future__ import absolute_import from __future__ import unicode_literals -from .models import Exon -from .models import Position -from .models import Transcript +from .models.variants import Position +from .models.transcript import Exon, Transcript def read_refgene(infile): diff --git a/requirements-dev.txt b/requirements-dev.txt index 1018450..5b2667d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ flake8==2.2.5 +lazy==1.4 nose==1.3.7 pyfaidx==0.5.0.1 diff --git a/setup.py b/setup.py index 3f94c25..6ed2380 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ def main(): install_requires=['pip>=1.2'], tests_require=[ 'flake8==2.2.5', + 'lazy==1.4', 'nose==1.3.7', 'pyfaidx==0.5.0.1', ], From 67ab67f263e34208f57e27a572c521b037e8a619 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Tue, 12 Oct 2021 16:16:37 +1030 Subject: [PATCH 09/47] Start of cDNA matching code changes. Put everything through 1 path as exons are just a special case of alignments with no gaps --- pyhgvs/models/transcript.py | 113 +++++++++++++++++++++++--------- pyhgvs/tests/test_cdna_match.py | 71 ++++++++++++++++++++ pyhgvs/tests/test_hgvs_names.py | 2 +- pyhgvs/utils.py | 66 +++++++++++++++---- 4 files changed, 207 insertions(+), 45 deletions(-) create mode 100644 pyhgvs/tests/test_cdna_match.py diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index e707375..011da10 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -18,19 +18,22 @@ def __init__(self, name): class Transcript(object): """ - A gene may have multiple transcripts with different combinations of exons. """ def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, exons=None): + is_default=False, cdna_match=None): self.name = name self.version = version self.gene = Gene(gene) self.tx_position = tx_position self.cds_position = cds_position self.is_default = is_default - self.exons = exons if exons else [] + self.cdna_match = cdna_match or [] + + @lazy + def exons(self): + return self.cdna_match @property def full_name(self): @@ -49,10 +52,15 @@ def is_coding(self): def strand(self): return ('+' if self.tx_position.is_forward_strand else '-') - @property - def coding_exons(self): - return [exon.get_as_interval(coding_only=True) - for exon in self.exons] + @lazy + def ordered_cdna_match(self): + """ Coding order """ + transcript_strand = self.tx_position.is_forward_strand + cdna_match = list(self.cdna_match) + cdna_match.sort(key=lambda cm: cm.tx_position.chrom_start) + if not transcript_strand: + cdna_match.reverse() + return cdna_match @lazy def ordered_exons(self): @@ -67,13 +75,6 @@ def ordered_exons(self): exons.reverse() return exons - def get_coding_exons(self): - """Yield non-empty coding exonic regions in coding order.""" - for exon in self.ordered_exons: - region = exon.get_as_interval(coding_only=True) - if region: - yield region - def get_utr5p_size(self): """Return the size of the 5prime UTR of a transcript.""" @@ -83,19 +84,22 @@ def get_utr5p_size(self): start_codon = (self.cds_position.chrom_start if transcript_strand else self.cds_position.chrom_stop - 1) cdna_len = 0 - for exon in self.ordered_exons: - exon_start = exon.tx_position.chrom_start - exon_end = exon.tx_position.chrom_stop - if exon_start <= start_codon < exon_end: + for cdna_match in self.ordered_cdna_match: + start = cdna_match.tx_position.chrom_start + end = cdna_match.tx_position.chrom_stop + if start <= start_codon < end: + # We're inside this match + if transcript_strand: + position = start_codon - start + else: + position = end - start_codon - 1 + cdna_len += position + cdna_match.get_offset(position) break - cdna_len += exon_end - exon_start + cdna_len += cdna_match.length else: - raise ValueError("transcript contains no exons") + raise ValueError("transcript contains no cdna_match") - if transcript_strand: - return cdna_len + (start_codon - exon_start) - else: - return cdna_len + (exon_end - start_codon - 1) + return cdna_len def find_stop_codon(self, cds_position): """Return the position along the cDNA of the base after the stop codon.""" @@ -248,6 +252,7 @@ def genomic_to_cdna_coord(self, genomic_coord): return cdna_coord + BED6Interval_base = namedtuple( "BED6Interval_base", ( "chrom", @@ -284,15 +289,23 @@ def distance(self, offset): return -end_distance -class Exon(object): - def __init__(self, transcript, tx_position, exon_number): +class CDNA_Match(object): + """ An exon is a special case of a cDNA match which has 0 gaps """ + def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number): self.transcript = transcript self.tx_position = tx_position - self.exon_number = exon_number + self.cdna_start = cdna_start + self.cdna_end = cdna_end + self.gap = gap + self.number = number + + @property + def length(self): + return self.cdna_end - self.cdna_start + 1 @property - def get_exon_name(self): - return "%s.%d" % (self.transcript.name, self.exon_number) + def name(self): + return "%s.%d" % (self.transcript.name, self.number) def get_as_interval(self, coding_only=False): """Returns the coding region for this exon as a BED6Interval. @@ -324,7 +337,7 @@ def get_as_interval(self, coding_only=False): self.tx_position.chrom, exon_start, exon_stop, - self.get_exon_name, + self.name, '.', self.strand, ) @@ -333,3 +346,43 @@ def get_as_interval(self, coding_only=False): def strand(self): strand = '+' if self.tx_position.is_forward_strand else '-' return strand + + def get_offset(self, position: int): + """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length + @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute + codes operation + M match + I insert a gap into the reference sequence + D insert a gap into the target (delete from reference) + + If you want the whole exon, then pass the end + """ + + if not self.gap: + return 0 + + match_i = 0 + offset = 0 + for gap_op in self.gap.split(): + code = gap_op[0] + length = int(gap_op[1:]) + if code == "M": + match_i += length + elif code == "I": + if position <= match_i + length: + raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op) + offset += length + elif code == "D": + if position <= match_i + length: + raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op) + offset -= length + else: + raise ValueError(f"Unknown code in cDNA GAP: {gap_op}") + + #if self.transcript.name == "NM_000016": + # print(f"{position}: {code} {length} - {offset=}, {match_i=}") + + if match_i >= position: + break + + return offset diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py new file mode 100644 index 0000000..ed49b02 --- /dev/null +++ b/pyhgvs/tests/test_cdna_match.py @@ -0,0 +1,71 @@ +""" + RefSeq transcripts don't always match the genome reference, so can align with gaps. +""" + +import nose +from pyhgvs import HGVSName + +from pyhgvs.utils import make_transcript + + +def get_transcript(accession): + transcript_json = _transcripts[accession] + return make_transcript(transcript_json) + + +def test_cdna_to_genomic_coord(): + HGVS_GRCh37_COORDS = [ + # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert + ("NM_015120.4:c.74A>T", 73613070), +# ("NM_015120.4:c.75G>T", None), # No consistent alignment +# ("NM_015120.4:c.78A>T", 73613071), +# ("NM_015120.4:c.79G>C", 73613072), + ] + transcript = get_transcript("NM_015120.4") + print(f"NM_015120.4 UTR = {transcript.get_utr5p_size()}") + for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + pass + # nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + + +def test_cdna_to_genomic_coord_negative_strand(): + # TODO: + pass + + +_transcripts = { + # GRCh37 + "NM_015120.4": { + "id": "NM_015120.4", + "gene_name": "ALMS1", + "end": 73837046, + "chrom": "NC_000002.11", + "exons": [[73612885, 73613320], [73635749, 73635875], [73646250, 73646446], [73649984, 73650102], + [73651557, 73652030], [73653580, 73653681], [73659325, 73659419], [73675089, 73681194], + [73682288, 73682422], [73716760, 73718625], [73746901, 73747143], [73761950, 73762076], + [73777393, 73777564], [73784346, 73784481], [73786098, 73786269], [73799388, 73800551], + [73826527, 73826648], [73827804, 73828008], [73828321, 73828563], [73829311, 73829495], + [73830367, 73830431], [73835601, 73835701], [73836694, 73837046]], + "start": 73612885, + "strand": "+", + "cds_end": 73836739, + "cds_start": 73612996, + "cdna_match": [[73612885, 73613320, 1, 438, "M185 I3 M250"], [73635749, 73635875, 439, 564, None], + [73646250, 73646446, 565, 760, None], [73649984, 73650102, 761, 878, None], + [73651557, 73652030, 879, 1351, None], [73653580, 73653681, 1352, 1452, None], + [73659325, 73659419, 1453, 1546, None], [73675089, 73681194, 1547, 7654, "M141 I3 M5964"], + [73682288, 73682422, 7655, 7788, None], [73716760, 73718625, 7789, 9653, None], + [73746901, 73747143, 9654, 9895, None], [73761950, 73762076, 9896, 10021, None], + [73777393, 73777564, 10022, 10192, None], [73784346, 73784481, 10193, 10327, None], + [73786098, 73786269, 10328, 10498, None], [73799388, 73800551, 10499, 11661, None], + [73826527, 73826648, 11662, 11782, None], [73827804, 73828008, 11783, 11986, None], + [73828321, 73828563, 11987, 12228, None], [73829311, 73829495, 12229, 12412, None], + [73830367, 73830431, 12413, 12476, None], [73835601, 73835701, 12477, 12576, None], + [73836694, 73837046, 12577, 12928, None]]} + +} diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index 48bbb99..8f06237 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -211,7 +211,7 @@ def test_invalid_coordinates(): ] -# Test examples of coverting coordinates. +# Test examples of converting coordinates. _convert_coords = [ # Positions near start codon. ('NM_000016.4', ('chr1', 76190473), CDNACoord(1)), diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 6d3ac3e..f691a71 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals from .models.variants import Position -from .models.transcript import Exon, Transcript +from .models.transcript import Transcript, CDNA_Match def read_refgene(infile): @@ -85,23 +85,61 @@ def make_transcript(transcript_json): transcript_json['cds_end'], transcript_json['strand'] == '+')) - exons = transcript_json['exons'] - if not transcript.tx_position.is_forward_strand: - exons = reversed(exons) - - for exon_number, (exon_start, exon_end) in enumerate(exons, 1): - transcript.exons.append( - Exon(transcript=transcript, - tx_position=Position( - transcript_json['chrom'], - exon_start, - exon_end, - transcript_json['strand'] == '+'), - exon_number=exon_number)) + cdna_match = transcript_json.get('cdna_match') + if cdna_match: + if not transcript.tx_position.is_forward_strand: + cdna_match = reversed(cdna_match) + else: + exons = transcript_json['exons'] + if not transcript.tx_position.is_forward_strand: + exons = reversed(exons) + cdna_match = json_perfect_exons_to_cdna_match(exons) # Only use single=True once ALL has been implemented + + for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1): + transcript.cdna_match.append(CDNA_Match(transcript=transcript, + tx_position=Position( + transcript_json['chrom'], + exon_start, + exon_end, + transcript_json['strand'] == '+'), + cdna_start=cdna_start, + cdna_end=cdna_end, + gap=gap, + number=number)) return transcript +def json_perfect_exons_to_cdna_match(ordered_exons, single=False): + """ Perfectly matched exons are basically a no-gap case of cDNA match """ + cdna_match = [] + if single: + ordered_exons = list(ordered_exons) + start = ordered_exons[0][0] + end = ordered_exons[-1][1] + last_exon_end = None + gap_list = [] + cdna_length = 0 + for (exon_start, exon_end) in ordered_exons: + # end up looking like "M D M D (M=exon, D=intron length)" + if last_exon_end: + intron_length = abs(exon_start - last_exon_end) + gap_list.append("D%d" % intron_length) + exon_length = exon_end - exon_start + cdna_length += exon_length + gap_list.append("M%d" % exon_length) + last_exon_end = exon_end + cdna_match = [[start, end, 1, cdna_length, " ".join(gap_list)]] + else: + cdna_end = 0 + for (exon_start, exon_end) in ordered_exons: + cdna_start = cdna_end + 1 + exon_length = exon_end - exon_start + cdna_end = cdna_start + exon_length - 1 + cdna_match.append([exon_start, exon_end, cdna_start, cdna_end, None]) + return cdna_match + + def read_transcripts(refgene_file): """ Read all transcripts in a RefGene file. From 04ac8a7af6065a605b5989c0c1a2015d0cc8d69b Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 13 Oct 2021 12:02:11 +1030 Subject: [PATCH 10/47] Handle gaps --- pyhgvs/models/transcript.py | 89 +++++++++++++++++---------------- pyhgvs/tests/test_cdna_match.py | 47 +++++++++++++---- pyhgvs/utils.py | 4 +- 3 files changed, 86 insertions(+), 54 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 011da10..5bddbb9 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -108,18 +108,25 @@ def find_stop_codon(self, cds_position): else: stop_pos = cds_position.chrom_start cdna_pos = 0 - for exon in self.ordered_exons: - exon_start = exon.tx_position.chrom_start - exon_stop = exon.tx_position.chrom_stop + for cdna_match in self.ordered_cdna_match: + start = cdna_match.tx_position.chrom_start + stop = cdna_match.tx_position.chrom_stop - if exon_start <= stop_pos <= exon_stop: + if start <= stop_pos <= stop: + # We're inside this match if cds_position.is_forward_strand: - return cdna_pos + stop_pos - exon_start + position = stop_pos - start else: - return cdna_pos + exon_stop - stop_pos + position = stop - stop_pos + # TODO: I don't think this is right as we need to go in reverse through match on '-' strand?? + cdna_pos += position + cdna_match.get_offset(position) + break else: - cdna_pos += exon_stop - exon_start - raise ValueError('Stop codon is not in any of the exons') + cdna_pos += stop - start + else: + raise ValueError('Stop codon is not in any of the exons') + + return cdna_pos def cdna_to_genomic_coord(self, coord): """Convert a HGVS cDNA coordinate to a genomic coordinate.""" @@ -130,49 +137,49 @@ def cdna_to_genomic_coord(self, coord): # compute starting position along spliced transcript. if coord.landmark == CDNA_START_CODON: if coord.coord > 0: - pos = utr5p + coord.coord + cdna_pos = utr5p + coord.coord else: - pos = utr5p + coord.coord + 1 + cdna_pos = utr5p + coord.coord + 1 elif coord.landmark == CDNA_STOP_CODON: if coord.coord < 0: raise ValueError('CDNACoord cannot have a negative coord and ' 'landmark CDNA_STOP_CODON') - pos = self.find_stop_codon(self.cds_position) + coord.coord + cdna_pos = self.find_stop_codon(self.cds_position) + coord.coord else: raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) - # 5' flanking sequence. - if pos < 1: + # 5' flanking sequence (no need to account for gaps) + if cdna_pos < 1: if transcript_strand: - return self.tx_position.chrom_start + pos + return self.tx_position.chrom_start + cdna_pos else: - return self.tx_position.chrom_stop - pos + 1 + return self.tx_position.chrom_stop - cdna_pos + 1 - # Walk along transcript until we find an exon that contains pos. + # Walk along transcript until we find an exon that contains cdna_pos. cdna_start = 1 - cdna_end = 1 - for exon in self.ordered_exons: - exon_start = exon.tx_position.chrom_start + 1 - exon_end = exon.tx_position.chrom_stop - cdna_end = cdna_start + (exon_end - exon_start) - if cdna_start <= pos <= cdna_end: - break + for cdna_match in self.ordered_cdna_match: + cdna_end = cdna_start + cdna_match.length - 1 + if cdna_start <= cdna_pos <= cdna_end: + match_pos = cdna_pos - cdna_start + # TODO: I don't think this is right as we need to go in reverse through match on '-' strand?? + match_pos -= cdna_match.get_offset(match_pos) + # Compute genomic coordinate using offset. + if transcript_strand: + # Plus strand. + start = cdna_match.tx_position.chrom_start + 1 + return start + match_pos + coord.offset + else: + # Minus strand. + end = cdna_match.tx_position.chrom_stop + return end - match_pos - coord.offset cdna_start = cdna_end + 1 else: - # 3' flanking sequence + # 3' flanking sequence (no need to account for gaps) if transcript_strand: return self.cds_position.chrom_stop + coord.coord else: return self.cds_position.chrom_start + 1 - coord.coord - # Compute genomic coordinate using offset. - if transcript_strand: - # Plus strand. - return exon_start + (pos - cdna_start) + coord.offset - else: - # Minus strand. - return exon_end - (pos - cdna_start) - coord.offset - def genomic_to_cdna_coord(self, genomic_coord): """Convert a genomic coordinate to a cDNA coordinate and offset. """ @@ -361,28 +368,26 @@ def get_offset(self, position: int): if not self.gap: return 0 - match_i = 0 + position_1_based = position + 1 + cdna_match_index = 1 offset = 0 for gap_op in self.gap.split(): code = gap_op[0] length = int(gap_op[1:]) if code == "M": - match_i += length + cdna_match_index += length elif code == "I": - if position <= match_i + length: - raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op) + if position_1_based < cdna_match_index + length: + raise ValueError("Coordinate (%d) inside insertion (%s) - no mapping possible!" % (position_1_based, gap_op)) offset += length elif code == "D": - if position <= match_i + length: - raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op) + if position < cdna_match_index + length: + raise ValueError("Coordinate (%d) inside deletion (%s) - no mapping possible!" % (position_1_based, gap_op)) offset -= length else: raise ValueError(f"Unknown code in cDNA GAP: {gap_op}") - #if self.transcript.name == "NM_000016": - # print(f"{position}: {code} {length} - {offset=}, {match_i=}") - - if match_i >= position: + if cdna_match_index > position_1_based: break return offset diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py index ed49b02..ec316e8 100644 --- a/pyhgvs/tests/test_cdna_match.py +++ b/pyhgvs/tests/test_cdna_match.py @@ -17,25 +17,40 @@ def test_cdna_to_genomic_coord(): HGVS_GRCh37_COORDS = [ # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert ("NM_015120.4:c.74A>T", 73613070), -# ("NM_015120.4:c.75G>T", None), # No consistent alignment -# ("NM_015120.4:c.78A>T", 73613071), -# ("NM_015120.4:c.79G>C", 73613072), + ("NM_015120.4:c.75G>T", None), # No consistent alignment + ("NM_015120.4:c.76G>T", None), # No consistent alignment + ("NM_015120.4:c.77A>T", None), # No consistent alignment + ("NM_015120.4:c.78A>T", 73613071), + ("NM_015120.4:c.79G>C", 73613072), ] transcript = get_transcript("NM_015120.4") - print(f"NM_015120.4 UTR = {transcript.get_utr5p_size()}") for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS: hgvs_name = HGVSName(hgvs_str) if expected_genomic_coord: genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) nose.tools.assert_equal(genomic_coord, expected_genomic_coord) else: - pass - # nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) def test_cdna_to_genomic_coord_negative_strand(): - # TODO: - pass + HGVS_GRCh37_COORDS = [ + # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert + ("NM_001135649.3:c.300C>T", 88751754), + ("NM_001135649.3:c.301A>C", 88751753), + ("NM_001135649.3:c.302C>T", 88751752), + ("NM_001135649.3:c.303C>T", None), # N/A GRCh37 - 1 bp insert + ("NM_001135649.3:c.304T>C", 88751751), + ("NM_001135649.3:c.305T>C", 88751750), + ] + transcript = get_transcript("NM_001135649.3") + for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) _transcripts = { @@ -66,6 +81,18 @@ def test_cdna_to_genomic_coord_negative_strand(): [73826527, 73826648, 11662, 11782, None], [73827804, 73828008, 11783, 11986, None], [73828321, 73828563, 11987, 12228, None], [73829311, 73829495, 12229, 12412, None], [73830367, 73830431, 12413, 12476, None], [73835601, 73835701, 12477, 12576, None], - [73836694, 73837046, 12577, 12928, None]]} - + [73836694, 73837046, 12577, 12928, None]] + }, + "NM_001135649.3": { + "id": "NM_001135649.3", + "gene_name": "FOXI3", + "end": 88752211, + "chrom": "NC_000002.11", + "exons": [[88746305, 88748348], [88751414, 88751751], [88751753, 88752211]], + "start": 88746305, + "strand": "-", + "cds_end": 88752053, + "cds_start": 88747725, + "cdna_match": [[88746305, 88748348, 799, 2841, None], [88751414, 88752211, 1, 798, "M460 I1 M337"]] + } } diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index f691a71..601baa8 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -131,12 +131,12 @@ def json_perfect_exons_to_cdna_match(ordered_exons, single=False): last_exon_end = exon_end cdna_match = [[start, end, 1, cdna_length, " ".join(gap_list)]] else: - cdna_end = 0 + cdna_start = 1 for (exon_start, exon_end) in ordered_exons: - cdna_start = cdna_end + 1 exon_length = exon_end - exon_start cdna_end = cdna_start + exon_length - 1 cdna_match.append([exon_start, exon_end, cdna_start, cdna_end, None]) + cdna_start = cdna_end + 1 return cdna_match From ae264c177f87481d375309ce7422b48b2a2b7be4 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 13 Oct 2021 17:03:47 +1030 Subject: [PATCH 11/47] Handle gaps - put back exons as we need them to work out flanking --- pyhgvs/models/transcript.py | 36 +++++++++-------- pyhgvs/tests/test_cdna_match.py | 68 ++++++++++++++++++++++----------- pyhgvs/utils.py | 26 ++++++++----- 3 files changed, 82 insertions(+), 48 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 5bddbb9..82de220 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -19,22 +19,21 @@ def __init__(self, name): class Transcript(object): """ A gene may have multiple transcripts with different combinations of exons. + + We need both exons and cdna_match as need to know exact exon boundaries to work out flanking """ def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, cdna_match=None): + is_default=False, exons=None, cdna_match=None): self.name = name self.version = version self.gene = Gene(gene) self.tx_position = tx_position self.cds_position = cds_position self.is_default = is_default + self.exons = exons or [] self.cdna_match = cdna_match or [] - @lazy - def exons(self): - return self.cdna_match - @property def full_name(self): if self.version is not None: @@ -244,7 +243,7 @@ def genomic_to_cdna_coord(self, genomic_coord): # Adjust coordinates for coding transcript. if self.is_coding: # Detect if position before start codon. - utr5p = self.get_utr5p_size() if self.is_coding else 0 + utr5p = self.get_utr5p_size() cdna_coord.coord -= utr5p if cdna_coord.coord <= 0: cdna_coord.coord -= 1 @@ -296,20 +295,13 @@ def distance(self, offset): return -end_distance -class CDNA_Match(object): - """ An exon is a special case of a cDNA match which has 0 gaps """ - def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number): +class Exon(object): + """ We still need exons to work out the flanking boundaries """ + def __init__(self, transcript, tx_position, number): self.transcript = transcript self.tx_position = tx_position - self.cdna_start = cdna_start - self.cdna_end = cdna_end - self.gap = gap self.number = number - @property - def length(self): - return self.cdna_end - self.cdna_start + 1 - @property def name(self): return "%s.%d" % (self.transcript.name, self.number) @@ -354,6 +346,18 @@ def strand(self): strand = '+' if self.tx_position.is_forward_strand else '-' return strand + +class CDNA_Match(Exon): + def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number): + super(CDNA_Match, self).__init__(transcript, tx_position, number) + self.cdna_start = cdna_start + self.cdna_end = cdna_end + self.gap = gap + + @property + def length(self): + return self.cdna_end - self.cdna_start + 1 + def get_offset(self, position: int): """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py index ec316e8..7e35ea5 100644 --- a/pyhgvs/tests/test_cdna_match.py +++ b/pyhgvs/tests/test_cdna_match.py @@ -7,24 +7,30 @@ from pyhgvs.utils import make_transcript +_HGVS_NM_015120_GRCh37_COORDS = [ + # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert + ("NM_015120.4:c.74A>T", 73613070), + ("NM_015120.4:c.75G>T", None), # No consistent alignment + ("NM_015120.4:c.76G>T", None), # No consistent alignment + ("NM_015120.4:c.77A>T", None), # No consistent alignment + ("NM_015120.4:c.78A>T", 73613071), + ("NM_015120.4:c.79G>C", 73613072), +] -def get_transcript(accession): - transcript_json = _transcripts[accession] - return make_transcript(transcript_json) +_HGVS_NM_001135649_GRCh37_COORDS = [ + # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert + ("NM_001135649.3:c.300C>T", 88751754), + ("NM_001135649.3:c.301A>C", 88751753), + ("NM_001135649.3:c.302C>T", 88751752), + ("NM_001135649.3:c.303C>T", None), # N/A GRCh37 - 1 bp insert + ("NM_001135649.3:c.304T>C", 88751751), + ("NM_001135649.3:c.305T>C", 88751750), +] def test_cdna_to_genomic_coord(): - HGVS_GRCh37_COORDS = [ - # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert - ("NM_015120.4:c.74A>T", 73613070), - ("NM_015120.4:c.75G>T", None), # No consistent alignment - ("NM_015120.4:c.76G>T", None), # No consistent alignment - ("NM_015120.4:c.77A>T", None), # No consistent alignment - ("NM_015120.4:c.78A>T", 73613071), - ("NM_015120.4:c.79G>C", 73613072), - ] transcript = get_transcript("NM_015120.4") - for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS: + for hgvs_str, expected_genomic_coord in _HGVS_NM_015120_GRCh37_COORDS: hgvs_name = HGVSName(hgvs_str) if expected_genomic_coord: genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) @@ -33,18 +39,19 @@ def test_cdna_to_genomic_coord(): nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) +@nose.SkipTest +def test_genomic_to_cdna_coord(): + transcript = get_transcript("NM_015120.4") + for hgvs_str, genomic_coord in _HGVS_NM_015120_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + def test_cdna_to_genomic_coord_negative_strand(): - HGVS_GRCh37_COORDS = [ - # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert - ("NM_001135649.3:c.300C>T", 88751754), - ("NM_001135649.3:c.301A>C", 88751753), - ("NM_001135649.3:c.302C>T", 88751752), - ("NM_001135649.3:c.303C>T", None), # N/A GRCh37 - 1 bp insert - ("NM_001135649.3:c.304T>C", 88751751), - ("NM_001135649.3:c.305T>C", 88751750), - ] transcript = get_transcript("NM_001135649.3") - for hgvs_str, expected_genomic_coord in HGVS_GRCh37_COORDS: + for hgvs_str, expected_genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: hgvs_name = HGVSName(hgvs_str) if expected_genomic_coord: genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) @@ -53,6 +60,21 @@ def test_cdna_to_genomic_coord_negative_strand(): nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) +@nose.SkipTest +def test_genomic_to_cdna_coord_negative_strand(): + transcript = get_transcript("NM_001135649.3") + for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + +def get_transcript(accession): + transcript_json = _transcripts[accession] + return make_transcript(transcript_json) + + _transcripts = { # GRCh37 "NM_015120.4": { diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 601baa8..4a4f298 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals from .models.variants import Position -from .models.transcript import Transcript, CDNA_Match +from .models.transcript import Transcript, CDNA_Match, Exon def read_refgene(infile): @@ -85,16 +85,24 @@ def make_transcript(transcript_json): transcript_json['cds_end'], transcript_json['strand'] == '+')) - cdna_match = transcript_json.get('cdna_match') - if cdna_match: - if not transcript.tx_position.is_forward_strand: - cdna_match = reversed(cdna_match) - else: - exons = transcript_json['exons'] - if not transcript.tx_position.is_forward_strand: - exons = reversed(exons) + exons = transcript_json['exons'] + cdna_match = transcript_json.get('cdna_match', []) + if not transcript.tx_position.is_forward_strand: + exons.reverse() + cdna_match.reverse() + + if not cdna_match: cdna_match = json_perfect_exons_to_cdna_match(exons) # Only use single=True once ALL has been implemented + for number, (exon_start, exon_end) in enumerate(exons, 1): + transcript.exons.append(Exon(transcript=transcript, + tx_position=Position( + transcript_json['chrom'], + exon_start, + exon_end, + transcript_json['strand'] == '+'), + number=number)) + for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1): transcript.cdna_match.append(CDNA_Match(transcript=transcript, tx_position=Position( From 5a4b48b2fccfea2c5fd9b64e12a6465009059730 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 14 Oct 2021 11:49:19 +1030 Subject: [PATCH 12/47] checkpoint - handle flanking --- pyhgvs/models/transcript.py | 78 +++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 82de220..089e79d 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -117,7 +117,6 @@ def find_stop_codon(self, cds_position): position = stop_pos - start else: position = stop - stop_pos - # TODO: I don't think this is right as we need to go in reverse through match on '-' strand?? cdna_pos += position + cdna_match.get_offset(position) break else: @@ -160,7 +159,6 @@ def cdna_to_genomic_coord(self, coord): cdna_end = cdna_start + cdna_match.length - 1 if cdna_start <= cdna_pos <= cdna_end: match_pos = cdna_pos - cdna_start - # TODO: I don't think this is right as we need to go in reverse through match on '-' strand?? match_pos -= cdna_match.get_offset(match_pos) # Compute genomic coordinate using offset. if transcript_strand: @@ -189,56 +187,40 @@ def genomic_to_cdna_coord(self, genomic_coord): return None strand = self.strand - - if strand == "+": - exons.sort(key=lambda exon: exon.chrom_start) - else: - exons.sort(key=lambda exon: -exon.chrom_end) - distances = [exon.distance(genomic_coord) for exon in exons] min_distance_to_exon = min(map(abs, distances)) + if min_distance_to_exon: + # We're outside of exon - so need to find closest point + for exon in exons: + distance = exon.distance(genomic_coord) + if abs(distance) == min_distance_to_exon: + # print(f"{exon}.distance({genomic_coord}) = {distance}") - coding_offset = 0 - for exon in exons: - exon_length = exon.chrom_end - exon.chrom_start - distance = exon.distance(genomic_coord) - if abs(distance) == min_distance_to_exon: - if strand == "+": - exon_start_cds_offset = coding_offset + 1 - exon_end_cds_offset = coding_offset + exon_length - else: - exon_start_cds_offset = coding_offset + exon_length - exon_end_cds_offset = coding_offset + 1 - # This is the exon we want to annotate against. - if distance == 0: - # Inside the exon. - if strand == "+": - coord = (exon_start_cds_offset + - (genomic_coord - - (exon.chrom_start + 1))) - else: - coord = (exon_end_cds_offset + - (exon.chrom_end - - genomic_coord)) - cdna_coord = CDNACoord(coord, 0) - else: # Outside the exon. if distance > 0: - nearest_exonic = exon_start_cds_offset + genomic_nearest_exon = exon.chrom_start + 1 else: - nearest_exonic = exon_end_cds_offset + genomic_nearest_exon = exon.chrom_end + if strand == "+": distance *= -1 + nearest_exon_coord = self._exon_genomic_to_cdna_coord(genomic_nearest_exon) + # If outside transcript, don't use offset. if (genomic_coord < self.tx_position.chrom_start + 1 or genomic_coord > self.tx_position.chrom_stop): - nearest_exonic += distance + print("OUTSIDE TRANSCRIPT - DON'T USE OFFSET") + nearest_exon_coord += distance distance = 0 - cdna_coord = CDNACoord(nearest_exonic, distance) - break - coding_offset += exon_length + cdna_coord = CDNACoord(nearest_exon_coord, distance) + break + else: + raise ValueError("Could not find closest exon!") # Should never happen + else: + coord = self._exon_genomic_to_cdna_coord(genomic_coord) + cdna_coord = CDNACoord(coord, 0) # Adjust coordinates for coding transcript. if self.is_coding: @@ -253,11 +235,31 @@ def genomic_to_cdna_coord(self, genomic_coord): stop_codon -= utr5p if (cdna_coord.coord > stop_codon or cdna_coord.coord == stop_codon and cdna_coord.offset > 0): + print(f"ADJUSTING - 5' UTR size: {utr5p} Stop codon={stop_codon}") cdna_coord.coord -= stop_codon cdna_coord.landmark = CDNA_STOP_CODON return cdna_coord + def _exon_genomic_to_cdna_coord(self, genomic_coord): + coding_offset = 0 + for cdna_match in self.ordered_cdna_match: + exon_length = cdna_match.tx_position.chrom_stop - cdna_match.tx_position.chrom_start + cds_start = coding_offset + 1 + cds_end = coding_offset + exon_length + + # Inside the exon. + if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop: + #print(f"{genomic_coord} INSIDE EXON: {exon}") + if self.strand == "+": + position = genomic_coord - (cdna_match.tx_position.chrom_start + 1) + else: + position = cdna_match.tx_position.chrom_stop - genomic_coord + return cds_start + position # - cdna_match.get_offset(position) + coding_offset += exon_length + else: + raise ValueError(f"Couldn't find {genomic_coord=}!") + BED6Interval_base = namedtuple( "BED6Interval_base", ( From 5667511f5f45f6eea4448d292fd878fa59131988 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 10:02:23 +1030 Subject: [PATCH 13/47] Finish off gap handling code --- pyhgvs/models/transcript.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 089e79d..8cd2e10 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -129,11 +129,12 @@ def find_stop_codon(self, cds_position): def cdna_to_genomic_coord(self, coord): """Convert a HGVS cDNA coordinate to a genomic coordinate.""" transcript_strand = self.tx_position.is_forward_strand - utr5p = (self.get_utr5p_size() - if self.is_coding else 0) # compute starting position along spliced transcript. if coord.landmark == CDNA_START_CODON: + utr5p = (self.get_utr5p_size() + if self.is_coding else 0) + if coord.coord > 0: cdna_pos = utr5p + coord.coord else: @@ -178,8 +179,7 @@ def cdna_to_genomic_coord(self, coord): return self.cds_position.chrom_start + 1 - coord.coord def genomic_to_cdna_coord(self, genomic_coord): - """Convert a genomic coordinate to a cDNA coordinate and offset. - """ + """ Convert a genomic coordinate to a cDNA coordinate and offset. """ exons = [exon.get_as_interval() for exon in self.ordered_exons] @@ -195,7 +195,6 @@ def genomic_to_cdna_coord(self, genomic_coord): for exon in exons: distance = exon.distance(genomic_coord) if abs(distance) == min_distance_to_exon: - # print(f"{exon}.distance({genomic_coord}) = {distance}") # Outside the exon. if distance > 0: @@ -211,7 +210,6 @@ def genomic_to_cdna_coord(self, genomic_coord): # If outside transcript, don't use offset. if (genomic_coord < self.tx_position.chrom_start + 1 or genomic_coord > self.tx_position.chrom_stop): - print("OUTSIDE TRANSCRIPT - DON'T USE OFFSET") nearest_exon_coord += distance distance = 0 cdna_coord = CDNACoord(nearest_exon_coord, distance) @@ -235,7 +233,6 @@ def genomic_to_cdna_coord(self, genomic_coord): stop_codon -= utr5p if (cdna_coord.coord > stop_codon or cdna_coord.coord == stop_codon and cdna_coord.offset > 0): - print(f"ADJUSTING - 5' UTR size: {utr5p} Stop codon={stop_codon}") cdna_coord.coord -= stop_codon cdna_coord.landmark = CDNA_STOP_CODON @@ -244,19 +241,16 @@ def genomic_to_cdna_coord(self, genomic_coord): def _exon_genomic_to_cdna_coord(self, genomic_coord): coding_offset = 0 for cdna_match in self.ordered_cdna_match: - exon_length = cdna_match.tx_position.chrom_stop - cdna_match.tx_position.chrom_start cds_start = coding_offset + 1 - cds_end = coding_offset + exon_length # Inside the exon. if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop: - #print(f"{genomic_coord} INSIDE EXON: {exon}") if self.strand == "+": position = genomic_coord - (cdna_match.tx_position.chrom_start + 1) else: position = cdna_match.tx_position.chrom_stop - genomic_coord - return cds_start + position # - cdna_match.get_offset(position) - coding_offset += exon_length + return cds_start + position - cdna_match.get_offset(position) + coding_offset += cdna_match.length else: raise ValueError(f"Couldn't find {genomic_coord=}!") From c1f2099e99f28ff42146a3d6340a0c90c7088895 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 10:29:43 +1030 Subject: [PATCH 14/47] linting --- pyhgvs/models/hgvs_name.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index efe9471..cc20682 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -256,6 +256,7 @@ def get_refseq_type(name): prefix = name[:3] return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0] + class InvalidHGVSName(ValueError): def __init__(self, name='', part='name', reason=''): if name: @@ -344,7 +345,7 @@ def parse_prefix(self, prefix, kind): # Transcript and gene given with parens. # example: NM_007294.3(BRCA1):c.2207A>C - match = re.match("^(?P[^(]+)\((?P[^)]+)\)$", prefix) + match = re.match(r"^(?P[^(]+)\((?P[^)]+)\)$", prefix) if match: self.transcript = match.group('transcript') self.gene = match.group('gene') @@ -352,7 +353,7 @@ def parse_prefix(self, prefix, kind): # Transcript and gene given with braces. # example: BRCA1{NM_007294.3}:c.2207A>C - match = re.match("^(?P[^{]+)\{(?P[^}]+)\}$", prefix) + match = re.match(r"^(?P[^{]+){(?P[^}]+)}$", prefix) if match: self.transcript = match.group('transcript') self.gene = match.group('gene') From dc6d638a761cf676e67cfdbbff2ab95fdbf63246 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 10:31:19 +1030 Subject: [PATCH 15/47] Keep module interface the same --- pyhgvs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index b3004b2..b092787 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -15,7 +15,7 @@ from .models.cdna import CDNACoord from .models.genome import GenomeSubset -from .models.hgvs_name import HGVSName +from .models.hgvs_name import HGVSName, InvalidHGVSName from .models.variants import justify_indel, normalize_variant, revcomp From 0d6ddd56b089a4daedcacddb11c96172007c0cc7 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 10:38:33 +1030 Subject: [PATCH 16/47] Better errors, note about not using single=True --- pyhgvs/models/transcript.py | 2 +- pyhgvs/utils.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 8cd2e10..d04f917 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -96,7 +96,7 @@ def get_utr5p_size(self): break cdna_len += cdna_match.length else: - raise ValueError("transcript contains no cdna_match") + raise ValueError(f"Couldn't find start_codon in cdna_match: {self.ordered_cdna_match}") return cdna_len diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 4a4f298..ced1e10 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -92,7 +92,7 @@ def make_transcript(transcript_json): cdna_match.reverse() if not cdna_match: - cdna_match = json_perfect_exons_to_cdna_match(exons) # Only use single=True once ALL has been implemented + cdna_match = json_perfect_exons_to_cdna_match(exons) for number, (exon_start, exon_end) in enumerate(exons, 1): transcript.exons.append(Exon(transcript=transcript, @@ -119,7 +119,9 @@ def make_transcript(transcript_json): def json_perfect_exons_to_cdna_match(ordered_exons, single=False): - """ Perfectly matched exons are basically a no-gap case of cDNA match """ + """ Perfectly matched exons are basically a no-gap case of cDNA match + single - use a single cDNA match (deletions for introns) - this is currently broken do not use + """ cdna_match = [] if single: ordered_exons = list(ordered_exons) From 79fa49b380b8e189fa95ff389c5a23ca6d2d4a29 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 12:17:59 +1030 Subject: [PATCH 17/47] Keep module interface the same --- pyhgvs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index b092787..70a9f37 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -13,7 +13,7 @@ import re -from .models.cdna import CDNACoord +from .models.cdna import CDNACoord, CDNA_START_CODON, CDNA_STOP_CODON from .models.genome import GenomeSubset from .models.hgvs_name import HGVSName, InvalidHGVSName from .models.variants import justify_indel, normalize_variant, revcomp From 2145f96c05d12ebf99ee10f6329ec8bcedf1551e Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 16:58:00 +1030 Subject: [PATCH 18/47] Don't die in gaps when going the other way. Change testing data --- pyhgvs/models/transcript.py | 11 ++++---- pyhgvs/tests/test_cdna_match.py | 49 +++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index d04f917..4207f00 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -215,7 +215,7 @@ def genomic_to_cdna_coord(self, genomic_coord): cdna_coord = CDNACoord(nearest_exon_coord, distance) break else: - raise ValueError("Could not find closest exon!") # Should never happen + raise ValueError("Could not find closest exon!") # Should never happen else: coord = self._exon_genomic_to_cdna_coord(genomic_coord) cdna_coord = CDNACoord(coord, 0) @@ -249,7 +249,8 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord): position = genomic_coord - (cdna_match.tx_position.chrom_start + 1) else: position = cdna_match.tx_position.chrom_stop - genomic_coord - return cds_start + position - cdna_match.get_offset(position) + offset = cdna_match.get_offset(position, validate=False) + return cds_start + position + offset coding_offset += cdna_match.length else: raise ValueError(f"Couldn't find {genomic_coord=}!") @@ -354,7 +355,7 @@ def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number): def length(self): return self.cdna_end - self.cdna_start + 1 - def get_offset(self, position: int): + def get_offset(self, position: int, validate=True): """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute codes operation @@ -377,11 +378,11 @@ def get_offset(self, position: int): if code == "M": cdna_match_index += length elif code == "I": - if position_1_based < cdna_match_index + length: + if validate and position_1_based < cdna_match_index + length: raise ValueError("Coordinate (%d) inside insertion (%s) - no mapping possible!" % (position_1_based, gap_op)) offset += length elif code == "D": - if position < cdna_match_index + length: + if validate and position < cdna_match_index + length: raise ValueError("Coordinate (%d) inside deletion (%s) - no mapping possible!" % (position_1_based, gap_op)) offset -= length else: diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py index 7e35ea5..883b867 100644 --- a/pyhgvs/tests/test_cdna_match.py +++ b/pyhgvs/tests/test_cdna_match.py @@ -18,7 +18,7 @@ ] _HGVS_NM_001135649_GRCh37_COORDS = [ - # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.618 then 3 bases insert + # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.302 then 1 base insert ("NM_001135649.3:c.300C>T", 88751754), ("NM_001135649.3:c.301A>C", 88751753), ("NM_001135649.3:c.302C>T", 88751752), @@ -27,6 +27,13 @@ ("NM_001135649.3:c.305T>C", 88751750), ] +_HGVS_NM_001012755_GRCh37_COORDS = [ + # 133, 6207, "M1574 I1 M4500 - 5'UTR length is 163 so last match is transcript pos (1574+132) ie c.1543 then 1bp ins + ("NM_001012755.5:c.*619G>C", 103348398), # NM_001012755.5:c.1543G>C + ("NM_001012755.5:c.*620G>C", None), # NM_001012755.5:c.1544G>C - No GRCh37 + ("NM_001012755.5:c.*621C>G", 103348397), # NM_001012755.5:c.1545G>C +] + def test_cdna_to_genomic_coord(): transcript = get_transcript("NM_015120.4") @@ -39,7 +46,6 @@ def test_cdna_to_genomic_coord(): nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) -@nose.SkipTest def test_genomic_to_cdna_coord(): transcript = get_transcript("NM_015120.4") for hgvs_str, genomic_coord in _HGVS_NM_015120_GRCh37_COORDS: @@ -60,7 +66,19 @@ def test_cdna_to_genomic_coord_negative_strand(): nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) -@nose.SkipTest +def test_cdna_to_genomic_coord_negative_strand2(): + transcript = get_transcript("NM_001012755.5") + for hgvs_str, expected_genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + + + +@nose.SkipTest # Currently fails def test_genomic_to_cdna_coord_negative_strand(): transcript = get_transcript("NM_001135649.3") for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: @@ -70,6 +88,17 @@ def test_genomic_to_cdna_coord_negative_strand(): nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) +def test_genomic_to_cdna_coord_negative_strand2(): + transcript = get_transcript("NM_001012755.5") + + for hgvs_str, genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + + def get_transcript(accession): transcript_json = _transcripts[accession] return make_transcript(transcript_json) @@ -106,6 +135,8 @@ def get_transcript(accession): [73836694, 73837046, 12577, 12928, None]] }, "NM_001135649.3": { + # This is a strange case - there are 3 exons, but 2 cDNA match entries + # I am not sure if this is an error, ie where are the splice sites? "id": "NM_001135649.3", "gene_name": "FOXI3", "end": 88752211, @@ -116,5 +147,17 @@ def get_transcript(accession): "cds_end": 88752053, "cds_start": 88747725, "cdna_match": [[88746305, 88748348, 799, 2841, None], [88751414, 88752211, 1, 798, "M460 I1 M337"]] + }, + "NM_001012755.5": { + "id": "NM_001012755.5", + "gene_name": "SLC25A53", + "end": 103401690, + "chrom": "NC_000023.10", + "exons": [[103343897, 103349971], [103401558, 103401690]], + "start": 103343897, + "strand": "-", + "cds_end": 103349940, + "cds_start": 103349016, + "cdna_match": [[103343897, 103349971, 133, 6207, "M1574 I1 M4500"], [103401558, 103401690, 1, 132, None]] } } From d58789b72ba585f2a451c6d391d73246c83513ae Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 15 Oct 2021 17:08:26 +1030 Subject: [PATCH 19/47] Always use cDNA as exons, to handle those strange cases like eg NM_001135649.3 --- pyhgvs/models/transcript.py | 15 +-------------- pyhgvs/tests/test_cdna_match.py | 6 ++---- pyhgvs/utils.py | 11 ++--------- 3 files changed, 5 insertions(+), 27 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 4207f00..cdee8a4 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -61,19 +61,6 @@ def ordered_cdna_match(self): cdna_match.reverse() return cdna_match - @lazy - def ordered_exons(self): - """Yield exons in coding order.""" - transcript_strand = self.tx_position.is_forward_strand - if hasattr(self.exons, 'select_related'): - exons = list(self.exons.select_related('tx_position')) - else: - exons = list(self.exons) - exons.sort(key=lambda exon: exon.tx_position.chrom_start) - if not transcript_strand: - exons.reverse() - return exons - def get_utr5p_size(self): """Return the size of the 5prime UTR of a transcript.""" @@ -181,7 +168,7 @@ def cdna_to_genomic_coord(self, coord): def genomic_to_cdna_coord(self, genomic_coord): """ Convert a genomic coordinate to a cDNA coordinate and offset. """ exons = [exon.get_as_interval() - for exon in self.ordered_exons] + for exon in self.ordered_cdna_match] if len(exons) == 0: return None diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py index 883b867..bbaf17f 100644 --- a/pyhgvs/tests/test_cdna_match.py +++ b/pyhgvs/tests/test_cdna_match.py @@ -77,8 +77,6 @@ def test_cdna_to_genomic_coord_negative_strand2(): nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) - -@nose.SkipTest # Currently fails def test_genomic_to_cdna_coord_negative_strand(): transcript = get_transcript("NM_001135649.3") for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: @@ -98,7 +96,6 @@ def test_genomic_to_cdna_coord_negative_strand2(): nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) - def get_transcript(accession): transcript_json = _transcripts[accession] return make_transcript(transcript_json) @@ -136,7 +133,8 @@ def get_transcript(accession): }, "NM_001135649.3": { # This is a strange case - there are 3 exons, but 2 cDNA match entries - # I am not sure if this is an error, ie where are the splice sites? + # I think the exons are in error, as eg where is the splice site? + # So - we'll just use the cDNA as exons "id": "NM_001135649.3", "gene_name": "FOXI3", "end": 88752211, diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index ced1e10..185f49f 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -91,18 +91,11 @@ def make_transcript(transcript_json): exons.reverse() cdna_match.reverse() + # We don't use exons, but run everything through cDNA match so there's just 1 path + # exons are treated as a perfect cDNA match if not cdna_match: cdna_match = json_perfect_exons_to_cdna_match(exons) - for number, (exon_start, exon_end) in enumerate(exons, 1): - transcript.exons.append(Exon(transcript=transcript, - tx_position=Position( - transcript_json['chrom'], - exon_start, - exon_end, - transcript_json['strand'] == '+'), - number=number)) - for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1): transcript.cdna_match.append(CDNA_Match(transcript=transcript, tx_position=Position( From 2dbd69ddf0df58eef6b03b8abddcefd7739ca72d Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Mon, 18 Oct 2021 13:43:21 +1030 Subject: [PATCH 20/47] Make sure to include models in package --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6ed2380..c468888 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def main(): long_description=description, author='Matt Rasmussen', author_email='rasmus@counsyl.com', - packages=[str('pyhgvs'), str('pyhgvs.tests')], + packages=[str('pyhgvs'), str('pyhgvs.models'), str('pyhgvs.tests')], include_package_data=True, scripts=[], install_requires=['pip>=1.2'], From 4271aa7680182dd57356e17d4a04f9b1df59bdfb Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 22 Oct 2021 14:03:28 +1030 Subject: [PATCH 21/47] Better error message --- pyhgvs/models/transcript.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index cdee8a4..89018c9 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -83,7 +83,13 @@ def get_utr5p_size(self): break cdna_len += cdna_match.length else: - raise ValueError(f"Couldn't find start_codon in cdna_match: {self.ordered_cdna_match}") + cdna_matches = [] + for cdna_match in self.ordered_cdna_match: + start = cdna_match.tx_position.chrom_start + end = cdna_match.tx_position.chrom_stop + cdna_matches.append(f"{start}-{end}") + cdna_match_summary = ", ".join(cdna_matches) + raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary)) return cdna_len From cc3de4d3db843182d1a190a8e74103b916592622 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 22 Oct 2021 16:24:54 +1030 Subject: [PATCH 22/47] Data generation scripts --- .../ensembl_gene_annotation_grch37.sh | 28 ++++ .../ensembl_gene_annotation_grch38.sh | 37 +++++ .../pyreference_to_pyhgvs_json.py | 131 ++++++++++++++++++ .../refseq_gene_annotation_grch37.sh | 75 ++++++++++ .../refseq_gene_annotation_grch38.sh | 90 ++++++++++++ 5 files changed, 361 insertions(+) create mode 100755 generate_transcript_data/ensembl_gene_annotation_grch37.sh create mode 100755 generate_transcript_data/ensembl_gene_annotation_grch38.sh create mode 100644 generate_transcript_data/pyreference_to_pyhgvs_json.py create mode 100755 generate_transcript_data/refseq_gene_annotation_grch37.sh create mode 100755 generate_transcript_data/refseq_gene_annotation_grch38.sh diff --git a/generate_transcript_data/ensembl_gene_annotation_grch37.sh b/generate_transcript_data/ensembl_gene_annotation_grch37.sh new file mode 100755 index 0000000..97ee668 --- /dev/null +++ b/generate_transcript_data/ensembl_gene_annotation_grch37.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them + +#82 is first GFF3 for GRCh37 +#83 has no data +#84 is 82 again +#86 is 85 again +pyreference_args=() +for release in 82 85 87; do + filename=Homo_sapiens.GRCh37.${release}.gff3.gz + url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename} + pyreference_file=${filename}.json.gz + if [[ ! -e ${filename} ]]; then + wget ${url} + fi + if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" + fi + pyreference_args+=(--pyreference-json ${pyreference_file}) +done + +merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz" +if [[ ! -e ${merged_file} ]]; then + BASE_DIR=$(dirname ${BASH_SOURCE[0]}) + + python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} +fi diff --git a/generate_transcript_data/ensembl_gene_annotation_grch38.sh b/generate_transcript_data/ensembl_gene_annotation_grch38.sh new file mode 100755 index 0000000..9f8c0a0 --- /dev/null +++ b/generate_transcript_data/ensembl_gene_annotation_grch38.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Skip earlier GTFs as they don't have versions +#for release in 76 77 78 79 80; do +# filename=Homo_sapiens.GRCh38.${release}.gtf.gz +# url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename} +# if [[ ! -e ${filename} ]]; then +# wget ${url} +# fi + +# if [[ ! -e ${filename}.json.gz ]]; then +# pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +# fi +#done + +#81 is first GFF3 for GRCh38 +pyreference_args=() +for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do + filename=Homo_sapiens.GRCh38.${release}.gff3.gz + url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename} + pyreference_file=${filename}.json.gz + + if [[ ! -e ${filename} ]]; then + wget ${url} + fi + if [[ ! -e ${filename}.json.gz ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" + fi + pyreference_args+=(--pyreference-json ${pyreference_file}) +done + +merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz" +if [[ ! -e ${merged_file} ]]; then + BASE_DIR=$(dirname ${BASH_SOURCE[0]}) + + python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} +fi diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py new file mode 100644 index 0000000..44574d0 --- /dev/null +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -0,0 +1,131 @@ +""" + Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS + + @see https://bitbucket.org/sacgf/pyreference/ + + Dave Lawrence (davmlaw@gmail.com) on 22/10/2021 +""" +import gzip +import json +from argparse import ArgumentParser +from typing import Dict + + +def handle_args(): + parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS') + parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend", + help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)') + parser.add_argument('--output', required=True, help='Output filename') + return parser.parse_args() + + +def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict: + gene_version_data = { + 'biotype': ",".join(gene_data["biotype"]), + 'description': gene_data.get("description"), + 'gene_symbol': gene_data["name"], + } + + if hgnc_str := gene_data.get("HGNC"): + # Has HGNC: (5 characters) at start of it + gene_version_data["hgnc"] = hgnc_str[5:] + + # Only Ensembl Genes have versions + if version := gene_data.get("version"): + gene_data["version"] = version + + return gene_version_data + + +def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: + start = transcript_data["start"] + end = transcript_data["stop"] + strand = transcript_data["strand"] + # PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts + cds_start = transcript_data.get("cds_start", start) + cds_end = transcript_data.get("cds_end", end) + # PyHGVS exons are in genomic order, PyReference are in stranded + features = transcript_data["features_by_type"] + exons = [[ed["start"], ed["stop"]] for ed in features["exon"]] + cdna_match = [] + for cdm in features.get("cDNA_match", []): + if "cdna_strand" in cdm: # None in Human RefSeq so haven't handled + raise ValueError("Haven't handled stranded Target alignment") + cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]]) + + if strand == '-': + exons.reverse() + cdna_match.reverse() + + pyhgvs_data = { + 'chrom': transcript_data["chr"], + 'start': start, + 'end': end, + 'strand': strand, + 'cds_start': cds_start, + 'cds_end': cds_end, + 'exons': exons, + } + + # Optional stuff + if cdna_match: + pyhgvs_data["cdna_match"] = cdna_match + if transcript_data.get("partial"): + pyhgvs_data["partial"] = 1 + + # Few extra fields (pyHGVS doesn't currently use) + pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"]) + return pyhgvs_data + + +def main(): + args = handle_args() + + gene_versions = {} # We only keep those that are in the latest transcript version + transcript_versions = {} + + for pyref_filename in args.pyreference_json: + print(f"Loading '{pyref_filename}'") + with gzip.open(pyref_filename) as f: + pyref_data = json.load(f) + + url = pyref_data["reference_gtf"]["url"] + + # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes) + transcript_gene_version = {} + + for gene_id, gene in pyref_data["genes_by_id"].items(): + if version := gene.get("version"): + gene_accession = f"{gene_id}.{version}" + else: + gene_accession = gene_id + + gene_version = convert_gene_pyreference_to_gene_version_data(gene) + gene_version["url"] = url + gene_versions[gene_accession] = gene_version + + for transcript_accession in gene["transcripts"]: + transcript_gene_version[transcript_accession] = gene_accession + + for transcript_accession, transcript in pyref_data["transcripts_by_id"].items(): + hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript) + gene_accession = transcript_gene_version[transcript_accession] + gene_version = gene_versions[gene_accession] + hgvs_data["id"] = transcript_accession + hgvs_data["gene_version"] = gene_accession + hgvs_data["gene_name"] = gene_version["gene_symbol"] + hgvs_data["url"] = url + transcript_versions[transcript_accession] = hgvs_data + + print("Writing PyHGVS data") + with gzip.open(args.output, 'w') as outfile: + data = { + "transcripts": transcript_versions, + "genes": gene_versions, + } + json_str = json.dumps(data) + outfile.write(json_str.encode('ascii')) + + +if __name__ == '__main__': + main() diff --git a/generate_transcript_data/refseq_gene_annotation_grch37.sh b/generate_transcript_data/refseq_gene_annotation_grch37.sh new file mode 100755 index 0000000..c0846fc --- /dev/null +++ b/generate_transcript_data/refseq_gene_annotation_grch37.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +pyreference_args=() + +# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok + +filename=ref_GRCh37.p5_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename} +pyreference_file=${filename}.json.gz +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh37.p9_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename} +pyreference_file=${filename}.json.gz +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh37.p10_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename} +pyreference_file=${filename}.json.gz +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh37.p13_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename} +pyreference_file=${filename}.json.gz +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +# These all have the same name, so rename them based on release ID +for release in 105.20190906 105.20201022; do + filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz + url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz + pyreference_file=${filename}.json.gz + if [[ ! -e ${filename} ]]; then + wget ${url} --output-document=${filename} + fi + if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" + fi + pyreference_args+=(--pyreference-json ${pyreference_file}) +done + + +merged_file="pyhgvs_transcripts_refseq_grch37.json.gz" +if [[ ! -e ${merged_file} ]]; then + BASE_DIR=$(dirname ${BASH_SOURCE[0]}) + + python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} +fi diff --git a/generate_transcript_data/refseq_gene_annotation_grch38.sh b/generate_transcript_data/refseq_gene_annotation_grch38.sh new file mode 100755 index 0000000..a4682fe --- /dev/null +++ b/generate_transcript_data/refseq_gene_annotation_grch38.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok + +filename=ref_GRCh38_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename} +pyreference_file=${filename}.json.gz + +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh38.p2_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename} +pyreference_file=${filename}.json.gz + +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh38.p7_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename} +pyreference_file=${filename}.json.gz + +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=ref_GRCh38.p12_top_level.gff3.gz +url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename} +pyreference_file=${filename}.json.gz + +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz +url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename} +pyreference_file=${filename}.json.gz + +if [[ ! -e ${filename} ]]; then + wget ${url} +fi +if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" +fi +pyreference_args+=(--pyreference-json ${pyreference_file}) + + +# These all have the same name, so rename them based on release ID +for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do + filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz + url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz + pyreference_file=${filename}.json.gz + if [[ ! -e ${filename} ]]; then + wget ${url} --output-document=${filename} + fi + if [[ ! -e ${pyreference_file} ]]; then + pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" + fi + pyreference_args+=(--pyreference-json ${pyreference_file}) +done + + +merged_file="pyhgvs_transcripts_refseq_grch38.json.gz" +if [[ ! -e ${merged_file} ]]; then + BASE_DIR=$(dirname ${BASH_SOURCE[0]}) + + python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} +fi From 2e71ba930366cedcf7a728b54d242fa3a479e392 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 27 Oct 2021 16:17:23 +1030 Subject: [PATCH 23/47] Keep track of what other contigs we have coordinates for (eg chrX and chrY) --- generate_transcript_data/pyreference_to_pyhgvs_json.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py index 44574d0..82ebabe 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -72,6 +72,9 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: pyhgvs_data["cdna_match"] = cdna_match if transcript_data.get("partial"): pyhgvs_data["partial"] = 1 + other_chroms = transcript_data.get("other_chroms") + if other_chroms: + pyhgvs_data["other_chroms"] = other_chroms # Few extra fields (pyHGVS doesn't currently use) pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"]) From 1337350e9b1c54077e73b89f064a7cca0e143ef8 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 28 Oct 2021 15:40:42 +1030 Subject: [PATCH 24/47] For non-coding, PyHGVS has cds_start = end not start (so coding length = 0) --- generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py index 82ebabe..d53edf6 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -41,8 +41,8 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: start = transcript_data["start"] end = transcript_data["stop"] strand = transcript_data["strand"] - # PyHGVS has cds_start/cds_end be equal to start/end for non-coding transcripts - cds_start = transcript_data.get("cds_start", start) + # PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) + cds_start = transcript_data.get("cds_start", end) cds_end = transcript_data.get("cds_end", end) # PyHGVS exons are in genomic order, PyReference are in stranded features = transcript_data["features_by_type"] From 6c5e381c6f34ec7288016ab31f68c8d4dd18cd69 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 10 Nov 2021 14:18:18 +1030 Subject: [PATCH 25/47] Use transcript offsets rather than exon lengths so we handle gaps properly --- pyhgvs/models/transcript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 89018c9..38a8bbb 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -113,7 +113,7 @@ def find_stop_codon(self, cds_position): cdna_pos += position + cdna_match.get_offset(position) break else: - cdna_pos += stop - start + cdna_pos = cdna_match.cdna_end else: raise ValueError('Stop codon is not in any of the exons') From 2ca3e08d2437ce9fe709940c2da7160a6ebcc011 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 10 Nov 2021 14:47:24 +1030 Subject: [PATCH 26/47] Simplify and make consistent --- pyhgvs/models/transcript.py | 55 +++++++++++++++---------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 38a8bbb..8e0d7c0 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -69,7 +69,7 @@ def get_utr5p_size(self): # Find the exon containing the start codon. start_codon = (self.cds_position.chrom_start if transcript_strand else self.cds_position.chrom_stop - 1) - cdna_len = 0 + cdna_offset = 0 for cdna_match in self.ordered_cdna_match: start = cdna_match.tx_position.chrom_start end = cdna_match.tx_position.chrom_stop @@ -79,19 +79,17 @@ def get_utr5p_size(self): position = start_codon - start else: position = end - start_codon - 1 - cdna_len += position + cdna_match.get_offset(position) - break - cdna_len += cdna_match.length - else: - cdna_matches = [] - for cdna_match in self.ordered_cdna_match: - start = cdna_match.tx_position.chrom_start - end = cdna_match.tx_position.chrom_stop - cdna_matches.append(f"{start}-{end}") - cdna_match_summary = ", ".join(cdna_matches) - raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary)) + return cdna_offset + position + cdna_match.get_offset(position) + cdna_offset = cdna_match.cdna_end - return cdna_len + # Couldn't find it + cdna_matches = [] + for cdna_match in self.ordered_cdna_match: + start = cdna_match.tx_position.chrom_start + end = cdna_match.tx_position.chrom_stop + cdna_matches.append(f"{start}-{end}") + cdna_match_summary = ", ".join(cdna_matches) + raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary)) def find_stop_codon(self, cds_position): """Return the position along the cDNA of the base after the stop codon.""" @@ -99,7 +97,7 @@ def find_stop_codon(self, cds_position): stop_pos = cds_position.chrom_stop else: stop_pos = cds_position.chrom_start - cdna_pos = 0 + cdna_offset = 0 for cdna_match in self.ordered_cdna_match: start = cdna_match.tx_position.chrom_start stop = cdna_match.tx_position.chrom_stop @@ -110,14 +108,10 @@ def find_stop_codon(self, cds_position): position = stop_pos - start else: position = stop - stop_pos - cdna_pos += position + cdna_match.get_offset(position) - break + return cdna_offset + position + cdna_match.get_offset(position) else: - cdna_pos = cdna_match.cdna_end - else: - raise ValueError('Stop codon is not in any of the exons') - - return cdna_pos + cdna_offset = cdna_match.cdna_end + raise ValueError('Stop codon is not in any of the exons') def cdna_to_genomic_coord(self, coord): """Convert a HGVS cDNA coordinate to a genomic coordinate.""" @@ -148,11 +142,9 @@ def cdna_to_genomic_coord(self, coord): return self.tx_position.chrom_stop - cdna_pos + 1 # Walk along transcript until we find an exon that contains cdna_pos. - cdna_start = 1 for cdna_match in self.ordered_cdna_match: - cdna_end = cdna_start + cdna_match.length - 1 - if cdna_start <= cdna_pos <= cdna_end: - match_pos = cdna_pos - cdna_start + if cdna_match.cdna_start <= cdna_pos <= cdna_match.cdna_end: + match_pos = cdna_pos - cdna_match.cdna_start match_pos -= cdna_match.get_offset(match_pos) # Compute genomic coordinate using offset. if transcript_strand: @@ -163,7 +155,6 @@ def cdna_to_genomic_coord(self, coord): # Minus strand. end = cdna_match.tx_position.chrom_stop return end - match_pos - coord.offset - cdna_start = cdna_end + 1 else: # 3' flanking sequence (no need to account for gaps) if transcript_strand: @@ -232,10 +223,8 @@ def genomic_to_cdna_coord(self, genomic_coord): return cdna_coord def _exon_genomic_to_cdna_coord(self, genomic_coord): - coding_offset = 0 + cdna_offset = 0 for cdna_match in self.ordered_cdna_match: - cds_start = coding_offset + 1 - # Inside the exon. if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop: if self.strand == "+": @@ -243,10 +232,10 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord): else: position = cdna_match.tx_position.chrom_stop - genomic_coord offset = cdna_match.get_offset(position, validate=False) - return cds_start + position + offset - coding_offset += cdna_match.length - else: - raise ValueError(f"Couldn't find {genomic_coord=}!") + return cdna_offset + position + offset + 1 + cdna_offset = cdna_match.cdna_end + + raise ValueError(f"Couldn't find {genomic_coord=}!") BED6Interval_base = namedtuple( From b4571a7cff67ff2844668a3a24327aff9c243938 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 10 Nov 2021 17:08:06 +1030 Subject: [PATCH 27/47] Previous change didn't work on negative strand --- pyhgvs/models/transcript.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 8e0d7c0..4f7c0b9 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -80,7 +80,7 @@ def get_utr5p_size(self): else: position = end - start_codon - 1 return cdna_offset + position + cdna_match.get_offset(position) - cdna_offset = cdna_match.cdna_end + cdna_offset += cdna_match.length # Couldn't find it cdna_matches = [] @@ -110,7 +110,7 @@ def find_stop_codon(self, cds_position): position = stop - stop_pos return cdna_offset + position + cdna_match.get_offset(position) else: - cdna_offset = cdna_match.cdna_end + cdna_offset += cdna_match.length raise ValueError('Stop codon is not in any of the exons') def cdna_to_genomic_coord(self, coord): @@ -233,7 +233,7 @@ def _exon_genomic_to_cdna_coord(self, genomic_coord): position = cdna_match.tx_position.chrom_stop - genomic_coord offset = cdna_match.get_offset(position, validate=False) return cdna_offset + position + offset + 1 - cdna_offset = cdna_match.cdna_end + cdna_offset += cdna_match.length raise ValueError(f"Couldn't find {genomic_coord=}!") From bb17f436bcb522e3c7e98d3c8fc687ff15b8708d Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 10 Nov 2021 18:28:26 +1030 Subject: [PATCH 28/47] Make sure exons are sorted correctly --- pyhgvs/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 185f49f..e6d6250 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -5,6 +5,7 @@ from __future__ import absolute_import from __future__ import unicode_literals +import operator from .models.variants import Position from .models.transcript import Transcript, CDNA_Match, Exon @@ -86,7 +87,10 @@ def make_transcript(transcript_json): transcript_json['strand'] == '+')) exons = transcript_json['exons'] + exons.sort(key=operator.itemgetter(0)) cdna_match = transcript_json.get('cdna_match', []) + cdna_match.sort(key=operator.itemgetter(0)) + if not transcript.tx_position.is_forward_strand: exons.reverse() cdna_match.reverse() From dbc621f14555cf79be84082fa7e7b635795e67ae Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Mon, 15 Nov 2021 10:58:35 +1030 Subject: [PATCH 29/47] Support LRG transcripts (was setting as gene not transcripts if no gene provided) --- pyhgvs/models/hgvs_name.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index cc20682..313bd2e 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -364,6 +364,11 @@ def parse_prefix(self, prefix, kind): self.transcript = prefix return + # Determine using LRG type. + if prefix.startswith('LRG_'): + self.transcript = prefix + return + # Determine using refseq type. refseq_type = get_refseq_type(prefix) if refseq_type in ('mRNA', 'RNA'): From 1409aba014c878d9e58874cdb86a2b04c892e169 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 16 Nov 2021 15:33:18 +1030 Subject: [PATCH 30/47] Use ref for alt on any reference HGVS --- pyhgvs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 70a9f37..ff31c73 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -52,7 +52,7 @@ def get_vcf_allele(hgvs, genome, transcript=None): # Sometimes we need to retrieve alt from reference # Eg NC_000001.11:g.169549811= - if hgvs.mutation_type == "=" and alt == 'N': + if hgvs.mutation_type == "=": alt = ref if hgvs.mutation_type in _indel_mutation_types: From 9ceb6498b8aa99481ea1719589c15118e00919c4 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 16 Nov 2021 18:07:10 +1030 Subject: [PATCH 31/47] Validate coordinate span and reference are the same, as eg c.1727_1740dup14 should pass but c.1727_1740dup9999 should fail --- pyhgvs/models/hgvs_name.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index 313bd2e..9581dda 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -542,7 +542,7 @@ def parse_genome(self, details): self.ref_allele = groups.get('ref', '') self.alt_allele = groups.get('alt', '') - # Convert numerical allelles. + # Convert numerical alleles. if self.ref_allele.isdigit(): self.ref_allele = "N" * int(self.ref_allele) if self.alt_allele.isdigit(): @@ -738,9 +738,6 @@ def get_raw_coords(self, transcript=None): end = transcript.cdna_to_genomic_coord(self.cdna_end) if not transcript.tx_position.is_forward_strand: - if end > start: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") start, end = end, start if start > end: @@ -755,6 +752,13 @@ def get_raw_coords(self, transcript=None): 'Coordinates are not available for this kind of HGVS name "%s"' % self.kind) + # Check coordinate span is equal to reference bases + if self.ref_allele: + coordinate_span = end - start + 1 # Ref will always be >=1 base + ref_length = len(self.ref_allele) + if coordinate_span != ref_length: + raise InvalidHGVSName("Coordinate span (%d) not equal to ref length %d" % (coordinate_span, ref_length)) + return chrom, start, end def get_ref_coords(self, transcript=None): From 27535c355b7e4eb8491f633f0cae45c31ab60393 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 16 Nov 2021 21:54:03 +1030 Subject: [PATCH 32/47] Support mitochondria --- pyhgvs/models/hgvs_name.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index 9581dda..16b0df6 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -416,7 +416,7 @@ def parse_allele(self, allele): "Non-coding transcript cannot contain '*' (3'UTR) coordinates") elif kind == "p": self.parse_protein(details) - elif kind == "g": + elif kind in ("g", 'm'): self.parse_genome(details) else: raise NotImplementedError("unknown kind: %s" % allele) @@ -578,14 +578,12 @@ def __unicode__(self): def format(self, use_prefix=True, use_gene=True, use_counsyl=False): """Generate a HGVS name as a string.""" - if self.kind == 'c': - allele = 'c.' + self.format_cdna() - elif self.kind == 'n': - allele = 'n.' + self.format_cdna() + if self.kind in ('c', 'n'): + allele = self.kind + '.' + self.format_cdna() elif self.kind == 'p': allele = 'p.' + self.format_protein() - elif self.kind == 'g': - allele = 'g.' + self.format_genome() + elif self.kind in ('g', 'm'): + allele = self.kind + '.' + self.format_genome() else: raise NotImplementedError("not implemented: '%s'" % self.kind) @@ -605,7 +603,7 @@ def format_prefix(self, use_gene=True): NM_007294.3(BRCA1):c.2207A>C """ - if self.kind == 'g': + if self.kind in ('g', 'm'): if self.chrom: return self.chrom @@ -743,7 +741,7 @@ def get_raw_coords(self, transcript=None): if start > end: raise AssertionError( "cdna_start cannot be greater than cdna_end") - elif self.kind == 'g': + elif self.kind in ('g', 'm'): chrom = self.chrom start = self.start end = self.end From a0a8edddfbf36cfa100a650719105b7e4c65fd0d Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 24 Nov 2021 17:26:18 +1030 Subject: [PATCH 33/47] Bump version so I can track it --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c468888..c7dc1c1 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.11.1', + version='0.12.0', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', From 12924f0cfe7226d238b85f45529e3616bd16dc85 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 9 Dec 2021 11:26:19 +1030 Subject: [PATCH 34/47] HGVS issue #61 - Reference HGVS without reference base leads to wrong coordinates and reference allele as it treats last digit as ref digit multiplying N that many times --- pyhgvs/models/hgvs_name.py | 2 ++ pyhgvs/tests/test_hgvs_names.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index 16b0df6..3d2b649 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -128,6 +128,7 @@ class HGVSRegex(object): # cDNA allele syntax CDNA_ALLELE = [ # No change + CDNA_START + EQUAL, CDNA_START + DNA_REF + EQUAL, # Substitution @@ -189,6 +190,7 @@ class HGVSRegex(object): # Genomic allele syntax GENOMIC_ALLELE = [ # No change + COORD_START + EQUAL, COORD_START + DNA_REF + EQUAL, # Substitution diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index 8f06237..06220de 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -358,6 +358,18 @@ def test_invalid_coordinates(): 'mutation_type': '=', }), + # cDNA no change, no provided base + ('BRCA1:c.101=', True, + { + 'gene': 'BRCA1', + 'kind': 'c', + 'cdna_start': CDNACoord(101), + 'cdna_end': CDNACoord(101), + 'ref_allele': '', + 'alt_allele': '', + 'mutation_type': '=', + }), + # cDNA 1bp mutations. ('BRCA1:c.101A>C', True, { @@ -542,6 +554,18 @@ def test_invalid_coordinates(): 'mutation_type': '=', }), + # Genomic no change + no ref base + ('BRCA1:g.101=', True, + { + 'gene': 'BRCA1', + 'kind': 'g', + 'start': 101, + 'end': 101, + 'ref_allele': '', + 'alt_allele': '', + 'mutation_type': '=', + }), + # Genomic 1bp mutations. ('BRCA1:g.101A>C', True, { From 70565eb4352555521ff8a36a9742b77410df0a84 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 9 Dec 2021 14:21:19 +1030 Subject: [PATCH 35/47] Bump version, add changelog --- CHANGELOG.md | 10 ++++++++++ setup.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3355e12..dfe400c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # HGVS library change log +## 0.12.1 (2021-12-09) + - Fix issue #61 - Regex for reference HGVS without reference base + +## 0.12.0 (2021-11-24) + - cDNA gaps + - Refactor models + - Support non-coding, mitochondria and LRG transcripts + - Validate coordinate span equals reference length + - Scripts to generate transcripts from RefSeq and Ensembl GTFs on the web + ## 0.9.2 (2014-12-11) - Rename package to pyhgvs to avoid naming conflict with other libraries. diff --git a/setup.py b/setup.py index c7dc1c1..7067c93 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.12.0', + version='0.12.1', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', From 1ea73ad19a17c7063c6b902a5b58f8792c75e136 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 14 Jan 2022 17:36:14 +1030 Subject: [PATCH 36/47] Use same code for calculating transcript position of start/stop codons. Be able to pass in pre-calculated values to save calculation if available --- pyhgvs/models/transcript.py | 86 ++++++++++++++++++------------------- pyhgvs/utils.py | 5 ++- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py index 4f7c0b9..7d93233 100644 --- a/pyhgvs/models/transcript.py +++ b/pyhgvs/models/transcript.py @@ -24,15 +24,18 @@ class Transcript(object): """ def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, exons=None, cdna_match=None): + is_default=False, cdna_match=None, + start_codon_transcript_pos=None, stop_codon_transcript_pos=None): self.name = name self.version = version self.gene = Gene(gene) self.tx_position = tx_position self.cds_position = cds_position self.is_default = is_default - self.exons = exons or [] self.cdna_match = cdna_match or [] + # Optional pre-calculated transcript coordinates + self._start_codon_transcript_pos = start_codon_transcript_pos + self._stop_codon_transcript_pos = stop_codon_transcript_pos @property def full_name(self): @@ -61,57 +64,51 @@ def ordered_cdna_match(self): cdna_match.reverse() return cdna_match - def get_utr5p_size(self): - """Return the size of the 5prime UTR of a transcript.""" + def get_cds_start_stop(self): + (start, stop) = (self.cds_position.chrom_start, self.cds_position.chrom_stop) + if not self.tx_position.is_forward_strand: + (stop, start) = (start, stop) + return start, stop - transcript_strand = self.tx_position.is_forward_strand + @lazy + def start_codon(self): + """ Transcript position """ - # Find the exon containing the start codon. - start_codon = (self.cds_position.chrom_start if transcript_strand - else self.cds_position.chrom_stop - 1) - cdna_offset = 0 - for cdna_match in self.ordered_cdna_match: - start = cdna_match.tx_position.chrom_start - end = cdna_match.tx_position.chrom_stop - if start <= start_codon < end: - # We're inside this match - if transcript_strand: - position = start_codon - start - else: - position = end - start_codon - 1 - return cdna_offset + position + cdna_match.get_offset(position) - cdna_offset += cdna_match.length + if self._start_codon_transcript_pos is not None: + return self._start_codon_transcript_pos + + start_genomic_coordinate, _ = self.get_cds_start_stop() + return self._get_transcript_position(start_genomic_coordinate) + + @lazy + def stop_codon(self): + """ Transcript position """ + if self._stop_codon_transcript_pos is not None: + return self._stop_codon_transcript_pos + + _, stop_genomic_coordinate = self.get_cds_start_stop() + return self._get_transcript_position(stop_genomic_coordinate) + + def _get_transcript_position(self, genomic_coordinate, label=None): + transcript_strand = self.tx_position.is_forward_strand - # Couldn't find it - cdna_matches = [] - for cdna_match in self.ordered_cdna_match: - start = cdna_match.tx_position.chrom_start - end = cdna_match.tx_position.chrom_stop - cdna_matches.append(f"{start}-{end}") - cdna_match_summary = ", ".join(cdna_matches) - raise ValueError("Couldn't find start_codon (%d) in cdna_match: {%s" % (start_codon, cdna_match_summary)) - - def find_stop_codon(self, cds_position): - """Return the position along the cDNA of the base after the stop codon.""" - if cds_position.is_forward_strand: - stop_pos = cds_position.chrom_stop - else: - stop_pos = cds_position.chrom_start cdna_offset = 0 for cdna_match in self.ordered_cdna_match: start = cdna_match.tx_position.chrom_start stop = cdna_match.tx_position.chrom_stop - if start <= stop_pos <= stop: + if start <= genomic_coordinate <= stop: # We're inside this match - if cds_position.is_forward_strand: - position = stop_pos - start + if transcript_strand: + position = genomic_coordinate - start else: - position = stop - stop_pos + position = stop - genomic_coordinate return cdna_offset + position + cdna_match.get_offset(position) else: cdna_offset += cdna_match.length - raise ValueError('Stop codon is not in any of the exons') + if label is None: + label = "Genomic coordinate: %d" % genomic_coordinate + raise ValueError('%s is not in any of the exons' % label) def cdna_to_genomic_coord(self, coord): """Convert a HGVS cDNA coordinate to a genomic coordinate.""" @@ -119,8 +116,7 @@ def cdna_to_genomic_coord(self, coord): # compute starting position along spliced transcript. if coord.landmark == CDNA_START_CODON: - utr5p = (self.get_utr5p_size() - if self.is_coding else 0) + utr5p = (self.start_codon if self.is_coding else 0) if coord.coord > 0: cdna_pos = utr5p + coord.coord @@ -130,7 +126,7 @@ def cdna_to_genomic_coord(self, coord): if coord.coord < 0: raise ValueError('CDNACoord cannot have a negative coord and ' 'landmark CDNA_STOP_CODON') - cdna_pos = self.find_stop_codon(self.cds_position) + coord.coord + cdna_pos = self.stop_codon + coord.coord else: raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) @@ -207,13 +203,13 @@ def genomic_to_cdna_coord(self, genomic_coord): # Adjust coordinates for coding transcript. if self.is_coding: # Detect if position before start codon. - utr5p = self.get_utr5p_size() + utr5p = self.start_codon cdna_coord.coord -= utr5p if cdna_coord.coord <= 0: cdna_coord.coord -= 1 else: # Detect if position is after stop_codon. - stop_codon = self.find_stop_codon(self.cds_position) + stop_codon = self.stop_codon stop_codon -= utr5p if (cdna_coord.coord > stop_codon or cdna_coord.coord == stop_codon and cdna_coord.offset > 0): diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index e6d6250..3fcc320 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -84,7 +84,10 @@ def make_transcript(transcript_json): transcript_json['chrom'], transcript_json['cds_start'], transcript_json['cds_end'], - transcript_json['strand'] == '+')) + transcript_json['strand'] == '+'), + start_codon_transcript_pos=transcript_json.get("start_codon_transcript_pos"), + stop_codon_transcript_pos=transcript_json.get("stop_codon_transcript_pos"), + ) exons = transcript_json['exons'] exons.sort(key=operator.itemgetter(0)) From f42bd06b6ff30bb6a38686b1592484c135a2fa69 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 14 Jan 2022 17:37:29 +1030 Subject: [PATCH 37/47] Changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dfe400c..c04fe6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # HGVS library change log +## Unreleased + - Use same code for calculating transcript position of start/stop codons. + - Be able to pass in pre-calculated start/stop from transcript_json if available + ## 0.12.1 (2021-12-09) - Fix issue #61 - Regex for reference HGVS without reference base From d59d079af578c8ba7b4297b7fea8c504d5b409fe Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 14 Jan 2022 22:22:51 +1030 Subject: [PATCH 38/47] No need to sort before storing as we don't rely on order and re-sort when loading in utils --- generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py index d53edf6..3ac4d53 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -53,10 +53,6 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: raise ValueError("Haven't handled stranded Target alignment") cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]]) - if strand == '-': - exons.reverse() - cdna_match.reverse() - pyhgvs_data = { 'chrom': transcript_data["chr"], 'start': start, From eef2514c916d3f3a8909f48fca5f3021fa27cfef Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 14 Jan 2022 22:42:31 +1030 Subject: [PATCH 39/47] Including coding start/end transcript coords if available --- generate_transcript_data/pyreference_to_pyhgvs_json.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py index 3ac4d53..a77de4a 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -64,6 +64,10 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: } # Optional stuff + for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"] + if value := transcript_data.get(optional_key): + pyhgvs_data[optional_key] = value + if cdna_match: pyhgvs_data["cdna_match"] = cdna_match if transcript_data.get("partial"): From 936d2764c9ae94961c8dc1df6f018cb403f08489 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Mon, 17 Jan 2022 10:53:14 +1030 Subject: [PATCH 40/47] Fix syntax error --- generate_transcript_data/pyreference_to_pyhgvs_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py index a77de4a..322ef98 100644 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ b/generate_transcript_data/pyreference_to_pyhgvs_json.py @@ -64,7 +64,7 @@ def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: } # Optional stuff - for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"] + for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]: if value := transcript_data.get(optional_key): pyhgvs_data[optional_key] = value From 992c4afe3edd29d25ce37e71507a2849eb15a675 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Mon, 1 Aug 2022 15:22:55 +0930 Subject: [PATCH 41/47] Fix issue where dups always matched ref allele --- pyhgvs/__init__.py | 6 +++--- pyhgvs/models/hgvs_name.py | 8 +++++--- pyhgvs/tests/test_hgvs_names.py | 27 ++++++++++++++++++++++++++- setup.py | 2 +- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index ff31c73..0fc378d 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -77,9 +77,9 @@ def get_alt_from_sequence(hgvs, genome, transcript): def matches_ref_allele(hgvs, genome, transcript=None): """Return True if reference allele matches genomic sequence.""" - ref, _ = hgvs.get_ref_alt( - transcript.tx_position.is_forward_strand if transcript else True) - chrom, start, end = hgvs.get_ref_coords(transcript) + is_forward_strand = transcript.tx_position.is_forward_strand if transcript else True + ref, _ = hgvs.get_ref_alt(is_forward_strand, raw_dup_alleles=True) # get raw values so dup isn't always True + chrom, start, end = hgvs.get_raw_coords(transcript) genome_ref = get_genomic_sequence(genome, chrom, start, end) return genome_ref == ref diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index 3d2b649..85f46bf 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -793,15 +793,17 @@ def get_vcf_coords(self, transcript=None): self.mutation_type) return chrom, start, end - def get_ref_alt(self, is_forward_strand=True): - """Return reference and alternate alleles.""" + def get_ref_alt(self, is_forward_strand=True, raw_dup_alleles=False): + """ Return reference and alternate alleles. + Original code was for representation - ie it altered dup to look like an insert + pass raw_dup_alleles=True to get the raw values """ if self.kind == 'p': raise NotImplementedError( 'get_ref_alt is not implemented for protein HGVS names') alleles = [self.ref_allele, self.alt_allele] # Represent duplications are inserts. - if self.mutation_type == "dup": + if not raw_dup_alleles and self.mutation_type == "dup": alleles[0] = "" alleles[1] = alleles[1][:len(alleles[1]) // 2] diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index 06220de..5754731 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -11,7 +11,7 @@ from ..models.cdna import CDNACoord, CDNA_STOP_CODON from ..models.hgvs_name import HGVSName, InvalidHGVSName -from .. import format_hgvs_name +from .. import format_hgvs_name, matches_ref_allele from .. import parse_hgvs_name from ..utils import read_transcripts from .genome import MockGenomeTestFile @@ -197,6 +197,31 @@ def test_invalid_coordinates(): parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript) +def test_matches_ref(): + genome = MockGenomeTestFile( + db_filename='hg19.fa', + filename='pyhgvs/tests/data/test_hgvs.genome', + create_data=False) + # genome = SequenceFileDB('pyhgvs/tests/data/test_hgvs.genome') + transcript = get_transcript("NM_000016.4") + + # The base at this position is "T" - used to always work w/dups + expected = [ + ("NM_000016.4:c.1189dupT", True), + ("NM_000016.4:c.1189T>TT", True), # same as dup by both alleles provided + ("NM_000016.4:c.1189dupA", False), + ("NM_000016.4:c.1189A>AA", False), + ("NM_000016.4:c.1189delT", True), + ("NM_000016.4:c.1189delC", False), + ("NM_000016.4:c.1189delCinsT", False), + ("NM_000016.4:c.1189delTinsC", True), + ] + for hgvs_string, expected_result in expected: + hgvs_name = HGVSName(hgvs_string) + matches_ref = matches_ref_allele(hgvs_name, genome, transcript) + nose.tools.assert_equal(matches_ref, expected_result, hgvs_string + " matches reference base") + + # Test examples of cDNA coordinates. _parse_cdna_coords = [ ('1001', CDNACoord(1001)), diff --git a/setup.py b/setup.py index 7067c93..f08964a 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.12.1', + version='0.12.2', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', From de3a1d336da05e91b111cd5d2218be5c04af0011 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 18 Aug 2022 14:39:51 +0930 Subject: [PATCH 42/47] Generating transcripts is now handled via https://github.com/SACGF/cdot --- .../ensembl_gene_annotation_grch37.sh | 28 ---- .../ensembl_gene_annotation_grch38.sh | 37 ----- .../pyreference_to_pyhgvs_json.py | 134 ------------------ .../refseq_gene_annotation_grch37.sh | 75 ---------- .../refseq_gene_annotation_grch38.sh | 90 ------------ 5 files changed, 364 deletions(-) delete mode 100755 generate_transcript_data/ensembl_gene_annotation_grch37.sh delete mode 100755 generate_transcript_data/ensembl_gene_annotation_grch38.sh delete mode 100644 generate_transcript_data/pyreference_to_pyhgvs_json.py delete mode 100755 generate_transcript_data/refseq_gene_annotation_grch37.sh delete mode 100755 generate_transcript_data/refseq_gene_annotation_grch38.sh diff --git a/generate_transcript_data/ensembl_gene_annotation_grch37.sh b/generate_transcript_data/ensembl_gene_annotation_grch37.sh deleted file mode 100755 index 97ee668..0000000 --- a/generate_transcript_data/ensembl_gene_annotation_grch37.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -# v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them - -#82 is first GFF3 for GRCh37 -#83 has no data -#84 is 82 again -#86 is 85 again -pyreference_args=() -for release in 82 85 87; do - filename=Homo_sapiens.GRCh37.${release}.gff3.gz - url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gff3/homo_sapiens/${filename} - pyreference_file=${filename}.json.gz - if [[ ! -e ${filename} ]]; then - wget ${url} - fi - if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" - fi - pyreference_args+=(--pyreference-json ${pyreference_file}) -done - -merged_file="pyhgvs_transcripts_ensembl_grch37.json.gz" -if [[ ! -e ${merged_file} ]]; then - BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} -fi diff --git a/generate_transcript_data/ensembl_gene_annotation_grch38.sh b/generate_transcript_data/ensembl_gene_annotation_grch38.sh deleted file mode 100755 index 9f8c0a0..0000000 --- a/generate_transcript_data/ensembl_gene_annotation_grch38.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# Skip earlier GTFs as they don't have versions -#for release in 76 77 78 79 80; do -# filename=Homo_sapiens.GRCh38.${release}.gtf.gz -# url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename} -# if [[ ! -e ${filename} ]]; then -# wget ${url} -# fi - -# if [[ ! -e ${filename}.json.gz ]]; then -# pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -# fi -#done - -#81 is first GFF3 for GRCh38 -pyreference_args=() -for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do - filename=Homo_sapiens.GRCh38.${release}.gff3.gz - url=ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${filename} - pyreference_file=${filename}.json.gz - - if [[ ! -e ${filename} ]]; then - wget ${url} - fi - if [[ ! -e ${filename}.json.gz ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" - fi - pyreference_args+=(--pyreference-json ${pyreference_file}) -done - -merged_file="pyhgvs_transcripts_ensembl_grch38.json.gz" -if [[ ! -e ${merged_file} ]]; then - BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} -fi diff --git a/generate_transcript_data/pyreference_to_pyhgvs_json.py b/generate_transcript_data/pyreference_to_pyhgvs_json.py deleted file mode 100644 index 322ef98..0000000 --- a/generate_transcript_data/pyreference_to_pyhgvs_json.py +++ /dev/null @@ -1,134 +0,0 @@ -""" - Converts PyReference JSON.gz files (created from RefSeq/Ensembl GTF or GFFs) into format easy to load w/PyHGVS - - @see https://bitbucket.org/sacgf/pyreference/ - - Dave Lawrence (davmlaw@gmail.com) on 22/10/2021 -""" -import gzip -import json -from argparse import ArgumentParser -from typing import Dict - - -def handle_args(): - parser = ArgumentParser(description='Convert multiple PyReference json.gz files into one for PyHGVS') - parser.add_argument('--pyreference-json', required=True, nargs="+", action="extend", - help='PyReference JSON.gz - list OLDEST to NEWEST (newest is kept)') - parser.add_argument('--output', required=True, help='Output filename') - return parser.parse_args() - - -def convert_gene_pyreference_to_gene_version_data(gene_data: Dict) -> Dict: - gene_version_data = { - 'biotype': ",".join(gene_data["biotype"]), - 'description': gene_data.get("description"), - 'gene_symbol': gene_data["name"], - } - - if hgnc_str := gene_data.get("HGNC"): - # Has HGNC: (5 characters) at start of it - gene_version_data["hgnc"] = hgnc_str[5:] - - # Only Ensembl Genes have versions - if version := gene_data.get("version"): - gene_data["version"] = version - - return gene_version_data - - -def convert_transcript_pyreference_to_pyhgvs(transcript_data: Dict) -> Dict: - start = transcript_data["start"] - end = transcript_data["stop"] - strand = transcript_data["strand"] - # PyHGVS has cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) - cds_start = transcript_data.get("cds_start", end) - cds_end = transcript_data.get("cds_end", end) - # PyHGVS exons are in genomic order, PyReference are in stranded - features = transcript_data["features_by_type"] - exons = [[ed["start"], ed["stop"]] for ed in features["exon"]] - cdna_match = [] - for cdm in features.get("cDNA_match", []): - if "cdna_strand" in cdm: # None in Human RefSeq so haven't handled - raise ValueError("Haven't handled stranded Target alignment") - cdna_match.append([cdm.get(k) for k in ["start", "stop", "cdna_start", "cdna_end", "gap"]]) - - pyhgvs_data = { - 'chrom': transcript_data["chr"], - 'start': start, - 'end': end, - 'strand': strand, - 'cds_start': cds_start, - 'cds_end': cds_end, - 'exons': exons, - } - - # Optional stuff - for optional_key in ["start_codon_transcript_pos", "stop_codon_transcript_pos"]: - if value := transcript_data.get(optional_key): - pyhgvs_data[optional_key] = value - - if cdna_match: - pyhgvs_data["cdna_match"] = cdna_match - if transcript_data.get("partial"): - pyhgvs_data["partial"] = 1 - other_chroms = transcript_data.get("other_chroms") - if other_chroms: - pyhgvs_data["other_chroms"] = other_chroms - - # Few extra fields (pyHGVS doesn't currently use) - pyhgvs_data["biotype"] = ",".join(transcript_data["biotype"]) - return pyhgvs_data - - -def main(): - args = handle_args() - - gene_versions = {} # We only keep those that are in the latest transcript version - transcript_versions = {} - - for pyref_filename in args.pyreference_json: - print(f"Loading '{pyref_filename}'") - with gzip.open(pyref_filename) as f: - pyref_data = json.load(f) - - url = pyref_data["reference_gtf"]["url"] - - # PyReference stores transcripts under genes, while PyReference only has transcripts (that contain genes) - transcript_gene_version = {} - - for gene_id, gene in pyref_data["genes_by_id"].items(): - if version := gene.get("version"): - gene_accession = f"{gene_id}.{version}" - else: - gene_accession = gene_id - - gene_version = convert_gene_pyreference_to_gene_version_data(gene) - gene_version["url"] = url - gene_versions[gene_accession] = gene_version - - for transcript_accession in gene["transcripts"]: - transcript_gene_version[transcript_accession] = gene_accession - - for transcript_accession, transcript in pyref_data["transcripts_by_id"].items(): - hgvs_data = convert_transcript_pyreference_to_pyhgvs(transcript) - gene_accession = transcript_gene_version[transcript_accession] - gene_version = gene_versions[gene_accession] - hgvs_data["id"] = transcript_accession - hgvs_data["gene_version"] = gene_accession - hgvs_data["gene_name"] = gene_version["gene_symbol"] - hgvs_data["url"] = url - transcript_versions[transcript_accession] = hgvs_data - - print("Writing PyHGVS data") - with gzip.open(args.output, 'w') as outfile: - data = { - "transcripts": transcript_versions, - "genes": gene_versions, - } - json_str = json.dumps(data) - outfile.write(json_str.encode('ascii')) - - -if __name__ == '__main__': - main() diff --git a/generate_transcript_data/refseq_gene_annotation_grch37.sh b/generate_transcript_data/refseq_gene_annotation_grch37.sh deleted file mode 100755 index c0846fc..0000000 --- a/generate_transcript_data/refseq_gene_annotation_grch37.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -pyreference_args=() - -# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok - -filename=ref_GRCh37.p5_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename} -pyreference_file=${filename}.json.gz -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh37.p9_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename} -pyreference_file=${filename}.json.gz -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh37.p10_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename} -pyreference_file=${filename}.json.gz -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh37.p13_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename} -pyreference_file=${filename}.json.gz -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -# These all have the same name, so rename them based on release ID -for release in 105.20190906 105.20201022; do - filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz - url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz - pyreference_file=${filename}.json.gz - if [[ ! -e ${filename} ]]; then - wget ${url} --output-document=${filename} - fi - if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" - fi - pyreference_args+=(--pyreference-json ${pyreference_file}) -done - - -merged_file="pyhgvs_transcripts_refseq_grch37.json.gz" -if [[ ! -e ${merged_file} ]]; then - BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} -fi diff --git a/generate_transcript_data/refseq_gene_annotation_grch38.sh b/generate_transcript_data/refseq_gene_annotation_grch38.sh deleted file mode 100755 index a4682fe..0000000 --- a/generate_transcript_data/refseq_gene_annotation_grch38.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok - -filename=ref_GRCh38_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename} -pyreference_file=${filename}.json.gz - -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh38.p2_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename} -pyreference_file=${filename}.json.gz - -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh38.p7_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename} -pyreference_file=${filename}.json.gz - -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=ref_GRCh38.p12_top_level.gff3.gz -url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename} -pyreference_file=${filename}.json.gz - -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz -url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/${filename} -pyreference_file=${filename}.json.gz - -if [[ ! -e ${filename} ]]; then - wget ${url} -fi -if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" -fi -pyreference_args+=(--pyreference-json ${pyreference_file}) - - -# These all have the same name, so rename them based on release ID -for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do - filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz - url=http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz - pyreference_file=${filename}.json.gz - if [[ ! -e ${filename} ]]; then - wget ${url} --output-document=${filename} - fi - if [[ ! -e ${pyreference_file} ]]; then - pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" - fi - pyreference_args+=(--pyreference-json ${pyreference_file}) -done - - -merged_file="pyhgvs_transcripts_refseq_grch38.json.gz" -if [[ ! -e ${merged_file} ]]; then - BASE_DIR=$(dirname ${BASH_SOURCE[0]}) - - python3 ${BASE_DIR}/pyreference_to_pyhgvs_json.py ${pyreference_args[@]} --output ${merged_file} -fi From 715c4f2e29de11272b9a39523460efcb6ea31a52 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 12 Apr 2023 12:19:19 +0930 Subject: [PATCH 43/47] Support for coord range + equals ie "NC_000010.10:g.89623915_89623920=" --- pyhgvs/models/hgvs_name.py | 1 + pyhgvs/tests/test_hgvs_names.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py index 85f46bf..6d6cc46 100644 --- a/pyhgvs/models/hgvs_name.py +++ b/pyhgvs/models/hgvs_name.py @@ -204,6 +204,7 @@ class HGVSRegex(object): COORD_START + DUP, # Insertion, deletion, duplication + COORD_RANGE + EQUAL, COORD_RANGE + INS + DNA_ALT, COORD_RANGE + DEL + DNA_REF, COORD_RANGE + DUP + DNA_REF, diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index 5754731..579e2a6 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -706,6 +706,11 @@ def test_matches_ref(): 'alt_allele': 'TA', 'mutation_type': 'delins', }), + # + ('NC_000010.10:g.89623915_89623920=', False, + { + 'chrom': 'NC_000010.10', + }), ] From 116217d18876b0882555e621e4fa9122b316f016 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 12 Apr 2023 12:30:37 +0930 Subject: [PATCH 44/47] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f08964a..a48f39a 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.12.2', + version='0.12.3', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', From 7591c13dd89dc9d5e759b3dcf1ed099840c74cd8 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 25 May 2023 17:00:01 +0930 Subject: [PATCH 45/47] #57 - c. dup converted back to delins --- pyhgvs/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 0fc378d..9dabfeb 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -155,8 +155,9 @@ def hgvs_justify_indel(chrom, offset, ref, alt, strand, genome): return chrom, offset, ref, alt # Get genomic sequence around the lesion. - start = max(offset - 100, 0) - end = offset + 100 + size = max(len(ref), len(alt)) + 1 + start = max(offset - size, 0) + end = offset + size seq = str(genome[str(chrom)][start - 1:end]).upper() cds_offset = offset - start From 8b30efd10ee44830d22501ff9dd1b6ce61c8c68e Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Thu, 1 Jun 2023 14:05:07 +0930 Subject: [PATCH 46/47] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a48f39a..4f36017 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.12.3', + version='0.12.4', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', From ef005725a4c9f7dfbb3bb3411a9ff170e177c918 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 14 Jun 2023 13:42:40 +0930 Subject: [PATCH 47/47] Use min of 100 again for normalization window space --- pyhgvs/__init__.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 9dabfeb..0521dab 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -155,7 +155,8 @@ def hgvs_justify_indel(chrom, offset, ref, alt, strand, genome): return chrom, offset, ref, alt # Get genomic sequence around the lesion. - size = max(len(ref), len(alt)) + 1 + window_size = 100 + size = window_size + max(len(ref), len(alt)) start = max(offset - size, 0) end = offset + size seq = str(genome[str(chrom)][start - 1:end]).upper() diff --git a/setup.py b/setup.py index 4f36017..34bcc44 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def main(): setup( name='pyhgvs', - version='0.12.4', + version='0.12.5', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen',