From b319e671bb82710f33b262d0d80e2f1e3ff0cdf2 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Sat, 2 Oct 2021 00:04:02 +0930 Subject: [PATCH] Handle cDNA alignment gaps --- pyhgvs/__init__.py | 43 +++++++++++++++++++++++++++++++++++++++++-- pyhgvs/models.py | 8 +++++--- pyhgvs/utils.py | 7 +++++-- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index 977ae39..09b7968 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -518,6 +518,40 @@ def get_genomic_sequence(genome, chrom, start, end): return str(genome[str(chrom)][start - 1:end]).upper() +def _get_exon_misalignment_offset(cdna_match_gap: str, exon_pos: int): + """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length + @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute + codes operation + M match + I insert a gap into the reference sequence + D insert a gap into the target (delete from reference) + + If you want the whole exon, then pass the end + """ + + exon_i = 0 + misalignment_offset = 0 + for gap_op in cdna_match_gap.split(): + code = gap_op[0] + length = int(gap_op[1:]) + if code == "M": + exon_i += length + elif code == "I": + if exon_pos <= exon_i + length: + raise ValueError("Coordinate inside insertion (%s) - no mapping possible!" % gap_op) + misalignment_offset += length + elif code == "D": + if exon_pos <= exon_i + length: + raise ValueError("Coordinate inside deletion (%s) - no mapping possible!" % gap_op) + misalignment_offset -= length + else: + raise ValueError(f"Unknown code in cDNA GAP: {gap_op}") + + if exon_i >= exon_pos: + break + return misalignment_offset + + def cdna_to_genomic_coord(transcript, coord): """Convert a HGVS cDNA coordinate to a genomic coordinate.""" transcript_strand = transcript.tx_position.is_forward_strand @@ -549,10 +583,15 @@ def cdna_to_genomic_coord(transcript, coord): # Walk along transcript until we find an exon that contains pos. cdna_start = 1 cdna_end = 1 + misalignment_offset = 0 for exon in exons: exon_start = exon.tx_position.chrom_start + 1 exon_end = exon.tx_position.chrom_stop cdna_end = cdna_start + (exon_end - exon_start) + if exon.cdna_match: + # Doesn't matter if it goes off the end, will just count entire exon + exon_pos = pos - cdna_start + 1 + misalignment_offset -= _get_exon_misalignment_offset(exon.cdna_match, exon_pos) if cdna_start <= pos <= cdna_end: break cdna_start = cdna_end + 1 @@ -566,10 +605,10 @@ def cdna_to_genomic_coord(transcript, coord): # Compute genomic coordinate using offset. if transcript_strand: # Plus strand. - return exon_start + (pos - cdna_start) + coord.offset + return exon_start + (pos - cdna_start) + coord.offset + misalignment_offset else: # Minus strand. - return exon_end - (pos - cdna_start) - coord.offset + return exon_end - (pos - cdna_start) - coord.offset + misalignment_offset def genomic_to_cdna_coord(transcript, genomic_coord): diff --git a/pyhgvs/models.py b/pyhgvs/models.py index 656e67b..4dbb182 100644 --- a/pyhgvs/models.py +++ b/pyhgvs/models.py @@ -33,14 +33,15 @@ class Transcript(object): """ def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, exons=None): + is_default=False, exons=None, cdna_match=None): self.name = name self.version = version self.gene = Gene(gene) self.tx_position = tx_position self.cds_position = cds_position self.is_default = is_default - self.exons = exons if exons else [] + self.exons = exons or [] + self.cdna_match = cdna_match or [] @property def full_name(self): @@ -102,10 +103,11 @@ def distance(self, offset): class Exon(object): - def __init__(self, transcript, tx_position, exon_number): + def __init__(self, transcript, tx_position, exon_number, cdna_match=None): self.transcript = transcript self.tx_position = tx_position self.exon_number = exon_number + self.cdna_match = cdna_match @property def get_exon_name(self): diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 55b20f4..ca7418f 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -87,10 +87,12 @@ def make_transcript(transcript_json): transcript_json['strand'] == '+')) exons = transcript_json['exons'] + cdna_matches = transcript_json.get('cdna_match', [None] * len(exons)) if not transcript.tx_position.is_forward_strand: exons = reversed(exons) + cdna_matches = reversed(cdna_matches) - for exon_number, (exon_start, exon_end) in enumerate(exons, 1): + for exon_number, ((exon_start, exon_end), cdna_match) in enumerate(zip(exons, cdna_matches), 1): transcript.exons.append( Exon(transcript=transcript, tx_position=Position( @@ -98,7 +100,8 @@ def make_transcript(transcript_json): exon_start, exon_end, transcript_json['strand'] == '+'), - exon_number=exon_number)) + exon_number=exon_number, + cdna_match=cdna_match)) return transcript