diff --git a/CHANGELOG.md b/CHANGELOG.md index 3355e12..c04fe6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # HGVS library change log +## Unreleased + - Use same code for calculating transcript position of start/stop codons. + - Be able to pass in pre-calculated start/stop from transcript_json if available + +## 0.12.1 (2021-12-09) + - Fix issue #61 - Regex for reference HGVS without reference base + +## 0.12.0 (2021-11-24) + - cDNA gaps + - Refactor models + - Support non-coding, mitochondria and LRG transcripts + - Validate coordinate span equals reference length + - Scripts to generate transcripts from RefSeq and Ensembl GTFs on the web + ## 0.9.2 (2014-12-11) - Rename package to pyhgvs to avoid naming conflict with other libraries. diff --git a/pyhgvs/__init__.py b/pyhgvs/__init__.py index b4df00c..0521dab 100644 --- a/pyhgvs/__init__.py +++ b/pyhgvs/__init__.py @@ -7,503 +7,16 @@ Definition of which transcript to use coding variants: ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene - -HGVS language currently implemented. - -HGVS = ALLELE - | PREFIX_NAME : ALLELE - -PREFIX_NAME = TRANSCRIPT - | TRANSCRIPT '(' GENE ')' - -TRANSCRIPT = TRANSCRIPT_NAME - | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION - -TRANSCRIPT_VERSION = NUMBER - -ALLELE = 'c.' CDNA_ALLELE # cDNA - | 'g.' GENOMIC_ALLELE # genomic - | 'm.' MIT_ALLELE # mitochondrial sequence - | 'n.' NC_ALLELE # non-coding RNA reference sequence - | 'r.' RNA_ALLELE # RNA sequence (like r.76a>u) - | 'p.' PROTEIN_ALLELE # protein sequence (like p.Lys76Asn) - -NC_ALLELE = -RNA_ALLELE = -CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE - | CDNA_COORD_RANGE MULTI_BASE_CHANGE - -GENOMIC_ALLELE = -MIT_ALLELE = COORD SINGLE_BASE_CHANGE - | COORD_RANGE MULTI_BASE_CHANGE - -SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '=' # no change - | CDNA_COORD BASE '>' BASE # substitution - | CDNA_COORD 'ins' BASE # 1bp insertion - | CDNA_COORD 'del' BASE # 1bp deletion - | CDNA_COORD 'dup' BASE # 1bp duplication - | CDNA_COORD 'ins' # 1bp insertion - | CDNA_COORD 'del' # 1bp deletion - | CDNA_COORD 'dup' # 1bp duplication - | CDNA_COORD 'del' BASE 'ins' BASE # 1bp indel - | CDNA_COORD 'delins' BASE # 1bp indel - -MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES # deletion - | COORD_RANGE 'ins' BASES # insertion - | COORD_RANGE 'dup' BASES # duplication - | COORD_RANGE 'del' # deletion - | COORD_RANGE 'dup' # duplication - | COORD_RANGE 'del' BASES 'ins' BASES # indel - | COORD_RANGE 'delins' BASES # indel - - -AMINO1 = [GAVLIMFWPSTCYNQDEKRH] - -AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro' - | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys' - | 'Arg' | 'His' - -PROTEIN_ALLELE = AMINO3 COORD '=' # no peptide change - | AMINO1 COORD '=' # no peptide change - | AMINO3 COORD AMINO3 PEP_EXTRA # peptide change - | AMINO1 COORD AMINO1 PEP_EXTRA # peptide change - | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA # indel - | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA # indel - | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel - | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel - -# A genomic range: -COORD_RANGE = COORD '_' COORD - -# A cDNA range: -CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD - -# A cDNA coordinate: -CDNA_COORD = COORD_PREFIX COORD - | COORD_PREFIX COORD OFFSET_PREFIX OFFSET -COORD_PREFIX = '' | '-' | '*' -COORD = NUMBER -OFFSET_PREFIX = '-' | '+' -OFFSET = NUMBER - -# Primatives: -NUMBER = \d+ -BASE = [ACGT] -BASES = BASE+ - """ from __future__ import absolute_import from __future__ import unicode_literals import re -from .variants import justify_indel -from .variants import normalize_variant -from .variants import revcomp - - -CHROM_PREFIX = 'chr' -CDNA_START_CODON = 'cdna_start' -CDNA_STOP_CODON = 'cdna_stop' - - -class HGVSRegex(object): - """ - All regular expression for HGVS names. - """ - - # DNA syntax - # http://www.hgvs.org/mutnomen/standards.html#nucleotide - BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+" - BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+" - DNA_REF = "(?P" + BASES + ")" - DNA_ALT = "(?P" + BASES + ")" - - # Mutation types - EQUAL = "(?P=)" - SUB = "(?P>)" - INS = "(?Pins)" - DEL = "(?Pdel)" - DUP = "(?Pdup)" - - # Simple coordinate syntax - COORD_START = "(?P\d+)" - COORD_END = "(?P\d+)" - COORD_RANGE = COORD_START + "_" + COORD_END - - # cDNA coordinate syntax - CDNA_COORD = ("(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?") - CDNA_START = ("(?P(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?)") - CDNA_END = (r"(?P(?P|-|\*)(?P\d+)" - "((?P-|\+)(?P\d+))?)") - CDNA_RANGE = CDNA_START + "_" + CDNA_END - - # cDNA allele syntax - CDNA_ALLELE = [ - # No change - CDNA_START + DNA_REF + EQUAL, - - # Substitution - CDNA_START + DNA_REF + SUB + DNA_ALT, - - # 1bp insertion, deletion, duplication - CDNA_START + INS + DNA_ALT, - CDNA_START + DEL + DNA_REF, - CDNA_START + DUP + DNA_REF, - CDNA_START + DEL, - CDNA_START + DUP, - - # Insertion, deletion, duplication - CDNA_RANGE + INS + DNA_ALT, - CDNA_RANGE + DEL + DNA_REF, - CDNA_RANGE + DUP + DNA_REF, - CDNA_RANGE + DEL, - CDNA_RANGE + DUP, - - # Indels - "(?P" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + CDNA_START + 'delins' + DNA_ALT + ")", - "(?P" + CDNA_RANGE + 'delins' + DNA_ALT + ")", - ] - - CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in CDNA_ALLELE] - - # Peptide syntax - PEP = "([A-Z]([a-z]{2}))+" - PEP_REF = "(?P" + PEP + ")" - PEP_REF2 = "(?P" + PEP + ")" - PEP_ALT = "(?P" + PEP + ")" - - PEP_EXTRA = "(?P(|=|\?)(|fs))" - - # Peptide allele syntax - PEP_ALLELE = [ - # No peptide change - # Example: Glu1161= - PEP_REF + COORD_START + PEP_EXTRA, - - # Peptide change - # Example: Glu1161Ser - PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA, - - # Peptide indel - # Example: Glu1161_Ser1164?fs - "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + - PEP_EXTRA + ")", - "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + - PEP_ALT + PEP_EXTRA + ")", - ] - - PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in PEP_ALLELE] - - # Genomic allele syntax - GENOMIC_ALLELE = [ - # No change - COORD_START + DNA_REF + EQUAL, - - # Substitution - COORD_START + DNA_REF + SUB + DNA_ALT, - - # 1bp insertion, deletion, duplication - COORD_START + INS + DNA_ALT, - COORD_START + DEL + DNA_REF, - COORD_START + DUP + DNA_REF, - COORD_START + DEL, - COORD_START + DUP, - - # Insertion, deletion, duplication - COORD_RANGE + INS + DNA_ALT, - COORD_RANGE + DEL + DNA_REF, - COORD_RANGE + DUP + DNA_REF, - COORD_RANGE + DEL, - COORD_RANGE + DUP, - - # Indels - "(?P" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", - "(?P" + COORD_START + 'delins' + DNA_ALT + ")", - "(?P" + COORD_RANGE + 'delins' + DNA_ALT + ")", - ] - - GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$") - for regex in GENOMIC_ALLELE] - - -class ChromosomeSubset(object): - """ - Allow direct access to a subset of the chromosome. - """ - - def __init__(self, name, genome=None): - self.name = name - self.genome = genome - - def __getitem__(self, key): - """Return sequence from region [start, end) - - Coordinates are 0-based, end-exclusive.""" - if isinstance(key, slice): - start, end = (key.start, key.stop) - start -= self.genome.start - end -= self.genome.start - return self.genome.genome[self.genome.seqid][start:end] - else: - raise TypeError('Expected a slice object but ' - 'received a {0}.'.format(type(key))) - - def __repr__(self): - return 'ChromosomeSubset("%s")' % self.name - - -class GenomeSubset(object): - """ - Allow the direct access of a subset of the genome. - """ - - def __init__(self, genome, chrom, start, end, seqid): - self.genome = genome - self.chrom = chrom - self.start = start - self.end = end - self.seqid = seqid - self._chroms = {} - - def __getitem__(self, chrom): - """Return a chromosome by its name.""" - if chrom in self._chroms: - return self._chroms[chrom] - else: - chromosome = ChromosomeSubset(chrom, self) - self._chroms[chrom] = chromosome - return chromosome - - -class CDNACoord(object): - """ - A HGVS cDNA-based coordinate. - - A cDNA coordinate can take one of these forms: - - N = nucleotide N in protein coding sequence (e.g. 11A>G) - - -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G) - NOTE: so located in the 5'UTR or 5' of the transcription initiation - site (upstream of the gene, incl. promoter) - - *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G) - NOTE: so located in the 3'UTR or 3' of the polyA-addition site - (including downstream of the gene) - - N+M = nucleotide M in the intron after (3' of) position N in the coding DNA - reference sequence (e.g. 30+4A>G) - - N-M = nucleotide M in the intron before (5' of) position N in the coding - DNA reference sequence (e.g. 301-2A>G) - - -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G) - - *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G) - """ - - def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON, - string=''): - """ - coord: main coordinate along cDNA on the same strand as the transcript - - offset: an additional genomic offset from the main coordinate. This - allows referencing non-coding (e.g. intronic) positions. - Offset is also interpreted on the coding strand. - - landmark: ('cdna_start', 'cdna_stop') indicating that 'coord' - is relative to one of these landmarks. - - string: a coordinate from an HGVS name. If given coord, offset, and - landmark should not be specified. - """ - - if string: - if coord != 0 or offset != 0 or landmark != CDNA_START_CODON: - raise ValueError("coord, offset, and landmark should not " - "be given with string argument") - - self.parse(string) - else: - self.coord = coord - self.offset = offset - self.landmark = landmark - - def parse(self, coord_text): - """ - Parse a HGVS formatted cDNA coordinate. - """ - - match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text) - if not match: - raise ValueError("unknown coordinate format '%s'" % coord_text) - coord_prefix, coord, _, offset_prefix, offset = match.groups() - - self.coord = int(coord) - self.offset = int(offset) if offset else 0 - - if offset_prefix == '-': - self.offset *= -1 - elif offset_prefix == '+' or offset is None: - pass - else: - raise ValueError("unknown offset_prefix '%s'" % offset_prefix) - - if coord_prefix == '': - self.landmark = CDNA_START_CODON - elif coord_prefix == "-": - self.coord *= -1 - self.landmark = CDNA_START_CODON - elif coord_prefix == '*': - self.landmark = CDNA_STOP_CODON - else: - raise ValueError("unknown coord_prefix '%s'" % coord_prefix) - return self - - def __str__(self): - """ - Return a formatted cDNA coordinate - """ - if self.landmark == CDNA_STOP_CODON: - coord_prefix = '*' - else: - coord_prefix = '' - - if self.offset < 0: - offset = '%d' % self.offset - elif self.offset > 0: - offset = '+%d' % self.offset - else: - offset = '' - - return '%s%d%s' % (coord_prefix, self.coord, offset) - - def __eq__(self, other): - """Equality operator.""" - return ((self.coord, self.offset, self.landmark) == - (other.coord, other.offset, other.landmark)) - - def __repr__(self): - """ - Returns a string representation of a cDNA coordinate. - """ - if self.landmark != CDNA_START_CODON: - return "CDNACoord(%d, %d, '%s')" % ( - self.coord, self.offset, self.landmark) - else: - return "CDNACoord(%d, %d)" % (self.coord, self.offset) - - -# The RefSeq standard for naming contigs/transcripts/proteins: -# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8 -REFSEQ_PREFIXES = [ - ('AC_', 'genomic', - 'Complete genomic molecule, usually alternate assembly'), - ('NC_', 'genomic', - 'Complete genomic molecule, usually reference assembly'), - ('NG_', 'genomic', 'Incomplete genomic region'), - ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'), - ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'), - ('NS_', 'genomic', 'Environmental sequence'), - ('NZ_', 'genomic', 'Unfinished WGS'), - ('NM_', 'mRNA', ''), - ('NR_', 'RNA', ''), - ('XM_', 'mRNA', 'Predicted model'), - ('XR_', 'RNA', 'Predicted model'), - ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'), - ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'), - ('YP_', 'Protein', ''), - ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'), - ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'), -] - -REFSEQ_PREFIX_LOOKUP = dict( - (prefix, (kind, description)) - for prefix, kind, description in REFSEQ_PREFIXES -) - - -def get_refseq_type(name): - """ - Return the RefSeq type for a refseq name. - """ - prefix = name[:3] - return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0] - - -def get_exons(transcript): - """Yield exons in coding order.""" - transcript_strand = transcript.tx_position.is_forward_strand - if hasattr(transcript.exons, 'select_related'): - exons = list(transcript.exons.select_related('tx_position')) - else: - exons = list(transcript.exons) - exons.sort(key=lambda exon: exon.tx_position.chrom_start) - if not transcript_strand: - exons.reverse() - return exons - - -def get_coding_exons(transcript): - """Yield non-empty coding exonic regions in coding order.""" - for exon in get_exons(transcript): - region = exon.get_as_interval(coding_only=True) - if region: - yield region - - -def get_utr5p_size(transcript): - """Return the size of the 5prime UTR of a transcript.""" - - transcript_strand = transcript.tx_position.is_forward_strand - exons = get_exons(transcript) - - # Find the exon containing the start codon. - start_codon = (transcript.cds_position.chrom_start if transcript_strand - else transcript.cds_position.chrom_stop - 1) - cdna_len = 0 - for exon in exons: - exon_start = exon.tx_position.chrom_start - exon_end = exon.tx_position.chrom_stop - if exon_start <= start_codon < exon_end: - break - cdna_len += exon_end - exon_start - else: - raise ValueError("transcript contains no exons") - - if transcript_strand: - return cdna_len + (start_codon - exon_start) - else: - return cdna_len + (exon_end - start_codon - 1) - - -def find_stop_codon(exons, cds_position): - """Return the position along the cDNA of the base after the stop codon.""" - if cds_position.is_forward_strand: - stop_pos = cds_position.chrom_stop - else: - stop_pos = cds_position.chrom_start - cdna_pos = 0 - for exon in exons: - exon_start = exon.tx_position.chrom_start - exon_stop = exon.tx_position.chrom_stop - - if exon_start <= stop_pos <= exon_stop: - if cds_position.is_forward_strand: - return cdna_pos + stop_pos - exon_start - else: - return cdna_pos + exon_stop - stop_pos - else: - cdna_pos += exon_stop - exon_start - raise ValueError('Stop codon is not in any of the exons') +from .models.cdna import CDNACoord, CDNA_START_CODON, CDNA_STOP_CODON +from .models.genome import GenomeSubset +from .models.hgvs_name import HGVSName, InvalidHGVSName +from .models.variants import justify_indel, normalize_variant, revcomp def get_genomic_sequence(genome, chrom, start, end): @@ -518,144 +31,9 @@ def get_genomic_sequence(genome, chrom, start, end): return str(genome[str(chrom)][start - 1:end]).upper() -def cdna_to_genomic_coord(transcript, coord): - """Convert a HGVS cDNA coordinate to a genomic coordinate.""" - transcript_strand = transcript.tx_position.is_forward_strand - exons = get_exons(transcript) - utr5p = (get_utr5p_size(transcript) - if transcript.is_coding else 0) - - # compute starting position along spliced transcript. - if coord.landmark == CDNA_START_CODON: - if coord.coord > 0: - pos = utr5p + coord.coord - else: - pos = utr5p + coord.coord + 1 - elif coord.landmark == CDNA_STOP_CODON: - if coord.coord < 0: - raise ValueError('CDNACoord cannot have a negative coord and ' - 'landmark CDNA_STOP_CODON') - pos = find_stop_codon(exons, transcript.cds_position) + coord.coord - else: - raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) - - # 5' flanking sequence. - if pos < 1: - if transcript_strand: - return transcript.tx_position.chrom_start + pos - else: - return transcript.tx_position.chrom_stop - pos + 1 - - # Walk along transcript until we find an exon that contains pos. - cdna_start = 1 - cdna_end = 1 - for exon in exons: - exon_start = exon.tx_position.chrom_start + 1 - exon_end = exon.tx_position.chrom_stop - cdna_end = cdna_start + (exon_end - exon_start) - if cdna_start <= pos <= cdna_end: - break - cdna_start = cdna_end + 1 - else: - # 3' flanking sequence - if transcript_strand: - return transcript.cds_position.chrom_stop + coord.coord - else: - return transcript.cds_position.chrom_start + 1 - coord.coord - - # Compute genomic coordinate using offset. - if transcript_strand: - # Plus strand. - return exon_start + (pos - cdna_start) + coord.offset - else: - # Minus strand. - return exon_end - (pos - cdna_start) - coord.offset - - -def genomic_to_cdna_coord(transcript, genomic_coord): - """Convert a genomic coordinate to a cDNA coordinate and offset. - """ - exons = [exon.get_as_interval() - for exon in get_exons(transcript)] - - if len(exons) == 0: - return None - - strand = transcript.strand - - if strand == "+": - exons.sort(key=lambda exon: exon.chrom_start) - else: - exons.sort(key=lambda exon: -exon.chrom_end) - - distances = [exon.distance(genomic_coord) - for exon in exons] - min_distance_to_exon = min(map(abs, distances)) - - coding_offset = 0 - for exon in exons: - exon_length = exon.chrom_end - exon.chrom_start - distance = exon.distance(genomic_coord) - if abs(distance) == min_distance_to_exon: - if strand == "+": - exon_start_cds_offset = coding_offset + 1 - exon_end_cds_offset = coding_offset + exon_length - else: - exon_start_cds_offset = coding_offset + exon_length - exon_end_cds_offset = coding_offset + 1 - # This is the exon we want to annotate against. - if distance == 0: - # Inside the exon. - if strand == "+": - coord = (exon_start_cds_offset + - (genomic_coord - - (exon.chrom_start + 1))) - else: - coord = (exon_end_cds_offset + - (exon.chrom_end - - genomic_coord)) - cdna_coord = CDNACoord(coord, 0) - else: - # Outside the exon. - if distance > 0: - nearest_exonic = exon_start_cds_offset - else: - nearest_exonic = exon_end_cds_offset - if strand == "+": - distance *= -1 - - # If outside transcript, don't use offset. - if (genomic_coord < transcript.tx_position.chrom_start + 1 or - genomic_coord > transcript.tx_position.chrom_stop): - nearest_exonic += distance - distance = 0 - cdna_coord = CDNACoord(nearest_exonic, distance) - break - coding_offset += exon_length - - # Adjust coordinates for coding transcript. - if transcript.is_coding: - # Detect if position before start codon. - utr5p = get_utr5p_size(transcript) if transcript.is_coding else 0 - cdna_coord.coord -= utr5p - if cdna_coord.coord <= 0: - cdna_coord.coord -= 1 - else: - # Detect if position is after stop_codon. - exons = get_exons(transcript) - stop_codon = find_stop_codon(exons, transcript.cds_position) - stop_codon -= utr5p - if (cdna_coord.coord > stop_codon or - cdna_coord.coord == stop_codon and cdna_coord.offset > 0): - cdna_coord.coord -= stop_codon - cdna_coord.landmark = CDNA_STOP_CODON - - return cdna_coord - - def get_allele(hgvs, genome, transcript=None): """Get an allele from a HGVSName, a genome, and a transcript.""" - chrom, start, end = hgvs.get_coords(transcript) + chrom, start, end = hgvs.get_ref_coords(transcript) _, alt = hgvs.get_ref_alt( transcript.tx_position.is_forward_strand if transcript else True) ref = get_genomic_sequence(genome, chrom, start, end) @@ -672,553 +50,40 @@ def get_vcf_allele(hgvs, genome, transcript=None): transcript.tx_position.is_forward_strand if transcript else True) ref = get_genomic_sequence(genome, chrom, start, end) + # Sometimes we need to retrieve alt from reference + # Eg NC_000001.11:g.169549811= + if hgvs.mutation_type == "=": + alt = ref + if hgvs.mutation_type in _indel_mutation_types: + if hgvs.mutation_type == 'dup': + # No alt supplied: NM_000492.3:c.1155_1156dup + # Number used: NM_004119.2(FLT3):c.1794_1811dup18 + # We *know* what the sequence is for "dup18", but not for "ins18" + if not hgvs.alt_allele or re.match("^N+$", hgvs.alt_allele): + alt = get_alt_from_sequence(hgvs, genome, transcript) + # Left-pad alternate allele. alt = ref[0] + alt return chrom, start, end, ref, alt +def get_alt_from_sequence(hgvs, genome, transcript): + """ returns ready for VCF """ + + chrom, start, end = hgvs.get_raw_coords(transcript) + return get_genomic_sequence(genome, chrom, start, end) + + def matches_ref_allele(hgvs, genome, transcript=None): """Return True if reference allele matches genomic sequence.""" - ref, alt = hgvs.get_ref_alt( - transcript.tx_position.is_forward_strand if transcript else True) - chrom, start, end = hgvs.get_coords(transcript) + is_forward_strand = transcript.tx_position.is_forward_strand if transcript else True + ref, _ = hgvs.get_ref_alt(is_forward_strand, raw_dup_alleles=True) # get raw values so dup isn't always True + chrom, start, end = hgvs.get_raw_coords(transcript) genome_ref = get_genomic_sequence(genome, chrom, start, end) return genome_ref == ref -class InvalidHGVSName(ValueError): - def __init__(self, name='', part='name', reason=''): - if name: - message = 'Invalid HGVS %s "%s"' % (part, name) - else: - message = 'Invalid HGVS %s' % part - if reason: - message += ': ' + reason - super(InvalidHGVSName, self).__init__(message) - - self.name = name - self.part = part - self.reason = reason - - -class HGVSName(object): - """ - Represents a HGVS variant name. - """ - - def __init__(self, name='', prefix='', chrom='', transcript='', gene='', - kind='', mutation_type=None, start=0, end=0, ref_allele='', - ref2_allele='', alt_allele='', - cdna_start=None, cdna_end=None, pep_extra=''): - - # Full HGVS name. - self.name = name - - # Name parts. - self.prefix = prefix - self.chrom = chrom - self.transcript = transcript - self.gene = gene - self.kind = kind - self.mutation_type = mutation_type - self.start = start - self.end = end - self.ref_allele = ref_allele # reference allele - self.ref2_allele = ref2_allele # reference allele at end of pep indel - self.alt_allele = alt_allele # alternate allele - - # cDNA-specific fields - self.cdna_start = cdna_start if cdna_start else CDNACoord() - self.cdna_end = cdna_end if cdna_end else CDNACoord() - - # Protein-specific fields - self.pep_extra = pep_extra - - if name: - self.parse(name) - - def parse(self, name): - """Parse a HGVS name.""" - # Does HGVS name have transcript/gene prefix? - if ':' in name: - prefix, allele = name.split(':', 1) - else: - prefix = '' - allele = name - - self.name = name - - # Parse prefix and allele. - self.parse_allele(allele) - self.parse_prefix(prefix, self.kind) - self._validate() - - def parse_prefix(self, prefix, kind): - """ - Parse a HGVS prefix (gene/transcript/chromosome). - - Some examples of full hgvs names with transcript include: - NM_007294.3:c.2207A>C - NM_007294.3(BRCA1):c.2207A>C - BRCA1{NM_007294.3}:c.2207A>C - """ - - self.prefix = prefix - - # No prefix. - if prefix == '': - self.chrom = '' - self.transcript = '' - self.gene = '' - return - - # Transcript and gene given with parens. - # example: NM_007294.3(BRCA1):c.2207A>C - match = re.match("^(?P[^(]+)\((?P[^)]+)\)$", prefix) - if match: - self.transcript = match.group('transcript') - self.gene = match.group('gene') - return - - # Transcript and gene given with braces. - # example: BRCA1{NM_007294.3}:c.2207A>C - match = re.match("^(?P[^{]+)\{(?P[^}]+)\}$", prefix) - if match: - self.transcript = match.group('transcript') - self.gene = match.group('gene') - return - - # Determine using Ensembl type. - if prefix.startswith('ENST'): - self.transcript = prefix - return - - # Determine using refseq type. - refseq_type = get_refseq_type(prefix) - if refseq_type in ('mRNA', 'RNA'): - self.transcript = prefix - return - - # Determine using refseq type. - if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic': - self.chrom = prefix - return - - # Assume gene name. - self.gene = prefix - - def parse_allele(self, allele): - """ - Parse a HGVS allele description. - - Some examples include: - cDNA substitution: c.101A>C, - cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG - No protein change: p.Glu1161= - Protein change: p.Glu1161Ser - Protein frameshift: p.Glu1161_Ser1164?fs - Genomic substitution: g.1000100A>T - Genomic indel: g.1000100_1000102delATG - """ - if '.' not in allele: - raise InvalidHGVSName(allele, 'allele', - 'expected kind "c.", "p.", "g.", etc') - - # Determine HGVS name kind. - kind, details = allele.split('.', 1) - self.kind = kind - self.mutation_type = None - - if kind == "c": - self.parse_cdna(details) - elif kind == "p": - self.parse_protein(details) - elif kind == "g": - self.parse_genome(details) - else: - raise NotImplementedError("unknown kind: %s" % allele) - - def parse_cdna(self, details): - """ - Parse a HGVS cDNA name. - - Some examples include: - Substitution: 101A>C, - Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG - """ - for regex in HGVSRegex.CDNA_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = groups['mutation_type'] - - # Parse coordinates. - self.cdna_start = CDNACoord(string=groups.get('start')) - if groups.get('end'): - self.cdna_end = CDNACoord(string=groups.get('end')) - else: - self.cdna_end = CDNACoord(string=groups.get('start')) - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - self.alt_allele = groups.get('alt', '') - - # Convert numerical allelles. - if self.ref_allele.isdigit(): - self.ref_allele = "N" * int(self.ref_allele) - if self.alt_allele.isdigit(): - self.alt_allele = "N" * int(self.alt_allele) - - # Convert duplication alleles. - if self.mutation_type == "dup": - self.alt_allele = self.ref_allele * 2 - - # Convert no match alleles. - if self.mutation_type == "=": - self.alt_allele = self.ref_allele - return - - raise InvalidHGVSName(details, 'cDNA allele') - - def parse_protein(self, details): - """ - Parse a HGVS protein name. - - Some examples include: - No change: Glu1161= - Change: Glu1161Ser - Frameshift: Glu1161_Ser1164?fs - """ - for regex in HGVSRegex.PEP_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = '>' - - # Parse coordinates. - self.start = int(groups.get('start')) - if groups.get('end'): - self.end = int(groups.get('end')) - else: - self.end = self.start - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - if groups.get('ref2'): - self.ref2_allele = groups.get('ref2') - self.alt_allele = groups.get('alt', '') - else: - # If alt is not given, assume matching with ref - self.ref2_allele = self.ref_allele - self.alt_allele = groups.get( - 'alt', self.ref_allele) - - self.pep_extra = groups.get('extra') - return - - raise InvalidHGVSName(details, 'protein allele') - - def parse_genome(self, details): - """ - Parse a HGVS genomic name. - - Som examples include: - Substitution: 1000100A>T - Indel: 1000100_1000102delATG - """ - - for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES: - match = re.match(regex, details) - if match: - groups = match.groupdict() - - # Parse mutation type. - if groups.get('delins'): - self.mutation_type = 'delins' - else: - self.mutation_type = groups['mutation_type'] - - # Parse coordinates. - self.start = int(groups.get('start')) - if groups.get('end'): - self.end = int(groups.get('end')) - else: - self.end = self.start - - # Parse alleles. - self.ref_allele = groups.get('ref', '') - self.alt_allele = groups.get('alt', '') - - # Convert numerical allelles. - if self.ref_allele.isdigit(): - self.ref_allele = "N" * int(self.ref_allele) - if self.alt_allele.isdigit(): - self.alt_allele = "N" * int(self.alt_allele) - - # Convert duplication alleles. - if self.mutation_type == "dup": - self.alt_allele = self.ref_allele * 2 - - # Convert no match alleles. - if self.mutation_type == "=": - self.alt_allele = self.ref_allele - return - - raise InvalidHGVSName(details, 'genomic allele') - - def _validate(self): - """ - Check for internal inconsistencies in representation - """ - if self.start > self.end: - raise InvalidHGVSName(reason="Coordinates are nonincreasing") - - def __repr__(self): - try: - return "HGVSName('%s')" % self.format() - except NotImplementedError: - return "HGVSName('%s')" % self.name - - def __unicode__(self): - return self.format() - - def format(self, use_prefix=True, use_gene=True, use_counsyl=False): - """Generate a HGVS name as a string.""" - - if self.kind == 'c': - allele = 'c.' + self.format_cdna() - elif self.kind == 'p': - allele = 'p.' + self.format_protein() - elif self.kind == 'g': - allele = 'g.' + self.format_genome() - else: - raise NotImplementedError("not implemented: '%s'" % self.kind) - - prefix = self.format_prefix(use_gene=use_gene) if use_prefix else '' - - if prefix: - return prefix + ':' + allele - else: - return allele - - def format_prefix(self, use_gene=True): - """ - Generate HGVS trancript/gene prefix. - - Some examples of full hgvs names with transcript include: - NM_007294.3:c.2207A>C - NM_007294.3(BRCA1):c.2207A>C - """ - - if self.kind == 'g': - if self.chrom: - return self.chrom - - if self.transcript: - if use_gene and self.gene: - return '%s(%s)' % (self.transcript, self.gene) - else: - return self.transcript - else: - if use_gene: - return self.gene - else: - return '' - - def format_cdna_coords(self): - """ - Generate HGVS cDNA coordinates string. - """ - # Format coordinates. - if self.cdna_start == self.cdna_end: - return str(self.cdna_start) - else: - return "%s_%s" % (self.cdna_start, self.cdna_end) - - def format_dna_allele(self): - """ - Generate HGVS DNA allele. - """ - if self.mutation_type == '=': - # No change. - # example: 101A= - return self.ref_allele + '=' - - if self.mutation_type == '>': - # SNP. - # example: 101A>C - return self.ref_allele + '>' + self.alt_allele - - elif self.mutation_type == 'delins': - # Indel. - # example: 112_117delAGGTCAinsTG, 112_117delinsTG - return 'del' + self.ref_allele + 'ins' + self.alt_allele - - elif self.mutation_type in ('del', 'dup'): - # Delete, duplication. - # example: 1000_1003delATG, 1000_1003dupATG - return self.mutation_type + self.ref_allele - - elif self.mutation_type == 'ins': - # Insert. - # example: 1000_1001insATG - return self.mutation_type + self.alt_allele - - elif self.mutation_type == 'inv': - return self.mutation_type - - else: - raise AssertionError( - "unknown mutation type: '%s'" % self.mutation_type) - - def format_cdna(self): - """ - Generate HGVS cDNA allele. - - Some examples include: - Substitution: 101A>C, - Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG - """ - return self.format_cdna_coords() + self.format_dna_allele() - - def format_protein(self): - """ - Generate HGVS protein name. - - Some examples include: - No change: Glu1161= - Change: Glu1161Ser - Frameshift: Glu1161_Ser1164?fs - """ - if (self.start == self.end and - self.ref_allele == self.ref2_allele == self.alt_allele): - # Match. - # Example: Glu1161= - pep_extra = self.pep_extra if self.pep_extra else '=' - return self.ref_allele + str(self.start) + pep_extra - - elif (self.start == self.end and - self.ref_allele == self.ref2_allele and - self.ref_allele != self.alt_allele): - # Change. - # Example: Glu1161Ser - return (self.ref_allele + str(self.start) + - self.alt_allele + self.pep_extra) - - elif self.start != self.end: - # Range change. - # Example: Glu1161_Ser1164?fs - return (self.ref_allele + str(self.start) + '_' + - self.ref2_allele + str(self.end) + - self.pep_extra) - - else: - raise NotImplementedError('protein name formatting.') - - def format_coords(self): - """ - Generate HGVS cDNA coordinates string. - """ - # Format coordinates. - if self.start == self.end: - return str(self.start) - else: - return "%s_%s" % (self.start, self.end) - - def format_genome(self): - """ - Generate HGVS genomic allele. - - Som examples include: - Substitution: 1000100A>T - Indel: 1000100_1000102delATG - """ - return self.format_coords() + self.format_dna_allele() - - def get_coords(self, transcript=None): - """Return genomic coordinates of reference allele.""" - if self.kind == 'c': - chrom = transcript.tx_position.chrom - start = cdna_to_genomic_coord(transcript, self.cdna_start) - end = cdna_to_genomic_coord(transcript, self.cdna_end) - - if not transcript.tx_position.is_forward_strand: - if end > start: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - start, end = end, start - else: - if start > end: - raise AssertionError( - "cdna_start cannot be greater than cdna_end") - - if self.mutation_type == "ins": - # Inserts have empty interval. - if start < end: - start += 1 - end -= 1 - else: - end = start - 1 - - elif self.mutation_type == "dup": - end = start - 1 - - elif self.kind == 'g': - chrom = self.chrom - start = self.start - end = self.end - - else: - raise NotImplementedError( - 'Coordinates are not available for this kind of HGVS name "%s"' - % self.kind) - - return chrom, start, end - - def get_vcf_coords(self, transcript=None): - """Return genomic coordinates of reference allele in VCF-style.""" - chrom, start, end = self.get_coords(transcript) - - # Inserts and deletes require left-padding by 1 base - if self.mutation_type in ("=", ">"): - pass - elif self.mutation_type in ("del", "ins", "dup", "delins"): - # Indels have left-padding. - start -= 1 - else: - raise NotImplementedError("Unknown mutation_type '%s'" % - self.mutation_type) - return chrom, start, end - - def get_ref_alt(self, is_forward_strand=True): - """Return reference and alternate alleles.""" - if self.kind == 'p': - raise NotImplementedError( - 'get_ref_alt is not implemented for protein HGVS names') - alleles = [self.ref_allele, self.alt_allele] - - # Represent duplications are inserts. - if self.mutation_type == "dup": - alleles[0] = "" - alleles[1] = alleles[1][:len(alleles[1]) // 2] - - if is_forward_strand: - return alleles - else: - return tuple(map(revcomp, alleles)) - - def hgvs_justify_dup(chrom, offset, ref, alt, genome): """ Determines if allele is a duplication and justifies. @@ -1290,8 +155,10 @@ def hgvs_justify_indel(chrom, offset, ref, alt, strand, genome): return chrom, offset, ref, alt # Get genomic sequence around the lesion. - start = max(offset - 100, 0) - end = offset + 100 + window_size = 100 + size = window_size + max(len(ref), len(alt)) + start = max(offset - size, 0) + end = offset + size seq = str(genome[str(chrom)][start - 1:end]).upper() cds_offset = offset - start @@ -1344,7 +211,8 @@ def hgvs_normalize_variant(chrom, offset, ref, alt, genome, transcript=None): def parse_hgvs_name(hgvs_name, genome, transcript=None, get_transcript=lambda name: None, - flank_length=30, normalize=True, lazy=False): + flank_length=30, normalize=True, lazy=False, + indels_start_with_same_base=True): """ Parse an HGVS name into (chrom, start, end, ref, alt) @@ -1353,15 +221,16 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript: Transcript corresponding to HGVS name. normalize: If True, normalize allele according to VCF standard. lazy: If True, discard version information from incoming transcript/gene. + indels_start_with_same_base: When normalizing, don't strip common prefix from indels """ hgvs = HGVSName(hgvs_name) # Determine transcript. - if hgvs.kind == 'c' and not transcript: + if hgvs.kind in ('c', 'n') and not transcript: if '.' in hgvs.transcript and lazy: - hgvs.transcript, version = hgvs.transcript.split('.') + hgvs.transcript, _ = hgvs.transcript.split('.') elif '.' in hgvs.gene and lazy: - hgvs.gene, version = hgvs.gene.split('.') + hgvs.gene, _ = hgvs.gene.split('.') if get_transcript: if hgvs.transcript: transcript = get_transcript(hgvs.transcript) @@ -1377,11 +246,12 @@ def parse_hgvs_name(hgvs_name, genome, transcript=None, transcript.tx_position.chrom_stop, hgvs.transcript) - chrom, start, end, ref, alt = get_vcf_allele(hgvs, genome, transcript) + chrom, start, _, ref, alt = get_vcf_allele(hgvs, genome, transcript) if normalize: - chrom, start, ref, [alt] = normalize_variant( - chrom, start, ref, [alt], genome, - flank_length=flank_length).variant + nv = normalize_variant(chrom, start, ref, [alt], genome, + flank_length=flank_length, + indels_start_with_same_base=indels_start_with_same_base) + chrom, start, ref, [alt] = nv.variant return (chrom, start, ref, alt) @@ -1408,35 +278,39 @@ def variant_to_hgvs_name(chrom, offset, ref, alt, genome, transcript, hgvs = HGVSName() # Populate coordinates. + if mutation_type == 'ins': + # Insert uses coordinates around the insert point. + offset_start = offset - 1 + offset_end = offset + else: + offset_start = offset + offset_end = offset + len(ref) - 1 + if not transcript: # Use genomic coordinate when no transcript is available. hgvs.kind = 'g' - hgvs.start = offset - hgvs.end = offset + len(ref) - 1 + hgvs.start = offset_start + hgvs.end = offset_end else: # Use cDNA coordinates. - hgvs.kind = 'c' + if transcript.is_coding: + hgvs.kind = 'c' + else: + hgvs.kind = 'n' + is_single_base_indel = ( (mutation_type == 'ins' and len(alt) == 1) or (mutation_type in ('del', 'delins', 'dup') and len(ref) == 1)) if mutation_type == '>' or (use_counsyl and is_single_base_indel): # Use a single coordinate. - hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset) + hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset) hgvs.cdna_end = hgvs.cdna_start else: - # Use a range of coordinates. - if mutation_type == 'ins': - # Insert uses coordinates around the insert point. - offset_start = offset - 1 - offset_end = offset - else: - offset_start = offset - offset_end = offset + len(ref) - 1 if transcript.strand == '-': offset_start, offset_end = offset_end, offset_start - hgvs.cdna_start = genomic_to_cdna_coord(transcript, offset_start) - hgvs.cdna_end = genomic_to_cdna_coord(transcript, offset_end) + hgvs.cdna_start = transcript.genomic_to_cdna_coord(offset_start) + hgvs.cdna_end = transcript.genomic_to_cdna_coord(offset_end) # Populate prefix. if transcript: diff --git a/pyhgvs/models.py b/pyhgvs/models.py deleted file mode 100644 index b26b8c4..0000000 --- a/pyhgvs/models.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Models for representing genomic elements. -""" - -from __future__ import unicode_literals - -from collections import namedtuple - - -class Position(object): - """A position in the genome.""" - - def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand): - self.chrom = chrom - self.chrom_start = chrom_start - self.chrom_stop = chrom_stop - self.is_forward_strand = is_forward_strand - - def __repr__(self): - return "" % ( - self.chrom, self.chrom_start, self.chrom_stop) - - -class Gene(object): - def __init__(self, name): - self.name = name - - -class Transcript(object): - """RefGene Transcripts for hg19 - - A gene may have multiple transcripts with different combinations of exons. - """ - - def __init__(self, name, version, gene, tx_position, cds_position, - is_default=False, exons=None): - self.name = name - self.version = version - self.gene = Gene(gene) - self.tx_position = tx_position - self.cds_position = cds_position - self.is_default = is_default - self.exons = exons if exons else [] - - @property - def full_name(self): - if self.version is not None: - return '%s.%d' % (self.name, self.version) - else: - return self.name - - @property - def is_coding(self): - # Coding transcripts have CDS with non-zero length. - return (self.cds_position.chrom_stop - - self.cds_position.chrom_start > 0) - - @property - def strand(self): - return ('+' if self.tx_position.is_forward_strand else '-') - - @property - def coding_exons(self): - return [exon.get_as_interval(coding_only=True) - for exon in self.exons] - - -BED6Interval_base = namedtuple( - "BED6Interval_base", ( - "chrom", - "chrom_start", - "chrom_end", - "name", - "score", - "strand")) - - -class BED6Interval(BED6Interval_base): - def distance(self, offset): - """Return the distance to the interval. - - if offset is inside the exon, distance is zero. - otherwise, distance is the distance to the nearest edge. - - distance is positive if the exon comes after the offset. - distance is negative if the exon comes before the offset. - """ - - start = self.chrom_start + 1 - end = self.chrom_end - - if start <= offset <= end: - return 0 - - start_distance = start - offset - end_distance = offset - end - - if abs(start_distance) < abs(end_distance): - return start_distance - else: - return -end_distance - - -class Exon(object): - def __init__(self, transcript, tx_position, exon_number): - self.transcript = transcript - self.tx_position = tx_position - self.exon_number = exon_number - - @property - def get_exon_name(self): - return "%s.%d" % (self.transcript.name, self.exon_number) - - def get_as_interval(self, coding_only=False): - """Returns the coding region for this exon as a BED6Interval. - - This function returns a BED6Interval objects containing position - information for this exon. This may be used as input for - pybedtools.create_interval_from_list() after casting chrom_start - and chrom_end as strings. - - coding_only: only include exons in the coding region - - """ - - exon_start = self.tx_position.chrom_start - exon_stop = self.tx_position.chrom_stop - - # Get only exon coding region if requested - if coding_only: - if (exon_stop <= self.transcript.cds_position.chrom_start or - exon_start >= self.transcript.cds_position.chrom_stop): - return None - exon_start = max(exon_start, - self.transcript.cds_position.chrom_start) - exon_stop = min( - max(exon_stop, self.transcript.cds_position.chrom_start), - self.transcript.cds_position.chrom_stop) - - return BED6Interval( - self.tx_position.chrom, - exon_start, - exon_stop, - self.get_exon_name, - '.', - self.strand, - ) - - @property - def strand(self): - strand = '+' if self.tx_position.is_forward_strand else '-' - return strand diff --git a/pyhgvs/models/__init__.py b/pyhgvs/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyhgvs/models/cdna.py b/pyhgvs/models/cdna.py new file mode 100644 index 0000000..bc1280f --- /dev/null +++ b/pyhgvs/models/cdna.py @@ -0,0 +1,123 @@ +import re + +CDNA_START_CODON = 'cdna_start' +CDNA_STOP_CODON = 'cdna_stop' + + +class CDNACoord(object): + """ + A HGVS cDNA-based coordinate. + + A cDNA coordinate can take one of these forms: + + N = nucleotide N in protein coding sequence (e.g. 11A>G) + + -N = nucleotide N 5' of the ATG translation initiation codon (e.g. -4A>G) + NOTE: so located in the 5'UTR or 5' of the transcription initiation + site (upstream of the gene, incl. promoter) + + *N = nucleotide N 3' of the translation stop codon (e.g. *6A>G) + NOTE: so located in the 3'UTR or 3' of the polyA-addition site + (including downstream of the gene) + + N+M = nucleotide M in the intron after (3' of) position N in the coding DNA + reference sequence (e.g. 30+4A>G) + + N-M = nucleotide M in the intron before (5' of) position N in the coding + DNA reference sequence (e.g. 301-2A>G) + + -N+M / -N-M = nucleotide in an intron in the 5'UTR (e.g. -45+4A>G) + + *N+M / *N-M = nucleotide in an intron in the 3'UTR (e.g. *212-2A>G) + """ + + def __init__(self, coord=0, offset=0, landmark=CDNA_START_CODON, + string=''): + """ + coord: main coordinate along cDNA on the same strand as the transcript + + offset: an additional genomic offset from the main coordinate. This + allows referencing non-coding (e.g. intronic) positions. + Offset is also interpreted on the coding strand. + + landmark: ('cdna_start', 'cdna_stop') indicating that 'coord' + is relative to one of these landmarks. + + string: a coordinate from an HGVS name. If given coord, offset, and + landmark should not be specified. + """ + + if string: + if coord != 0 or offset != 0 or landmark != CDNA_START_CODON: + raise ValueError("coord, offset, and landmark should not " + "be given with string argument") + + self.parse(string) + else: + self.coord = coord + self.offset = offset + self.landmark = landmark + + def parse(self, coord_text): + """ + Parse a HGVS formatted cDNA coordinate. + """ + + match = re.match(r"(|-|\*)(\d+)((-|\+)(\d+))?", coord_text) + if not match: + raise ValueError("unknown coordinate format '%s'" % coord_text) + coord_prefix, coord, _, offset_prefix, offset = match.groups() + + self.coord = int(coord) + self.offset = int(offset) if offset else 0 + + if offset_prefix == '-': + self.offset *= -1 + elif offset_prefix == '+' or offset is None: + pass + else: + raise ValueError("unknown offset_prefix '%s'" % offset_prefix) + + if coord_prefix == '': + self.landmark = CDNA_START_CODON + elif coord_prefix == "-": + self.coord *= -1 + self.landmark = CDNA_START_CODON + elif coord_prefix == '*': + self.landmark = CDNA_STOP_CODON + else: + raise ValueError("unknown coord_prefix '%s'" % coord_prefix) + return self + + def __str__(self): + """ + Return a formatted cDNA coordinate + """ + if self.landmark == CDNA_STOP_CODON: + coord_prefix = '*' + else: + coord_prefix = '' + + if self.offset < 0: + offset = '%d' % self.offset + elif self.offset > 0: + offset = '+%d' % self.offset + else: + offset = '' + + return '%s%d%s' % (coord_prefix, self.coord, offset) + + def __eq__(self, other): + """Equality operator.""" + return ((self.coord, self.offset, self.landmark) == + (other.coord, other.offset, other.landmark)) + + def __repr__(self): + """ + Returns a string representation of a cDNA coordinate. + """ + if self.landmark != CDNA_START_CODON: + return "CDNACoord(%d, %d, '%s')" % ( + self.coord, self.offset, self.landmark) + else: + return "CDNACoord(%d, %d)" % (self.coord, self.offset) diff --git a/pyhgvs/models/genome.py b/pyhgvs/models/genome.py new file mode 100644 index 0000000..4cfc87d --- /dev/null +++ b/pyhgvs/models/genome.py @@ -0,0 +1,47 @@ +class ChromosomeSubset(object): + """ + Allow direct access to a subset of the chromosome. + """ + + def __init__(self, name, genome=None): + self.name = name + self.genome = genome + + def __getitem__(self, key): + """Return sequence from region [start, end) + + Coordinates are 0-based, end-exclusive.""" + if isinstance(key, slice): + start, end = (key.start, key.stop) + start -= self.genome.start + end -= self.genome.start + return self.genome.genome[self.genome.seqid][start:end] + else: + raise TypeError('Expected a slice object but ' + 'received a {0}.'.format(type(key))) + + def __repr__(self): + return 'ChromosomeSubset("%s")' % self.name + + +class GenomeSubset(object): + """ + Allow the direct access of a subset of the genome. + """ + + def __init__(self, genome, chrom, start, end, seqid): + self.genome = genome + self.chrom = chrom + self.start = start + self.end = end + self.seqid = seqid + self._chroms = {} + + def __getitem__(self, chrom): + """Return a chromosome by its name.""" + if chrom in self._chroms: + return self._chroms[chrom] + else: + chromosome = ChromosomeSubset(chrom, self) + self._chroms[chrom] = chromosome + return chromosome \ No newline at end of file diff --git a/pyhgvs/models/hgvs_name.py b/pyhgvs/models/hgvs_name.py new file mode 100644 index 0000000..6d6cc46 --- /dev/null +++ b/pyhgvs/models/hgvs_name.py @@ -0,0 +1,814 @@ +""" +HGVS language currently implemented. + +HGVS = ALLELE + | PREFIX_NAME : ALLELE + +PREFIX_NAME = TRANSCRIPT + | TRANSCRIPT '(' GENE ')' + +TRANSCRIPT = TRANSCRIPT_NAME + | TRANSCRIPT_NAME '.' TRANSCRIPT_VERSION + +TRANSCRIPT_VERSION = NUMBER + +ALLELE = 'c.' CDNA_ALLELE # cDNA + | 'g.' GENOMIC_ALLELE # genomic + | 'm.' MIT_ALLELE # mitochondrial sequence + | 'n.' NC_ALLELE # non-coding RNA reference sequence + | 'r.' RNA_ALLELE # RNA sequence (like r.76a>u) + | 'p.' PROTEIN_ALLELE # protein sequence (like p.Lys76Asn) + +NC_ALLELE = +RNA_ALLELE = +CDNA_ALLELE = CDNA_COORD SINGLE_BASE_CHANGE + | CDNA_COORD_RANGE MULTI_BASE_CHANGE + +GENOMIC_ALLELE = +MIT_ALLELE = COORD SINGLE_BASE_CHANGE + | COORD_RANGE MULTI_BASE_CHANGE + +SINGLE_BASE_CHANGE = CDNA_ALLELE = CDNA_COORD BASE '=' # no change + | CDNA_COORD BASE '>' BASE # substitution + | CDNA_COORD 'ins' BASE # 1bp insertion + | CDNA_COORD 'del' BASE # 1bp deletion + | CDNA_COORD 'dup' BASE # 1bp duplication + | CDNA_COORD 'ins' # 1bp insertion + | CDNA_COORD 'del' # 1bp deletion + | CDNA_COORD 'dup' # 1bp duplication + | CDNA_COORD 'del' BASE 'ins' BASE # 1bp indel + | CDNA_COORD 'delins' BASE # 1bp indel + +MULTI_BASE_CHANGE = COORD_RANGE 'del' BASES # deletion + | COORD_RANGE 'ins' BASES # insertion + | COORD_RANGE 'dup' BASES # duplication + | COORD_RANGE 'del' # deletion + | COORD_RANGE 'dup' # duplication + | COORD_RANGE 'del' BASES 'ins' BASES # indel + | COORD_RANGE 'delins' BASES # indel + + +AMINO1 = [GAVLIMFWPSTCYNQDEKRH] + +AMINO3 = 'Gly' | 'Ala' | 'Val' | 'Leu' | 'Ile' | 'Met' | 'Phe' | 'Trp' | 'Pro' + | 'Ser' | 'Thr' | 'Cys' | 'Tyr' | 'Asn' | 'Gln' | 'Asp' | 'Glu' | 'Lys' + | 'Arg' | 'His' + +PROTEIN_ALLELE = AMINO3 COORD '=' # no peptide change + | AMINO1 COORD '=' # no peptide change + | AMINO3 COORD AMINO3 PEP_EXTRA # peptide change + | AMINO1 COORD AMINO1 PEP_EXTRA # peptide change + | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA # indel + | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA # indel + | AMINO3 COORD '_' AMINO3 COORD PEP_EXTRA AMINO3 # indel + | AMINO1 COORD '_' AMINO1 COORD PEP_EXTRA AMINO1 # indel + +# A genomic range: +COORD_RANGE = COORD '_' COORD + +# A cDNA range: +CDNA_COORD_RANGE = CDNA_COORD '_' CDNA_COORD + +# A cDNA coordinate: +CDNA_COORD = COORD_PREFIX COORD + | COORD_PREFIX COORD OFFSET_PREFIX OFFSET +COORD_PREFIX = '' | '-' | '*' +COORD = NUMBER +OFFSET_PREFIX = '-' | '+' +OFFSET = NUMBER + +# Primatives: +NUMBER = \d+ +BASE = [ACGT] +BASES = BASE+ + +""" + +import re + +from .cdna import CDNACoord +from .variants import revcomp + +CHROM_PREFIX = 'chr' + + +class HGVSRegex(object): + """ + All regular expression for HGVS names. + """ + + # DNA syntax + # http://www.hgvs.org/mutnomen/standards.html#nucleotide + BASE = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]|\d+" + BASES = "[acgtbdhkmnrsvwyACGTBDHKMNRSVWY]+|\d+" + DNA_REF = "(?P" + BASES + ")" + DNA_ALT = "(?P" + BASES + ")" + + # Mutation types + EQUAL = "(?P=)" + SUB = "(?P>)" + INS = "(?Pins)" + DEL = "(?Pdel)" + DUP = "(?Pdup)" + + # Simple coordinate syntax + COORD_START = "(?P\d+)" + COORD_END = "(?P\d+)" + COORD_RANGE = COORD_START + "_" + COORD_END + + # cDNA coordinate syntax + CDNA_COORD = ("(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?") + CDNA_START = ("(?P(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?)") + CDNA_END = (r"(?P(?P|-|\*)(?P\d+)" + "((?P-|\+)(?P\d+))?)") + CDNA_RANGE = CDNA_START + "_" + CDNA_END + + # cDNA allele syntax + CDNA_ALLELE = [ + # No change + CDNA_START + EQUAL, + CDNA_START + DNA_REF + EQUAL, + + # Substitution + CDNA_START + DNA_REF + SUB + DNA_ALT, + + # 1bp insertion, deletion, duplication + CDNA_START + INS + DNA_ALT, + CDNA_START + DEL + DNA_REF, + CDNA_START + DUP + DNA_REF, + CDNA_START + DEL, + CDNA_START + DUP, + + # Insertion, deletion, duplication + CDNA_RANGE + INS + DNA_ALT, + CDNA_RANGE + DEL + DNA_REF, + CDNA_RANGE + DUP + DNA_REF, + CDNA_RANGE + DEL, + CDNA_RANGE + DUP, + + # Indels + "(?P" + CDNA_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + CDNA_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + CDNA_START + 'delins' + DNA_ALT + ")", + "(?P" + CDNA_RANGE + 'delins' + DNA_ALT + ")", + ] + + CDNA_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in CDNA_ALLELE] + + # Peptide syntax + PEP = "([A-Z]([a-z]{2}))+" + PEP_REF = "(?P" + PEP + ")" + PEP_REF2 = "(?P" + PEP + ")" + PEP_ALT = "(?P" + PEP + ")" + + PEP_EXTRA = "(?P(|=|\?)(|fs))" + + # Peptide allele syntax + PEP_ALLELE = [ + # No peptide change + # Example: Glu1161= + PEP_REF + COORD_START + PEP_EXTRA, + + # Peptide change + # Example: Glu1161Ser + PEP_REF + COORD_START + PEP_ALT + PEP_EXTRA, + + # Peptide indel + # Example: Glu1161_Ser1164?fs + "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + + PEP_EXTRA + ")", + "(?P" + PEP_REF + COORD_START + "_" + PEP_REF2 + COORD_END + + PEP_ALT + PEP_EXTRA + ")", + ] + + PEP_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in PEP_ALLELE] + + # Genomic allele syntax + GENOMIC_ALLELE = [ + # No change + COORD_START + EQUAL, + COORD_START + DNA_REF + EQUAL, + + # Substitution + COORD_START + DNA_REF + SUB + DNA_ALT, + + # 1bp insertion, deletion, duplication + COORD_START + INS + DNA_ALT, + COORD_START + DEL + DNA_REF, + COORD_START + DUP + DNA_REF, + COORD_START + DEL, + COORD_START + DUP, + + # Insertion, deletion, duplication + COORD_RANGE + EQUAL, + COORD_RANGE + INS + DNA_ALT, + COORD_RANGE + DEL + DNA_REF, + COORD_RANGE + DUP + DNA_REF, + COORD_RANGE + DEL, + COORD_RANGE + DUP, + + # Indels + "(?P" + COORD_START + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + COORD_RANGE + 'del' + DNA_REF + 'ins' + DNA_ALT + ")", + "(?P" + COORD_START + 'delins' + DNA_ALT + ")", + "(?P" + COORD_RANGE + 'delins' + DNA_ALT + ")", + ] + + GENOMIC_ALLELE_REGEXES = [re.compile("^" + regex + "$") + for regex in GENOMIC_ALLELE] + + +# The RefSeq standard for naming contigs/transcripts/proteins: +# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8 +REFSEQ_PREFIXES = [ + ('AC_', 'genomic', + 'Complete genomic molecule, usually alternate assembly'), + ('NC_', 'genomic', + 'Complete genomic molecule, usually reference assembly'), + ('NG_', 'genomic', 'Incomplete genomic region'), + ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'), + ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'), + ('NS_', 'genomic', 'Environmental sequence'), + ('NZ_', 'genomic', 'Unfinished WGS'), + ('NM_', 'mRNA', ''), + ('NR_', 'RNA', ''), + ('XM_', 'mRNA', 'Predicted model'), + ('XR_', 'RNA', 'Predicted model'), + ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'), + ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'), + ('YP_', 'Protein', ''), + ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'), + ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'), +] + + +REFSEQ_PREFIX_LOOKUP = dict( + (prefix, (kind, description)) + for prefix, kind, description in REFSEQ_PREFIXES +) + + +def get_refseq_type(name): + """ + Return the RefSeq type for a refseq name. + """ + prefix = name[:3] + return REFSEQ_PREFIX_LOOKUP.get(prefix, (None, ''))[0] + + +class InvalidHGVSName(ValueError): + def __init__(self, name='', part='name', reason=''): + if name: + message = 'Invalid HGVS %s "%s"' % (part, name) + else: + message = 'Invalid HGVS %s' % part + if reason: + message += ': ' + reason + super(InvalidHGVSName, self).__init__(message) + + self.name = name + self.part = part + self.reason = reason + + +class HGVSName(object): + """ + Represents a HGVS variant name. + """ + + def __init__(self, name='', prefix='', chrom='', transcript='', gene='', + kind='', mutation_type=None, start=0, end=0, ref_allele='', + ref2_allele='', alt_allele='', + cdna_start=None, cdna_end=None, pep_extra=''): + + # Full HGVS name. + self.name = name + + # Name parts. + self.prefix = prefix + self.chrom = chrom + self.transcript = transcript + self.gene = gene + self.kind = kind + self.mutation_type = mutation_type + self.start = start + self.end = end + self.ref_allele = ref_allele # reference allele + self.ref2_allele = ref2_allele # reference allele at end of pep indel + self.alt_allele = alt_allele # alternate allele + + # cDNA-specific fields + self.cdna_start = cdna_start if cdna_start else CDNACoord() + self.cdna_end = cdna_end if cdna_end else CDNACoord() + + # Protein-specific fields + self.pep_extra = pep_extra + + if name: + self.parse(name) + + def parse(self, name): + """Parse a HGVS name.""" + # Does HGVS name have transcript/gene prefix? + if ':' in name: + prefix, allele = name.split(':', 1) + else: + prefix = '' + allele = name + + self.name = name + + # Parse prefix and allele. + self.parse_allele(allele) + self.parse_prefix(prefix, self.kind) + self._validate() + + def parse_prefix(self, prefix, kind): + """ + Parse a HGVS prefix (gene/transcript/chromosome). + + Some examples of full hgvs names with transcript include: + NM_007294.3:c.2207A>C + NM_007294.3(BRCA1):c.2207A>C + BRCA1{NM_007294.3}:c.2207A>C + """ + + self.prefix = prefix + + # No prefix. + if prefix == '': + self.chrom = '' + self.transcript = '' + self.gene = '' + return + + # Transcript and gene given with parens. + # example: NM_007294.3(BRCA1):c.2207A>C + match = re.match(r"^(?P[^(]+)\((?P[^)]+)\)$", prefix) + if match: + self.transcript = match.group('transcript') + self.gene = match.group('gene') + return + + # Transcript and gene given with braces. + # example: BRCA1{NM_007294.3}:c.2207A>C + match = re.match(r"^(?P[^{]+){(?P[^}]+)}$", prefix) + if match: + self.transcript = match.group('transcript') + self.gene = match.group('gene') + return + + # Determine using Ensembl type. + if prefix.startswith('ENST'): + self.transcript = prefix + return + + # Determine using LRG type. + if prefix.startswith('LRG_'): + self.transcript = prefix + return + + # Determine using refseq type. + refseq_type = get_refseq_type(prefix) + if refseq_type in ('mRNA', 'RNA'): + self.transcript = prefix + return + + # Determine using refseq type. + if prefix.startswith(CHROM_PREFIX) or refseq_type == 'genomic': + self.chrom = prefix + return + + # Assume gene name. + self.gene = prefix + + def parse_allele(self, allele): + """ + Parse a HGVS allele description. + + Some examples include: + cDNA substitution: c.101A>C, + cDNA indel: c.3428delCinsTA, c.1000_1003delATG, c.1000_1001insATG + No protein change: p.Glu1161= + Protein change: p.Glu1161Ser + Protein frameshift: p.Glu1161_Ser1164?fs + Genomic substitution: g.1000100A>T + Genomic indel: g.1000100_1000102delATG + """ + if '.' not in allele: + raise InvalidHGVSName(allele, 'allele', + 'expected kind "c.", "p.", "g.", etc') + + # Determine HGVS name kind. + kind, details = allele.split('.', 1) + self.kind = kind + self.mutation_type = None + + if kind in ("c", 'n'): + self.parse_cdna(details) + if kind == 'n': # Ensure no 3'UTR or 5'UTR coords in non-coding + if self.cdna_start.coord < 0: + raise InvalidHGVSName(allele, 'allele', + "Non-coding transcript cannot contain negative (5'UTR) coordinates") + if self.cdna_start.landmark == 'cdna_stop' or self.cdna_end and self.cdna_end.landmark == 'cdna_stop': + raise InvalidHGVSName(allele, 'allele', + "Non-coding transcript cannot contain '*' (3'UTR) coordinates") + elif kind == "p": + self.parse_protein(details) + elif kind in ("g", 'm'): + self.parse_genome(details) + else: + raise NotImplementedError("unknown kind: %s" % allele) + + def parse_cdna(self, details): + """ + Parse a HGVS cDNA name. + + Some examples include: + Substitution: 101A>C, + Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG + """ + for regex in HGVSRegex.CDNA_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = groups['mutation_type'] + + # Parse coordinates. + self.cdna_start = CDNACoord(string=groups.get('start')) + if groups.get('end'): + self.cdna_end = CDNACoord(string=groups.get('end')) + else: + self.cdna_end = CDNACoord(string=groups.get('start')) + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + self.alt_allele = groups.get('alt', '') + + # Convert numerical allelles. + if self.ref_allele.isdigit(): + self.ref_allele = "N" * int(self.ref_allele) + if self.alt_allele.isdigit(): + self.alt_allele = "N" * int(self.alt_allele) + + # Convert duplication alleles. + if self.mutation_type == "dup": + self.alt_allele = self.ref_allele * 2 + + # Convert no match alleles. + if self.mutation_type == "=": + self.alt_allele = self.ref_allele + return + + raise InvalidHGVSName(details, 'cDNA allele') + + def parse_protein(self, details): + """ + Parse a HGVS protein name. + + Some examples include: + No change: Glu1161= + Change: Glu1161Ser + Frameshift: Glu1161_Ser1164?fs + """ + for regex in HGVSRegex.PEP_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = '>' + + # Parse coordinates. + self.start = int(groups.get('start')) + if groups.get('end'): + self.end = int(groups.get('end')) + else: + self.end = self.start + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + if groups.get('ref2'): + self.ref2_allele = groups.get('ref2') + self.alt_allele = groups.get('alt', '') + else: + # If alt is not given, assume matching with ref + self.ref2_allele = self.ref_allele + self.alt_allele = groups.get( + 'alt', self.ref_allele) + + self.pep_extra = groups.get('extra') + return + + raise InvalidHGVSName(details, 'protein allele') + + def parse_genome(self, details): + """ + Parse a HGVS genomic name. + + Som examples include: + Substitution: 1000100A>T + Indel: 1000100_1000102delATG + """ + + for regex in HGVSRegex.GENOMIC_ALLELE_REGEXES: + match = re.match(regex, details) + if match: + groups = match.groupdict() + + # Parse mutation type. + if groups.get('delins'): + self.mutation_type = 'delins' + else: + self.mutation_type = groups['mutation_type'] + + # Parse coordinates. + self.start = int(groups.get('start')) + if groups.get('end'): + self.end = int(groups.get('end')) + else: + self.end = self.start + + # Parse alleles. + self.ref_allele = groups.get('ref', '') + self.alt_allele = groups.get('alt', '') + + # Convert numerical alleles. + if self.ref_allele.isdigit(): + self.ref_allele = "N" * int(self.ref_allele) + if self.alt_allele.isdigit(): + self.alt_allele = "N" * int(self.alt_allele) + + # Convert duplication alleles. + if self.mutation_type == "dup": + self.alt_allele = self.ref_allele * 2 + + # Convert no match alleles. + if self.mutation_type == "=": + self.alt_allele = self.ref_allele + return + + raise InvalidHGVSName(details, 'genomic allele') + + def _validate(self): + """ + Check for internal inconsistencies in representation + """ + if self.start > self.end: + raise InvalidHGVSName(reason="Coordinates are nonincreasing") + + def __repr__(self): + try: + return "HGVSName('%s')" % self.format() + except NotImplementedError: + return "HGVSName('%s')" % self.name + + def __unicode__(self): + return self.format() + + def format(self, use_prefix=True, use_gene=True, use_counsyl=False): + """Generate a HGVS name as a string.""" + + if self.kind in ('c', 'n'): + allele = self.kind + '.' + self.format_cdna() + elif self.kind == 'p': + allele = 'p.' + self.format_protein() + elif self.kind in ('g', 'm'): + allele = self.kind + '.' + self.format_genome() + else: + raise NotImplementedError("not implemented: '%s'" % self.kind) + + prefix = self.format_prefix(use_gene=use_gene) if use_prefix else '' + + if prefix: + return prefix + ':' + allele + else: + return allele + + def format_prefix(self, use_gene=True): + """ + Generate HGVS trancript/gene prefix. + + Some examples of full hgvs names with transcript include: + NM_007294.3:c.2207A>C + NM_007294.3(BRCA1):c.2207A>C + """ + + if self.kind in ('g', 'm'): + if self.chrom: + return self.chrom + + if self.transcript: + if use_gene and self.gene: + return '%s(%s)' % (self.transcript, self.gene) + else: + return self.transcript + else: + if use_gene: + return self.gene + else: + return '' + + def format_cdna_coords(self): + """ + Generate HGVS cDNA coordinates string. + """ + # Format coordinates. + if self.cdna_start == self.cdna_end: + return str(self.cdna_start) + else: + return "%s_%s" % (self.cdna_start, self.cdna_end) + + def format_dna_allele(self): + """ + Generate HGVS DNA allele. + """ + if self.mutation_type == '=': + # No change. + # example: 101A= + return self.ref_allele + '=' + + if self.mutation_type == '>': + # SNP. + # example: 101A>C + return self.ref_allele + '>' + self.alt_allele + + elif self.mutation_type == 'delins': + # Indel. + # example: 112_117delAGGTCAinsTG, 112_117delinsTG + return 'del' + self.ref_allele + 'ins' + self.alt_allele + + elif self.mutation_type in ('del', 'dup'): + # Delete, duplication. + # example: 1000_1003delATG, 1000_1003dupATG + return self.mutation_type + self.ref_allele + + elif self.mutation_type == 'ins': + # Insert. + # example: 1000_1001insATG + return self.mutation_type + self.alt_allele + + elif self.mutation_type == 'inv': + return self.mutation_type + + else: + raise AssertionError( + "unknown mutation type: '%s'" % self.mutation_type) + + def format_cdna(self): + """ + Generate HGVS cDNA allele. + + Some examples include: + Substitution: 101A>C, + Indel: 3428delCinsTA, 1000_1003delATG, 1000_1001insATG + """ + return self.format_cdna_coords() + self.format_dna_allele() + + def format_protein(self): + """ + Generate HGVS protein name. + + Some examples include: + No change: Glu1161= + Change: Glu1161Ser + Frameshift: Glu1161_Ser1164?fs + """ + if (self.start == self.end and + self.ref_allele == self.ref2_allele == self.alt_allele): + # Match. + # Example: Glu1161= + pep_extra = self.pep_extra if self.pep_extra else '=' + return self.ref_allele + str(self.start) + pep_extra + + elif (self.start == self.end and + self.ref_allele == self.ref2_allele and + self.ref_allele != self.alt_allele): + # Change. + # Example: Glu1161Ser + return (self.ref_allele + str(self.start) + + self.alt_allele + self.pep_extra) + + elif self.start != self.end: + # Range change. + # Example: Glu1161_Ser1164?fs + return (self.ref_allele + str(self.start) + '_' + + self.ref2_allele + str(self.end) + + self.pep_extra) + + else: + raise NotImplementedError('protein name formatting.') + + def format_coords(self): + """ + Generate HGVS cDNA coordinates string. + """ + # Format coordinates. + if self.start == self.end: + return str(self.start) + else: + return "%s_%s" % (self.start, self.end) + + def format_genome(self): + """ + Generate HGVS genomic allele. + + Som examples include: + Substitution: 1000100A>T + Indel: 1000100_1000102delATG + """ + return self.format_coords() + self.format_dna_allele() + + def get_raw_coords(self, transcript=None): + """ return genomic coordinates """ + if self.kind in ('c', 'n'): + chrom = transcript.tx_position.chrom + start = transcript.cdna_to_genomic_coord(self.cdna_start) + end = transcript.cdna_to_genomic_coord(self.cdna_end) + + if not transcript.tx_position.is_forward_strand: + start, end = end, start + + if start > end: + raise AssertionError( + "cdna_start cannot be greater than cdna_end") + elif self.kind in ('g', 'm'): + chrom = self.chrom + start = self.start + end = self.end + else: + raise NotImplementedError( + 'Coordinates are not available for this kind of HGVS name "%s"' + % self.kind) + + # Check coordinate span is equal to reference bases + if self.ref_allele: + coordinate_span = end - start + 1 # Ref will always be >=1 base + ref_length = len(self.ref_allele) + if coordinate_span != ref_length: + raise InvalidHGVSName("Coordinate span (%d) not equal to ref length %d" % (coordinate_span, ref_length)) + + return chrom, start, end + + def get_ref_coords(self, transcript=None): + """Return genomic coordinates of reference allele.""" + + chrom, start, end = self.get_raw_coords(transcript) + + if self.mutation_type == "ins": + # Inserts have empty interval. + if start < end: + start += 1 + end -= 1 + else: + end = start - 1 + + elif self.mutation_type == "dup": + end = start - 1 + return chrom, start, end + + def get_vcf_coords(self, transcript=None): + """Return genomic coordinates of reference allele in VCF-style.""" + chrom, start, end = self.get_ref_coords(transcript) + + # Inserts and deletes require left-padding by 1 base + if self.mutation_type in ("=", ">"): + pass + elif self.mutation_type in ("del", "ins", "dup", "delins"): + # Indels have left-padding. + start -= 1 + else: + raise NotImplementedError("Unknown mutation_type '%s'" % + self.mutation_type) + return chrom, start, end + + def get_ref_alt(self, is_forward_strand=True, raw_dup_alleles=False): + """ Return reference and alternate alleles. + Original code was for representation - ie it altered dup to look like an insert + pass raw_dup_alleles=True to get the raw values """ + if self.kind == 'p': + raise NotImplementedError( + 'get_ref_alt is not implemented for protein HGVS names') + alleles = [self.ref_allele, self.alt_allele] + + # Represent duplications are inserts. + if not raw_dup_alleles and self.mutation_type == "dup": + alleles[0] = "" + alleles[1] = alleles[1][:len(alleles[1]) // 2] + + if is_forward_strand: + return alleles + else: + return tuple(map(revcomp, alleles)) diff --git a/pyhgvs/models/transcript.py b/pyhgvs/models/transcript.py new file mode 100644 index 0000000..7d93233 --- /dev/null +++ b/pyhgvs/models/transcript.py @@ -0,0 +1,372 @@ +""" +Models for representing genomic elements. +""" + +from __future__ import unicode_literals + +from collections import namedtuple + +from lazy import lazy + +from pyhgvs.models.cdna import CDNA_START_CODON, CDNA_STOP_CODON, CDNACoord + + +class Gene(object): + def __init__(self, name): + self.name = name + + +class Transcript(object): + """ + A gene may have multiple transcripts with different combinations of exons. + + We need both exons and cdna_match as need to know exact exon boundaries to work out flanking + """ + + def __init__(self, name, version, gene, tx_position, cds_position, + is_default=False, cdna_match=None, + start_codon_transcript_pos=None, stop_codon_transcript_pos=None): + self.name = name + self.version = version + self.gene = Gene(gene) + self.tx_position = tx_position + self.cds_position = cds_position + self.is_default = is_default + self.cdna_match = cdna_match or [] + # Optional pre-calculated transcript coordinates + self._start_codon_transcript_pos = start_codon_transcript_pos + self._stop_codon_transcript_pos = stop_codon_transcript_pos + + @property + def full_name(self): + if self.version is not None: + return '%s.%d' % (self.name, self.version) + else: + return self.name + + @property + def is_coding(self): + # Coding transcripts have CDS with non-zero length. + return (self.cds_position.chrom_stop - + self.cds_position.chrom_start > 0) + + @property + def strand(self): + return ('+' if self.tx_position.is_forward_strand else '-') + + @lazy + def ordered_cdna_match(self): + """ Coding order """ + transcript_strand = self.tx_position.is_forward_strand + cdna_match = list(self.cdna_match) + cdna_match.sort(key=lambda cm: cm.tx_position.chrom_start) + if not transcript_strand: + cdna_match.reverse() + return cdna_match + + def get_cds_start_stop(self): + (start, stop) = (self.cds_position.chrom_start, self.cds_position.chrom_stop) + if not self.tx_position.is_forward_strand: + (stop, start) = (start, stop) + return start, stop + + @lazy + def start_codon(self): + """ Transcript position """ + + if self._start_codon_transcript_pos is not None: + return self._start_codon_transcript_pos + + start_genomic_coordinate, _ = self.get_cds_start_stop() + return self._get_transcript_position(start_genomic_coordinate) + + @lazy + def stop_codon(self): + """ Transcript position """ + if self._stop_codon_transcript_pos is not None: + return self._stop_codon_transcript_pos + + _, stop_genomic_coordinate = self.get_cds_start_stop() + return self._get_transcript_position(stop_genomic_coordinate) + + def _get_transcript_position(self, genomic_coordinate, label=None): + transcript_strand = self.tx_position.is_forward_strand + + cdna_offset = 0 + for cdna_match in self.ordered_cdna_match: + start = cdna_match.tx_position.chrom_start + stop = cdna_match.tx_position.chrom_stop + + if start <= genomic_coordinate <= stop: + # We're inside this match + if transcript_strand: + position = genomic_coordinate - start + else: + position = stop - genomic_coordinate + return cdna_offset + position + cdna_match.get_offset(position) + else: + cdna_offset += cdna_match.length + if label is None: + label = "Genomic coordinate: %d" % genomic_coordinate + raise ValueError('%s is not in any of the exons' % label) + + def cdna_to_genomic_coord(self, coord): + """Convert a HGVS cDNA coordinate to a genomic coordinate.""" + transcript_strand = self.tx_position.is_forward_strand + + # compute starting position along spliced transcript. + if coord.landmark == CDNA_START_CODON: + utr5p = (self.start_codon if self.is_coding else 0) + + if coord.coord > 0: + cdna_pos = utr5p + coord.coord + else: + cdna_pos = utr5p + coord.coord + 1 + elif coord.landmark == CDNA_STOP_CODON: + if coord.coord < 0: + raise ValueError('CDNACoord cannot have a negative coord and ' + 'landmark CDNA_STOP_CODON') + cdna_pos = self.stop_codon + coord.coord + else: + raise ValueError('unknown CDNACoord landmark "%s"' % coord.landmark) + + # 5' flanking sequence (no need to account for gaps) + if cdna_pos < 1: + if transcript_strand: + return self.tx_position.chrom_start + cdna_pos + else: + return self.tx_position.chrom_stop - cdna_pos + 1 + + # Walk along transcript until we find an exon that contains cdna_pos. + for cdna_match in self.ordered_cdna_match: + if cdna_match.cdna_start <= cdna_pos <= cdna_match.cdna_end: + match_pos = cdna_pos - cdna_match.cdna_start + match_pos -= cdna_match.get_offset(match_pos) + # Compute genomic coordinate using offset. + if transcript_strand: + # Plus strand. + start = cdna_match.tx_position.chrom_start + 1 + return start + match_pos + coord.offset + else: + # Minus strand. + end = cdna_match.tx_position.chrom_stop + return end - match_pos - coord.offset + else: + # 3' flanking sequence (no need to account for gaps) + if transcript_strand: + return self.cds_position.chrom_stop + coord.coord + else: + return self.cds_position.chrom_start + 1 - coord.coord + + def genomic_to_cdna_coord(self, genomic_coord): + """ Convert a genomic coordinate to a cDNA coordinate and offset. """ + exons = [exon.get_as_interval() + for exon in self.ordered_cdna_match] + + if len(exons) == 0: + return None + + strand = self.strand + distances = [exon.distance(genomic_coord) + for exon in exons] + min_distance_to_exon = min(map(abs, distances)) + if min_distance_to_exon: + # We're outside of exon - so need to find closest point + for exon in exons: + distance = exon.distance(genomic_coord) + if abs(distance) == min_distance_to_exon: + + # Outside the exon. + if distance > 0: + genomic_nearest_exon = exon.chrom_start + 1 + else: + genomic_nearest_exon = exon.chrom_end + + if strand == "+": + distance *= -1 + + nearest_exon_coord = self._exon_genomic_to_cdna_coord(genomic_nearest_exon) + + # If outside transcript, don't use offset. + if (genomic_coord < self.tx_position.chrom_start + 1 or + genomic_coord > self.tx_position.chrom_stop): + nearest_exon_coord += distance + distance = 0 + cdna_coord = CDNACoord(nearest_exon_coord, distance) + break + else: + raise ValueError("Could not find closest exon!") # Should never happen + else: + coord = self._exon_genomic_to_cdna_coord(genomic_coord) + cdna_coord = CDNACoord(coord, 0) + + # Adjust coordinates for coding transcript. + if self.is_coding: + # Detect if position before start codon. + utr5p = self.start_codon + cdna_coord.coord -= utr5p + if cdna_coord.coord <= 0: + cdna_coord.coord -= 1 + else: + # Detect if position is after stop_codon. + stop_codon = self.stop_codon + stop_codon -= utr5p + if (cdna_coord.coord > stop_codon or + cdna_coord.coord == stop_codon and cdna_coord.offset > 0): + cdna_coord.coord -= stop_codon + cdna_coord.landmark = CDNA_STOP_CODON + + return cdna_coord + + def _exon_genomic_to_cdna_coord(self, genomic_coord): + cdna_offset = 0 + for cdna_match in self.ordered_cdna_match: + # Inside the exon. + if cdna_match.tx_position.chrom_start <= genomic_coord <= cdna_match.tx_position.chrom_stop: + if self.strand == "+": + position = genomic_coord - (cdna_match.tx_position.chrom_start + 1) + else: + position = cdna_match.tx_position.chrom_stop - genomic_coord + offset = cdna_match.get_offset(position, validate=False) + return cdna_offset + position + offset + 1 + cdna_offset += cdna_match.length + + raise ValueError(f"Couldn't find {genomic_coord=}!") + + +BED6Interval_base = namedtuple( + "BED6Interval_base", ( + "chrom", + "chrom_start", + "chrom_end", + "name", + "score", + "strand")) + + +class BED6Interval(BED6Interval_base): + def distance(self, offset): + """Return the distance to the interval. + + if offset is inside the exon, distance is zero. + otherwise, distance is the distance to the nearest edge. + + distance is positive if the exon comes after the offset. + distance is negative if the exon comes before the offset. + """ + + start = self.chrom_start + 1 + end = self.chrom_end + + if start <= offset <= end: + return 0 + + start_distance = start - offset + end_distance = offset - end + + if abs(start_distance) < abs(end_distance): + return start_distance + else: + return -end_distance + + +class Exon(object): + """ We still need exons to work out the flanking boundaries """ + def __init__(self, transcript, tx_position, number): + self.transcript = transcript + self.tx_position = tx_position + self.number = number + + @property + def name(self): + return "%s.%d" % (self.transcript.name, self.number) + + def get_as_interval(self, coding_only=False): + """Returns the coding region for this exon as a BED6Interval. + + This function returns a BED6Interval objects containing position + information for this exon. This may be used as input for + pybedtools.create_interval_from_list() after casting chrom_start + and chrom_end as strings. + + coding_only: only include exons in the coding region + + """ + + exon_start = self.tx_position.chrom_start + exon_stop = self.tx_position.chrom_stop + + # Get only exon coding region if requested + if coding_only: + if (exon_stop <= self.transcript.cds_position.chrom_start or + exon_start >= self.transcript.cds_position.chrom_stop): + return None + exon_start = max(exon_start, + self.transcript.cds_position.chrom_start) + exon_stop = min( + max(exon_stop, self.transcript.cds_position.chrom_start), + self.transcript.cds_position.chrom_stop) + + return BED6Interval( + self.tx_position.chrom, + exon_start, + exon_stop, + self.name, + '.', + self.strand, + ) + + @property + def strand(self): + strand = '+' if self.tx_position.is_forward_strand else '-' + return strand + + +class CDNA_Match(Exon): + def __init__(self, transcript, tx_position, cdna_start, cdna_end, gap, number): + super(CDNA_Match, self).__init__(transcript, tx_position, number) + self.cdna_start = cdna_start + self.cdna_end = cdna_end + self.gap = gap + + @property + def length(self): + return self.cdna_end - self.cdna_start + 1 + + def get_offset(self, position: int, validate=True): + """ cdna_match GAP attribute looks like: 'M185 I3 M250' which is code/length + @see https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#the-gap-attribute + codes operation + M match + I insert a gap into the reference sequence + D insert a gap into the target (delete from reference) + + If you want the whole exon, then pass the end + """ + + if not self.gap: + return 0 + + position_1_based = position + 1 + cdna_match_index = 1 + offset = 0 + for gap_op in self.gap.split(): + code = gap_op[0] + length = int(gap_op[1:]) + if code == "M": + cdna_match_index += length + elif code == "I": + if validate and position_1_based < cdna_match_index + length: + raise ValueError("Coordinate (%d) inside insertion (%s) - no mapping possible!" % (position_1_based, gap_op)) + offset += length + elif code == "D": + if validate and position < cdna_match_index + length: + raise ValueError("Coordinate (%d) inside deletion (%s) - no mapping possible!" % (position_1_based, gap_op)) + offset -= length + else: + raise ValueError(f"Unknown code in cDNA GAP: {gap_op}") + + if cdna_match_index > position_1_based: + break + + return offset diff --git a/pyhgvs/variants.py b/pyhgvs/models/variants.py similarity index 90% rename from pyhgvs/variants.py rename to pyhgvs/models/variants.py index fca0cdc..aced9fc 100644 --- a/pyhgvs/variants.py +++ b/pyhgvs/models/variants.py @@ -5,13 +5,24 @@ from __future__ import absolute_import from __future__ import unicode_literals -from .models import Position - - _COMP = dict(A='T', C='G', G='C', T='A', N='N', a='t', c='g', g='c', t='a', n='n') +class Position(object): + """A position in the genome.""" + + def __init__(self, chrom, chrom_start, chrom_stop, is_forward_strand): + self.chrom = chrom + self.chrom_start = chrom_start + self.chrom_stop = chrom_stop + self.is_forward_strand = is_forward_strand + + def __repr__(self): + return "" % ( + self.chrom, self.chrom_start, self.chrom_stop) + + def revcomp(seq): """Reverse complement.""" return ''.join(_COMP[base] for base in reversed(seq)) @@ -115,7 +126,7 @@ def justify_genomic_indel(genome, chrom, start, end, indel, justify, def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, - justify='left', flank_length=30): + justify='left', flank_length=30, indels_start_with_same_base=True): """ Normalize variant according to the GATK/VCF standard. @@ -133,7 +144,7 @@ def normalize_variant(chrom, offset, ref_sequence, alt_sequences, genome, chrom_stop=end, is_forward_strand=True) return NormalizedVariant(position, ref_sequence, alt_sequences, - genome=genome, justify=justify) + genome=genome, justify=justify, indels_start_with_same_base=indels_start_with_same_base) class NormalizedVariant(object): @@ -142,7 +153,8 @@ class NormalizedVariant(object): """ def __init__(self, position, ref_allele, alt_alleles, - seq_5p='', seq_3p='', genome=None, justify='left'): + seq_5p='', seq_3p='', genome=None, justify='left', + indels_start_with_same_base=True): """ position: a 0-index genomic Position. ref_allele: the reference allele sequence. @@ -150,6 +162,9 @@ def __init__(self, position, ref_allele, alt_alleles, seq_5p: 5 prime flanking sequence of variant. seq_3p: 3 prime flanking sequence of variant. genome: a pygr compatible genome object (optional). + + indels_start_with_same_base: DML - I have no idea why this is required + but am keeping for backwards compat """ self.position = position self.alleles = [ref_allele] + list(alt_alleles) @@ -157,6 +172,7 @@ def __init__(self, position, ref_allele, alt_alleles, self.seq_3p = seq_3p self.genome = genome self.log = [] + self.indels_start_with_same_base = indels_start_with_same_base self._on_forward_strand() self._trim_common_prefix() @@ -203,7 +219,7 @@ def _trim_common_suffix(self): """ minlength = min(map(len, self.alleles)) common_suffix = 0 - for i in range(1, minlength+1): + for i in range(1, minlength + 1): if len(set(allele[-i] for allele in self.alleles)) > 1: # Not all alleles match at this site, so common suffix ends. break @@ -275,7 +291,7 @@ def _1bp_pad(self): # Pad sequences with one 5-prime base before the mutation event. empty_seq = any(not allele for allele in self.alleles) uniq_starts = set(allele[0] for allele in self.alleles if allele) - if empty_seq or len(uniq_starts) > 1: + if empty_seq or (self.indels_start_with_same_base and len(uniq_starts) > 1): # Fetch more 5p flanking sequence if needed. if self.genome and self.seq_5p == '': start = self.position.chrom_start @@ -299,9 +315,10 @@ def _1bp_pad(self): self.seq_3p = self.seq_3p[1:] self.position.chrom_stop += 1 - if len(set(a[0] for a in self.alleles)) != 1: - raise AssertionError( - "All INDEL alleles should start with same base.") + if self.indels_start_with_same_base: + if len(set(a[0] for a in self.alleles)) != 1: + raise AssertionError( + "All INDEL alleles should start with same base.") def _set_1based_position(self): """ diff --git a/pyhgvs/tests/data/test_name_to_variant.genome b/pyhgvs/tests/data/test_name_to_variant.genome index 90eb59e..a36abfb 100644 --- a/pyhgvs/tests/data/test_name_to_variant.genome +++ b/pyhgvs/tests/data/test_name_to_variant.genome @@ -61,6 +61,7 @@ chr1 76205650 76205691 TTTATATATTCAAGGCTTATTGTGTAACAGAACCTGGAGCA chr1 76205639 76205669 TTTCTCTTGTTTTTATATATTCAAGGCTTA chr1 76205670 76205700 TGTGTAACAGAACCTGGAGCAGGCTCTGAT chr1 76227048 76227049 C +chr1 76227049 76227050 T chr1 76227029 76227070 TAATGAGGGATGCCAAAATCTATCAGGTAAGGTTAAAGATG chr1 76227019 76227049 GTAGAAAAACTAATGAGGGATGCCAAAATC chr1 76227049 76227079 TATCAGGTAAGGTTAAAGATGATTTTTTTG @@ -84,7 +85,13 @@ chr7 117176661 117176664 TAT chr7 117176642 117176684 CCTCAGAAATGATTGAAAATATCCAATCTGTTAAGGCATACT chr7 117176630 117176660 GACTTGTGATTACCTCAGAAATGATTGAAA chr7 117176662 117176692 ATCCAATCTGTTAAGGCATACTGCTGGGAA +chr7 117182084 117182126 CAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAA +chr7 117182103 117182104 A +chr7 117182103 117182106 AAT +chr7 117182106 117182109 ATA chr7 117182106 117182107 A +chr7 117182107 117182109 TA +chr7 117182077 117182107 TACAAAAGCAAGAATATAAGACATTGGAATA chr7 117182087 117182129 GAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTA chr7 117182074 117182104 CTTACAAAAGCAAGAATATAAGACATTGGA chr7 117182104 117182134 ATATAACTTAACGACTACAGAAGTAGTGAT @@ -98,7 +105,11 @@ chr17 41245340 41245341 T chr17 41245340 41245341 T chr7 117307164 117307165 A chr11 17496507 17496508 T +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA +chr17 7126009 7126051 CATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGT chr17 7126027 7126037 AGCAGAGGTG +chr17 7126028 7126029 A +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG chr1 76216233 76216235 AA chr1 76199213 76199220 AGGTCTT chr1 76199194 76199240 AACATTTTGATACTGTAGGAGGTCTTGGACTTGGAACTTTTGATGC diff --git a/pyhgvs/tests/data/test_variant_to_name.genome b/pyhgvs/tests/data/test_variant_to_name.genome index f310c9c..1f0b257 100644 --- a/pyhgvs/tests/data/test_variant_to_name.genome +++ b/pyhgvs/tests/data/test_variant_to_name.genome @@ -66,3 +66,9 @@ chr7 117188682 117188712 tTTTTTTAACAGGGATTTGGGGAATTATTT chr7 117188582 117188783 CCATGTGCTTTTCAAACTAATTGTACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGAtgtgtgtgtgtgtgtgtgtgtgtTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGTAATTT chr7 117188687 117188689 TT chr7 117188689 117188691 AA +chr17 7125928 7126129 TCCCCCAGTGACAACCTGTTGAACACACCTCTGCTTTCCCACACTGCCCTGACACAGTGGGCCCCCTGAGAAGAAGATGGGCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAGTACGGGTGCCATCGGAGAACGTGCTGGGTGAGGTTGGGAGTGGCTTCAAGGTTGCCATGCACATCCTCAACAATGGAAG +chr17 7126008 7126050 GCATCAAGGCTTCAAACACAGCAGAGGTGTTCTTTGATGGAG +chr17 7126027 7126029 AG +chr17 7126028 7126058 GCAGAGGTGTTCTTTGATGGAGTACGGGTG +chr17 7126029 7126031 CA +chr17 7125998 7126028 AAGAAGATGGGCATCAAGGCTTCAAACACA \ No newline at end of file diff --git a/pyhgvs/tests/genome.py b/pyhgvs/tests/genome.py index 5c53da6..ecc35ae 100644 --- a/pyhgvs/tests/genome.py +++ b/pyhgvs/tests/genome.py @@ -4,7 +4,7 @@ import itertools import os -from ..variants import revcomp +from ..models.variants import revcomp try: @@ -77,9 +77,11 @@ def __init__(self, lookup=None, filename=None, db_filename=None, if SequenceFileDB is None: raise ValueError('pygr is not available.') self._genome = SequenceFileDB(db_filename) + self._source_filename = db_filename elif filename: # Read genome sequence from lookup table. self.read(filename) + self._source_filename = filename def __contains__(self, chrom): """Return True if genome contains chromosome.""" @@ -113,8 +115,8 @@ def get_seq(self, chrom, start, end): None, end - start)) else: raise MockGenomeError( - 'Sequence not in test data: %s:%d-%d' % - (chrom, start, end)) + 'Sequence not in test data: %s:%d-%d source: %s' % + (chrom, start, end, self._source_filename)) def read(self, filename): """Read a sequence lookup table from a file. diff --git a/pyhgvs/tests/test_cdna_match.py b/pyhgvs/tests/test_cdna_match.py new file mode 100644 index 0000000..bbaf17f --- /dev/null +++ b/pyhgvs/tests/test_cdna_match.py @@ -0,0 +1,161 @@ +""" + RefSeq transcripts don't always match the genome reference, so can align with gaps. +""" + +import nose +from pyhgvs import HGVSName + +from pyhgvs.utils import make_transcript + +_HGVS_NM_015120_GRCh37_COORDS = [ + # Has Gap=M185 I3 M250 5' UTR length is 111 so last match is c.74 then 3 bases insert + ("NM_015120.4:c.74A>T", 73613070), + ("NM_015120.4:c.75G>T", None), # No consistent alignment + ("NM_015120.4:c.76G>T", None), # No consistent alignment + ("NM_015120.4:c.77A>T", None), # No consistent alignment + ("NM_015120.4:c.78A>T", 73613071), + ("NM_015120.4:c.79G>C", 73613072), +] + +_HGVS_NM_001135649_GRCh37_COORDS = [ + # Has Gap=M460 I1 M337 5' UTR length is 158 so last match is c.302 then 1 base insert + ("NM_001135649.3:c.300C>T", 88751754), + ("NM_001135649.3:c.301A>C", 88751753), + ("NM_001135649.3:c.302C>T", 88751752), + ("NM_001135649.3:c.303C>T", None), # N/A GRCh37 - 1 bp insert + ("NM_001135649.3:c.304T>C", 88751751), + ("NM_001135649.3:c.305T>C", 88751750), +] + +_HGVS_NM_001012755_GRCh37_COORDS = [ + # 133, 6207, "M1574 I1 M4500 - 5'UTR length is 163 so last match is transcript pos (1574+132) ie c.1543 then 1bp ins + ("NM_001012755.5:c.*619G>C", 103348398), # NM_001012755.5:c.1543G>C + ("NM_001012755.5:c.*620G>C", None), # NM_001012755.5:c.1544G>C - No GRCh37 + ("NM_001012755.5:c.*621C>G", 103348397), # NM_001012755.5:c.1545G>C +] + + +def test_cdna_to_genomic_coord(): + transcript = get_transcript("NM_015120.4") + for hgvs_str, expected_genomic_coord in _HGVS_NM_015120_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + + +def test_genomic_to_cdna_coord(): + transcript = get_transcript("NM_015120.4") + for hgvs_str, genomic_coord in _HGVS_NM_015120_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + +def test_cdna_to_genomic_coord_negative_strand(): + transcript = get_transcript("NM_001135649.3") + for hgvs_str, expected_genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + + +def test_cdna_to_genomic_coord_negative_strand2(): + transcript = get_transcript("NM_001012755.5") + for hgvs_str, expected_genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if expected_genomic_coord: + genomic_coord = transcript.cdna_to_genomic_coord(hgvs_name.cdna_start) + nose.tools.assert_equal(genomic_coord, expected_genomic_coord) + else: + nose.tools.assert_raises(ValueError, transcript.cdna_to_genomic_coord, hgvs_name.cdna_start) + + +def test_genomic_to_cdna_coord_negative_strand(): + transcript = get_transcript("NM_001135649.3") + for hgvs_str, genomic_coord in _HGVS_NM_001135649_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + +def test_genomic_to_cdna_coord_negative_strand2(): + transcript = get_transcript("NM_001012755.5") + + for hgvs_str, genomic_coord in _HGVS_NM_001012755_GRCh37_COORDS: + hgvs_name = HGVSName(hgvs_str) + if genomic_coord: + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord) + nose.tools.assert_equal(cdna_coord, hgvs_name.cdna_start) + + +def get_transcript(accession): + transcript_json = _transcripts[accession] + return make_transcript(transcript_json) + + +_transcripts = { + # GRCh37 + "NM_015120.4": { + "id": "NM_015120.4", + "gene_name": "ALMS1", + "end": 73837046, + "chrom": "NC_000002.11", + "exons": [[73612885, 73613320], [73635749, 73635875], [73646250, 73646446], [73649984, 73650102], + [73651557, 73652030], [73653580, 73653681], [73659325, 73659419], [73675089, 73681194], + [73682288, 73682422], [73716760, 73718625], [73746901, 73747143], [73761950, 73762076], + [73777393, 73777564], [73784346, 73784481], [73786098, 73786269], [73799388, 73800551], + [73826527, 73826648], [73827804, 73828008], [73828321, 73828563], [73829311, 73829495], + [73830367, 73830431], [73835601, 73835701], [73836694, 73837046]], + "start": 73612885, + "strand": "+", + "cds_end": 73836739, + "cds_start": 73612996, + "cdna_match": [[73612885, 73613320, 1, 438, "M185 I3 M250"], [73635749, 73635875, 439, 564, None], + [73646250, 73646446, 565, 760, None], [73649984, 73650102, 761, 878, None], + [73651557, 73652030, 879, 1351, None], [73653580, 73653681, 1352, 1452, None], + [73659325, 73659419, 1453, 1546, None], [73675089, 73681194, 1547, 7654, "M141 I3 M5964"], + [73682288, 73682422, 7655, 7788, None], [73716760, 73718625, 7789, 9653, None], + [73746901, 73747143, 9654, 9895, None], [73761950, 73762076, 9896, 10021, None], + [73777393, 73777564, 10022, 10192, None], [73784346, 73784481, 10193, 10327, None], + [73786098, 73786269, 10328, 10498, None], [73799388, 73800551, 10499, 11661, None], + [73826527, 73826648, 11662, 11782, None], [73827804, 73828008, 11783, 11986, None], + [73828321, 73828563, 11987, 12228, None], [73829311, 73829495, 12229, 12412, None], + [73830367, 73830431, 12413, 12476, None], [73835601, 73835701, 12477, 12576, None], + [73836694, 73837046, 12577, 12928, None]] + }, + "NM_001135649.3": { + # This is a strange case - there are 3 exons, but 2 cDNA match entries + # I think the exons are in error, as eg where is the splice site? + # So - we'll just use the cDNA as exons + "id": "NM_001135649.3", + "gene_name": "FOXI3", + "end": 88752211, + "chrom": "NC_000002.11", + "exons": [[88746305, 88748348], [88751414, 88751751], [88751753, 88752211]], + "start": 88746305, + "strand": "-", + "cds_end": 88752053, + "cds_start": 88747725, + "cdna_match": [[88746305, 88748348, 799, 2841, None], [88751414, 88752211, 1, 798, "M460 I1 M337"]] + }, + "NM_001012755.5": { + "id": "NM_001012755.5", + "gene_name": "SLC25A53", + "end": 103401690, + "chrom": "NC_000023.10", + "exons": [[103343897, 103349971], [103401558, 103401690]], + "start": 103343897, + "strand": "-", + "cds_end": 103349940, + "cds_start": 103349016, + "cdna_match": [[103343897, 103349971, 133, 6207, "M1574 I1 M4500"], [103401558, 103401690, 1, 132, None]] + } +} diff --git a/pyhgvs/tests/test_hgvs_names.py b/pyhgvs/tests/test_hgvs_names.py index be294e0..579e2a6 100644 --- a/pyhgvs/tests/test_hgvs_names.py +++ b/pyhgvs/tests/test_hgvs_names.py @@ -9,13 +9,9 @@ except ImportError: SequenceFileDB = None -from .. import CDNACoord -from .. import CDNA_STOP_CODON -from .. import HGVSName -from .. import InvalidHGVSName -from .. import cdna_to_genomic_coord -from .. import format_hgvs_name -from .. import genomic_to_cdna_coord +from ..models.cdna import CDNACoord, CDNA_STOP_CODON +from ..models.hgvs_name import HGVSName, InvalidHGVSName +from .. import format_hgvs_name, matches_ref_allele from .. import parse_hgvs_name from ..utils import read_transcripts from .genome import MockGenomeTestFile @@ -50,7 +46,7 @@ def test_genomic_to_cdna_coord(): """ for transcript_name, genomic_coord, cdna_coord_expected in _convert_coords: transcript = get_transcript(transcript_name) - cdna_coord = genomic_to_cdna_coord(transcript, genomic_coord[1]) + cdna_coord = transcript.genomic_to_cdna_coord(genomic_coord[1]) nose.tools.assert_equal( cdna_coord, cdna_coord_expected, repr((cdna_coord, cdna_coord_expected, @@ -63,7 +59,7 @@ def test_cdna_to_genomic_coord(): """ for transcript_name, genomic_coord_expected, cdna_coord in _convert_coords: transcript = get_transcript(transcript_name) - genomic_coord = cdna_to_genomic_coord(transcript, cdna_coord) + genomic_coord = transcript.cdna_to_genomic_coord(cdna_coord) nose.tools.assert_equal( genomic_coord, genomic_coord_expected[1], repr((genomic_coord, genomic_coord_expected[1], @@ -123,13 +119,22 @@ def test_variant_to_name(): for (expected_hgvs_name, variant, name_canonical, var_canonical) in _name_variants: if name_canonical: - transcript_name = HGVSName(expected_hgvs_name).transcript - transcript = get_transcript(transcript_name) - assert transcript, transcript_name + hgvs = HGVSName(expected_hgvs_name) + transcript_name = hgvs.transcript + transcript = None + if transcript_name: + transcript = get_transcript(transcript_name) + assert transcript, transcript_name chrom, offset, ref, alt = variant hgvs_name = format_hgvs_name( chrom, offset, ref, alt, genome, transcript, use_gene=False) + + if hgvs.kind == 'g': + # Strip off g.HGVS prefix + i = expected_hgvs_name.find(":g.") + expected_hgvs_name = expected_hgvs_name[i+1:] + nose.tools.assert_equal( hgvs_name, expected_hgvs_name, repr([hgvs_name, expected_hgvs_name, variant])) @@ -192,6 +197,31 @@ def test_invalid_coordinates(): parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript) +def test_matches_ref(): + genome = MockGenomeTestFile( + db_filename='hg19.fa', + filename='pyhgvs/tests/data/test_hgvs.genome', + create_data=False) + # genome = SequenceFileDB('pyhgvs/tests/data/test_hgvs.genome') + transcript = get_transcript("NM_000016.4") + + # The base at this position is "T" - used to always work w/dups + expected = [ + ("NM_000016.4:c.1189dupT", True), + ("NM_000016.4:c.1189T>TT", True), # same as dup by both alleles provided + ("NM_000016.4:c.1189dupA", False), + ("NM_000016.4:c.1189A>AA", False), + ("NM_000016.4:c.1189delT", True), + ("NM_000016.4:c.1189delC", False), + ("NM_000016.4:c.1189delCinsT", False), + ("NM_000016.4:c.1189delTinsC", True), + ] + for hgvs_string, expected_result in expected: + hgvs_name = HGVSName(hgvs_string) + matches_ref = matches_ref_allele(hgvs_name, genome, transcript) + nose.tools.assert_equal(matches_ref, expected_result, hgvs_string + " matches reference base") + + # Test examples of cDNA coordinates. _parse_cdna_coords = [ ('1001', CDNACoord(1001)), @@ -206,7 +236,7 @@ def test_invalid_coordinates(): ] -# Test examples of coverting coordinates. +# Test examples of converting coordinates. _convert_coords = [ # Positions near start codon. ('NM_000016.4', ('chr1', 76190473), CDNACoord(1)), @@ -353,6 +383,18 @@ def test_invalid_coordinates(): 'mutation_type': '=', }), + # cDNA no change, no provided base + ('BRCA1:c.101=', True, + { + 'gene': 'BRCA1', + 'kind': 'c', + 'cdna_start': CDNACoord(101), + 'cdna_end': CDNACoord(101), + 'ref_allele': '', + 'alt_allele': '', + 'mutation_type': '=', + }), + # cDNA 1bp mutations. ('BRCA1:c.101A>C', True, { @@ -537,6 +579,18 @@ def test_invalid_coordinates(): 'mutation_type': '=', }), + # Genomic no change + no ref base + ('BRCA1:g.101=', True, + { + 'gene': 'BRCA1', + 'kind': 'g', + 'start': 101, + 'end': 101, + 'ref_allele': '', + 'alt_allele': '', + 'mutation_type': '=', + }), + # Genomic 1bp mutations. ('BRCA1:g.101A>C', True, { @@ -652,6 +706,11 @@ def test_invalid_coordinates(): 'alt_allele': 'TA', 'mutation_type': 'delins', }), + # + ('NC_000010.10:g.89623915_89623920=', False, + { + 'chrom': 'NC_000010.10', + }), ] @@ -723,7 +782,11 @@ def test_invalid_coordinates(): ('NM_000016.4:c.387+1delG', ('chr1', 76199309, 'TG', 'T'), True, True), ('NM_000016.4:c.475delT', ('chr1', 76205669, 'AT', 'A'), True, True), ('NM_000016.4:c.1189dupT', ('chr1', 76227049, 'C', 'CT'), True, True), + # Same as above but without optional trailing base + ('NM_000016.4:c.1189dup', ('chr1', 76227049, 'C', 'CT'), False, True), ('NM_000016.4:c.1191delT', ('chr1', 76227051, 'AT', 'A'), True, True), + # Same as above but without optional trailing base + ('NM_000016.4:c.1191del', ('chr1', 76227051, 'AT', 'A'), False, True), ('NM_000016.4:c.306_307insG', ('chr1', 76199232, 'T', 'TG'), True, True), # Alignment tests for HGVS 3' and VCF left-alignment. @@ -734,6 +797,9 @@ def test_invalid_coordinates(): True, True), ('NM_000492.3:c.1155_1156dupTA', ('chr7', 117182104, 'A', 'AAT'), True, True), + # Same as above but without optional trailing base + ('NM_000492.3:c.1155_1156dup', ('chr7', 117182104, 'A', 'AAT'), + False, True), ('NM_000492.3:c.3889dupT', ('chr7', 117292905, 'A', 'AT'), True, True), # Transcript prefix. @@ -750,6 +816,9 @@ def test_invalid_coordinates(): ('chr11:g.17496508T>C', ('chr11', 17496508, 'T', 'C'), False, True), # Genomic indels. + ('chr17:g.7126029_7126030insTG', + ('chr17', 7126028, 'A', 'AGT'), True, True), + ('chr17:g.7126029_7126037delGCAGAGGTGinsTCAAAGCAC', ('chr17', 7126028, 'AGCAGAGGTG', 'ATCAAAGCAC'), False, True), @@ -757,6 +826,13 @@ def test_invalid_coordinates(): ('chr1:g.76216235delAinsGC', ('chr1', 76216234, 'AA', 'AGC'), False, True), + # Genomic dup + ('chr7:g.117182108_117182109dup', ('chr7', 117182104, 'A', 'AAT'), + False, True), + + ('chr7:g.117182105_117182106dupAT', ('chr7', 117182104, 'A', 'AAT'), + False, True), + # Genomic delete region. ('chr1:g.76199215_76199220delGGTCTT', ('chr1', 76199214, 'AGGTCTT', 'A'), False, True), diff --git a/pyhgvs/tests/test_hgvs_names_long.py b/pyhgvs/tests/test_hgvs_names_long.py index 9a019a7..2bfdfec 100644 --- a/pyhgvs/tests/test_hgvs_names_long.py +++ b/pyhgvs/tests/test_hgvs_names_long.py @@ -2,7 +2,7 @@ from .. import parse_hgvs_name from ..utils import read_transcripts -from ..variants import normalize_variant +from ..models.variants import normalize_variant from .genome import MockGenomeTestFile diff --git a/pyhgvs/tests/test_variants.py b/pyhgvs/tests/test_variants.py index de37a45..c4b9b35 100644 --- a/pyhgvs/tests/test_variants.py +++ b/pyhgvs/tests/test_variants.py @@ -3,7 +3,7 @@ from unittest import TestCase -from ..variants import normalize_variant +from ..models.variants import normalize_variant from .genome import MockGenomeTestFile diff --git a/pyhgvs/utils.py b/pyhgvs/utils.py index 83e239e..3fcc320 100644 --- a/pyhgvs/utils.py +++ b/pyhgvs/utils.py @@ -5,63 +5,58 @@ from __future__ import absolute_import from __future__ import unicode_literals -from .models import Exon -from .models import Position -from .models import Transcript +import operator +from .models.variants import Position +from .models.transcript import Transcript, CDNA_Match, Exon def read_refgene(infile): - """ - Iterate through a refGene file. + """ refGene = genePred with extra column at front (and ignored ones after) """ + return read_genepred(infile, skip_first_column=True) + +def read_genepred(infile, skip_first_column=False): + """ GenePred extension format: http://genome.ucsc.edu/FAQ/FAQformat.html#GenePredExt Column definitions: - 0. uint undocumented id - 1. string name; "Name of gene (usually transcript_id from GTF)" - 2. string chrom; "Chromosome name" - 3. char[1] strand; "+ or - for strand" - 4. uint txStart; "Transcription start position" - 5. uint txEnd; "Transcription end position" - 6. uint cdsStart; "Coding region start" - 7. uint cdsEnd; "Coding region end" - 8. uint exonCount; "Number of exons" - 9. uint[exonCount] exonStarts; "Exon start positions" - 10. uint[exonCount] exonEnds; "Exon end positions" - 11. uint id; "Unique identifier" - 12. string name2; "Alternate name (e.g. gene_id from GTF)" - 13. string cdsStartStat; "enum('none','unk','incmpl','cmpl')" - 14. string cdsEndStat; "enum('none','unk','incmpl','cmpl')" - 15. lstring exonFrames; "Exon frame offsets {0,1,2}" + 0. string name; "Name of gene (usually transcript_id from GTF)" + 1. string chrom; "Chromosome name" + 2. char[1] strand; "+ or - for strand" + 3. uint txStart; "Transcription start position" + 4. uint txEnd; "Transcription end position" + 5. uint cdsStart; "Coding region start" + 6. uint cdsEnd; "Coding region end" + 7. uint exonCount; "Number of exons" + 8. uint[exonCount] exonStarts; "Exon start positions" + 9. uint[exonCount] exonEnds; "Exon end positions" + 10. uint id; "Unique identifier" + 11. string name2; "Alternate name (e.g. gene_id from GTF)" """ for line in infile: # Skip comments. if line.startswith('#'): continue row = line.rstrip('\n').split('\t') - if len(row) != 16: - raise ValueError( - 'File has incorrect number of columns ' - 'in at least one line.') + if skip_first_column: + row = row[1:] # Skip trailing , - exon_starts = list(map(int, row[9].split(',')[:-1])) - exon_ends = list(map(int, row[10].split(',')[:-1])) - exon_frames = list(map(int, row[15].split(',')[:-1])) + exon_starts = list(map(int, row[8].split(',')[:-1])) + exon_ends = list(map(int, row[9].split(',')[:-1])) exons = list(zip(exon_starts, exon_ends)) yield { - 'chrom': row[2], - 'start': int(row[4]), - 'end': int(row[5]), - 'id': row[1], - 'strand': row[3], - 'cds_start': int(row[6]), - 'cds_end': int(row[7]), - 'gene_name': row[12], + 'chrom': row[1], + 'start': int(row[3]), + 'end': int(row[4]), + 'id': row[0], + 'strand': row[2], + 'cds_start': int(row[5]), + 'cds_end': int(row[6]), + 'gene_name': row[11], 'exons': exons, - 'exon_frames': exon_frames } @@ -89,25 +84,72 @@ def make_transcript(transcript_json): transcript_json['chrom'], transcript_json['cds_start'], transcript_json['cds_end'], - transcript_json['strand'] == '+')) + transcript_json['strand'] == '+'), + start_codon_transcript_pos=transcript_json.get("start_codon_transcript_pos"), + stop_codon_transcript_pos=transcript_json.get("stop_codon_transcript_pos"), + ) exons = transcript_json['exons'] + exons.sort(key=operator.itemgetter(0)) + cdna_match = transcript_json.get('cdna_match', []) + cdna_match.sort(key=operator.itemgetter(0)) + if not transcript.tx_position.is_forward_strand: - exons = reversed(exons) - - for exon_number, (exon_start, exon_end) in enumerate(exons, 1): - transcript.exons.append( - Exon(transcript=transcript, - tx_position=Position( - transcript_json['chrom'], - exon_start, - exon_end, - transcript_json['strand'] == '+'), - exon_number=exon_number)) + exons.reverse() + cdna_match.reverse() + + # We don't use exons, but run everything through cDNA match so there's just 1 path + # exons are treated as a perfect cDNA match + if not cdna_match: + cdna_match = json_perfect_exons_to_cdna_match(exons) + + for number, (exon_start, exon_end, cdna_start, cdna_end, gap) in enumerate(cdna_match, 1): + transcript.cdna_match.append(CDNA_Match(transcript=transcript, + tx_position=Position( + transcript_json['chrom'], + exon_start, + exon_end, + transcript_json['strand'] == '+'), + cdna_start=cdna_start, + cdna_end=cdna_end, + gap=gap, + number=number)) return transcript +def json_perfect_exons_to_cdna_match(ordered_exons, single=False): + """ Perfectly matched exons are basically a no-gap case of cDNA match + single - use a single cDNA match (deletions for introns) - this is currently broken do not use + """ + cdna_match = [] + if single: + ordered_exons = list(ordered_exons) + start = ordered_exons[0][0] + end = ordered_exons[-1][1] + last_exon_end = None + gap_list = [] + cdna_length = 0 + for (exon_start, exon_end) in ordered_exons: + # end up looking like "M D M D (M=exon, D=intron length)" + if last_exon_end: + intron_length = abs(exon_start - last_exon_end) + gap_list.append("D%d" % intron_length) + exon_length = exon_end - exon_start + cdna_length += exon_length + gap_list.append("M%d" % exon_length) + last_exon_end = exon_end + cdna_match = [[start, end, 1, cdna_length, " ".join(gap_list)]] + else: + cdna_start = 1 + for (exon_start, exon_end) in ordered_exons: + exon_length = exon_end - exon_start + cdna_end = cdna_start + exon_length - 1 + cdna_match.append([exon_start, exon_end, cdna_start, cdna_end, None]) + cdna_start = cdna_end + 1 + return cdna_match + + def read_transcripts(refgene_file): """ Read all transcripts in a RefGene file. diff --git a/requirements-dev.txt b/requirements-dev.txt index 1018450..5b2667d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ flake8==2.2.5 +lazy==1.4 nose==1.3.7 pyfaidx==0.5.0.1 diff --git a/setup.py b/setup.py index 3f94c25..34bcc44 100644 --- a/setup.py +++ b/setup.py @@ -20,17 +20,18 @@ def main(): setup( name='pyhgvs', - version='0.11.1', + version='0.12.5', description='HGVS name parsing and formatting', long_description=description, author='Matt Rasmussen', author_email='rasmus@counsyl.com', - packages=[str('pyhgvs'), str('pyhgvs.tests')], + packages=[str('pyhgvs'), str('pyhgvs.models'), str('pyhgvs.tests')], include_package_data=True, scripts=[], install_requires=['pip>=1.2'], tests_require=[ 'flake8==2.2.5', + 'lazy==1.4', 'nose==1.3.7', 'pyfaidx==0.5.0.1', ],