From 42d25b6fe8eb05cf3ca00134e58f7c068fc733aa Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld <50678786+jarbesfeld@users.noreply.github.com> Date: Fri, 1 Mar 2024 09:17:10 -0500 Subject: [PATCH] feat: Determine adjacent exon for fusions with non-exonic breakpoint (#268) Allows for the adjacent exon to be determined for fusions with breakpoints that do not occur on an exon --- src/cool_seq_tool/app.py | 5 +- .../mappers/exon_genomic_coords.py | 166 ++++++++++- src/cool_seq_tool/sources/uta_database.py | 30 ++ tests/conftest.py | 30 ++ tests/mappers/test_exon_genomic_coords.py | 265 ++++++++++++++++++ tests/sources/test_uta_database.py | 16 ++ 6 files changed, 509 insertions(+), 3 deletions(-) diff --git a/src/cool_seq_tool/app.py b/src/cool_seq_tool/app.py index e644babf..9d3444ac 100644 --- a/src/cool_seq_tool/app.py +++ b/src/cool_seq_tool/app.py @@ -86,5 +86,8 @@ def __init__( self.uta_db, ) self.ex_g_coords_mapper = ExonGenomicCoordsMapper( - self.uta_db, self.mane_transcript + self.seqrepo_access, + self.uta_db, + self.mane_transcript, + self.mane_transcript_mappings, ) diff --git a/src/cool_seq_tool/mappers/exon_genomic_coords.py b/src/cool_seq_tool/mappers/exon_genomic_coords.py index 17856d68..7ff4844c 100644 --- a/src/cool_seq_tool/mappers/exon_genomic_coords.py +++ b/src/cool_seq_tool/mappers/exon_genomic_coords.py @@ -2,6 +2,7 @@ import logging from typing import Dict, List, Optional, Tuple, TypeVar, Union +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.mappers.mane_transcript import CdnaRepresentation, ManeTranscript from cool_seq_tool.schemas import ( AnnotationLayer, @@ -13,6 +14,7 @@ TranscriptExonData, TranscriptExonDataResponse, ) +from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings from cool_seq_tool.sources.uta_database import UtaDatabase from cool_seq_tool.utils import get_inter_residue_pos, service_meta @@ -28,7 +30,13 @@ class ExonGenomicCoordsMapper: coordinate representation. """ - def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None: + def __init__( + self, + seqrepo_access: SeqRepoAccess, + uta_db: UtaDatabase, + mane_transcript: ManeTranscript, + mane_transcript_mappings: ManeTranscriptMappings, + ) -> None: """Initialize ExonGenomicCoordsMapper class. A lot of resources are required for initialization, so when defaults are enough, @@ -50,11 +58,15 @@ def __init__(self, uta_db: UtaDatabase, mane_transcript: ManeTranscript) -> None >>> result.genomic_data.start, result.genomic_data.end (156864428, 156881456) + :param: seqrepo_access: SeqRepo instance to give access to query SeqRepo database :param uta_db: UtaDatabase instance to give access to query UTA database :param mane_transcript: Instance to align to MANE or compatible representation + :param mane_transcript_mappings: Instance to provide access to ManeTranscriptMappings class """ + self.seqrepo_access = seqrepo_access self.uta_db = uta_db self.mane_transcript = mane_transcript + self.mane_transcript_mappings = mane_transcript_mappings @staticmethod def _return_warnings( @@ -217,6 +229,7 @@ async def genomic_to_transcript_exon_coordinates( end: Optional[int] = None, strand: Optional[Strand] = None, transcript: Optional[str] = None, + get_nearest_transcript_junction: bool = False, gene: Optional[str] = None, residue_mode: Union[ ResidueMode.INTER_RESIDUE, ResidueMode.RESIDUE @@ -254,7 +267,13 @@ async def genomic_to_transcript_exon_coordinates( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript. See the :ref:`Transcript Selection policy ` page. - :param gene: HGNC gene symbol + param get_nearest_transcript_junction: If ``True``, this will return the + adjacent exon if the position specified by``start`` or ``end`` does not + occur on an exon. For the positive strand, adjacent is defined as the exon + preceding the breakpoint for the 5' end and the exon following the + breakpoint for the 3' end. For the negative strand, adjacent is defined as + the exon following the breakpoint for the 5' end and the exon preceding the + breakpoint for the 3' end. :param residue_mode: Residue mode for ``start`` and ``end`` :return: Genomic data (inter-residue coordinates) """ @@ -280,6 +299,7 @@ async def genomic_to_transcript_exon_coordinates( strand=strand, transcript=transcript, gene=gene, + get_nearest_transcript_junction=get_nearest_transcript_junction, is_start=True, ) if start_data.transcript_exon_data: @@ -299,6 +319,7 @@ async def genomic_to_transcript_exon_coordinates( strand=strand, transcript=transcript, gene=gene, + get_nearest_transcript_junction=get_nearest_transcript_junction, is_start=False, ) if end_data.transcript_exon_data: @@ -453,6 +474,7 @@ async def _genomic_to_transcript_exon_coordinate( strand: Optional[Strand] = None, transcript: Optional[str] = None, gene: Optional[str] = None, + get_nearest_transcript_junction: bool = False, is_start: bool = True, ) -> TranscriptExonDataResponse: """Convert individual genomic data to transcript data @@ -469,6 +491,13 @@ async def _genomic_to_transcript_exon_coordinate( following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining Compatible Transcript :param gene: HGNC gene symbol + :param get_nearest_transcript_junction: If ``True``, this will return the + adjacent exon if the position specified by``start`` or ``end`` does not + occur on an exon. For the positive strand, adjacent is defined as the exon + preceding the breakpoint for the 5' end and the exon following the + breakpoint for the 3' end. For the negative strand, adjacent is defined as + the exon following the breakpoint for the 5' end and the exon preceding the + breakpoint for the 3' end. :param is_start: ``True`` if ``pos`` is start position. ``False`` if ``pos`` is end position. :return: Transcript data (inter-residue coordinates) @@ -484,6 +513,87 @@ async def _genomic_to_transcript_exon_coordinate( params = {key: None for key in TranscriptExonData.model_fields} + if get_nearest_transcript_junction: + if not gene or not strand: + return self._return_warnings( + resp, + "Gene or strand must be provided to select the adjacent transcript junction", + ) + alt_acs, w = self.seqrepo_access.chromosome_to_acs(chromosome) + + if not alt_acs: + return self._return_warnings(resp, w) + alt_ac = alt_acs[0] + + if not transcript: + # Select a transcript if not provided + mane_transcripts = self.mane_transcript_mappings.get_gene_mane_data( + gene + ) + + if mane_transcripts: + transcript = mane_transcripts[0]["RefSeq_nuc"] + else: + # Attempt to find a coding transcript if a MANE transcript + # cannot be found + results = await self.uta_db.get_transcripts( + gene=gene, alt_ac=alt_ac + ) + + if not results.is_empty(): + transcript = results[0]["tx_ac"][0] + else: + # Run if gene is for a noncoding transcript + query = f""" + SELECT DISTINCT tx_ac + FROM {self.uta_db.schema}.tx_exon_aln_v + WHERE hgnc = '{gene}' + AND alt_ac = '{alt_ac}' + """ # noqa: S608 + result = await self.uta_db.execute_query(query) + + if result: + transcript = result[0]["tx_ac"] + else: + return self._return_warnings( + resp, + f"Could not find a transcript for {gene} on {alt_ac}", + ) + + tx_genomic_coords, w = await self.uta_db.get_tx_exons_genomic_coords( + tx_ac=transcript, alt_ac=alt_ac + ) + if not tx_genomic_coords: + return self._return_warnings(resp, w) + + # Check if breakpoint occurs on an exon. + # If not, determine the adjacent exon given the selected transcript + if not self._is_exonic_breakpoint(pos, tx_genomic_coords): + exon = self._get_adjacent_exon( + tx_exons_genomic_coords=tx_genomic_coords, + strand=strand, + start=pos if is_start else None, + end=pos if not is_start else None, + ) + + params["exon"] = exon + params["transcript"] = transcript + params["gene"] = gene + params["pos"] = pos + params["chr"] = alt_ac + + self._set_exon_offset( + params=params, + start=tx_genomic_coords[exon - 1][3], # Start exon coordinate + end=tx_genomic_coords[exon - 1][4], # End exon coordinate + pos=pos, + is_start=is_start, + strand=strand, + ) + params["strand"] = strand.value + resp.transcript_exon_data = TranscriptExonData(**params) + return resp + if alt_ac: # Check if valid accession is given if not await self.uta_db.validate_genomic_ac(alt_ac): @@ -807,3 +917,55 @@ def _get_exon_number(tx_exons: List, tx_pos: int) -> int: break i += 1 return i + + @staticmethod + def _get_adjacent_exon( + tx_exons_genomic_coords: List[Tuple[int, int, int, int, int]], + strand: Strand, + start: Optional[int] = None, + end: Optional[int] = None, + ) -> int: + """Return the adjacent exon given a non-exonic breakpoint. For the positive + strand, adjacent is defined as the exon preceding the breakpoint for the 5' end + and the exon following the breakpoint for the 3' end. For the negative strand, + adjacent is defined as the exon following the breakpoint for the 5' end and the + exon preceding the breakpoint for the 3' end. + + :param: tx_exons_genomic_coords: List of tuples describing exons and genomic + coordinates for a transcript. Each tuple contains the transcript number + (0-indexed), the transcript coordinates for the exon, and the genomic + coordinates for the exon. Pos 0 in the tuple corresponds to the exon + number, pos 1 and pos 2 refer to the start and end transcript coordinates, + respectively, and pos 3 and 4 refer to the start and end genomic + coordinates, respectively. + :param strand: Strand + :param: start: Genomic coordinate of breakpoint + :param: end: Genomic coordinate of breakpoint + :return: Exon number corresponding to adjacent exon. Will be 1-based + """ + for i in range(len(tx_exons_genomic_coords) - 1): + exon = tx_exons_genomic_coords[i] + next_exon = tx_exons_genomic_coords[i + 1] + bp = start if start else end + if strand == strand.POSITIVE: + lte_exon = exon + gte_exon = next_exon + else: + lte_exon = next_exon + gte_exon = exon + if bp >= lte_exon[4] and bp <= gte_exon[3]: + break + # Return current exon if end position is provided, next exon if start position + # is provided. exon[0] needs to be incremented by 1 in both cases as exons are + # 0-based in UTA + return exon[0] + 1 if end else exon[0] + 2 + + @staticmethod + def _is_exonic_breakpoint(pos: int, tx_genomic_coords: List) -> bool: + """Check if a breakpoint occurs on an exon + + :param pos: Genomic breakpoint + :param tx_genomic_coords: A list of genomic coordinates for a transcript + :return: True if the breakpoint occurs on an exon + """ + return any(pos >= exon[3] and pos <= exon[4] for exon in tx_genomic_coords) diff --git a/src/cool_seq_tool/sources/uta_database.py b/src/cool_seq_tool/sources/uta_database.py index 81f108a3..4ff5e3de 100644 --- a/src/cool_seq_tool/sources/uta_database.py +++ b/src/cool_seq_tool/sources/uta_database.py @@ -348,6 +348,36 @@ async def get_tx_exons( tx_exons = [(r["tx_start_i"], r["tx_end_i"]) for r in result] return tx_exons, None + async def get_tx_exons_genomic_coords( + self, + tx_ac: str, + alt_ac: str, + ) -> Tuple[Optional[Tuple[int, int, int, int, int]], Optional[str]]: + """Get exon number, transcript coordinates, and genomic coordinates + + :param tx_ac: Transcript accession + :param alt_ac: RefSeq genomic accession + :return: Tuple of exon numbers, transcript and genomic coordinates, + and warnings if found + """ + query = f""" + SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i + FROM {self.schema}.tx_exon_aln_v + WHERE tx_ac = '{tx_ac}' + AND alt_ac = '{alt_ac}' + """ # noqa: S608 + result = await self.execute_query(query) + + if not result: + msg = f"Unable to get exons and genomic coordinates for {tx_ac} on {alt_ac}" + logger.warning(msg) + return None, msg + tx_exons_genomic_coords = [ + (r["ord"], r["tx_start_i"], r["tx_end_i"], r["alt_start_i"], r["alt_end_i"]) + for r in result + ] + return tx_exons_genomic_coords, None + async def get_alt_ac_start_or_end( self, tx_ac: str, tx_exon_start: int, tx_exon_end: int, gene: Optional[str] ) -> Tuple[Optional[Tuple[str, str, int, int, int]], Optional[str]]: diff --git a/tests/conftest.py b/tests/conftest.py index 26840285..cee55a79 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -61,6 +61,36 @@ def nm_152263_exons(): ] +@pytest.fixture(scope="session") +def nm_152263_exons_genomic_coords(): + """Create test fixture for NM_152263.4 exons and genomic coordinates.""" + return [ + (0, 0, 199, 154191901, 154192100), + (1, 199, 325, 154191185, 154191311), + (2, 325, 459, 154176114, 154176248), + (3, 459, 577, 154173083, 154173201), + (4, 577, 648, 154172907, 154172978), + (5, 648, 724, 154171412, 154171488), + (6, 724, 787, 154170648, 154170711), + (7, 787, 857, 154170399, 154170469), + (8, 857, 936, 154169304, 154169383), + (9, 936, 7064, 154161812, 154167940), + ] + + +@pytest.fixture(scope="session") +def nm_001105539_exons_genomic_coords(): + """Create test fixture for NM_001105539.3 exons and genomic coordinates.""" + return [ + (0, 0, 1557, 80486225, 80487782), + (1, 1557, 2446, 80499493, 80500382), + (2, 2446, 2545, 80513909, 80514008), + (3, 2545, 2722, 80518402, 80518579), + (4, 2722, 2895, 80518781, 80518954), + (5, 2895, 9938, 80519222, 80526265), + ] + + @pytest.fixture(scope="session") def tpm3_1_8_start_genomic(): """Create test fixture for genomic data for exon 1, 8""" diff --git a/tests/mappers/test_exon_genomic_coords.py b/tests/mappers/test_exon_genomic_coords.py index c7910fd4..735eb725 100644 --- a/tests/mappers/test_exon_genomic_coords.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -188,6 +188,114 @@ def ntrk1_exon10_exon17(): return GenomicData(**params) +@pytest.fixture(scope="module") +def zbtb10_exon3_end(): + """Create test fixture for ZBTB10, end of exon 3""" + params = { + "gene": "ZBTB10", + "chr": "NC_000008.11", + "start": None, + "end": 80514009, + "exon_start": None, + "exon_end": 3, + "exon_end_offset": 100, + "exon_start_offset": None, + "transcript": "NM_001105539.3", + "strand": Strand.POSITIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def zbtb10_exon5_start(): + """Create test fixture for ZBTB10, start of exon 5""" + params = { + "gene": "ZBTB10", + "chr": "NC_000008.11", + "start": 80518580, + "end": None, + "exon_start": 5, + "exon_start_offset": -374, + "exon_end": None, + "exon_end_offset": None, + "transcript": "NM_001105539.3", + "strand": Strand.POSITIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def tpm3_exon6_end(): + """Create test fixture for TPM3, end of exon 6""" + params = { + "gene": "TPM3", + "chr": "NC_000001.11", + "start": None, + "end": 154171409, + "exon_start": None, + "exon_start_offset": None, + "exon_end": 6, + "exon_end_offset": 3, + "transcript": "NM_152263.4", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def tpm3_exon5_start(): + """Create test fixture for TPM3, start of exon 5""" + params = { + "gene": "TPM3", + "chr": "NC_000001.11", + "start": 154173080, + "end": None, + "exon_start": 5, + "exon_start_offset": -102, + "exon_end": None, + "exon_end_offset": None, + "transcript": "NM_152263.4", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def gusbp3_exon2_end(): + """Create test fixture for GUSBP3, end of exon 2""" + params = { + "gene": "GUSBP3", + "chr": "NC_000005.10", + "start": None, + "end": 69680763, + "exon_start": None, + "exon_start_offset": None, + "exon_end": 2, + "exon_end_offset": 2, + "transcript": "NR_027386.2", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + +@pytest.fixture(scope="module") +def gusbp3_exon5_start(): + """Create test fixture for GUSBP3, start of exon 5""" + params = { + "gene": "GUSBP3", + "chr": "NC_000005.10", + "start": 69645878, + "end": None, + "exon_start": 5, + "exon_start_offset": -3589, + "exon_end": None, + "exon_end_offset": None, + "transcript": "NR_027386.2", + "strand": Strand.NEGATIVE, + } + return GenomicData(**params) + + def check_service_meta(actual): """Check that service metadata matches expected @@ -248,6 +356,163 @@ async def test_get_tx_exon_coords(test_egc_mapper, nm_152263_exons): assert resp[1] == "Exon 11 does not exist on NM_152263.3" +@pytest.mark.asyncio() +async def test_get_adjacent_exon( + test_egc_mapper, nm_152263_exons_genomic_coords, nm_001105539_exons_genomic_coords +): + """Test that get_adjacent_exon works properly""" + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + end=154191901, + strand=Strand.NEGATIVE, + ) + assert resp == 1 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + end=154191184, + strand=Strand.NEGATIVE, + ) + assert resp == 2 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_152263_exons_genomic_coords, + start=154191184, + strand=Strand.NEGATIVE, + ) + assert resp == 3 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, + end=80500385, + strand=Strand.POSITIVE, + ) + assert resp == 2 + resp = test_egc_mapper._get_adjacent_exon( + tx_exons_genomic_coords=nm_001105539_exons_genomic_coords, + start=80518580, + strand=Strand.POSITIVE, + ) + assert resp == 5 + + +def test_is_exonic_breakpoint(test_egc_mapper, nm_001105539_exons_genomic_coords): + """Test is breakpoint occurs on exon""" + resp = test_egc_mapper._is_exonic_breakpoint( + 80514010, nm_001105539_exons_genomic_coords + ) + assert resp is False # Breakpoint does not occur on an exon + + resp = test_egc_mapper._is_exonic_breakpoint( + 80499495, nm_001105539_exons_genomic_coords + ) + assert resp is True # Breakpoint does occur on an exon + + +@pytest.mark.asyncio() +async def test_genomic_to_transcript_fusion_context( + test_egc_mapper, + zbtb10_exon3_end, + zbtb10_exon5_start, + tpm3_exon6_end, + tpm3_exon5_start, + gusbp3_exon2_end, + gusbp3_exon5_start, +): + """Test that genomic to transcript works correctly for non-exonic breakpoints""" + inputs = { + "chromosome": "8", + "end": 80514010, + "strand": Strand.POSITIVE, + "gene": "ZBTB10", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, zbtb10_exon3_end) + + inputs = { + "chromosome": "8", + "start": 80518581, + "strand": Strand.POSITIVE, + "gene": "ZBTB10", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, zbtb10_exon5_start) + + inputs = { + "chromosome": "1", + "end": 154171410, + "strand": Strand.NEGATIVE, + "gene": "TPM3", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, tpm3_exon6_end) + + inputs = { + "chromosome": "1", + "start": 154173081, + "strand": Strand.NEGATIVE, + "gene": "TPM3", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, tpm3_exon5_start) + + inputs = { + "chromosome": "5", + "end": 69680764, + "strand": Strand.NEGATIVE, + "gene": "GUSBP3", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, gusbp3_exon2_end) + + inputs = { + "chromosome": "5", + "start": 69645879, + "strand": Strand.NEGATIVE, + "gene": "GUSBP3", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, gusbp3_exon5_start) + + inputs = { # Test when strand is not provided + "chromosome": "5", + "start": 69645879, + "gene": "GUSBP3", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + assert ( + resp.warnings[0] + == "Gene or strand must be provided to select the adjacent transcript junction" + ) + + inputs = { # Test when gene and strand are not provided + "chromosome": "5", + "start": 69645879, + "transcript": "NR_027386.2", + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + assert ( + resp.warnings[0] + == "Gene or strand must be provided to select the adjacent transcript junction" + ) + + inputs = { # Test when transcript is provided + "chromosome": "5", + "start": 69645879, + "gene": "GUSBP3", + "transcript": "NR_027386.2", + "strand": Strand.NEGATIVE, + "get_nearest_transcript_junction": True, + } + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) + genomic_data_assertion_checks(resp, gusbp3_exon5_start) + + @pytest.mark.asyncio() async def test_get_alt_ac_start_and_end( test_egc_mapper, tpm3_1_8_start_genomic, tpm3_1_8_end_genomic diff --git a/tests/sources/test_uta_database.py b/tests/sources/test_uta_database.py index df7d862f..a7b987a2 100644 --- a/tests/sources/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -69,6 +69,22 @@ async def test_get_tx_exons(test_db, nm_152263_exons): assert resp[1] == "Unable to get exons for NM_152263.36" +@pytest.mark.asyncio() +async def test_get_tx_exons_genomic_coords(test_db, nm_152263_exons_genomic_coords): + """Test that get_tx_exons_genomic_coords works correctly.""" + resp = await test_db.get_tx_exons_genomic_coords("NM_152263.4", "NC_000001.11") + assert resp[0] == nm_152263_exons_genomic_coords + assert resp[1] is None + + # Invalid transcript accession given chromosome accession + resp = await test_db.get_tx_exons_genomic_coords("NM_001105539.3", "NC_000001.11") + assert resp[0] is None + assert ( + resp[1] + == "Unable to get exons and genomic coordinates for NM_001105539.3 on NC_000001.11" + ) + + @pytest.mark.asyncio() async def test_get_cds_start_end(test_db): """Test that get_cds_start_end works correctly."""