From 1424f1bd4410fd18f249c7fc94ebed83daa447c1 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Thu, 12 Oct 2023 12:13:04 -0400 Subject: [PATCH] refactor!: rearrange app architecture (#196) * Remove gene-normalizer dependency * Removed `get_mapped_mane_data` (#194), which was the only reason why we needed gene-normalizer * Rearranges app architecture * handlers * `SeqRepoAccess` * `get_fasta_file` is now a method * mappers * `AlignmentMapper` * `MANETranscript` * `ExonGenomicCoordsMapper` * These methods were originally in `CoolSeqTool` * data_sources * `MANETranscriptMappings` * `TranscriptMappings` * `UTADatabase` --- Pipfile | 1 - cool_seq_tool/__init__.py | 3 - cool_seq_tool/app.py | 638 +----------------- cool_seq_tool/data_sources/__init__.py | 8 - cool_seq_tool/data_sources/gene_normalizer.py | 49 -- cool_seq_tool/handlers/__init__.py | 2 + .../seqrepo_access.py | 67 +- cool_seq_tool/mappers/__init__.py | 4 + .../alignment.py} | 4 +- cool_seq_tool/mappers/exon_genomic_coords.py | 541 +++++++++++++++ .../mane_transcript.py | 144 +--- cool_seq_tool/routers/__init__.py | 2 +- cool_seq_tool/routers/default.py | 15 +- cool_seq_tool/routers/mane.py | 59 +- cool_seq_tool/routers/mappings.py | 8 +- cool_seq_tool/schemas.py | 389 ++++------- cool_seq_tool/sources/__init__.py | 4 + .../mane_transcript_mappings.py | 2 +- .../transcript_mappings.py | 0 .../{data_sources => sources}/uta_database.py | 2 +- .../residue_mode.py => utils.py} | 20 +- cool_seq_tool/version.py | 2 +- setup.cfg | 1 - tests/conftest.py | 32 + tests/handlers/test_seqrepo_access.py | 305 +++++++++ .../test_alignment.py} | 5 +- .../test_exon_genomic_coords.py} | 350 +++------- .../{unit => mappers}/test_mane_transcript.py | 74 +- .../test_mane_transcript_mappings.py | 8 - tests/{unit => sources}/test_uta_database.py | 10 - .../test_residue_mode.py => test_utils.py} | 2 +- tests/unit/conftest.py | 12 - tests/unit/test_seqrepo_access.py | 124 ---- 33 files changed, 1255 insertions(+), 1632 deletions(-) delete mode 100644 cool_seq_tool/data_sources/__init__.py delete mode 100644 cool_seq_tool/data_sources/gene_normalizer.py create mode 100644 cool_seq_tool/handlers/__init__.py rename cool_seq_tool/{data_sources => handlers}/seqrepo_access.py (72%) create mode 100644 cool_seq_tool/mappers/__init__.py rename cool_seq_tool/{data_sources/alignment_mapper.py => mappers/alignment.py} (98%) create mode 100644 cool_seq_tool/mappers/exon_genomic_coords.py rename cool_seq_tool/{data_sources => mappers}/mane_transcript.py (86%) create mode 100644 cool_seq_tool/sources/__init__.py rename cool_seq_tool/{data_sources => sources}/mane_transcript_mappings.py (98%) rename cool_seq_tool/{data_sources => sources}/transcript_mappings.py (100%) rename cool_seq_tool/{data_sources => sources}/uta_database.py (99%) rename cool_seq_tool/{data_sources/residue_mode.py => utils.py} (70%) create mode 100644 tests/handlers/test_seqrepo_access.py rename tests/{unit/test_alignment_mapper.py => mappers/test_alignment.py} (98%) rename tests/{unit/test_cool_seq_tool.py => mappers/test_exon_genomic_coords.py} (54%) rename tests/{unit => mappers}/test_mane_transcript.py (88%) rename tests/{unit => sources}/test_mane_transcript_mappings.py (96%) rename tests/{unit => sources}/test_uta_database.py (98%) rename tests/{unit/test_residue_mode.py => test_utils.py} (90%) delete mode 100644 tests/unit/conftest.py delete mode 100644 tests/unit/test_seqrepo_access.py diff --git a/Pipfile b/Pipfile index 2295afdb..05f48d94 100644 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,6 @@ hgvs = "*" pydantic = "*" fastapi = "*" uvicorn = "*" -gene-normalizer = ">=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8" "ga4gh.vrs" = "*" [dev-packages] diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py index 62f45e63..a671b1ee 100644 --- a/cool_seq_tool/__init__.py +++ b/cool_seq_tool/__init__.py @@ -1,5 +1,4 @@ """The cool_seq_tool package""" -from os import environ from pathlib import Path import logging @@ -13,5 +12,3 @@ logger.setLevel(logging.DEBUG) LOG_FN = "cool_seq_tool.log" - -from .app import CoolSeqTool # noqa: E402, F401, I202 diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py index b8360028..06749f7b 100644 --- a/cool_seq_tool/app.py +++ b/cool_seq_tool/app.py @@ -1,28 +1,22 @@ """Module for initializing data sources.""" -from datetime import datetime -from typing import Optional, TypeVar, Union, List, Tuple, Dict +from typing import Optional from pathlib import Path import logging from biocommons.seqrepo import SeqRepo -from gene.query import QueryHandler as GeneQueryHandler -from cool_seq_tool.data_sources.alignment_mapper import AlignmentMapper -from cool_seq_tool.data_sources.uta_database import UTA_DB_URL +from cool_seq_tool.mappers import ( + MANETranscript, AlignmentMapper, ExonGenomicCoordsMapper +) +from cool_seq_tool.sources.uta_database import UTA_DB_URL, UTADatabase +from cool_seq_tool.sources.mane_transcript_mappings import MANETranscriptMappings +from cool_seq_tool.sources.transcript_mappings import TranscriptMappings +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess from cool_seq_tool.paths import LRG_REFSEQGENE_PATH, MANE_SUMMARY_PATH, \ SEQREPO_ROOT_DIR, TRANSCRIPT_MAPPINGS_PATH -from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ - ResidueMode, GenomicDataResponse, ServiceMeta, TranscriptExonDataResponse -from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer -from cool_seq_tool.version import __version__ - -logger = logging.getLogger("cool_seq_tool") -CoordinatesResponseType = TypeVar( - "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse -) +logger = logging.getLogger(__name__) class CoolSeqTool: @@ -33,26 +27,17 @@ def __init__( transcript_file_path: Path = TRANSCRIPT_MAPPINGS_PATH, lrg_refseqgene_path: Path = LRG_REFSEQGENE_PATH, mane_data_path: Path = MANE_SUMMARY_PATH, - db_url: str = UTA_DB_URL, gene_query_handler: Optional[GeneQueryHandler] = None, - gene_db_url: str = "", gene_db_region: str = "us-east-2", + db_url: str = UTA_DB_URL, sr: Optional[SeqRepo] = None ) -> None: """Initialize CoolSeqTool class - :param Path transcript_file_path: The path to transcript_mapping.tsv - :param Path lrg_refseqgene_path: The path to LRG_RefSeqGene - :param Path mane_data_path: Path to RefSeq MANE summary data - :param str db_url: PostgreSQL connection URL + :param transcript_file_path: The path to transcript_mapping.tsv + :param lrg_refseqgene_path: The path to LRG_RefSeqGene + :param mane_data_path: Path to RefSeq MANE summary data + :param db_url: PostgreSQL connection URL Format: `driver://user:password@host/database/schema` - :param Optional[GeneQueryHandler] gene_query_handler: Gene normalizer query - handler instance. If this is provided, will use a current instance. If this - is not provided, will create a new instance. - :param str gene_db_url: URL to gene normalizer dynamodb. Only used when - `gene_query_handler` is `None`. - :param str gene_db_region: AWS region for gene normalizer db. Only used when - `gene_query_handler` is `None`. - :param Optional[SeqRepo] sr: SeqRepo instance. If this is not provided, will - create a new instance. + :param sr: SeqRepo instance. If this is not provided, will create a new instance """ if not sr: sr = SeqRepo(root_dir=SEQREPO_ROOT_DIR) @@ -63,597 +48,10 @@ def __init__( self.mane_transcript_mappings = MANETranscriptMappings( mane_data_path=mane_data_path) self.uta_db = UTADatabase(db_url=db_url) - gene_normalizer = GeneNormalizer(gene_query_handler, gene_db_url, - gene_db_region) - self.gene_query_handler = gene_normalizer.query_handler self.alignment_mapper = AlignmentMapper( self.seqrepo_access, self.transcript_mappings, self.uta_db) self.mane_transcript = MANETranscript( self.seqrepo_access, self.transcript_mappings, - self.mane_transcript_mappings, self.uta_db, gene_normalizer) - - @staticmethod - def service_meta() -> ServiceMeta: - """Return ServiceMeta for cool_seq_tool - - :return: ServiceMeta object - """ - return ServiceMeta( - version=__version__, - response_datetime=datetime.now() - ) - - @staticmethod - def _return_warnings( - resp: CoordinatesResponseType, - warning_msg: str) -> CoordinatesResponseType: - """Add warnings to response object - - :param Union[GenomicDataResponse, TranscriptExonDataResponse] resp: - Response object - :param str warning_msg: Warning message on why `transcript_exon_data` - or `genomic_data` field is None - :return: Response object with warning message - """ - logger.warning(warning_msg) - resp.warnings.append(warning_msg) - return resp - - async def transcript_to_genomic_coordinates( - self, gene: Optional[str] = None, transcript: Optional[str] = None, - exon_start: Optional[int] = None, exon_start_offset: int = 0, # noqa: E501 - exon_end: Optional[int] = None, exon_end_offset: int = 0, - **kwargs) -> GenomicDataResponse: - """Get genomic data given transcript data. - Will use GRCh38 coordinates if possible - - :param gene: Gene symbol - :param transcript: Transcript accession - :param exon_start: Starting transcript exon number - :param exon_end: Ending transcript exon number - :param exon_start_offset: Starting exon offset - :param exon_end_offset: Ending exon offset - :return: GRCh38 genomic data (inter-residue coordinates) - """ - resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=self.service_meta() - ) - - if not transcript: - return self._return_warnings(resp, "Must provide `transcript`") - else: - transcript = transcript.strip() - - if exon_start is None and exon_end is None: - return self._return_warnings( - resp, "Must provide either `exon_start` or `exon_end`") - - if gene: - gene = gene.upper().strip() - - if exon_start and exon_end: - if exon_start > exon_end: - return self._return_warnings( - resp, - f"Start exon {exon_start} is greater than end exon {exon_end}" # noqa: E501 - ) - - tx_exons, warning = await self.uta_db.get_tx_exons(transcript) - if not tx_exons: - return self._return_warnings(resp, warning or "") - - tx_exon_coords, warning = self.uta_db.get_tx_exon_coords( - transcript, tx_exons, exon_start, exon_end) - if not tx_exon_coords: - return self._return_warnings(resp, warning or "") - tx_exon_start, tx_exon_end = tx_exon_coords - - alt_ac_start_end, warning = await self.uta_db.get_alt_ac_start_and_end( - transcript, tx_exon_start, tx_exon_end, gene=gene) - if not alt_ac_start_end: - return self._return_warnings(resp, warning or "") - alt_ac_start, alt_ac_end = alt_ac_start_end - - gene = alt_ac_start[0] if alt_ac_start else alt_ac_end[0] - chromosome = alt_ac_start[1] if alt_ac_start else alt_ac_end[1] - if gene is None or chromosome is None: - return self._return_warnings( - resp, "Unable to retrieve `gene` or `chromosome` from " - "genomic start or end data") - - start = alt_ac_start[3] if alt_ac_start else None - end = alt_ac_end[2] if alt_ac_end else None - strand = alt_ac_start[4] if alt_ac_start else alt_ac_end[4] - - # Using none since could set to 0 - start_exits = start is not None - end_exists = end is not None - - if strand == -1: - start_offset = exon_start_offset * -1 if start_exits else None - end_offset = exon_end_offset * -1 if end_exists else None - else: - start_offset = exon_start_offset if start_exits else None - end_offset = exon_end_offset if end_exists else None - - start = start + start_offset if start_exits else None - end = end + end_offset if end_exists else None - - resp.genomic_data = GenomicData( - gene=gene, - chr=chromosome, - start=start, - end=end, - exon_start=exon_start if start_exits else None, - exon_start_offset=exon_start_offset if start_exits else None, - exon_end=exon_end if end_exists else None, - exon_end_offset=exon_end_offset if end_exists else None, - transcript=transcript, - strand=strand - ) - - return resp - - async def genomic_to_transcript_exon_coordinates( - self, chromosome: Union[str, int], start: Optional[int] = None, - end: Optional[int] = None, strand: Optional[int] = None, - transcript: Optional[str] = None, gene: Optional[str] = None, - residue_mode: ResidueMode = ResidueMode.RESIDUE, - **kwargs) -> GenomicDataResponse: - """Get transcript data for genomic data. - MANE Transcript data will be returned iff `transcript` is not supplied. - `gene` must be supplied in order to retrieve MANE Transcript data. - Liftovers genomic coordinates to GRCh38 - - :param str chromosome: Chromosome. Must either give chromosome number - (i.e. `1`) or accession (i.e. `NC_000001.11`). - :param int start: Start genomic position - :param int end: End genomic position - :param str strand: Strand. Must be either `-1` or `1`. - :param str transcript: The transcript to use. If this is not given, - we will try the following transcripts: MANE Select, MANE Clinical - Plus, Longest Remaining Compatible Transcript - :param str gene: Gene symbol - :param str residue_mode: Default is `resiude` (1-based). - Must be either `residue` or `inter-residue` (0-based). - :return: Genomic data (inter-residue coordinates) - """ - resp = GenomicDataResponse( - genomic_data=None, - warnings=[], - service_meta=self.service_meta() - ) - if start is None and end is None: - return self._return_warnings( - resp, "Must provide either `start` or `end`") - - params = {key: None for key in GenomicData.__fields__.keys()} - if gene is not None: - gene = gene.upper().strip() - - if start: - if residue_mode == ResidueMode.RESIDUE: - start -= 1 - start_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, start, strand=strand, transcript=transcript, - gene=gene, is_start=True, - residue_mode=ResidueMode.INTER_RESIDUE - ) - if start_data.transcript_exon_data: - start_data = start_data.transcript_exon_data.dict() - else: - return self._return_warnings(resp, start_data.warnings[0]) - else: - start_data = None - - if end: - if residue_mode == ResidueMode.RESIDUE: - end -= 1 - end_data = await self._genomic_to_transcript_exon_coordinate( - chromosome, end, strand=strand, transcript=transcript, - gene=gene, is_start=False, - residue_mode=ResidueMode.INTER_RESIDUE - ) - if end_data.transcript_exon_data: - end_data = end_data.transcript_exon_data.dict() - else: - return self._return_warnings(resp, end_data.warnings[0]) - else: - end_data = None - - for field in ["transcript", "gene", "chr", "strand"]: - if start_data: - if end_data: - if start_data[field] != end_data[field]: - msg = f"Start `{field}`, {start_data[field]}, does " \ - f"not match End `{field}`, {end_data[field]}" - return self._return_warnings(resp, msg) - params[field] = start_data[field] - else: - params[field] = end_data[field] - - if gene and gene != params["gene"]: - msg = f"Input gene, {gene}, does not match expected output" \ - f"gene, {params['gene']}" - return self._return_warnings(resp, msg) - - for label, data in [("start", start_data), ("end", end_data)]: - if data: - params[label] = data["pos"] - params[f"exon_{label}"] = data["exon"] - params[f"exon_{label}_offset"] = data["exon_offset"] - resp.genomic_data = GenomicData(**params) - return resp - - async def _genomic_to_transcript_exon_coordinate( - self, chromosome: Union[str, int], pos: int, strand: int = None, - transcript: str = None, gene: str = None, is_start: bool = True, - residue_mode: ResidueMode = ResidueMode.RESIDUE) -> TranscriptExonDataResponse: # noqa: E501 - """Convert individual genomic data to transcript data - - :param str chromosome: Chromosome. Must either give chromosome number - (i.e. `1`) or accession (i.e. `NC_000001.11`). - :param int pos: Genomic position - :param str strand: Strand. Must be either `-1` or `1`. - :param str transcript: The transcript to use. If this is not given, - we will try the following transcripts: MANE Select, MANE Clinical - Plus, Longest Remaining Compatible Transcript - :param str gene: Gene symbol - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :param str residue_mode: Default is `resiude` (1-based). - Must be either `residue` or `inter-residue` (0-based). - :return: Transcript data (inter-residue coordinates) - """ - resp = TranscriptExonDataResponse( - transcript_exon_data=None, - warnings=[], - service_meta=self.service_meta() - ) - - if transcript is None and gene is None: - return self._return_warnings( - resp, "Must provide either `gene` or `transcript`" - ) - - params = {key: None for key in TranscriptExonData.__fields__.keys()} - - try: - # Check if just chromosome is given. If it is, we should - # convert this to the correct accession version - if chromosome == "X": - chromosome = 23 - elif chromosome == "Y": - chromosome = 24 - else: - chromosome = int(chromosome) - except ValueError: - # Check if valid accession is given - if not await self.uta_db.validate_genomic_ac(chromosome): - return self._return_warnings( - resp, f"Invalid chromosome: {chromosome}") - - if isinstance(chromosome, str): - # Accession given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene) - else: - # Number given - genes_alt_acs, warning = \ - await self.uta_db.chr_to_gene_and_accessions( - chromosome, pos, strand=strand, alt_ac=None, gene=gene) - if not genes_alt_acs: - return self._return_warnings(resp, warning) - - gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene) - if not gene_alt_ac: - return self._return_warnings(resp, warning) - gene, alt_ac = gene_alt_ac - - if transcript is None: - warnings = await self._set_mane_genomic_data( - params, gene, alt_ac, pos, strand, is_start, residue_mode) - if warnings: - return self._return_warnings(resp, warnings) - else: - params["transcript"] = transcript - params["gene"] = gene - params["pos"] = pos - params["chr"] = alt_ac - warning = await self._set_genomic_data(params, strand, is_start) - if warning: - return self._return_warnings(resp, warning) - - resp.transcript_exon_data = TranscriptExonData(**params) - return resp - - @staticmethod - def _get_gene_and_alt_ac( - genes_alt_acs: Dict, gene: Optional[str] - ) -> Tuple[Optional[Tuple[str, str]], Optional[str]]: - """Return gene genomic accession - - :param Dict genes_alt_acs: Dictionary containing genes and - genomic accessions - :param Optional[str] gene: Gene symbol - :return: [Gene, Genomic accession] if both exist - """ - alt_acs = genes_alt_acs["alt_acs"] - len_alt_acs = len(alt_acs) - if len_alt_acs > 1: - return None, f"Found more than one accessions: {alt_acs}" - elif len_alt_acs == 0: - return None, "No genomic accessions found" - alt_ac = next(iter(alt_acs)) - - genes = genes_alt_acs["genes"] - len_genes = len(genes) - input_gene = gene - output_gene = None - if len_genes == 1: - output_gene = next(iter(genes)) - elif len_genes > 1: - return None, f"Found more than one gene: {genes}" - elif len_genes == 0: - return None, "No genes found" - - if input_gene is not None: - if output_gene != input_gene.upper(): - return None, f"Input gene, {input_gene}, does not match " \ - f"expected output gene, {output_gene}" - - gene = output_gene if output_gene else input_gene - return (gene, alt_ac), None - - async def _set_mane_genomic_data( - self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, - is_start: bool, residue_mode: str - ) -> Optional[str]: - """Set genomic data in `params` found from MANE. - - :param Dict params: Parameters for response - :param str gene: Gene symbol - :param str alt_ac: Genomic accession - :param int pos: Genomic position - :param int strand: Strand - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :param str residue_mode: Residue mode for start/end positions - Must be either `inter-residue` or `residue` - :return: Warnings if found - """ - mane_data = await self.mane_transcript.get_mane_transcript( - alt_ac, pos, "g", gene=gene, - try_longest_compatible=True, residue_mode=residue_mode - ) - if not mane_data: - msg = f"Unable to find mane data for {alt_ac} with position {pos}" - if gene: - msg += f" on gene {gene}" - logger.warning(msg) - return msg - - if mane_data["strand"] == "-": - mane_data["strand"] = -1 - elif mane_data["strand"] == "+": - mane_data["strand"] = 1 - - params["gene"] = mane_data["gene"] - params["transcript"] = mane_data["refseq"] if mane_data["refseq"] \ - else mane_data["ensembl"] if mane_data["ensembl"] else None - tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac) - if not tx_exons: - return f"Unable to get exons for {params['transcript']}" - tx_pos = mane_data["pos"][0] + mane_data["coding_start_site"] - params["exon"] = self._get_exon_number(tx_exons, tx_pos) - - try: - tx_exon = tx_exons[params["exon"] - 1] - except IndexError: - msg = f"{params['transcript']} with position {tx_pos} "\ - f"does not exist on exons: {tx_exons}" - logger.warning(msg) - return msg - - strand_to_use = strand if strand is not None else mane_data["strand"] - params["strand"] = strand_to_use - self._set_exon_offset(params, tx_exon[0], tx_exon[1], tx_pos, - is_start=is_start, strand=strand_to_use) - - # Need to check if we need to change pos for liftover - genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( - params["transcript"], tx_pos, tx_pos, gene) - if genomic_data is None: - return warnings - - params["chr"] = genomic_data[1] - genomic_coords = genomic_data[2], genomic_data[3] - genomic_pos = genomic_coords[1] if is_start else genomic_coords[0] - params["pos"] = genomic_pos - params["exon_offset"] if \ - strand_to_use == -1 else genomic_pos + params["exon_offset"] - return None - - async def _set_genomic_data(self, params: Dict, strand: int, - is_start: bool) -> Optional[str]: - """Set genomic data in `params`. - - :param Dict params: Parameters for response - :param int strand: Strand - :param bool is_start: `True` if `pos` is start position. `False` if - `pos` is end position. - :return: Warnings if found - """ - # We should always try to liftover - grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"]) - if not grch38_ac: - return f"Invalid genomic accession: {params['chr']}" - - grch38_ac = grch38_ac[0] - if grch38_ac != params["chr"]: # params["chr"] is genomic accession - # Liftover to 38 - descr = await self.uta_db.get_chr_assembly(params["chr"]) - if descr is None: - return f"Unable to get chromosome and assembly for " \ - f"{params['chr']}" - - chromosome_number, assembly = descr - liftover_data = self.uta_db.get_liftover( - chromosome_number, params["pos"], Assembly.GRCH38) - if liftover_data is None: - return f"Position {params['pos']} does not exist on " \ - f"chromosome {chromosome_number}" - - params["pos"] = liftover_data[1] - params["chr"] = grch38_ac - - tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac) - if not tx_exons: - return f"Unable to get exons for {params['transcript']}" - data = await self.uta_db.get_tx_exon_aln_v_data( - params["transcript"], params["pos"], params["pos"], - alt_ac=params["chr"], use_tx_pos=False) - if len(data) != 1: - return f"Must find exactly one row for genomic data, " \ - f"but found: {len(data)}" - - # Find exon number - data = data[0] - data_exons = data[2], data[3] - i = 1 - found_tx_exon = False - for exon in tx_exons: - if data_exons == exon: - found_tx_exon = True - break - i += 1 - if not found_tx_exon: - # Either first or last - i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1 - params["exon"] = i - - strand_to_use = strand if strand is not None else data[7] - params["strand"] = strand_to_use - self._set_exon_offset(params, data[5], data[6], params["pos"], - is_start=is_start, strand=strand_to_use) - return None - - @staticmethod - def _set_exon_offset(params: Dict, start: int, end: int, pos: int, - is_start: bool, strand: int) -> None: - """Set `exon_offset` in params. - - :param Dict params: Parameters for response - :param int start: Start exon coord (can be transcript or genomic) - :param int end: End exon coord (can be transcript or genomic) - :param int pos: Position change (can be transcript or genomic) - :param bool is_start: `True` if `pos` is start position. - `False` if `pos` is end position - :param int strand: Strand - """ - if is_start: - if strand == -1: - params["exon_offset"] = end - pos - else: - params["exon_offset"] = pos - end - else: - if strand == -1: - params["exon_offset"] = start - pos - else: - params["exon_offset"] = pos - start - - async def _structure_exons( - self, transcript: str, alt_ac: Optional[str] = None - ) -> List[Tuple[int, int]]: - """Structure exons as list of tuples. - - :param str transcript: Transcript accession - :param Optional[str] alt_ac: Genomic accession - :return: List of tuples containing transcript exon coordinates - """ - result = list() - tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac) - if not tx_exons: - return result - for coords in tx_exons: - result.append((coords[0], coords[1])) - return result - - @staticmethod - def _get_exon_number(tx_exons: List, tx_pos: int) -> int: - """Find exon number. - - :param List tx_exons: List of exon coordinates - :param int tx_pos: Transcript position change - :return: Exon number associated to transcript position change - """ - i = 1 - for coords in tx_exons: - if coords[0] <= tx_pos <= coords[1]: - break - i += 1 - return i - - def get_fasta_file( - self, sequence_id: str, outfile_path: Path - ) -> None: - """Retrieve FASTA file containing sequence for requested sequence ID. - :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` - :param outfile_path: path to save file to - :return: None, but saves sequence data to `outfile_path` if successful - :raise: KeyError if SeqRepo doesn't have sequence data for the given ID - """ - sequence = self.seqrepo_access.get_reference_sequence(sequence_id)[0] - if not sequence: - raise KeyError - - REFSEQ_PREFIXES = [ - "NC_", - "AC_", - "NZ_", - "NT_", - "NW_", - "NG_", - "NM_", - "XM_", - "NR_", - "XR_", - "NP_", - "AP_", - "XP_", - "YP_", - "WP_" - ] - ENSEMBL_PREFIXES = [ - "ENSE", - "ENSFM", - "ENSG", - "ENSGT", - "ENSP", - "ENSR", - "ENST" - ] - - if sequence_id[:3] in REFSEQ_PREFIXES: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["ensembl", "ga4gh"] - ) - header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}" - elif sequence_id[:4] in ENSEMBL_PREFIXES: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["refseq", "ga4gh"] - ) - header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}" - else: - aliases = self.seqrepo_access.translate_identifier( - sequence_id, ["ensembl", "refseq", "ga4gh"] - ) - header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}" - - LINE_LENGTH = 60 - file_data = [header] + [sequence[i: i + LINE_LENGTH] - for i in range(0, len(sequence), LINE_LENGTH)] - text = "\n".join(file_data) - outfile_path.write_text(text) + self.mane_transcript_mappings, self.uta_db) + self.ex_g_coords_mapper = ExonGenomicCoordsMapper(self.uta_db, + self.mane_transcript) diff --git a/cool_seq_tool/data_sources/__init__.py b/cool_seq_tool/data_sources/__init__.py deleted file mode 100644 index 02f01e4b..00000000 --- a/cool_seq_tool/data_sources/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Module for data sources""" -from .seqrepo_access import SeqRepoAccess -from .mane_transcript_mappings import MANETranscriptMappings -from .transcript_mappings import TranscriptMappings -from .uta_database import UTADatabase -from .gene_normalizer import GeneNormalizer -from .mane_transcript import MANETranscript -from .alignment_mapper import AlignmentMapper diff --git a/cool_seq_tool/data_sources/gene_normalizer.py b/cool_seq_tool/data_sources/gene_normalizer.py deleted file mode 100644 index 3b761f4a..00000000 --- a/cool_seq_tool/data_sources/gene_normalizer.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Module for accessing Gene Normalizer""" -import logging -from typing import Dict, Optional - -from gene.database.dynamodb import DynamoDbDatabase -from gene.query import QueryHandler -from gene.schemas import SourceName - - -logger = logging.getLogger("cool_seq_tool") - - -class GeneNormalizer: - """Gene Normalizer class for getting gene data""" - - def __init__( - self, query_handler: Optional[QueryHandler] = None, db_url: str = "", - db_region: str = "us-east-2" - ) -> None: - """Initialize gene normalizer class - - :param QueryHandler query_handler: Gene normalizer query handler instance. - If this is provided, will use a current instance. If this is not provided, - will create a new instance. - :param str db_url: URL to gene normalizer dynamodb. Only used when - `query_handler` is `None`. - :param str db_region: AWS region for gene normalizer db. Only used when - `query_handler` is `None`. - """ - if query_handler: - self.query_handler = query_handler - else: - ddb = DynamoDbDatabase(db_url=db_url, region_name=db_region) - self.query_handler = QueryHandler(ddb) - - def get_hgnc_data(self, gene: str) -> Dict: - """Return HGNC data for a given gene - - :param str gene: Gene query - :return: HGNC data - """ - hgnc_data = dict() - gene_resp = self.query_handler.normalize_unmerged(gene) - hgnc_matches = gene_resp.source_matches.get(SourceName.HGNC) - if hgnc_matches and hgnc_matches.records: - hgnc_data = hgnc_matches.records[0].dict() - else: - logger.warning(f"Unable to get HGNC symbol for {gene}") - return hgnc_data diff --git a/cool_seq_tool/handlers/__init__.py b/cool_seq_tool/handlers/__init__.py new file mode 100644 index 00000000..d8c49db0 --- /dev/null +++ b/cool_seq_tool/handlers/__init__.py @@ -0,0 +1,2 @@ +"""Module for extending clients""" +from .seqrepo_access import SeqRepoAccess diff --git a/cool_seq_tool/data_sources/seqrepo_access.py b/cool_seq_tool/handlers/seqrepo_access.py similarity index 72% rename from cool_seq_tool/data_sources/seqrepo_access.py rename to cool_seq_tool/handlers/seqrepo_access.py index 14a0a1ab..af73d151 100644 --- a/cool_seq_tool/data_sources/seqrepo_access.py +++ b/cool_seq_tool/handlers/seqrepo_access.py @@ -2,14 +2,15 @@ import logging from typing import Optional, List, Tuple, Union from os import environ +from pathlib import Path from ga4gh.vrs.dataproxy import SeqRepoDataProxy from cool_seq_tool.schemas import ResidueMode -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.utils import get_inter_residue_pos -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class SeqRepoAccess(SeqRepoDataProxy): @@ -139,3 +140,65 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]: return None, f"Unable to get chromosome for {ac}" else: return aliases, None + + def get_fasta_file( + self, sequence_id: str, outfile_path: Path + ) -> None: + """Retrieve FASTA file containing sequence for requested sequence ID. + :param sequence_id: accession ID, sans namespace, eg `NM_152263.3` + :param outfile_path: path to save file to + :return: None, but saves sequence data to `outfile_path` if successful + :raise: KeyError if SeqRepo doesn't have sequence data for the given ID + """ + sequence = self.get_reference_sequence(sequence_id)[0] + if not sequence: + raise KeyError + + REFSEQ_PREFIXES = [ + "NC_", + "AC_", + "NZ_", + "NT_", + "NW_", + "NG_", + "NM_", + "XM_", + "NR_", + "XR_", + "NP_", + "AP_", + "XP_", + "YP_", + "WP_" + ] + ENSEMBL_PREFIXES = [ + "ENSE", + "ENSFM", + "ENSG", + "ENSGT", + "ENSP", + "ENSR", + "ENST" + ] + + if sequence_id[:3] in REFSEQ_PREFIXES: + aliases = self.translate_identifier( + sequence_id, ["ensembl", "ga4gh"] + ) + header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}" + elif sequence_id[:4] in ENSEMBL_PREFIXES: + aliases = self.translate_identifier( + sequence_id, ["refseq", "ga4gh"] + ) + header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}" + else: + aliases = self.translate_identifier( + sequence_id, ["ensembl", "refseq", "ga4gh"] + ) + header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}" + + LINE_LENGTH = 60 + file_data = [header] + [sequence[i: i + LINE_LENGTH] + for i in range(0, len(sequence), LINE_LENGTH)] + text = "\n".join(file_data) + outfile_path.write_text(text) diff --git a/cool_seq_tool/mappers/__init__.py b/cool_seq_tool/mappers/__init__.py new file mode 100644 index 00000000..75ba954e --- /dev/null +++ b/cool_seq_tool/mappers/__init__.py @@ -0,0 +1,4 @@ +"""Module for mapping data""" +from .alignment import AlignmentMapper +from .mane_transcript import MANETranscript +from .exon_genomic_coords import ExonGenomicCoordsMapper diff --git a/cool_seq_tool/data_sources/alignment_mapper.py b/cool_seq_tool/mappers/alignment.py similarity index 98% rename from cool_seq_tool/data_sources/alignment_mapper.py rename to cool_seq_tool/mappers/alignment.py index 14839828..ad6454a5 100644 --- a/cool_seq_tool/data_sources/alignment_mapper.py +++ b/cool_seq_tool/mappers/alignment.py @@ -4,8 +4,8 @@ from typing import Optional, Tuple, Dict from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode -from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \ - UTADatabase +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.sources import TranscriptMappings, UTADatabase class AlignmentMapper: diff --git a/cool_seq_tool/mappers/exon_genomic_coords.py b/cool_seq_tool/mappers/exon_genomic_coords.py new file mode 100644 index 00000000..f0980b10 --- /dev/null +++ b/cool_seq_tool/mappers/exon_genomic_coords.py @@ -0,0 +1,541 @@ +"""Module for mapping transcript exon to and from genomic coordinates""" +import logging +from typing import Optional, TypeVar, Union, Dict, Tuple, List + +from cool_seq_tool.schemas import Assembly, GenomicData, TranscriptExonData, \ + ResidueMode, GenomicDataResponse, TranscriptExonDataResponse +from cool_seq_tool.mappers import MANETranscript +from cool_seq_tool.sources.uta_database import UTADatabase +from cool_seq_tool.utils import service_meta + + +CoordinatesResponseType = TypeVar( + "CoordinatesResponseType", GenomicDataResponse, TranscriptExonDataResponse +) + +logger = logging.getLogger(__name__) + + +class ExonGenomicCoordsMapper: + """Class for mapping transcript exon representation to/from genomic coordinate + representation + """ + + def __init__(self, uta_db: UTADatabase, mane_transcript: MANETranscript) -> None: + """Initialize ExonGenomicCoordsMapper class + + :param uta_db: UTADatabase instance to give access to query UTA database + :param mane_transcript: Instance to align to MANE or compatible representation + """ + self.uta_db = uta_db + self.mane_transcript = mane_transcript + + @staticmethod + def _return_warnings(resp: CoordinatesResponseType, + warning_msg: str) -> CoordinatesResponseType: + """Add warnings to response object + + :param resp: Response object + :param warning_msg: Warning message on why `transcript_exon_data` or + `genomic_data` field is None + :return: Response object with warning message + """ + logger.warning(warning_msg) + resp.warnings.append(warning_msg) + return resp + + async def transcript_to_genomic_coordinates( + self, gene: Optional[str] = None, transcript: Optional[str] = None, + exon_start: Optional[int] = None, exon_start_offset: int = 0, # noqa: E501 + exon_end: Optional[int] = None, exon_end_offset: int = 0, + **kwargs) -> GenomicDataResponse: + """Get genomic data given transcript data. + Will use GRCh38 coordinates if possible + + :param gene: Gene symbol + :param transcript: Transcript accession + :param exon_start: Starting transcript exon number + :param exon_end: Ending transcript exon number + :param exon_start_offset: Starting exon offset + :param exon_end_offset: Ending exon offset + :return: GRCh38 genomic data (inter-residue coordinates) + """ + resp = GenomicDataResponse( + genomic_data=None, + warnings=[], + service_meta=service_meta() + ) + + if not transcript: + return self._return_warnings(resp, "Must provide `transcript`") + else: + transcript = transcript.strip() + + if exon_start is None and exon_end is None: + return self._return_warnings( + resp, "Must provide either `exon_start` or `exon_end`") + + if gene: + gene = gene.upper().strip() + + if exon_start and exon_end: + if exon_start > exon_end: + return self._return_warnings( + resp, + f"Start exon {exon_start} is greater than end exon {exon_end}" # noqa: E501 + ) + + tx_exons, warning = await self.uta_db.get_tx_exons(transcript) + if not tx_exons: + return self._return_warnings(resp, warning or "") + + tx_exon_coords, warning = self.uta_db.get_tx_exon_coords( + transcript, tx_exons, exon_start, exon_end) + if not tx_exon_coords: + return self._return_warnings(resp, warning or "") + tx_exon_start, tx_exon_end = tx_exon_coords + + alt_ac_start_end, warning = await self.uta_db.get_alt_ac_start_and_end( + transcript, tx_exon_start, tx_exon_end, gene=gene) + if not alt_ac_start_end: + return self._return_warnings(resp, warning or "") + alt_ac_start, alt_ac_end = alt_ac_start_end + + gene = alt_ac_start[0] if alt_ac_start else alt_ac_end[0] + chromosome = alt_ac_start[1] if alt_ac_start else alt_ac_end[1] + if gene is None or chromosome is None: + return self._return_warnings( + resp, "Unable to retrieve `gene` or `chromosome` from " + "genomic start or end data") + + start = alt_ac_start[3] if alt_ac_start else None + end = alt_ac_end[2] if alt_ac_end else None + strand = alt_ac_start[4] if alt_ac_start else alt_ac_end[4] + + # Using none since could set to 0 + start_exits = start is not None + end_exists = end is not None + + if strand == -1: + start_offset = exon_start_offset * -1 if start_exits else None + end_offset = exon_end_offset * -1 if end_exists else None + else: + start_offset = exon_start_offset if start_exits else None + end_offset = exon_end_offset if end_exists else None + + start = start + start_offset if start_exits else None + end = end + end_offset if end_exists else None + + resp.genomic_data = GenomicData( + gene=gene, + chr=chromosome, + start=start, + end=end, + exon_start=exon_start if start_exits else None, + exon_start_offset=exon_start_offset if start_exits else None, + exon_end=exon_end if end_exists else None, + exon_end_offset=exon_end_offset if end_exists else None, + transcript=transcript, + strand=strand + ) + + return resp + + async def genomic_to_transcript_exon_coordinates( + self, chromosome: Union[str, int], start: Optional[int] = None, + end: Optional[int] = None, strand: Optional[int] = None, + transcript: Optional[str] = None, gene: Optional[str] = None, + residue_mode: ResidueMode = ResidueMode.RESIDUE, + **kwargs) -> GenomicDataResponse: + """Get transcript data for genomic data. + MANE Transcript data will be returned iff `transcript` is not supplied. + `gene` must be supplied in order to retrieve MANE Transcript data. + Liftovers genomic coordinates to GRCh38 + + :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or + accession (i.e. `NC_000001.11`). + :param start: Start genomic position + :param end: End genomic position + :param strand: Strand. Must be either `-1` or `1`. + :param transcript: The transcript to use. If this is not given, we will try the + following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining + Compatible Transcript + :param gene: Gene symbol + :param residue_mode: Default is `resiude` (1-based). Must be either `residue` or + `inter-residue` (0-based). + :return: Genomic data (inter-residue coordinates) + """ + resp = GenomicDataResponse( + genomic_data=None, + warnings=[], + service_meta=service_meta() + ) + if start is None and end is None: + return self._return_warnings( + resp, "Must provide either `start` or `end`") + + params = {key: None for key in GenomicData.__fields__.keys()} + if gene is not None: + gene = gene.upper().strip() + + if start: + if residue_mode == ResidueMode.RESIDUE: + start -= 1 + start_data = await self._genomic_to_transcript_exon_coordinate( + chromosome, start, strand=strand, transcript=transcript, + gene=gene, is_start=True, + residue_mode=ResidueMode.INTER_RESIDUE + ) + if start_data.transcript_exon_data: + start_data = start_data.transcript_exon_data.model_dump() + else: + return self._return_warnings(resp, start_data.warnings[0]) + else: + start_data = None + + if end: + if residue_mode == ResidueMode.RESIDUE: + end -= 1 + end_data = await self._genomic_to_transcript_exon_coordinate( + chromosome, end, strand=strand, transcript=transcript, + gene=gene, is_start=False, + residue_mode=ResidueMode.INTER_RESIDUE + ) + if end_data.transcript_exon_data: + end_data = end_data.transcript_exon_data.model_dump() + else: + return self._return_warnings(resp, end_data.warnings[0]) + else: + end_data = None + + for field in ["transcript", "gene", "chr", "strand"]: + if start_data: + if end_data: + if start_data[field] != end_data[field]: + msg = f"Start `{field}`, {start_data[field]}, does " \ + f"not match End `{field}`, {end_data[field]}" + return self._return_warnings(resp, msg) + params[field] = start_data[field] + else: + params[field] = end_data[field] + + if gene and gene != params["gene"]: + msg = f"Input gene, {gene}, does not match expected output" \ + f"gene, {params['gene']}" + return self._return_warnings(resp, msg) + + for label, data in [("start", start_data), ("end", end_data)]: + if data: + params[label] = data["pos"] + params[f"exon_{label}"] = data["exon"] + params[f"exon_{label}_offset"] = data["exon_offset"] + resp.genomic_data = GenomicData(**params) + return resp + + async def _genomic_to_transcript_exon_coordinate( + self, chromosome: Union[str, int], pos: int, strand: int = None, + transcript: str = None, gene: str = None, is_start: bool = True, + residue_mode: ResidueMode = ResidueMode.RESIDUE) -> TranscriptExonDataResponse: # noqa: E501 + """Convert individual genomic data to transcript data + + :param chromosome: Chromosome. Must either give chromosome number (i.e. `1`) or + accession (i.e. `NC_000001.11`). + :param pos: Genomic position + :param strand: Strand. Must be either `-1` or `1`. + :param transcript: The transcript to use. If this is not given, we will try the + following transcripts: MANE Select, MANE Clinical Plus, Longest Remaining + Compatible Transcript + :param gene: Gene symbol + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :param residue_mode: Default is `resiude` (1-based). Must be either `residue` + or `inter-residue` (0-based). + :return: Transcript data (inter-residue coordinates) + """ + resp = TranscriptExonDataResponse( + transcript_exon_data=None, + warnings=[], + service_meta=service_meta() + ) + + if transcript is None and gene is None: + return self._return_warnings( + resp, "Must provide either `gene` or `transcript`" + ) + + params = {key: None for key in TranscriptExonData.__fields__.keys()} + + try: + # Check if just chromosome is given. If it is, we should + # convert this to the correct accession version + if chromosome == "X": + chromosome = 23 + elif chromosome == "Y": + chromosome = 24 + else: + chromosome = int(chromosome) + except ValueError: + # Check if valid accession is given + if not await self.uta_db.validate_genomic_ac(chromosome): + return self._return_warnings( + resp, f"Invalid chromosome: {chromosome}") + + if isinstance(chromosome, str): + # Accession given + genes_alt_acs, warning = \ + await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=chromosome, gene=gene) + else: + # Number given + genes_alt_acs, warning = \ + await self.uta_db.chr_to_gene_and_accessions( + chromosome, pos, strand=strand, alt_ac=None, gene=gene) + if not genes_alt_acs: + return self._return_warnings(resp, warning) + + gene_alt_ac, warning = self._get_gene_and_alt_ac(genes_alt_acs, gene) + if not gene_alt_ac: + return self._return_warnings(resp, warning) + gene, alt_ac = gene_alt_ac + + if transcript is None: + warnings = await self._set_mane_genomic_data( + params, gene, alt_ac, pos, strand, is_start, residue_mode) + if warnings: + return self._return_warnings(resp, warnings) + else: + params["transcript"] = transcript + params["gene"] = gene + params["pos"] = pos + params["chr"] = alt_ac + warning = await self._set_genomic_data(params, strand, is_start) + if warning: + return self._return_warnings(resp, warning) + + resp.transcript_exon_data = TranscriptExonData(**params) + return resp + + @staticmethod + def _get_gene_and_alt_ac( + genes_alt_acs: Dict, gene: Optional[str] + ) -> Tuple[Optional[Tuple[str, str]], Optional[str]]: + """Return gene genomic accession + + :param genes_alt_acs: Dictionary containing genes and genomic accessions + :param gene: Gene symbol + :return: (Gene, Genomic accession) if both exist + """ + alt_acs = genes_alt_acs["alt_acs"] + len_alt_acs = len(alt_acs) + if len_alt_acs > 1: + return None, f"Found more than one accessions: {alt_acs}" + elif len_alt_acs == 0: + return None, "No genomic accessions found" + alt_ac = next(iter(alt_acs)) + + genes = genes_alt_acs["genes"] + len_genes = len(genes) + input_gene = gene + output_gene = None + if len_genes == 1: + output_gene = next(iter(genes)) + elif len_genes > 1: + return None, f"Found more than one gene: {genes}" + elif len_genes == 0: + return None, "No genes found" + + if input_gene is not None: + if output_gene != input_gene.upper(): + return None, f"Input gene, {input_gene}, does not match " \ + f"expected output gene, {output_gene}" + + gene = output_gene if output_gene else input_gene + return (gene, alt_ac), None + + async def _set_mane_genomic_data( + self, params: Dict, gene: str, alt_ac: str, pos: int, strand: int, + is_start: bool, residue_mode: str + ) -> Optional[str]: + """Set genomic data in `params` found from MANE. + + :param params: Parameters for response + :param gene: Gene symbol + :param alt_ac: Genomic accession + :param pos: Genomic position + :param strand: Strand + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :param residue_mode: Residue mode for start/end positions. Must be either + `inter-residue` or `residue` + :return: Warnings if found + """ + mane_data = await self.mane_transcript.get_mane_transcript( + alt_ac, pos, "g", gene=gene, + try_longest_compatible=True, residue_mode=residue_mode + ) + if not mane_data: + msg = f"Unable to find mane data for {alt_ac} with position {pos}" + if gene: + msg += f" on gene {gene}" + logger.warning(msg) + return msg + + if mane_data["strand"] == "-": + mane_data["strand"] = -1 + elif mane_data["strand"] == "+": + mane_data["strand"] = 1 + + params["gene"] = mane_data["gene"] + params["transcript"] = mane_data["refseq"] if mane_data["refseq"] \ + else mane_data["ensembl"] if mane_data["ensembl"] else None + tx_exons = await self._structure_exons(params["transcript"], alt_ac=alt_ac) + if not tx_exons: + return f"Unable to get exons for {params['transcript']}" + tx_pos = mane_data["pos"][0] + mane_data["coding_start_site"] + params["exon"] = self._get_exon_number(tx_exons, tx_pos) + + try: + tx_exon = tx_exons[params["exon"] - 1] + except IndexError: + msg = f"{params['transcript']} with position {tx_pos} "\ + f"does not exist on exons: {tx_exons}" + logger.warning(msg) + return msg + + strand_to_use = strand if strand is not None else mane_data["strand"] + params["strand"] = strand_to_use + self._set_exon_offset(params, tx_exon[0], tx_exon[1], tx_pos, + is_start=is_start, strand=strand_to_use) + + # Need to check if we need to change pos for liftover + genomic_data, warnings = await self.uta_db.get_alt_ac_start_or_end( + params["transcript"], tx_pos, tx_pos, gene) + if genomic_data is None: + return warnings + + params["chr"] = genomic_data[1] + genomic_coords = genomic_data[2], genomic_data[3] + genomic_pos = genomic_coords[1] if is_start else genomic_coords[0] + params["pos"] = genomic_pos - params["exon_offset"] if \ + strand_to_use == -1 else genomic_pos + params["exon_offset"] + return None + + async def _set_genomic_data(self, params: Dict, strand: int, + is_start: bool) -> Optional[str]: + """Set genomic data in `params` + + :param params: Parameters for response + :param strand: Strand + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position. + :return: Warnings if found + """ + # We should always try to liftover + grch38_ac = await self.uta_db.get_newest_assembly_ac(params["chr"]) + if not grch38_ac: + return f"Invalid genomic accession: {params['chr']}" + + grch38_ac = grch38_ac[0] + if grch38_ac != params["chr"]: # params["chr"] is genomic accession + # Liftover to 38 + descr = await self.uta_db.get_chr_assembly(params["chr"]) + if descr is None: + return f"Unable to get chromosome and assembly for " \ + f"{params['chr']}" + + chromosome_number, assembly = descr + liftover_data = self.uta_db.get_liftover( + chromosome_number, params["pos"], Assembly.GRCH38) + if liftover_data is None: + return f"Position {params['pos']} does not exist on " \ + f"chromosome {chromosome_number}" + + params["pos"] = liftover_data[1] + params["chr"] = grch38_ac + + tx_exons = await self._structure_exons(params["transcript"], alt_ac=grch38_ac) + if not tx_exons: + return f"Unable to get exons for {params['transcript']}" + data = await self.uta_db.get_tx_exon_aln_v_data( + params["transcript"], params["pos"], params["pos"], + alt_ac=params["chr"], use_tx_pos=False) + if len(data) != 1: + return f"Must find exactly one row for genomic data, " \ + f"but found: {len(data)}" + + # Find exon number + data = data[0] + data_exons = data[2], data[3] + i = 1 + found_tx_exon = False + for exon in tx_exons: + if data_exons == exon: + found_tx_exon = True + break + i += 1 + if not found_tx_exon: + # Either first or last + i = 1 if data_exons == (0, tx_exons[0][1]) else i - 1 + params["exon"] = i + + strand_to_use = strand if strand is not None else data[7] + params["strand"] = strand_to_use + self._set_exon_offset(params, data[5], data[6], params["pos"], + is_start=is_start, strand=strand_to_use) + return None + + @staticmethod + def _set_exon_offset(params: Dict, start: int, end: int, pos: int, + is_start: bool, strand: int) -> None: + """Set `exon_offset` in params. + + :param params: Parameters for response + :param start: Start exon coord (can be transcript or genomic) + :param end: End exon coord (can be transcript or genomic) + :param pos: Position change (can be transcript or genomic) + :param is_start: `True` if `pos` is start position. `False` if `pos` is end + position + :param int strand: Strand + """ + if is_start: + if strand == -1: + params["exon_offset"] = end - pos + else: + params["exon_offset"] = pos - end + else: + if strand == -1: + params["exon_offset"] = start - pos + else: + params["exon_offset"] = pos - start + + async def _structure_exons( + self, transcript: str, alt_ac: Optional[str] = None + ) -> List[Tuple[int, int]]: + """Structure exons as list of tuples. + + :param transcript: Transcript accession + :param alt_ac: Genomic accession + :return: List of tuples containing transcript exon coordinates + """ + result = list() + tx_exons, _ = await self.uta_db.get_tx_exons(transcript, alt_ac=alt_ac) + if not tx_exons: + return result + for coords in tx_exons: + result.append((coords[0], coords[1])) + return result + + @staticmethod + def _get_exon_number(tx_exons: List, tx_pos: int) -> int: + """Find exon number. + + :param tx_exons: List of exon coordinates + :param tx_pos: Transcript position change + :return: Exon number associated to transcript position change + """ + i = 1 + for coords in tx_exons: + if coords[0] <= tx_pos <= coords[1]: + break + i += 1 + return i diff --git a/cool_seq_tool/data_sources/mane_transcript.py b/cool_seq_tool/mappers/mane_transcript.py similarity index 86% rename from cool_seq_tool/data_sources/mane_transcript.py rename to cool_seq_tool/mappers/mane_transcript.py index b4d7f292..04520027 100644 --- a/cool_seq_tool/data_sources/mane_transcript.py +++ b/cool_seq_tool/mappers/mane_transcript.py @@ -13,20 +13,17 @@ import pandas as pd -from cool_seq_tool.schemas import AnnotationLayer, Assembly, MappedManeData, \ - ResidueMode, TranscriptPriorityLabel -from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \ - MANETranscriptMappings, UTADatabase, GeneNormalizer -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.schemas import ( + AnnotationLayer, Assembly, ResidueMode, TranscriptPriorityLabel +) +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.sources import ( + TranscriptMappings, MANETranscriptMappings, UTADatabase +) +from cool_seq_tool.utils import get_inter_residue_pos -logger = logging.getLogger("cool_seq_tool") - - -class MANETranscriptError(Exception): - """Custom exception for MANETranscript class""" - - pass +logger = logging.getLogger(__name__) class MANETranscript: @@ -35,24 +32,20 @@ class MANETranscript: def __init__(self, seqrepo_access: SeqRepoAccess, transcript_mappings: TranscriptMappings, mane_transcript_mappings: MANETranscriptMappings, - uta_db: UTADatabase, - gene_normalizer: GeneNormalizer) -> None: + uta_db: UTADatabase) -> None: """Initialize the MANETranscript class. - :param SeqRepoAccess seqrepo_access: Access to seqrepo queries - :param TranscriptMappings transcript_mappings: Access to transcript - accession mappings and conversions - :param MANETranscriptMappings mane_transcript_mappings: Access to - MANE Transcript accession mapping data - :param UTADatabase uta_db: UTADatabase instance to give access to query - UTA database - :param GeneNormalizer gene_normalizer: Access to Gene Normalizer + :param seqrepo_access: Access to seqrepo queries + :param transcript_mappings: Access to transcript accession mappings and + conversions + :param mane_transcript_mappings: Access to MANE Transcript accession mapping + data + :param uta_db: UTADatabase instance to give access to query UTA database """ self.seqrepo_access = seqrepo_access self.transcript_mappings = transcript_mappings self.mane_transcript_mappings = mane_transcript_mappings self.uta_db = uta_db - self.gene_normalizer = gene_normalizer @staticmethod def _get_reading_frame(pos: int) -> int: @@ -866,108 +859,3 @@ async def g_to_mane_c( refseq_c_ac=current_mane_data["RefSeq_nuc"], ensembl_c_ac=current_mane_data["Ensembl_nuc"], alt_ac=grch38["ac"] if grch38 else None) - - async def get_mapped_mane_data( - self, gene: str, assembly: Assembly, genomic_position: int, - residue_mode: ResidueMode = ResidueMode.INTER_RESIDUE - ) -> Optional[MappedManeData]: - """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, - will return mapped MANE data. - - :param str gene: Gene symbol or identifier - :param Assembly assembly: Assembly for the provided genomic position - :param int genomic_position: Position on the genomic reference sequence to find - MANE data for - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue - :return: Mapped MANE or Longest Compatible Remaining data if found/compatible. - MANETranscriptError will be raised if unable to get required data for - retrieving mapped MANE data. - """ - hgnc_gene_data = self.gene_normalizer.get_hgnc_data(gene) - if not hgnc_gene_data: - raise MANETranscriptError(f"Unable to get HGNC data for gene: {gene}") - - gene = hgnc_gene_data["symbol"] - - mane_data = self.mane_transcript_mappings.get_gene_mane_data(gene) - if not mane_data: - raise MANETranscriptError(f"Unable to get MANE data for gene: {gene}") - - mane_data_len = len(mane_data) - - alt_ac = None - if hgnc_gene_data["locations"]: - chr = hgnc_gene_data["locations"][0].get("chr") or "" - alt_acs, _ = self.seqrepo_access.translate_identifier( - f"{assembly.value}:{chr}", "refseq" - ) - if alt_acs: - alt_ac = alt_acs[0].split(":")[1] - else: - raise MANETranscriptError(f"Unable to translate identifier for: " - f"{assembly}:{chr}") - - inter_residue_pos, _ = get_inter_residue_pos(genomic_position, residue_mode) - g_pos = inter_residue_pos[0] - - mane_transcripts = set() - for i in range(mane_data_len): - index = mane_data_len - i - 1 - current_mane_data = mane_data[index] - mane_transcripts |= set((current_mane_data["RefSeq_nuc"], - current_mane_data["Ensembl_nuc"])) - mane_c_ac = current_mane_data["RefSeq_nuc"] - - ac_query = mane_c_ac.split(".")[0] - tx_exon_aln_v_data = await self.uta_db.get_tx_exon_aln_v_data( - ac_query, g_pos, g_pos, alt_ac, False, True) - - if not tx_exon_aln_v_data: - continue - else: - len_of_aligned_data = len(tx_exon_aln_v_data) - if len_of_aligned_data == 1: - tx_exon_aln_v_data = tx_exon_aln_v_data[0] - else: - logger.debug(f"Found {len_of_aligned_data} records for aligned " - f"mapped MANE data for {ac_query}, {g_pos}, {alt_ac}") - - # Try checking for MANE match - filter_data = list(filter(lambda x: x[1] == mane_c_ac, - tx_exon_aln_v_data)) - if filter_data: - tx_exon_aln_v_data = filter_data[0] - else: - # Try checking for older versions of MANE - filter_data = list(filter(lambda x: x[1].startswith( - mane_c_ac.split(".")[0]), tx_exon_aln_v_data)) - if filter_data: - filter_data.sort(key=lambda x: x[1], reverse=True) - tx_exon_aln_v_data = filter_data[0] - return MappedManeData( - gene=gene, - refseq=current_mane_data["RefSeq_nuc"], - ensembl=current_mane_data["Ensembl_nuc"], - strand="-" if tx_exon_aln_v_data[7] == -1 else "+", - status="_".join(current_mane_data["MANE_status"].split()).lower(), - alt_ac=alt_ac, - assembly=assembly.value - ) - - lcr_data = await self.get_longest_compatible_transcript( - gene, g_pos, g_pos, AnnotationLayer.GENOMIC, - residue_mode=ResidueMode.INTER_RESIDUE, mane_transcripts=mane_transcripts, - alt_ac=alt_ac) - if lcr_data: - return MappedManeData( - gene=gene, - refseq=lcr_data["refseq"], - ensembl=lcr_data["ensembl"], - strand=lcr_data["strand"], - status=lcr_data["status"], - alt_ac=alt_ac, - assembly=assembly.value - ) - - return None diff --git a/cool_seq_tool/routers/__init__.py b/cool_seq_tool/routers/__init__.py index ec2cc781..bafbb9d3 100644 --- a/cool_seq_tool/routers/__init__.py +++ b/cool_seq_tool/routers/__init__.py @@ -1,7 +1,7 @@ """Module for routers""" from enum import Enum -from cool_seq_tool import CoolSeqTool +from cool_seq_tool.app import CoolSeqTool cool_seq_tool = CoolSeqTool() diff --git a/cool_seq_tool/routers/default.py b/cool_seq_tool/routers/default.py index c44d510a..6d42ad37 100644 --- a/cool_seq_tool/routers/default.py +++ b/cool_seq_tool/routers/default.py @@ -14,6 +14,7 @@ UNHANDLED_EXCEPTION_MSG from cool_seq_tool.schemas import GenomicDataResponse, GenomicRequestBody, \ TranscriptRequestBody +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -37,14 +38,14 @@ async def genomic_to_transcript_exon_coordinates( Returns: GenomicDataResponse with data and warnings """ - request_body = request_body.dict() + request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: response = \ - await cool_seq_tool.genomic_to_transcript_exon_coordinates(**request_body) + await cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"genomic_to_transcript_exon_coordinates unhandled exception {str(e)}") # noqa: E501 response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -68,13 +69,13 @@ async def transcript_to_genomic_coordinates( Returns: GenomicDataResponse with data and warnings """ - request_body = request_body.dict() + request_body = request_body.model_dump() response = GenomicDataResponse( - genomic_data=None, warnings=list(), service_meta=cool_seq_tool.service_meta()) + genomic_data=None, warnings=list(), service_meta=service_meta()) try: - response = await cool_seq_tool.transcript_to_genomic_coordinates(**request_body) + response = await cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates(**request_body) # noqa: E501 except Exception as e: logger.error(f"transcript_to_genomic_coordinates unhandled exception {str(e)}") response.warnings.append(UNHANDLED_EXCEPTION_MSG) @@ -105,7 +106,7 @@ async def get_sequence( """ _, path = tempfile.mkstemp(suffix=".fasta") try: - cool_seq_tool.get_fasta_file(sequence_id, Path(path)) + cool_seq_tool.seqrepo_access.get_fasta_file(sequence_id, Path(path)) except KeyError: raise HTTPException( status_code=404, diff --git a/cool_seq_tool/routers/mane.py b/cool_seq_tool/routers/mane.py index 366d06b7..73476aef 100644 --- a/cool_seq_tool/routers/mane.py +++ b/cool_seq_tool/routers/mane.py @@ -1,15 +1,14 @@ """Module containing routes related to MANE data""" import logging -from typing import List, Optional +from typing import Optional from fastapi import APIRouter from fastapi import Query from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, \ UNHANDLED_EXCEPTION_MSG, Tags -from cool_seq_tool.data_sources.mane_transcript import MANETranscriptError -from cool_seq_tool.schemas import AnnotationLayer, Assembly, ManeDataService, \ - MappedManeDataService, ResidueMode +from cool_seq_tool.schemas import AnnotationLayer, ManeDataService, ResidueMode +from cool_seq_tool.utils import service_meta logger = logging.getLogger("cool_seq_tool") @@ -76,55 +75,5 @@ async def get_mane_data( return ManeDataService( mane_data=mane_data, warnings=warnings, - service_meta=cool_seq_tool.service_meta() - ) - - -@router.get( - "/get_mapped_mane_data", - summary="Retrieve MANE Transcript mapped to a given assembly", - response_description=RESP_DESCR, - description="Return mapped MANE Transcript data to a given assembly", - response_model=MappedManeDataService, - tags=[Tags.MANE_TRANSCRIPT] -) -async def get_mapped_mane_data( - gene: str = Query(..., description="HGNC Symbol or Identifier"), - assembly: Assembly = Query(..., description="Genomic assembly to use"), - genomic_position: int = Query(..., description="Genomic position associated to the given gene and assembly"), # noqa: E501 - residue_mode: ResidueMode = Query(ResidueMode.INTER_RESIDUE, - description="Residue mode for `genomic_position`") -) -> MappedManeDataService: - """Get MANE data for gene, assembly, and position. If GRCh37 assembly is given, - will return mapped MANE data. - - :param str gene: HGNC symbol or identifier - :param Assembly assembly: Assembly for the provided genomic position - :param int genomic_position: Position on the genomic reference sequence to find - MANE data for - :param ResidueMode residue_mode: Starting residue mode for `start_pos` - and `end_pos`. Will always return coordinates in inter-residue - :return: Mapped MANE or Longest Compatible Remaining data - """ - warnings: List = list() - mapped_mane_data = None - try: - mapped_mane_data = await cool_seq_tool.mane_transcript.get_mapped_mane_data( - gene, assembly, genomic_position, residue_mode) - if not mapped_mane_data: - warnings.append(f"Unable to find mapped data for gene {gene} at position " - f"{genomic_position} ({residue_mode} coordinates) on " - f"assembly {assembly}") - except MANETranscriptError as e: - e = str(e) - logger.exception(e) - warnings.append(e) - except Exception as e: - logger.exception(f"get_mapped_mane_data unhandled exception {e}") - warnings.append(UNHANDLED_EXCEPTION_MSG) - - return MappedManeDataService( - mapped_mane_data=mapped_mane_data, - warnings=warnings, - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/routers/mappings.py b/cool_seq_tool/routers/mappings.py index 00b340af..38c763fe 100644 --- a/cool_seq_tool/routers/mappings.py +++ b/cool_seq_tool/routers/mappings.py @@ -9,6 +9,8 @@ from cool_seq_tool.routers import cool_seq_tool, SERVICE_NAME, RESP_DESCR, Tags from cool_seq_tool.schemas import Assembly, ToGenomicService, ToCdnaService, \ ResidueMode +from cool_seq_tool.utils import service_meta + logger = logging.getLogger("cool_seq_tool") @@ -51,7 +53,7 @@ async def p_to_c( return ToCdnaService( c_data=c_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -100,7 +102,7 @@ async def c_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) @@ -144,5 +146,5 @@ async def p_to_g( return ToGenomicService( g_data=g_data, warnings=[w] if w else [], - service_meta=cool_seq_tool.service_meta() + service_meta=service_meta() ) diff --git a/cool_seq_tool/schemas.py b/cool_seq_tool/schemas.py index c3d5ea73..7984f1c8 100644 --- a/cool_seq_tool/schemas.py +++ b/cool_seq_tool/schemas.py @@ -2,11 +2,16 @@ from datetime import datetime from enum import Enum import re -from typing import Literal, Optional, List, Tuple, Union, Dict, Any, Type +from typing import Literal, Optional, List, Tuple, Union -from pydantic import BaseModel, root_validator, validator -from pydantic.main import Extra -from pydantic.types import StrictStr, StrictInt +from pydantic import ( + BaseModel, + model_validator, + field_validator, + StrictStr, + StrictInt, + ConfigDict, +) from cool_seq_tool.version import __version__ @@ -14,9 +19,9 @@ class AnnotationLayer(str, Enum): """Create enum for supported annotation layers""" - PROTEIN = "p" - CDNA = "c" - GENOMIC = "g" + PROTEIN: Literal["p"] = "p" + CDNA: Literal["c"] = "c" + GENOMIC: Literal["g"] = "g" class Strand(str, Enum): @@ -48,14 +53,9 @@ class ResidueMode(str, Enum): INTER_RESIDUE = "inter-residue" -class BaseModelForbidExtra(BaseModel): +class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" - class Config: - """Class configs.""" - - extra = Extra.forbid - class GenomicRequestBody(BaseModelForbidExtra): """Define constraints for genomic to transcript exon coordinates request body""" @@ -68,34 +68,27 @@ class GenomicRequestBody(BaseModelForbidExtra): gene: Optional[StrictStr] = None residue_mode: ResidueMode = ResidueMode.RESIDUE - @root_validator(pre=False) + @model_validator(mode="after") def check_start_and_end(cls, values): """Check that at least one of {`start`, `end`} is set""" msg = "Must provide either `start` or `end`" - start, end = values.get("start"), values.get("end") + start, end = values.start, values.end assert start or end, msg return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicRequestBody"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "chromosome": "NC_000001.11", "start": 154192135, "end": None, "strand": -1, "transcript": "NM_152263.3", "gene": "TPM3", - "residue_mode": "residue" + "residue_mode": "residue", } + } + ) class TranscriptRequestBody(BaseModelForbidExtra): @@ -108,26 +101,17 @@ class TranscriptRequestBody(BaseModelForbidExtra): exon_end: Optional[StrictInt] = None exon_end_offset: Optional[StrictInt] = 0 - @root_validator(pre=False) + @model_validator(mode="after") def check_exon_start_and_exon_end(cls, values): """Check that at least one of {`exon_start`, `exon_end`} is set""" msg = "Must provide either `exon_start` or `exon_end`" - exon_start, exon_end = values.get("exon_start"), values.get("exon_end") + exon_start, exon_end = values.exon_start, values.exon_end assert exon_start or exon_end, msg return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptRequestBody"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "TPM3", "transcript": "NM_152263.3", "exon_start": 1, @@ -135,6 +119,8 @@ def schema_extra(schema: Dict[str, Any], "exon_end": None, "exon_end_offset": None, } + } + ) class TranscriptExonData(BaseModelForbidExtra): @@ -148,26 +134,19 @@ class TranscriptExonData(BaseModelForbidExtra): chr: StrictStr strand: StrictInt - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptExonData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "chr": "NC_000001.11", "gene": "TPM3", "pos": 154192135, "exon": 1, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } + } + ) class GenomicData(BaseModelForbidExtra): @@ -184,7 +163,7 @@ class GenomicData(BaseModelForbidExtra): transcript: StrictStr strand: StrictInt - @root_validator(pre=True) + @model_validator(mode="after") def check_start_end(cls, values): """ Check that at least one of {`start`, `end`} is set. @@ -192,35 +171,26 @@ def check_start_end(cls, values): If not set, set corresponding offset to `None` """ msg = "Missing values for `start` or `end`" - start = values.get("start") - end = values.get("end") + start = values.start + end = values.end assert start or end, msg if start: msg = "Missing value `exon_start`" - assert values.get("exon_start"), msg + assert values.exon_start, msg else: - values["exon_start_offset"] = None + values.exon_start_offset = None if end: msg = "Missing value `exon_end`" - assert values.get("exon_end"), msg + assert values.exon_end, msg else: - values["exon_end_offset"] = None + values.exon_end_offset = None return values - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "TPM3", "chr": "NC_000001.11", "start": 154192135, @@ -230,8 +200,10 @@ def schema_extra(schema: Dict[str, Any], "exon_start_offset": 0, "exon_end_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, } + } + ) class ServiceMeta(BaseModelForbidExtra): @@ -240,9 +212,11 @@ class ServiceMeta(BaseModelForbidExtra): name: Literal["cool_seq_tool"] = "cool_seq_tool" version: StrictStr response_datetime: datetime - url: Literal["https://github.com/GenomicMedLab/cool-seq-tool"] = "https://github.com/GenomicMedLab/cool-seq-tool" # noqa: E501 + url: Literal[ + "https://github.com/GenomicMedLab/cool-seq-tool" + ] = "https://github.com/GenomicMedLab/cool-seq-tool" - @validator("version") + @field_validator("version") def validate_version(cls, v): """Check version matches semantic versioning regex pattern. https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string @@ -251,23 +225,16 @@ def validate_version(cls, v): assert bool(re.match(version_regex, v)) return v - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ServiceMeta"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" + "url": "https://github.com/GenomicMedLab/cool-seq-tool", } + } + ) class TranscriptExonDataResponse(BaseModelForbidExtra): @@ -277,18 +244,9 @@ class TranscriptExonDataResponse(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["TranscriptExonDataResponse"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "transcript_exon_data": { "chr": "NC_000001.11", "gene": "TPM3", @@ -296,16 +254,18 @@ def schema_extra(schema: Dict[str, Any], "exon": 1, "exon_offset": 0, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class GenomicDataResponse(BaseModelForbidExtra): @@ -315,18 +275,9 @@ class GenomicDataResponse(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["GenomicDataResponse"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "genomic_data": { "gene": "TPM3", "chr": "NC_000001.11", @@ -337,16 +288,18 @@ def schema_extra(schema: Dict[str, Any], "exon_start_offset": 0, "exon_end_offset": None, "transcript": "NM_152263.3", - "strand": -1 + "strand": -1, }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class MappedManeData(BaseModel): @@ -360,26 +313,19 @@ class MappedManeData(BaseModel): alt_ac: StrictStr assembly: Assembly - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["MappedManeData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "BRAF", "refseq": "NM_001374258.1", "ensembl": "ENST00000644969.2", "strand": "-", "status": "mane_plus_clinical", "alt_ac": "NC_000007.13", - "assembly": "GRCh37" + "assembly": "GRCh37", } + } + ) class MappedManeDataService(BaseModelForbidExtra): @@ -389,18 +335,9 @@ class MappedManeDataService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["MappedManeDataService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "mapped_mane_data": { "gene": "BRAF", "refseq": "NM_001374258.1", @@ -408,16 +345,18 @@ def schema_extra(schema: Dict[str, Any], "strand": "-", "status": "mane_plus_clinical", "alt_ac": "NC_000007.13", - "assembly": "GRCh37" + "assembly": "GRCh37", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class ManeData(BaseModel): @@ -430,25 +369,18 @@ class ManeData(BaseModel): strand: Strand status: TranscriptPriorityLabel - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ManeData"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "gene": "BRAF", "refseq": "NP_004324.2", "ensembl": "ENSP00000493543.1", "pos": (598, 598), "strand": "-", - "status": "mane_select" + "status": "mane_select", } + } + ) class ManeDataService(BaseModelForbidExtra): @@ -458,34 +390,27 @@ class ManeDataService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ManeDataService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "mane_data": { "gene": "BRAF", "refseq": "NP_004324.2", "ensembl": "ENSP00000493543.1", "pos": (598, 598), "strand": "-", - "status": "mane_select" + "status": "mane_select", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) # ALIGNMENT MAPPER SERVICE SCHEMAS @@ -494,30 +419,23 @@ def schema_extra(schema: Dict[str, Any], class CdnaRepresentation(BaseModelForbidExtra): """Model response for cDNA representation""" - c_ac: str - c_start_pos: str - c_end_pos: str - cds_start: int - residue_mode = ResidueMode.INTER_RESIDUE.value - - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["CdnaRepresentation"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + c_ac: StrictStr + c_start_pos: StrictInt + c_end_pos: StrictInt + cds_start: StrictInt + residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value + + model_config = ConfigDict( + json_schema_extra={ + "example": { "c_ac": "NM_004333.6", "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", } + } + ) class ToCdnaService(BaseModelForbidExtra): @@ -527,33 +445,26 @@ class ToCdnaService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ToCdnaService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "c_data": { "c_ac": "NM_004333.6", "c_start_pos": 1797, "c_end_pos": 1800, "cds_start": 226, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) class GenomicRepresentation(BaseModelForbidExtra): @@ -562,25 +473,18 @@ class GenomicRepresentation(BaseModelForbidExtra): g_ac: str g_start_pos: int g_end_pos: int - residue_mode = ResidueMode.INTER_RESIDUE.value - - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["CdnaRepresentation"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + residue_mode: Literal[ResidueMode.INTER_RESIDUE] = ResidueMode.INTER_RESIDUE.value + + model_config = ConfigDict( + json_schema_extra={ + "example": { "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", } + } + ) class ToGenomicService(BaseModelForbidExtra): @@ -590,29 +494,22 @@ class ToGenomicService(BaseModelForbidExtra): warnings: List[StrictStr] = [] service_meta: ServiceMeta - class Config(BaseModelForbidExtra.Config): - """Configure model.""" - - @staticmethod - def schema_extra(schema: Dict[str, Any], - model: Type["ToGenomicService"]) -> None: - """Configure OpenAPI schema.""" - if "title" in schema.keys(): - schema.pop("title", None) - for prop in schema.get("properties", {}).values(): - prop.pop("title", None) - schema["example"] = { + model_config = ConfigDict( + json_schema_extra={ + "example": { "g_data": { "g_ac": "NC_000007.13", "g_start_pos": 140453134, "g_end_pos": 140453137, - "residue_mode": "inter-residue" + "residue_mode": "inter-residue", }, - "warnings": list(), + "warnings": [], "service_meta": { "name": "cool_seq_tool", "version": __version__, "response_datetime": datetime.now(), - "url": "https://github.com/GenomicMedLab/cool-seq-tool" - } + "url": "https://github.com/GenomicMedLab/cool-seq-tool", + }, } + } + ) diff --git a/cool_seq_tool/sources/__init__.py b/cool_seq_tool/sources/__init__.py new file mode 100644 index 00000000..c753f9a3 --- /dev/null +++ b/cool_seq_tool/sources/__init__.py @@ -0,0 +1,4 @@ +"""Module for providing basic acquisition/setup for the various resources""" +from .mane_transcript_mappings import MANETranscriptMappings +from .transcript_mappings import TranscriptMappings +from .uta_database import UTADatabase diff --git a/cool_seq_tool/data_sources/mane_transcript_mappings.py b/cool_seq_tool/sources/mane_transcript_mappings.py similarity index 98% rename from cool_seq_tool/data_sources/mane_transcript_mappings.py rename to cool_seq_tool/sources/mane_transcript_mappings.py index 7f9ad7b3..475e4e47 100644 --- a/cool_seq_tool/data_sources/mane_transcript_mappings.py +++ b/cool_seq_tool/sources/mane_transcript_mappings.py @@ -8,7 +8,7 @@ from cool_seq_tool.paths import MANE_SUMMARY_PATH -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class MANETranscriptMappings: diff --git a/cool_seq_tool/data_sources/transcript_mappings.py b/cool_seq_tool/sources/transcript_mappings.py similarity index 100% rename from cool_seq_tool/data_sources/transcript_mappings.py rename to cool_seq_tool/sources/transcript_mappings.py diff --git a/cool_seq_tool/data_sources/uta_database.py b/cool_seq_tool/sources/uta_database.py similarity index 99% rename from cool_seq_tool/data_sources/uta_database.py rename to cool_seq_tool/sources/uta_database.py index f59b2f14..1a1b0633 100644 --- a/cool_seq_tool/data_sources/uta_database.py +++ b/cool_seq_tool/sources/uta_database.py @@ -27,7 +27,7 @@ UTA_DB_URL = environ.get("UTA_DB_URL", "postgresql://uta_admin:uta@localhost:5433/uta/uta_20210129") -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) class UTADatabase: diff --git a/cool_seq_tool/data_sources/residue_mode.py b/cool_seq_tool/utils.py similarity index 70% rename from cool_seq_tool/data_sources/residue_mode.py rename to cool_seq_tool/utils.py index 27cfea92..1e427f1c 100644 --- a/cool_seq_tool/data_sources/residue_mode.py +++ b/cool_seq_tool/utils.py @@ -1,11 +1,13 @@ -"""Module for converting positions to inter-residue coordinates""" +"""Module for common utilities used throughout the app""" import logging +from datetime import datetime from typing import Optional, Tuple -from cool_seq_tool.schemas import ResidueMode +from cool_seq_tool.schemas import ResidueMode, ServiceMeta +from cool_seq_tool.version import __version__ -logger = logging.getLogger("cool_seq_tool") +logger = logging.getLogger(__name__) def get_inter_residue_pos( @@ -36,3 +38,15 @@ def get_inter_residue_pos( logger.warning(msg) return None, msg return (start_pos, end_pos), None + + +@staticmethod +def service_meta() -> ServiceMeta: + """Return ServiceMeta for cool_seq_tool + + :return: ServiceMeta object + """ + return ServiceMeta( + version=__version__, + response_datetime=datetime.now() + ) diff --git a/cool_seq_tool/version.py b/cool_seq_tool/version.py index 44de9d69..c585b2e4 100644 --- a/cool_seq_tool/version.py +++ b/cool_seq_tool/version.py @@ -1 +1 @@ -__version__ = "0.1.14-dev1" +__version__ = "0.3.0-dev0" diff --git a/setup.cfg b/setup.cfg index 55ca5061..279b0142 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,6 @@ install_requires = pydantic uvicorn fastapi - gene-normalizer >=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8 ga4gh.vrs [options.package_data] diff --git a/tests/conftest.py b/tests/conftest.py index b86e6a61..3b0d815a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ import pytest +from cool_seq_tool.app import CoolSeqTool + @pytest.fixture(scope="session") def event_loop(request): @@ -10,3 +12,33 @@ def event_loop(request): loop = asyncio.get_event_loop_policy().new_event_loop() yield loop loop.close() + + +@pytest.fixture(scope="session") +def test_cool_seq_tool(): + """Create CoolSeqTool test fixture""" + return CoolSeqTool() + + +@pytest.fixture(scope="session") +def test_seqrepo_access(test_cool_seq_tool): + """Create SeqRepoAccess test fixture""" + return test_cool_seq_tool.seqrepo_access + + +@pytest.fixture(scope="session") +def test_db(test_cool_seq_tool): + """Create UTA Database test fixture""" + return test_cool_seq_tool.uta_db + + +@pytest.fixture(scope="session") +def test_transcript_mappings(test_cool_seq_tool): + """Create Transcript Mappings test fixture""" + return test_cool_seq_tool.transcript_mappings + + +@pytest.fixture(scope="session") +def test_mane_transcript_mappings(test_cool_seq_tool): + """Create MANE Transcript Mappings test fixture""" + return test_cool_seq_tool.mane_transcript_mappings diff --git a/tests/handlers/test_seqrepo_access.py b/tests/handlers/test_seqrepo_access.py new file mode 100644 index 00000000..f0ee65ff --- /dev/null +++ b/tests/handlers/test_seqrepo_access.py @@ -0,0 +1,305 @@ +"""Module for testing seqrepo access class""" +import pytest + + +def test_get_reference_sequence(test_seqrepo_access): + """Test that get_reference_sequence method works correctly""" + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 600) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 601) + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 599, 600, residue_mode="inter-residue") + assert resp == ("V", None) + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) + assert resp == ("", "Invalid inter-residue coordinates: start (600)" + " cannot be greater than end (599)") + + resp = test_seqrepo_access.get_reference_sequence("NP_0043241311412", 600) + assert resp == ("", "Accession, NP_0043241311412, not found in SeqRepo") + + resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 800) + assert resp == ("", "End inter-residue coordinate (799) " + "is out of index on NP_004324.2") + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 4654645645654, 1) + assert resp == ("", "Start inter-residue coordinate (4654645645653) is " + "out of index on NP_004324.2") + + resp = test_seqrepo_access.get_reference_sequence( + "NP_004324.2", 600, 4654645645654) + assert resp == ("", "End inter-residue coordinate (4654645645653) " + "is out of index on NP_004324.2") + + +def test_translate_identifier(test_seqrepo_access): + """Test that translate_identifier method works correctly""" + expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) + resp = test_seqrepo_access.translate_identifier( + "NM_152263.3", target_namespaces="ga4gh") + assert resp == expected + + resp = test_seqrepo_access.translate_identifier( + "refseq:NM_152263.3", target_namespaces="ga4gh") + assert resp == expected + + resp = test_seqrepo_access.translate_identifier("refseq:NM_152263.3") + assert len(resp[0]) > 0 + assert resp[1] is None + assert expected[0][0] in resp[0] + + resp = test_seqrepo_access.translate_identifier("GRCh38:2") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "refseq:NC_000002.12" in resp[0] + + resp = test_seqrepo_access.translate_identifier("NC_000002.12") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "refseq:NC_000002.12" in resp[0] + + resp = test_seqrepo_access.translate_identifier("refseq_152263.3") + assert resp == ([], "SeqRepo unable to get translated identifiers for" + " refseq_152263.3") + + +def test_aliases(test_seqrepo_access): + """Test that aliases method works correctly""" + expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) + resp = test_seqrepo_access.translate_alias("NM_152263.3") + assert len(resp[0]) > 0 + assert resp[1] is None + assert expected[0][0] in resp[0] + + resp = test_seqrepo_access.translate_alias("NC_000002.12") + assert len(resp[0]) > 0 + assert resp[1] is None + assert "GRCh38:2" in resp[0] + + resp = test_seqrepo_access.translate_alias("refseq_152263.3") + assert resp == ([], "SeqRepo could not translate alias refseq_152263.3") + + resp = test_seqrepo_access.translate_alias("GRCh38:2") + assert resp == ([], "SeqRepo could not translate alias GRCh38:2") + + +def test_chromosome_to_acs(test_seqrepo_access): + """Test that chromosome_to_acs method works correctly""" + resp = test_seqrepo_access.chromosome_to_acs("7") + assert resp == (["NC_000007.14", "NC_000007.13"], None) + + resp = test_seqrepo_access.chromosome_to_acs("X") + assert resp == (["NC_000023.11", "NC_000023.10"], None) + + resp = test_seqrepo_access.chromosome_to_acs("Y") + assert resp == (["NC_000024.10", "NC_000024.9"], None) + + resp = test_seqrepo_access.chromosome_to_acs("117") + assert resp == (None, "117 is not a valid chromosome") + + +def test_ac_to_chromosome(test_seqrepo_access): + """Test that ac_to_chromosome method works correctly""" + resp = test_seqrepo_access.ac_to_chromosome("NC_000007.13") + assert resp == ("7", None) + + resp = test_seqrepo_access.ac_to_chromosome("NC_000007.1323") + assert resp == (None, "Unable to get chromosome for NC_000007.1323") + + +def test_get_fasta_file(test_seqrepo_access, tmp_path): + """Test get_fasta_file method""" + tpm3 = tmp_path / "NM_002529.3.fasta" + test_seqrepo_access.get_fasta_file("NM_002529.3", tpm3) + tpm3_expected = """>refseq:NM_002529.3|ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA +TGCAGCTGGGAGCGCACAGACGGCTGCCCCGCCTGAGCGAGGCGGGCGCCGCCGCGATGC +TGCGAGGCGGACGGCGCGGGCAGCTTGGCTGGCACAGCTGGGCTGCGGGGCCGGGCAGCC +TGCTGGCTTGGCTGATACTGGCATCTGCGGGCGCCGCACCCTGCCCCGATGCCTGCTGCC +CCCACGGCTCCTCGGGACTGCGATGCACCCGGGATGGGGCCCTGGATAGCCTCCACCACC +TGCCCGGCGCAGAGAACCTGACTGAGCTCTACATCGAGAACCAGCAGCATCTGCAGCATC +TGGAGCTCCGTGATCTGAGGGGCCTGGGGGAGCTGAGAAACCTCACCATCGTGAAGAGTG +GTCTCCGTTTCGTGGCGCCAGATGCCTTCCATTTCACTCCTCGGCTCAGTCGCCTGAATC +TCTCCTTCAACGCTCTGGAGTCTCTCTCCTGGAAAACTGTGCAGGGCCTCTCCTTACAGG +AACTGGTCCTGTCGGGGAACCCTCTGCACTGTTCTTGTGCCCTGCGCTGGCTACAGCGCT +GGGAGGAGGAGGGACTGGGCGGAGTGCCTGAACAGAAGCTGCAGTGTCATGGGCAAGGGC +CCCTGGCCCACATGCCCAATGCCAGCTGTGGTGTGCCCACGCTGAAGGTCCAGGTGCCCA +ATGCCTCGGTGGATGTGGGGGACGACGTGCTGCTGCGGTGCCAGGTGGAGGGGCGGGGCC +TGGAGCAGGCCGGCTGGATCCTCACAGAGCTGGAGCAGTCAGCCACGGTGATGAAATCTG +GGGGTCTGCCATCCCTGGGGCTGACCCTGGCCAATGTCACCAGTGACCTCAACAGGAAGA +ACGTGACGTGCTGGGCAGAGAACGATGTGGGCCGGGCAGAGGTCTCTGTTCAGGTCAACG +TCTCCTTCCCGGCCAGTGTGCAGCTGCACACGGCGGTGGAGATGCACCACTGGTGCATCC +CCTTCTCTGTGGATGGGCAGCCGGCACCGTCTCTGCGCTGGCTCTTCAATGGCTCCGTGC +TCAATGAGACCAGCTTCATCTTCACTGAGTTCCTGGAGCCGGCAGCCAATGAGACCGTGC +GGCACGGGTGTCTGCGCCTCAACCAGCCCACCCACGTCAACAACGGCAACTACACGCTGC +TGGCTGCCAACCCCTTCGGCCAGGCCTCCGCCTCCATCATGGCTGCCTTCATGGACAACC +CTTTCGAGTTCAACCCCGAGGACCCCATCCCTGTCTCCTTCTCGCCGGTGGACACTAACA +GCACATCTGGAGACCCGGTGGAGAAGAAGGACGAAACACCTTTTGGGGTCTCGGTGGCTG +TGGGCCTGGCCGTCTTTGCCTGCCTCTTCCTTTCTACGCTGCTCCTTGTGCTCAACAAAT +GTGGACGGAGAAACAAGTTTGGGATCAACCGCCCGGCTGTGCTGGCTCCAGAGGATGGGC +TGGCCATGTCCCTGCATTTCATGACATTGGGTGGCAGCTCCCTGTCCCCCACCGAGGGCA +AAGGCTCTGGGCTCCAAGGCCACATCATCGAGAACCCACAATACTTCAGTGATGCCTGTG +TTCACCACATCAAGCGCCGGGACATCGTGCTCAAGTGGGAGCTGGGGGAGGGCGCCTTTG +GGAAGGTCTTCCTTGCTGAGTGCCACAACCTCCTGCCTGAGCAGGACAAGATGCTGGTGG +CTGTCAAGGCACTGAAGGAGGCGTCCGAGAGTGCTCGGCAGGACTTCCAGCGTGAGGCTG +AGCTGCTCACCATGCTGCAGCACCAGCACATCGTGCGCTTCTTCGGCGTCTGCACCGAGG +GCCGCCCCCTGCTCATGGTCTTTGAGTATATGCGGCACGGGGACCTCAACCGCTTCCTCC +GATCCCATGGACCTGATGCCAAGCTGCTGGCTGGTGGGGAGGATGTGGCTCCAGGCCCCC +TGGGTCTGGGGCAGCTGCTGGCCGTGGCTAGCCAGGTCGCTGCGGGGATGGTGTACCTGG +CGGGTCTGCATTTTGTGCACCGGGACCTGGCCACACGCAACTGTCTAGTGGGCCAGGGAC +TGGTGGTCAAGATTGGTGATTTTGGCATGAGCAGGGATATCTACAGCACCGACTATTACC +GTGTGGGAGGCCGCACCATGCTGCCCATTCGCTGGATGCCGCCCGAGAGCATCCTGTACC +GTAAGTTCACCACCGAGAGCGACGTGTGGAGCTTCGGCGTGGTGCTCTGGGAGATCTTCA +CCTACGGCAAGCAGCCCTGGTACCAGCTCTCCAACACGGAGGCAATCGACTGCATCACGC +AGGGACGTGAGTTGGAGCGGCCACGTGCCTGCCCACCAGAGGTCTACGCCATCATGCGGG +GCTGCTGGCAGCGGGAGCCCCAGCAACGCCACAGCATCAAGGATGTGCACGCCCGGCTGC +AAGCCCTGGCCCAGGCACCTCCTGTCTACCTGGATGTCCTGGGCTAGGGGGCCGGCCCAG +GGGCTGGGAGTGGTTAGCCGGAATACTGGGGCCTGCCCTCAGCATCCCCCATAGCTCCCA +GCAGCCCCAGGGTGATCTCAAAGTATCTAATTCACCCTCAGCATGTGGGAAGGGACAGGT +GGGGGCTGGGAGTAGAGGATGTTCCTGCTTCTCTAGGCAAGGTCCCGTCATAGCAATTAT +ATTTATTATCCCTTGAAAAAAAA""" + assert tpm3.read_text() == tpm3_expected + + limk2 = tmp_path / "ENST00000331728.9.fasta" + test_seqrepo_access.get_fasta_file("ENST00000331728.9", limk2) + limk2_expected = """>ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF +GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG +GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG +CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG +ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC +CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC +TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT +ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG +ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG +TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG +GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG +GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA +GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC +ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA +ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC +CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC +CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC +CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG +CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG +CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG +CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT +CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG +GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC +CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC +CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC +TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG +ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC +CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC +TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT +GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG +TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC +TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC +TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC +CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC +CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC +CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA +GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC +CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC +TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA +ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC +AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG +ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA +CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT +GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT +GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT +TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA +CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC +TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC +CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG +ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA +GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG +GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG +TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT +GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA +GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC +ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG +GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA +TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT +ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA +TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA +GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA +GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA +CTGGCAA""" # noqa: E501 + assert limk2.read_text() == limk2_expected + + limk2_seguid = tmp_path / "SEGUID_LIMK2.fasta" + test_seqrepo_access.get_fasta_file("ugqOFdlaed2cnxrGa7zngGMrLlY", limk2_seguid) + limk2_seguid_expected = """>gnl|ID|ugqOFdlaed2cnxrGa7zngGMrLlY|ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF +GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG +GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG +CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG +ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC +CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC +TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT +ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG +ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG +TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG +GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG +GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA +GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC +ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA +ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC +CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC +CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC +CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG +CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG +CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG +CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT +CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG +GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC +CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC +CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC +TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG +ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC +CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC +TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT +GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG +TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC +TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC +TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC +CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC +CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC +CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA +GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC +CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC +TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA +ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC +AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG +ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA +CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT +GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT +GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT +TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA +CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC +TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC +CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG +ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA +GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG +GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG +TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT +GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA +GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC +ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG +GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA +TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT +ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA +TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA +GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA +GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA +CTGGCAA""" # noqa: E501 + assert limk2_seguid.read_text() == limk2_seguid_expected + + invalid = tmp_path / "invalid.fasta" + with pytest.raises(KeyError): + test_seqrepo_access.get_fasta_file("NM_2529.3", invalid) diff --git a/tests/unit/test_alignment_mapper.py b/tests/mappers/test_alignment.py similarity index 98% rename from tests/unit/test_alignment_mapper.py rename to tests/mappers/test_alignment.py index 6d155b25..452e6e1e 100644 --- a/tests/unit/test_alignment_mapper.py +++ b/tests/mappers/test_alignment.py @@ -1,14 +1,13 @@ """Module for testing the Alignment Mapper class""" import pytest -from cool_seq_tool.data_sources import AlignmentMapper, TranscriptMappings, UTADatabase from cool_seq_tool.schemas import Assembly, ResidueMode @pytest.fixture(scope="module") -def test_alignment_mapper(test_seqrepo_access): +def test_alignment_mapper(test_cool_seq_tool): """Build AlignmentMapper test fixture""" - return AlignmentMapper(test_seqrepo_access, TranscriptMappings(), UTADatabase()) + return test_cool_seq_tool.alignment_mapper @pytest.fixture(scope="module") diff --git a/tests/unit/test_cool_seq_tool.py b/tests/mappers/test_exon_genomic_coords.py similarity index 54% rename from tests/unit/test_cool_seq_tool.py rename to tests/mappers/test_exon_genomic_coords.py index 5164e173..0ff60df6 100644 --- a/tests/unit/test_cool_seq_tool.py +++ b/tests/mappers/test_exon_genomic_coords.py @@ -5,16 +5,13 @@ import pytest -from cool_seq_tool import CoolSeqTool from cool_seq_tool.schemas import GenomicData, TranscriptExonData @pytest.fixture(scope="module") -async def test_cool_seq_tool(): - """Create a CoolSeqTool test fixture""" - test_cool_seq_tool = CoolSeqTool() - await test_cool_seq_tool.uta_db._create_genomic_table() - return test_cool_seq_tool +def test_egc_mapper(test_cool_seq_tool): + """Build mane ExonGenomicCoordsMapper test fixture.""" + return test_cool_seq_tool.ex_g_coords_mapper @pytest.fixture(scope="module") @@ -295,45 +292,45 @@ def transcript_exon_data_assertion_checks(actual, expected=None, @pytest.mark.asyncio -async def test__genomic_to_transcript(test_cool_seq_tool, tpm3_exon1, tpm3_exon8): +async def test__genomic_to_transcript(test_egc_mapper, tpm3_exon1, tpm3_exon8): """Test that _genomic_to_transcript_exon_coordinate method works correctly. """ - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=-1, transcript="NM_152263.3", gene="TPM3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154192135, strand=-1, transcript="NM_152263.3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154192135, transcript="NM_152263.3" ) transcript_exon_data_assertion_checks(resp, tpm3_exon1) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154170399, strand=-1, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154170399, strand=-1, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( 1, 154170399, transcript="NM_152263.3", is_start=False ) transcript_exon_data_assertion_checks(resp, tpm3_exon8) @pytest.mark.asyncio -async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, +async def test_tpm3(test_egc_mapper, tpm3_exon1_exon8, tpm3_exon1_exon8_offset, tpm3_exon1_g, tpm3_exon8_g, tpm3_exon1_exon8_t_to_g): """Test TPM3 genomic_to_transcript_exon_coordinates and @@ -350,18 +347,18 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g.start = 154192135 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) inputs["residue_mode"] = "INTER-RESIDUE" inputs["start"] = 154192135 inputs["end"] = 154170399 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_t_to_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # No strand @@ -370,9 +367,9 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, inputs["start"] = 154192136 inputs["end"] = 154170400 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Offset, no strand @@ -382,17 +379,17 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_offset_t_to_g = copy.deepcopy(tpm3_exon1_exon8_offset) tpm3_exon1_exon8_offset_t_to_g.start = 154192132 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Offset, strand inputs["strand"] = -1 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_exon8_offset) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_offset_t_to_g) # Test only setting start @@ -407,9 +404,9 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g.start = 154192135 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon1_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) # Test only setting end @@ -423,14 +420,14 @@ async def test_tpm3(test_cool_seq_tool, tpm3_exon1_exon8, tpm3_exon1_exon8_t_to_g = copy.deepcopy(tpm3_exon8_g) g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, tpm3_exon8_g) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, tpm3_exon1_exon8_t_to_g) @pytest.mark.asyncio -async def test_braf(test_cool_seq_tool, mane_braf): +async def test_braf(test_egc_mapper, mane_braf): """Test BRAF genomic_to_transcript_exon_coordinates and transcript_to_genomic_coordinates. """ @@ -443,23 +440,23 @@ async def test_braf(test_cool_seq_tool, mane_braf): } # MANE g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) del inputs["strand"] g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_braf) mane_braf_t_to_g = copy.deepcopy(mane_braf) t_to_g_resp = \ - await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 mane_braf_t_to_g.start = 140808062 genomic_data_assertion_checks(t_to_g_resp, mane_braf_t_to_g) @pytest.mark.asyncio -async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon11): +async def test_wee1(test_egc_mapper, wee1_exon2_exon11, mane_wee1_exon2_exon11): """Test WEE1 genomic_to_transcript_exon_coordinates and transcript_to_genomic_coordinates. """ @@ -473,17 +470,17 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 wee1_exon2_exon11_t_to_g = copy.deepcopy(wee1_exon2_exon11) wee1_exon2_exon11_t_to_g.start = 9576092 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) inputs["gene"] = "wee1" del inputs["strand"] g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, wee1_exon2_exon11_t_to_g) # MANE @@ -491,57 +488,57 @@ async def test_wee1(test_cool_seq_tool, wee1_exon2_exon11, mane_wee1_exon2_exon1 mane_wee1_exon2_exon11_t_to_g = copy.deepcopy(mane_wee1_exon2_exon11) mane_wee1_exon2_exon11_t_to_g.start = 9576092 g_to_t_resp = \ - await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) genomic_data_assertion_checks(g_to_t_resp, mane_wee1_exon2_exon11) - t_to_g_resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.dict()) # noqa: E501 + t_to_g_resp = await test_egc_mapper.transcript_to_genomic_coordinates(**g_to_t_resp.genomic_data.model_dump()) # noqa: E501 genomic_data_assertion_checks(t_to_g_resp, mane_wee1_exon2_exon11_t_to_g) @pytest.mark.asyncio -async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g, +async def test_transcript_to_genomic(test_egc_mapper, tpm3_exon1_exon8_t_to_g, tpm3_exon1_t_to_g, tpm3_exon8_t_to_g, ntrk1_exon10_exon17): """Test that transcript_to_genomic_coordinates works correctly.""" # TPM3 - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=None, transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon1_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, transcript="NM_152263.3 ") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene="TPM3", transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene=" TPM3 ", transcript=" NM_152263.3 ") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=8, gene="tpm3", transcript="NM_152263.3") genomic_data_assertion_checks(resp, tpm3_exon8_t_to_g) expected = copy.deepcopy(tpm3_exon1_exon8_t_to_g) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, exon_end_offset=-5, transcript="NM_152263.3") expected.exon_end = 8 expected.exon_end_offset = -5 expected.end = 154170404 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, exon_end_offset=5, transcript="NM_152263.3") expected.exon_end_offset = 5 expected.end = 154170394 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=3, exon_end=8, exon_start_offset=3, exon_end_offset=5, transcript="NM_152263.3") expected.exon_start = 3 @@ -549,7 +546,7 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g expected.start = 154176245 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=3, exon_end=8, exon_start_offset=-3, exon_end_offset=5, transcript="NM_152263.3") expected.exon_start_offset = -3 @@ -557,19 +554,19 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g genomic_data_assertion_checks(resp, expected) # NTRK1 - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, gene="NTRK1", transcript="NM_002529.3") genomic_data_assertion_checks(resp, ntrk1_exon10_exon17) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, exon_start_offset=3, transcript="NM_002529.3") expected = copy.deepcopy(ntrk1_exon10_exon17) @@ -577,7 +574,7 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g expected.start = 156874629 genomic_data_assertion_checks(resp, expected) - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=10, exon_end=17, exon_start_offset=-3, transcript="NM_002529.3") expected.exon_start_offset = -3 @@ -586,14 +583,14 @@ async def test_transcript_to_genomic(test_cool_seq_tool, tpm3_exon1_exon8_t_to_g @pytest.mark.asyncio -async def test_valid_inputs(test_cool_seq_tool): +async def test_valid_inputs(test_egc_mapper): """Test that valid inputs don"t return any errors""" inputs = { "gene": "TPM3", "chromosome": "NC_000001.11", "start": 154171413 } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs = { @@ -601,22 +598,22 @@ async def test_valid_inputs(test_cool_seq_tool): "chromosome": "NC_000011.9", "end": 9609996 } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs["chromosome"] = "11" - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) # noqa: E501 assert resp.genomic_data inputs = { "transcript": "NM_003390.3", "exon_start": 2 } - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**inputs) + resp = await test_egc_mapper.transcript_to_genomic_coordinates(**inputs) assert resp.genomic_data inputs["gene"] = "WEE1" - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates(**inputs) + resp = await test_egc_mapper.transcript_to_genomic_coordinates(**inputs) assert resp.genomic_data # Test X/Y chromosome bug @@ -628,32 +625,32 @@ async def test_valid_inputs(test_cool_seq_tool): "gene": "GDI1", "residue_mode": "inter-residue" } - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates(**inputs) + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates(**inputs) assert resp.genomic_data - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( gene="PDGFRB", transcript="NM_002609.4", exon_start=11, exon_end=23) assert resp.genomic_data @pytest.mark.asyncio -async def test_invalid(test_cool_seq_tool): +async def test_invalid(test_egc_mapper): """Test that invalid queries return `None`.""" - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( transcript="NM_152263 3", start=154192135, end=154170399, chromosome="NC_000001.11" ) assert resp.warnings == ["Unable to get exons for NM_152263 3"] # start and end not given - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=None, end=None, strand=-1, transcript="NM_152263.3", gene="TPM3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `start` or `end`"] # Invalid gene - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=154192135, end=154170399, strand=-1, transcript="NM_152263.3", gene="dummy gene") genomic_data_assertion_checks(resp, is_valid=False) @@ -663,14 +660,14 @@ async def test_invalid(test_cool_seq_tool): "and on gene DUMMY GENE"] # Invalid chromosome - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.200", start=154192135, end=154170399, strand=-1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Invalid chromosome: NC_000001.200"] # Invalid coordinates - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( "NC_000001.11", start=9999999999999, end=9999999999999, strand=-1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) @@ -679,7 +676,7 @@ async def test_invalid(test_cool_seq_tool): "coordinate 9999999999998 is mapped between an exon's start and end " "coordinates on the negative strand"] - resp = await test_cool_seq_tool.genomic_to_transcript_exon_coordinates( + resp = await test_egc_mapper.genomic_to_transcript_exon_coordinates( chromosome="1", start=154170400, strand=-1, transcript="NM_002529.3" ) genomic_data_assertion_checks(resp, is_valid=False) @@ -688,7 +685,7 @@ async def test_invalid(test_cool_seq_tool): ] # Strand does not match - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=1, transcript="NM_152263.3", gene="TPM3" ) @@ -700,44 +697,44 @@ async def test_invalid(test_cool_seq_tool): ] # Must supply either gene or transcript - resp = await test_cool_seq_tool._genomic_to_transcript_exon_coordinate( + resp = await test_egc_mapper._genomic_to_transcript_exon_coordinate( "NC_000001.11", 154192135, strand=1 ) transcript_exon_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `gene` or `transcript`"] # Exon 22 does not exist - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=22, transcript="NM_152263.3", ) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 22 does not exist on NM_152263.3"] # Start > End - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=8, exon_end=1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Start exon 8 is greater than end exon 1"] # Transcript DNE - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=7, exon_end=None, transcript="NM_12345.6") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Unable to get exons for NM_12345.6"] # Index error for invalid exon - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=-1, exon_end=0, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon -1 does not exist on NM_152263.3"] # Cant supply 0 based exons - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=0, exon_end=1, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Exon 0 does not exist on NM_152263.3"] # Gene that does not match transcript - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == [ @@ -746,210 +743,19 @@ async def test_invalid(test_cool_seq_tool): "NTKR1"] # No transcript given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript=None) genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No transcript given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=1, exon_end=8, gene="NTKR1", transcript="") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide `transcript`"] # No exons given - resp = await test_cool_seq_tool.transcript_to_genomic_coordinates( + resp = await test_egc_mapper.transcript_to_genomic_coordinates( exon_start=None, exon_end=None, transcript="NM_152263.3") genomic_data_assertion_checks(resp, is_valid=False) assert resp.warnings == ["Must provide either `exon_start` or `exon_end`"] - - -def test_get_fasta_file(test_cool_seq_tool, tmp_path): - """Test get_fasta_file method""" - tpm3 = tmp_path / "NM_002529.3.fasta" - test_cool_seq_tool.get_fasta_file("NM_002529.3", tpm3) - tpm3_expected = """>refseq:NM_002529.3|ga4gh:SQ.RSkww1aYmsMiWbNdNnOTnVDAM3ZWp1uA -TGCAGCTGGGAGCGCACAGACGGCTGCCCCGCCTGAGCGAGGCGGGCGCCGCCGCGATGC -TGCGAGGCGGACGGCGCGGGCAGCTTGGCTGGCACAGCTGGGCTGCGGGGCCGGGCAGCC -TGCTGGCTTGGCTGATACTGGCATCTGCGGGCGCCGCACCCTGCCCCGATGCCTGCTGCC -CCCACGGCTCCTCGGGACTGCGATGCACCCGGGATGGGGCCCTGGATAGCCTCCACCACC -TGCCCGGCGCAGAGAACCTGACTGAGCTCTACATCGAGAACCAGCAGCATCTGCAGCATC -TGGAGCTCCGTGATCTGAGGGGCCTGGGGGAGCTGAGAAACCTCACCATCGTGAAGAGTG -GTCTCCGTTTCGTGGCGCCAGATGCCTTCCATTTCACTCCTCGGCTCAGTCGCCTGAATC -TCTCCTTCAACGCTCTGGAGTCTCTCTCCTGGAAAACTGTGCAGGGCCTCTCCTTACAGG -AACTGGTCCTGTCGGGGAACCCTCTGCACTGTTCTTGTGCCCTGCGCTGGCTACAGCGCT -GGGAGGAGGAGGGACTGGGCGGAGTGCCTGAACAGAAGCTGCAGTGTCATGGGCAAGGGC -CCCTGGCCCACATGCCCAATGCCAGCTGTGGTGTGCCCACGCTGAAGGTCCAGGTGCCCA -ATGCCTCGGTGGATGTGGGGGACGACGTGCTGCTGCGGTGCCAGGTGGAGGGGCGGGGCC -TGGAGCAGGCCGGCTGGATCCTCACAGAGCTGGAGCAGTCAGCCACGGTGATGAAATCTG -GGGGTCTGCCATCCCTGGGGCTGACCCTGGCCAATGTCACCAGTGACCTCAACAGGAAGA -ACGTGACGTGCTGGGCAGAGAACGATGTGGGCCGGGCAGAGGTCTCTGTTCAGGTCAACG -TCTCCTTCCCGGCCAGTGTGCAGCTGCACACGGCGGTGGAGATGCACCACTGGTGCATCC -CCTTCTCTGTGGATGGGCAGCCGGCACCGTCTCTGCGCTGGCTCTTCAATGGCTCCGTGC -TCAATGAGACCAGCTTCATCTTCACTGAGTTCCTGGAGCCGGCAGCCAATGAGACCGTGC -GGCACGGGTGTCTGCGCCTCAACCAGCCCACCCACGTCAACAACGGCAACTACACGCTGC -TGGCTGCCAACCCCTTCGGCCAGGCCTCCGCCTCCATCATGGCTGCCTTCATGGACAACC -CTTTCGAGTTCAACCCCGAGGACCCCATCCCTGTCTCCTTCTCGCCGGTGGACACTAACA -GCACATCTGGAGACCCGGTGGAGAAGAAGGACGAAACACCTTTTGGGGTCTCGGTGGCTG -TGGGCCTGGCCGTCTTTGCCTGCCTCTTCCTTTCTACGCTGCTCCTTGTGCTCAACAAAT -GTGGACGGAGAAACAAGTTTGGGATCAACCGCCCGGCTGTGCTGGCTCCAGAGGATGGGC -TGGCCATGTCCCTGCATTTCATGACATTGGGTGGCAGCTCCCTGTCCCCCACCGAGGGCA -AAGGCTCTGGGCTCCAAGGCCACATCATCGAGAACCCACAATACTTCAGTGATGCCTGTG -TTCACCACATCAAGCGCCGGGACATCGTGCTCAAGTGGGAGCTGGGGGAGGGCGCCTTTG -GGAAGGTCTTCCTTGCTGAGTGCCACAACCTCCTGCCTGAGCAGGACAAGATGCTGGTGG -CTGTCAAGGCACTGAAGGAGGCGTCCGAGAGTGCTCGGCAGGACTTCCAGCGTGAGGCTG -AGCTGCTCACCATGCTGCAGCACCAGCACATCGTGCGCTTCTTCGGCGTCTGCACCGAGG -GCCGCCCCCTGCTCATGGTCTTTGAGTATATGCGGCACGGGGACCTCAACCGCTTCCTCC -GATCCCATGGACCTGATGCCAAGCTGCTGGCTGGTGGGGAGGATGTGGCTCCAGGCCCCC -TGGGTCTGGGGCAGCTGCTGGCCGTGGCTAGCCAGGTCGCTGCGGGGATGGTGTACCTGG -CGGGTCTGCATTTTGTGCACCGGGACCTGGCCACACGCAACTGTCTAGTGGGCCAGGGAC -TGGTGGTCAAGATTGGTGATTTTGGCATGAGCAGGGATATCTACAGCACCGACTATTACC -GTGTGGGAGGCCGCACCATGCTGCCCATTCGCTGGATGCCGCCCGAGAGCATCCTGTACC -GTAAGTTCACCACCGAGAGCGACGTGTGGAGCTTCGGCGTGGTGCTCTGGGAGATCTTCA -CCTACGGCAAGCAGCCCTGGTACCAGCTCTCCAACACGGAGGCAATCGACTGCATCACGC -AGGGACGTGAGTTGGAGCGGCCACGTGCCTGCCCACCAGAGGTCTACGCCATCATGCGGG -GCTGCTGGCAGCGGGAGCCCCAGCAACGCCACAGCATCAAGGATGTGCACGCCCGGCTGC -AAGCCCTGGCCCAGGCACCTCCTGTCTACCTGGATGTCCTGGGCTAGGGGGCCGGCCCAG -GGGCTGGGAGTGGTTAGCCGGAATACTGGGGCCTGCCCTCAGCATCCCCCATAGCTCCCA -GCAGCCCCAGGGTGATCTCAAAGTATCTAATTCACCCTCAGCATGTGGGAAGGGACAGGT -GGGGGCTGGGAGTAGAGGATGTTCCTGCTTCTCTAGGCAAGGTCCCGTCATAGCAATTAT -ATTTATTATCCCTTGAAAAAAAA""" - assert tpm3.read_text() == tpm3_expected - - limk2 = tmp_path / "ENST00000331728.9.fasta" - test_cool_seq_tool.get_fasta_file("ENST00000331728.9", limk2) - limk2_expected = """>ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF -GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG -GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG -CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG -ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC -CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC -TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT -ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG -ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG -TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG -GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG -GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA -GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC -ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA -ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC -CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC -CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC -CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG -CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG -CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG -CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT -CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG -GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC -CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC -CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC -TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG -ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC -CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC -TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT -GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG -TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC -TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC -TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC -CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC -CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC -CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA -GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC -CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC -TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA -ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC -AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG -ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA -CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT -GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT -GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT -TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA -CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC -TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC -CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG -ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA -GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG -GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG -TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT -GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA -GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC -ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG -GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA -TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT -ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA -TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA -GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA -GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA -CTGGCAA""" # noqa: E501 - assert limk2.read_text() == limk2_expected - - limk2_seguid = tmp_path / "SEGUID_LIMK2.fasta" - test_cool_seq_tool.get_fasta_file("ugqOFdlaed2cnxrGa7zngGMrLlY", limk2_seguid) - limk2_seguid_expected = """>gnl|ID|ugqOFdlaed2cnxrGa7zngGMrLlY|ensembl:ENST00000331728.9|refseq:NM_005569.4|ga4gh:SQ.7_mlQyDN-uWH0RlxTQFvFEv6ykd2D-xF -GTCTTCCCGCGCCTGAGGCGGCGGCGGCAGGAGCTGAGGGGAGTTGTAGGGAACTGAGGG -GAGCTGCTGTGTCCCCCGCCTCCTCCTCCCCATTTCCGCGCTCCCGGGACCATGTCCGCG -CTGGCGGGTGAAGATGTCTGGAGGTGTCCAGGCTGTGGGGACCACATTGCTCCAAGCCAG -ATATGGTACAGGACTGTCAACGAAACCTGGCACGGCTCTTGCTTCCGGTGTTCAGAATGC -CAGGATTCCCTCACCAACTGGTACTATGAGAAGGATGGGAAGCTCTACTGCCCCAAGGAC -TACTGGGGGAAGTTTGGGGAGTTCTGTCATGGGTGCTCCCTGCTGATGACAGGGCCTTTT -ATGGTGGCTGGGGAGTTCAAGTACCACCCAGAGTGCTTTGCCTGTATGAGCTGCAAGGTG -ATCATTGAGGATGGGGATGCATATGCACTGGTGCAGCATGCCACCCTCTACTGTGGGAAG -TGCCACAATGAGGTGGTGCTGGCACCCATGTTTGAGAGACTCTCCACAGAGTCTGTTCAG -GAGCAGCTGCCCTACTCTGTCACGCTCATCTCCATGCCGGCCACCACTGAAGGCAGGCGG -GGCTTCTCCGTGTCCGTGGAGAGTGCCTGCTCCAACTACGCCACCACTGTGCAAGTGAAA -GAGGTCAACCGGATGCACATCAGTCCCAACAATCGAAACGCCATCCACCCTGGGGACCGC -ATCCTGGAGATCAATGGGACCCCCGTCCGCACACTTCGAGTGGAGGAGGTGGAGGATGCA -ATTAGCCAGACGAGCCAGACACTTCAGCTGTTGATTGAACATGACCCCGTCTCCCAACGC -CTGGACCAGCTGCGGCTGGAGGCCCGGCTCGCTCCTCACATGCAGAATGCCGGACACCCC -CACGCCCTCAGCACCCTGGACACCAAGGAGAATCTGGAGGGGACACTGAGGAGACGTTCC -CTAAGGCGCAGTAACAGTATCTCCAAGTCCCCTGGCCCCAGCTCCCCAAAGGAGCCCCTG -CTGTTCAGCCGTGACATCAGCCGCTCAGAATCCCTTCGTTGTTCCAGCAGCTATTCACAG -CAGATCTTCCGGCCCTGTGACCTAATCCATGGGGAGGTCCTGGGGAAGGGCTTCTTTGGG -CAGGCTATCAAGGTGACACACAAAGCCACGGGCAAAGTGATGGTCATGAAAGAGTTAATT -CGATGTGATGAGGAGACCCAGAAAACTTTTCTGACTGAGGTGAAAGTGATGCGCAGCCTG -GACCACCCCAATGTGCTCAAGTTCATTGGTGTGCTGTACAAGGATAAGAAGCTGAACCTC -CTGACAGAGTACATTGAGGGGGGCACACTGAAGGACTTTCTGCGCAGTATGGATCCGTTC -CCCTGGCAGCAGAAGGTCAGGTTTGCCAAAGGAATCGCCTCCGGAATGGCCTATTTGCAC -TCTATGTGCATCATCCACCGGGATCTGAACTCGCACAACTGCCTCATCAAGTTGGACAAG -ACTGTGGTGGTGGCAGACTTTGGGCTGTCACGGCTCATAGTGGAAGAGAGGAAAAGGGCC -CCCATGGAGAAGGCCACCACCAAGAAACGCACCTTGCGCAAGAACGACCGCAAGAAGCGC -TACACGGTGGTGGGAAACCCCTACTGGATGGCCCCTGAGATGCTGAACGGAAAGAGCTAT -GATGAGACGGTGGATATCTTCTCCTTTGGGATCGTTCTCTGTGAGATCATTGGGCAGGTG -TATGCAGATCCTGACTGCCTTCCCCGAACACTGGACTTTGGCCTCAACGTGAAGCTTTTC -TGGGAGAAGTTTGTTCCCACAGATTGTCCCCCGGCCTTCTTCCCGCTGGCCGCCATCTGC -TGCAGACTGGAGCCTGAGAGCAGACCAGCATTCTCGAAATTGGAGGACTCCTTTGAGGCC -CTCTCCCTGTACCTGGGGGAGCTGGGCATCCCGCTGCCTGCAGAGCTGGAGGAGTTGGAC -CACACTGTGAGCATGCAGTACGGCCTGACCCGGGACTCACCTCCCTAGCCCTGGCCCAGC -CCCCTGCAGGGGGGTGTTCTACAGCCAGCATTGCCCCTCTGTGCCCCATTCCTGCTGTGA -GCAGGGCCGTCCGGGCTTCCTGTGGATTGGCGGAATGTTTAGAAGCAGAACAAGCCATTC -CTATTACCTCCCCAGGAGGCAAGTGGGCGCAGCACCAGGGAAATGTATCTCCACAGGTTC -TGGGGCCTAGTTACTGTCTGTAAATCCAATACTTGCCTGAAAGCTGTGAAGAAGAAAAAA -ACCCCTGGCCTTTGGGCCAGGAGGAATCTGTTACTCGAATCCACCCAGGAACTCCCTGGC -AGTGGATTGTGGGAGGCTCTTGCTTACACTAATCAGCGTGACCTGGACCTGCTGGGCAGG -ATCCCAGGGTGAACCTGCCTGTGAACTCTGAAGTCACTAGTCCAGCTGGGTGCAGGAGGA -CTTCAAGTGTGTGGACGAAAGAAAGACTGATGGCTCAAAGGGTGTGAAAAAGTCAGTGAT -GCTCCCCCTTTCTACTCCAGATCCTGTCCTTCCTGGAGCAAGGTTGAGGGAGTAGGTTTT -GAAGAGTCCCTTAATATGTGGTGGAACAGGCCAGGAGTTAGAGAAAGGGCTGGCTTCTGT -TTACCTGCTCACTGGCTCTAGCCAGCCCAGGGACCACATCAATGTGAGAGGAAGCCTCCA -CCTCATGTTTTCAAACTTAATACTGGAGACTGGCTGAGAACTTACGGACAACATCCTTTC -TGTCTGAAACAAACAGTCACAAGCAAAGGAAGAGGCTGGGGGACTAGAAAGAGGCCCTGC -CCTCTAGAAAGCTCAGATCTTGGCTTCTGTTACTCATACTCGGGTGGGCTCCTTAGTCAG -ATGCCTAAAACATTTTGCCTAAAGCTCGATGGGTTCTGGAGGACAGTGTGGCTTGTCACA -GGCCTAGAGTCTGAGGGAGGGGAGTGGGAGTCTCAGCAATCTCTTGGTCTTGGCTTCATG -GCAACCACTGCTCACCCTTCAACATGCCTGGTTTAGGCAGCAGCTTGGGCTGGGAAGAGG -TGGTGGCAGAGTCTCAAAGCTGAGATGCTGAGAGAGATAGCTCCCTGAGCTGGGCCATCT -GACTTCTACCTCCCATGTTTGCTCTCCCAACTCATTAGCTCCTGGGCAGCATCCTCCTGA -GCCACATGTGCAGGTACTGGAAAACCTCCATCTTGGCTCCCAGAGCTCTAGGAACTCTTC -ATCACAACTAGATTTGCCTCTTCTAAGTGTCTATGAGCTTGCACCATATTTAATAAATTG -GGAATGGGTTTGGGGTATTAATGCAATGTGTGGTGGTTGTATTGGAGCAGGGGGAATTGA -TAAAGGAGAGTGGTTGCTGTTAATATTATCTTATCTATTGGGTGGTATGTGAAATATTGT -ACATAGACCTGATGAGTTGTGGGACCAGATGTCATCTCTGGTCAGAGTTTACTTGCTATA -TAGACTGTACTTATGTGTGAAGTTTGCAAGCTTGCTTTAGGGCTGAGCCCTGGACTCCCA -GCAGCAGCACAGTTCAGCATTGTGTGGCTGGTTGTTTCCTGGCTGTCCCCAGCAAGTGTA -GGAGTGGTGGGCCTGAACTGGGCCATTGATCAGACTAAATAAATTAAGCAGTTAACATAA -CTGGCAA""" # noqa: E501 - assert limk2_seguid.read_text() == limk2_seguid_expected - - invalid = tmp_path / "invalid.fasta" - with pytest.raises(KeyError): - test_cool_seq_tool.get_fasta_file("NM_2529.3", invalid) diff --git a/tests/unit/test_mane_transcript.py b/tests/mappers/test_mane_transcript.py similarity index 88% rename from tests/unit/test_mane_transcript.py rename to tests/mappers/test_mane_transcript.py index 85e1939d..78263b19 100644 --- a/tests/unit/test_mane_transcript.py +++ b/tests/mappers/test_mane_transcript.py @@ -5,17 +5,14 @@ from mock import patch import pandas as pd -from cool_seq_tool.data_sources import MANETranscript, MANETranscriptMappings, \ - SeqRepoAccess, TranscriptMappings, UTADatabase, GeneNormalizer -from cool_seq_tool.data_sources.mane_transcript import MANETranscriptError -from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode +from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess +from cool_seq_tool.schemas import AnnotationLayer @pytest.fixture(scope="module") -def test_mane_transcript(test_seqrepo_access): +def test_mane_transcript(test_cool_seq_tool): """Build mane transcript test fixture.""" - return MANETranscript(test_seqrepo_access, TranscriptMappings(), - MANETranscriptMappings(), UTADatabase(), GeneNormalizer()) + return test_cool_seq_tool.mane_transcript @pytest.fixture(scope="module") @@ -567,69 +564,6 @@ async def test_g_to_mane_c(test_mane_transcript, egfr_l858r_mane_c, } -@pytest.mark.asyncio -async def test_get_mapped_mane_data(test_mane_transcript): - """Test that get_mapped_mane_data works correctly""" - resp = await test_mane_transcript.get_mapped_mane_data( - "braf", Assembly.GRCH38, 140785808, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { - "gene": "BRAF", - "refseq": "NM_001374258.1", - "ensembl": "ENST00000644969.2", - "strand": "-", - "status": "mane_plus_clinical", - "alt_ac": "NC_000007.14", - "assembly": "GRCh38" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "Braf", Assembly.GRCH37, 140485608, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { - "gene": "BRAF", - "refseq": "NM_001374258.1", - "ensembl": "ENST00000644969.2", - "strand": "-", - "status": "mane_plus_clinical", - "alt_ac": "NC_000007.13", - "assembly": "GRCh37" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH38, 140783157, ResidueMode.INTER_RESIDUE) - assert resp.dict() == { - "gene": "BRAF", - "refseq": "NM_004333.6", - "ensembl": "ENST00000646891.2", - "strand": "-", - "status": "mane_select", - "alt_ac": "NC_000007.14", - "assembly": "GRCh38" - } - - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH37, 140482958, ResidueMode.RESIDUE) - assert resp.dict() == { - "gene": "BRAF", - "refseq": "NM_004333.6", - "ensembl": "ENST00000646891.2", - "strand": "-", - "status": "mane_select", - "alt_ac": "NC_000007.13", - "assembly": "GRCh37" - } - - # Invalid coord given assembly, so no result should be found - resp = await test_mane_transcript.get_mapped_mane_data( - "BRAF", Assembly.GRCH38, 140482957, ResidueMode.INTER_RESIDUE) - assert resp is None - - # Invalid gene - with pytest.raises(MANETranscriptError) as e: - await test_mane_transcript.get_mapped_mane_data( - "dummy", Assembly.GRCH37, 140482958, ResidueMode.RESIDUE) - assert str(e.value) == "Unable to get HGNC data for gene: dummy" - - @pytest.mark.asyncio async def test_valid(test_mane_transcript): """Test that valid queries do not raise any exceptions""" diff --git a/tests/unit/test_mane_transcript_mappings.py b/tests/sources/test_mane_transcript_mappings.py similarity index 96% rename from tests/unit/test_mane_transcript_mappings.py rename to tests/sources/test_mane_transcript_mappings.py index 4f1b3c8d..56863c94 100644 --- a/tests/unit/test_mane_transcript_mappings.py +++ b/tests/sources/test_mane_transcript_mappings.py @@ -1,14 +1,6 @@ """Module for testing MANE Transcript Mapping class.""" import pytest -from cool_seq_tool.data_sources import MANETranscriptMappings - - -@pytest.fixture(scope="module") -def test_mane_transcript_mappings(): - """Build MANE transcript mappings test fixture.""" - return MANETranscriptMappings() - @pytest.fixture(scope="module") def braf_select(): diff --git a/tests/unit/test_uta_database.py b/tests/sources/test_uta_database.py similarity index 98% rename from tests/unit/test_uta_database.py rename to tests/sources/test_uta_database.py index 42a99a0a..3046d96e 100644 --- a/tests/unit/test_uta_database.py +++ b/tests/sources/test_uta_database.py @@ -3,16 +3,6 @@ import pytest -from cool_seq_tool.data_sources import UTADatabase - - -@pytest.fixture(scope="module") -async def test_db(): - """Create uta db test fixture.""" - test_uta_db = UTADatabase() - await test_uta_db._create_genomic_table() - return test_uta_db - @pytest.fixture(scope="module") def nm_152263_exons(): diff --git a/tests/unit/test_residue_mode.py b/tests/test_utils.py similarity index 90% rename from tests/unit/test_residue_mode.py rename to tests/test_utils.py index 3f8bc7e1..d2b5048d 100644 --- a/tests/unit/test_residue_mode.py +++ b/tests/test_utils.py @@ -1,5 +1,5 @@ """Module for testing residue mode""" -from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos +from cool_seq_tool.utils import get_inter_residue_pos def test_get_inter_residue_pos(): diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py deleted file mode 100644 index f60af000..00000000 --- a/tests/unit/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -"""Provide utilities for test cases.""" -from biocommons.seqrepo import SeqRepo -import pytest - -from cool_seq_tool.data_sources import SeqRepoAccess -from cool_seq_tool.paths import SEQREPO_ROOT_DIR - - -@pytest.fixture(scope="session") -def test_seqrepo_access(): - """Create SeqRepoAccess test fixture""" - return SeqRepoAccess(SeqRepo(root_dir=SEQREPO_ROOT_DIR)) diff --git a/tests/unit/test_seqrepo_access.py b/tests/unit/test_seqrepo_access.py deleted file mode 100644 index 409db4cc..00000000 --- a/tests/unit/test_seqrepo_access.py +++ /dev/null @@ -1,124 +0,0 @@ -"""Module for testing seqrepo access class""" -import pytest -from biocommons.seqrepo import SeqRepo - -from cool_seq_tool.data_sources import SeqRepoAccess -from cool_seq_tool.paths import SEQREPO_ROOT_DIR - - -@pytest.fixture(scope="module") -def test_seqrepo_access(): - """Create SeqRepoAccess test fixture""" - return SeqRepoAccess(SeqRepo(root_dir=SEQREPO_ROOT_DIR)) - - -def test_get_reference_sequence(test_seqrepo_access): - """Test that get_reference_sequence method works correctly""" - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 600) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 601) - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 599, 600, residue_mode="inter-residue") - assert resp == ("V", None) - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 601, 600) - assert resp == ("", "Invalid inter-residue coordinates: start (600)" - " cannot be greater than end (599)") - - resp = test_seqrepo_access.get_reference_sequence("NP_0043241311412", 600) - assert resp == ("", "Accession, NP_0043241311412, not found in SeqRepo") - - resp = test_seqrepo_access.get_reference_sequence("NP_004324.2", 600, 800) - assert resp == ("", "End inter-residue coordinate (799) " - "is out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 4654645645654, 1) - assert resp == ("", "Start inter-residue coordinate (4654645645653) is " - "out of index on NP_004324.2") - - resp = test_seqrepo_access.get_reference_sequence( - "NP_004324.2", 600, 4654645645654) - assert resp == ("", "End inter-residue coordinate (4654645645653) " - "is out of index on NP_004324.2") - - -def test_translate_identifier(test_seqrepo_access): - """Test that translate_identifier method works correctly""" - expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) - resp = test_seqrepo_access.translate_identifier( - "NM_152263.3", target_namespaces="ga4gh") - assert resp == expected - - resp = test_seqrepo_access.translate_identifier( - "refseq:NM_152263.3", target_namespaces="ga4gh") - assert resp == expected - - resp = test_seqrepo_access.translate_identifier("refseq:NM_152263.3") - assert len(resp[0]) > 0 - assert resp[1] is None - assert expected[0][0] in resp[0] - - resp = test_seqrepo_access.translate_identifier("GRCh38:2") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "refseq:NC_000002.12" in resp[0] - - resp = test_seqrepo_access.translate_identifier("NC_000002.12") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "refseq:NC_000002.12" in resp[0] - - resp = test_seqrepo_access.translate_identifier("refseq_152263.3") - assert resp == ([], "SeqRepo unable to get translated identifiers for" - " refseq_152263.3") - - -def test_aliases(test_seqrepo_access): - """Test that aliases method works correctly""" - expected = (["ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT"], None) - resp = test_seqrepo_access.translate_alias("NM_152263.3") - assert len(resp[0]) > 0 - assert resp[1] is None - assert expected[0][0] in resp[0] - - resp = test_seqrepo_access.translate_alias("NC_000002.12") - assert len(resp[0]) > 0 - assert resp[1] is None - assert "GRCh38:2" in resp[0] - - resp = test_seqrepo_access.translate_alias("refseq_152263.3") - assert resp == ([], "SeqRepo could not translate alias refseq_152263.3") - - resp = test_seqrepo_access.translate_alias("GRCh38:2") - assert resp == ([], "SeqRepo could not translate alias GRCh38:2") - - -def test_chromosome_to_acs(test_seqrepo_access): - """Test that chromosome_to_acs method works correctly""" - resp = test_seqrepo_access.chromosome_to_acs("7") - assert resp == (["NC_000007.14", "NC_000007.13"], None) - - resp = test_seqrepo_access.chromosome_to_acs("X") - assert resp == (["NC_000023.11", "NC_000023.10"], None) - - resp = test_seqrepo_access.chromosome_to_acs("Y") - assert resp == (["NC_000024.10", "NC_000024.9"], None) - - resp = test_seqrepo_access.chromosome_to_acs("117") - assert resp == (None, "117 is not a valid chromosome") - - -def test_ac_to_chromosome(test_seqrepo_access): - """Test that ac_to_chromosome method works correctly""" - resp = test_seqrepo_access.ac_to_chromosome("NC_000007.13") - assert resp == ("7", None) - - resp = test_seqrepo_access.ac_to_chromosome("NC_000007.1323") - assert resp == (None, "Unable to get chromosome for NC_000007.1323")