refactor!: rearrange app architecture (#196)

* Remove gene-normalizer dependency * Removed `get_mapped_mane_data` (#194), which was the only reason why we needed gene-normalizer * Rearranges app architecture * handlers * `SeqRepoAccess` * `get_fasta_file` is now a method * mappers * `AlignmentMapper` * `MANETranscript` * `ExonGenomicCoordsMapper` * These methods were originally in `CoolSeqTool` * data_sources * `MANETranscriptMappings` * `TranscriptMappings` * `UTADatabase`
GenomicMedLab · Oct 12, 2023 · 1424f1b · 1424f1b
1 parent 3aa73de
commit 1424f1b
Show file tree

Hide file tree

Showing 33 changed files with 1,255 additions and 1,632 deletions.
diff --git a/Pipfile b/Pipfile
@@ -14,7 +14,6 @@ hgvs = "*"
 pydantic = "*"
 fastapi = "*"
 uvicorn = "*"
-gene-normalizer = ">=0.1.34, != 0.2.0, != 0.2.1, != 0.2.2, != 0.2.3, != 0.2.4, != 0.2.5, != 0.2.6, != 0.2.7, != 0.2.8"
 "ga4gh.vrs" = "*"
 
 [dev-packages]

diff --git a/cool_seq_tool/__init__.py b/cool_seq_tool/__init__.py
@@ -1,5 +1,4 @@
 """The cool_seq_tool package"""
-from os import environ
 from pathlib import Path
 import logging
 
@@ -13,5 +12,3 @@
 logger.setLevel(logging.DEBUG)
 
 LOG_FN = "cool_seq_tool.log"
-
-from .app import CoolSeqTool  # noqa: E402, F401, I202
diff --git a/cool_seq_tool/app.py b/cool_seq_tool/app.py
diff --git a/cool_seq_tool/data_sources/__init__.py b/cool_seq_tool/data_sources/__init__.py
diff --git a/cool_seq_tool/data_sources/gene_normalizer.py b/cool_seq_tool/data_sources/gene_normalizer.py
diff --git a/cool_seq_tool/handlers/__init__.py b/cool_seq_tool/handlers/__init__.py
@@ -0,0 +1,2 @@
+"""Module for extending clients"""
+from .seqrepo_access import SeqRepoAccess
diff --git a/cool_seq_tool/data_sources/seqrepo_access.py → cool_seq_tool/handlers/seqrepo_access.py b/cool_seq_tool/data_sources/seqrepo_access.py → cool_seq_tool/handlers/seqrepo_access.py
@@ -2,14 +2,15 @@
 import logging
 from typing import Optional, List, Tuple, Union
 from os import environ
+from pathlib import Path
 
 from ga4gh.vrs.dataproxy import SeqRepoDataProxy
 
 from cool_seq_tool.schemas import ResidueMode
-from cool_seq_tool.data_sources.residue_mode import get_inter_residue_pos
+from cool_seq_tool.utils import get_inter_residue_pos
 
 
-logger = logging.getLogger("cool_seq_tool")
+logger = logging.getLogger(__name__)
 
 
 class SeqRepoAccess(SeqRepoDataProxy):
@@ -139,3 +140,65 @@ def ac_to_chromosome(self, ac: str) -> Tuple[Optional[str], Optional[str]]:
             return None, f"Unable to get chromosome for {ac}"
         else:
             return aliases, None
+
+    def get_fasta_file(
+        self, sequence_id: str, outfile_path: Path
+    ) -> None:
+        """Retrieve FASTA file containing sequence for requested sequence ID.
+        :param sequence_id: accession ID, sans namespace, eg `NM_152263.3`
+        :param outfile_path: path to save file to
+        :return: None, but saves sequence data to `outfile_path` if successful
+        :raise: KeyError if SeqRepo doesn't have sequence data for the given ID
+        """
+        sequence = self.get_reference_sequence(sequence_id)[0]
+        if not sequence:
+            raise KeyError
+
+        REFSEQ_PREFIXES = [
+            "NC_",
+            "AC_",
+            "NZ_",
+            "NT_",
+            "NW_",
+            "NG_",
+            "NM_",
+            "XM_",
+            "NR_",
+            "XR_",
+            "NP_",
+            "AP_",
+            "XP_",
+            "YP_",
+            "WP_"
+        ]
+        ENSEMBL_PREFIXES = [
+            "ENSE",
+            "ENSFM",
+            "ENSG",
+            "ENSGT",
+            "ENSP",
+            "ENSR",
+            "ENST"
+        ]
+
+        if sequence_id[:3] in REFSEQ_PREFIXES:
+            aliases = self.translate_identifier(
+                sequence_id, ["ensembl", "ga4gh"]
+            )
+            header = f">refseq:{sequence_id}|{'|'.join(aliases[0])}"
+        elif sequence_id[:4] in ENSEMBL_PREFIXES:
+            aliases = self.translate_identifier(
+                sequence_id, ["refseq", "ga4gh"]
+            )
+            header = f">ensembl:{sequence_id}|{'|'.join(aliases[0])}"
+        else:
+            aliases = self.translate_identifier(
+                sequence_id, ["ensembl", "refseq", "ga4gh"]
+            )
+            header = f">gnl|ID|{sequence_id}|{'|'.join(aliases[0])}"
+
+        LINE_LENGTH = 60
+        file_data = [header] + [sequence[i: i + LINE_LENGTH]
+                                for i in range(0, len(sequence), LINE_LENGTH)]
+        text = "\n".join(file_data)
+        outfile_path.write_text(text)
diff --git a/cool_seq_tool/mappers/__init__.py b/cool_seq_tool/mappers/__init__.py
@@ -0,0 +1,4 @@
+"""Module for mapping data"""
+from .alignment import AlignmentMapper
+from .mane_transcript import MANETranscript
+from .exon_genomic_coords import ExonGenomicCoordsMapper
diff --git a/...seq_tool/data_sources/alignment_mapper.py → cool_seq_tool/mappers/alignment.py b/...seq_tool/data_sources/alignment_mapper.py → cool_seq_tool/mappers/alignment.py
@@ -4,8 +4,8 @@
 from typing import Optional, Tuple, Dict
 
 from cool_seq_tool.schemas import AnnotationLayer, Assembly, ResidueMode
-from cool_seq_tool.data_sources import SeqRepoAccess, TranscriptMappings, \
-    UTADatabase
+from cool_seq_tool.handlers.seqrepo_access import SeqRepoAccess
+from cool_seq_tool.sources import TranscriptMappings, UTADatabase
 
 
 class AlignmentMapper:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"""Module for extending clients"""
		from .seqrepo_access import SeqRepoAccess