diff --git a/src/wags_tails/__init__.py b/src/wags_tails/__init__.py
index 21092e6..9b8dc28 100644
--- a/src/wags_tails/__init__.py
+++ b/src/wags_tails/__init__.py
@@ -7,11 +7,14 @@
from .drugbank import DrugBankData
from .drugsatfda import DrugsAtFdaData
from .ensembl import EnsemblData
+from .ensembl_transcript_mappings import EnsemblTranscriptMappingData
from .guide_to_pharmacology import GToPLigandData
from .hemonc import HemOncData
from .hgnc import HgncData
from .mondo import MondoData
from .ncbi import NcbiGeneData, NcbiGenomeData
+from .ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData
+from .ncbi_mane_summary import NcbiManeSummaryData
from .ncit import NcitData
from .oncotree import OncoTreeData
from .rxnorm import RxNormData
@@ -26,12 +29,15 @@
"DrugBankData",
"DrugsAtFdaData",
"EnsemblData",
+ "EnsemblTranscriptMappingData",
"GToPLigandData",
"HemOncData",
"HgncData",
"MondoData",
"NcbiGeneData",
"NcbiGenomeData",
+ "NcbiManeSummaryData",
+ "NcbiLrgRefSeqGeneData",
"NcitData",
"OncoTreeData",
"RxNormData",
diff --git a/src/wags_tails/ensembl_transcript_mappings.py b/src/wags_tails/ensembl_transcript_mappings.py
new file mode 100644
index 0000000..5ef3277
--- /dev/null
+++ b/src/wags_tails/ensembl_transcript_mappings.py
@@ -0,0 +1,27 @@
+"""Fetches transcript mapping data from Ensembl BioMart."""
+
+from pathlib import Path
+
+from wags_tails.base_source import UnversionedDataSource
+from wags_tails.utils.downloads import download_http
+
+QUERY = ''
+
+
+class EnsemblTranscriptMappingData(UnversionedDataSource):
+ """Provide access to Ensembl transcript mapping data, from the Ensembl BioMart."""
+
+ _src_name = "ensembl_transcript_mappings"
+ _filetype = "tsv"
+
+ def _download_data(self, version: str, outfile: Path) -> None:
+ """Download data file to specified location.
+
+ :param version: version to acquire
+ :param outfile: location and filename for final data file
+ """
+ download_http(
+ f"http://ensembl.org/biomart/martservice?query={QUERY}",
+ outfile,
+ tqdm_params=self._tqdm_params,
+ )
diff --git a/src/wags_tails/ncbi_lrg_refseqgene.py b/src/wags_tails/ncbi_lrg_refseqgene.py
new file mode 100644
index 0000000..64ceeac
--- /dev/null
+++ b/src/wags_tails/ncbi_lrg_refseqgene.py
@@ -0,0 +1,49 @@
+"""Fetches NCBI LRG_RefSeqGene data."""
+import re
+from pathlib import Path
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http
+
+
+class NcbiLrgRefSeqGeneData(DataSource):
+ """Provide access to NCBI LRG_RefSeqGene data."""
+
+ _src_name = "ncbi_lrg_refseqgene"
+ _filetype = "tsv"
+
+ def _get_latest_version(self) -> str:
+ """Retrieve latest version value
+
+ :return: latest release value
+ :raise RemoteDataError: if unable to parse version number from file directory
+ """
+ url = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/"
+ response = requests.get(url, timeout=HTTPS_REQUEST_TIMEOUT)
+ response.raise_for_status()
+ text = response.text
+ for row in text.split("\n"):
+ if "LRG_RefSeqGene" in row:
+ break
+ else:
+ msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}"
+ raise RemoteDataError(msg)
+ match = re.findall(r"\d\d\d\d-\d\d-\d\d", row)
+ if not match:
+ msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}"
+ raise RemoteDataError(msg)
+ return match[0].replace("-", "")
+
+ def _download_data(self, version: str, outfile: Path) -> None:
+ """Download data file to specified location.
+
+ :param version: version to acquire
+ :param outfile: location and filename for final data file
+ """
+ download_http(
+ "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene",
+ outfile,
+ tqdm_params=self._tqdm_params,
+ )
diff --git a/src/wags_tails/ncbi_mane_summary.py b/src/wags_tails/ncbi_mane_summary.py
new file mode 100644
index 0000000..53bb56a
--- /dev/null
+++ b/src/wags_tails/ncbi_mane_summary.py
@@ -0,0 +1,43 @@
+"""Fetches NCBI MANE summary data."""
+from pathlib import Path
+
+import requests
+
+from .base_source import DataSource, RemoteDataError
+from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http, handle_gzip
+
+
+class NcbiManeSummaryData(DataSource):
+ """Provide access to NCBI MANE summary file."""
+
+ _src_name = "ncbi_mane_summary"
+ _filetype = "txt"
+
+ def _get_latest_version(self) -> str:
+ """Retrieve latest version value
+
+ :return: latest release value
+ :raise RemoteDataError: if unable to parse version number from README
+ """
+ latest_readme_url = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt"
+ response = requests.get(latest_readme_url, timeout=HTTPS_REQUEST_TIMEOUT)
+ response.raise_for_status()
+ text = response.text
+ try:
+ return text.split("\n")[0].split("\t")[1]
+ except IndexError as e:
+ msg = f"Unable to parse latest NCBI MANE summary version number from README at {latest_readme_url}"
+ raise RemoteDataError(msg) from e
+
+ def _download_data(self, version: str, outfile: Path) -> None:
+ """Download data file to specified location.
+
+ :param version: version to acquire
+ :param outfile: location and filename for final data file
+ """
+ download_http(
+ f"https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_{version}/MANE.GRCh38.v{version}.summary.txt.gz",
+ outfile,
+ handler=handle_gzip,
+ tqdm_params=self._tqdm_params,
+ )
diff --git a/tests/fixtures/ncbi_lrg_refseqgene_index.html b/tests/fixtures/ncbi_lrg_refseqgene_index.html
new file mode 100644
index 0000000..6534c12
--- /dev/null
+++ b/tests/fixtures/ncbi_lrg_refseqgene_index.html
@@ -0,0 +1,31 @@
+
+
+
+ Index of /refseq/H_sapiens/RefSeqGene
+
+
+Index of /refseq/H_sapiens/RefSeqGene
+Name Last modified Size
Parent Directory -
+presentations/ 2012-02-14 06:11 -
+Aligned2RefSeqGene 2024-02-01 06:05 789K
+GCF_000001405.25_refseqgene_alignments.gff3 2022-10-04 14:35 2.5M
+LRG_RefSeqGene 2024-02-01 06:07 2.5M
+README.txt 2016-03-29 15:33 6.8K
+gene_RefSeqGene 2024-02-01 06:05 192K
+refseqgene.1.genomic.fna.gz 2024-01-30 10:57 26M
+refseqgene.1.genomic.gbff.gz 2024-01-30 10:58 439M
+refseqgene.2.genomic.fna.gz 2024-01-30 10:58 37M
+refseqgene.2.genomic.gbff.gz 2024-01-30 10:58 428M
+refseqgene.3.genomic.fna.gz 2024-01-30 10:58 33M
+refseqgene.3.genomic.gbff.gz 2024-01-30 10:58 460M
+refseqgene.4.genomic.fna.gz 2024-01-30 10:58 22M
+refseqgene.4.genomic.gbff.gz 2024-01-30 10:58 316M
+refseqgene.5.genomic.fna.gz 2024-01-30 10:58 23M
+refseqgene.5.genomic.gbff.gz 2024-01-30 10:58 337M
+refseqgene.6.genomic.fna.gz 2024-01-30 10:58 33M
+refseqgene.6.genomic.gbff.gz 2024-01-30 10:58 470M
+refseqgene.7.genomic.fna.gz 2024-01-30 10:58 14M
+refseqgene.7.genomic.gbff.gz 2024-01-30 10:58 188M
+refseqgene.files.installed 2024-01-30 10:58 861
+
+HHS Vulnerability Disclosure
diff --git a/tests/fixtures/ncbi_mane_summary_README.txt b/tests/fixtures/ncbi_mane_summary_README.txt
new file mode 100644
index 0000000..a98f65c
--- /dev/null
+++ b/tests/fixtures/ncbi_mane_summary_README.txt
@@ -0,0 +1,3 @@
+MANE Version 1.3
+NCBI RefSeq Annotation Release GCF_000001405.40-RS_2023_10
+Ensembl Release 112
diff --git a/tests/test_ensembl_transcript_mappings.py b/tests/test_ensembl_transcript_mappings.py
new file mode 100644
index 0000000..693346b
--- /dev/null
+++ b/tests/test_ensembl_transcript_mappings.py
@@ -0,0 +1,63 @@
+"""Test Ensembl Transcript Mappings data source."""
+from pathlib import Path
+
+import pytest
+import requests_mock
+
+from wags_tails.ensembl_transcript_mappings import EnsemblTranscriptMappingData
+
+
+@pytest.fixture()
+def mappings_data_dir(base_data_dir: Path):
+ """Provide ensembl transcript mappings data directory."""
+ directory = base_data_dir / "ensembl_transcript_mappings"
+ directory.mkdir(exist_ok=True, parents=True)
+ return directory
+
+
+@pytest.fixture()
+def ensembl_transcript_mappings(mappings_data_dir: Path):
+ """Provide EnsemblTranscriptMappingData fixture"""
+ return EnsemblTranscriptMappingData(mappings_data_dir, silent=True)
+
+
+def test_get_latest(
+ ensembl_transcript_mappings: EnsemblTranscriptMappingData,
+ mappings_data_dir: Path,
+):
+ """Test EnsemblTranscriptMappingData.get_latest()"""
+ with pytest.raises(
+ ValueError, match="Cannot set both `force_refresh` and `from_local`"
+ ):
+ ensembl_transcript_mappings.get_latest(from_local=True, force_refresh=True)
+
+ with pytest.raises(FileNotFoundError):
+ ensembl_transcript_mappings.get_latest(from_local=True)
+
+ with requests_mock.Mocker() as m:
+ m.get(
+ 'http://ensembl.org/biomart/martservice?query=',
+ text="",
+ )
+ path, version = ensembl_transcript_mappings.get_latest()
+ assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
+ assert path.exists()
+ assert version == ""
+ assert m.call_count == 1
+
+ path, version = ensembl_transcript_mappings.get_latest()
+ assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
+ assert path.exists()
+ assert version == ""
+ assert m.call_count == 1, "don't make extra call if data already exists"
+
+ path, version = ensembl_transcript_mappings.get_latest(from_local=True)
+ assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
+ assert path.exists()
+ assert m.call_count == 1, "don't make extra call if `from_local` == True"
+
+ path, version = ensembl_transcript_mappings.get_latest(force_refresh=True)
+ assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
+ assert path.exists()
+ assert version == ""
+ assert m.call_count == 2, "make extra call if `force_refresh` == True"
diff --git a/tests/test_ncbi_lrg_refseqgene.py b/tests/test_ncbi_lrg_refseqgene.py
new file mode 100644
index 0000000..dd17841
--- /dev/null
+++ b/tests/test_ncbi_lrg_refseqgene.py
@@ -0,0 +1,82 @@
+"""Test NCBI LRG_RefSeqGene data source."""
+from pathlib import Path
+
+import pytest
+import requests_mock
+
+from wags_tails import NcbiLrgRefSeqGeneData
+
+
+@pytest.fixture()
+def ncbi_lrg_refseqgene_data_dir(base_data_dir: Path):
+ """Provide LRG_RefSeqGene data directory."""
+ directory = base_data_dir / "ncbi_lrg_refseqgene"
+ directory.mkdir(exist_ok=True, parents=True)
+ return directory
+
+
+@pytest.fixture()
+def ncbi_lrg_refseqgene(ncbi_lrg_refseqgene_data_dir: Path):
+ """Provide NcbiLrgRefSeqGeneData fixture"""
+ return NcbiLrgRefSeqGeneData(ncbi_lrg_refseqgene_data_dir, silent=True)
+
+
+@pytest.fixture(scope="module")
+def index_html_file(fixture_dir: Path):
+ """Provide NIH file index page, for getting latest version."""
+ with (fixture_dir / "ncbi_lrg_refseqgene_index.html").open() as f:
+ return f.read()
+
+
+def test_get_latest(
+ ncbi_lrg_refseqgene: NcbiLrgRefSeqGeneData,
+ ncbi_lrg_refseqgene_data_dir: Path,
+ index_html_file: str,
+):
+ """Test NcbiLrgRefSeqGeneData.get_latest()"""
+ with pytest.raises(
+ ValueError, match="Cannot set both `force_refresh` and `from_local`"
+ ):
+ ncbi_lrg_refseqgene.get_latest(from_local=True, force_refresh=True)
+
+ with pytest.raises(FileNotFoundError):
+ ncbi_lrg_refseqgene.get_latest(from_local=True)
+
+ with requests_mock.Mocker() as m:
+ m.get(
+ "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/",
+ text=index_html_file,
+ )
+ m.get(
+ "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene",
+ text="",
+ )
+ path, version = ncbi_lrg_refseqgene.get_latest()
+ assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
+ assert path.exists()
+ assert version == "20240201"
+ assert m.call_count == 2
+
+ path, version = ncbi_lrg_refseqgene.get_latest()
+ assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
+ assert path.exists()
+ assert version == "20240201"
+ assert m.call_count == 3
+
+ path, version = ncbi_lrg_refseqgene.get_latest(from_local=True)
+ assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
+ assert path.exists()
+ assert m.call_count == 3
+
+ (ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240131.tsv").touch()
+ path, version = ncbi_lrg_refseqgene.get_latest(from_local=True)
+ assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
+ assert path.exists()
+ assert version == "20240201"
+ assert m.call_count == 3
+
+ path, version = ncbi_lrg_refseqgene.get_latest(force_refresh=True)
+ assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
+ assert path.exists()
+ assert version == "20240201"
+ assert m.call_count == 5
diff --git a/tests/test_ncbi_mane_summary.py b/tests/test_ncbi_mane_summary.py
new file mode 100644
index 0000000..81f4114
--- /dev/null
+++ b/tests/test_ncbi_mane_summary.py
@@ -0,0 +1,83 @@
+"""Test NCBI MANE summary data."""
+from pathlib import Path
+
+import pytest
+import requests_mock
+
+from wags_tails import NcbiManeSummaryData
+
+
+@pytest.fixture()
+def ncbi_mane_summary_data_dir(base_data_dir: Path):
+ """Provide NCBI MANE summary data directory."""
+ directory = base_data_dir / "ncbi_mane_summary"
+ directory.mkdir(exist_ok=True, parents=True)
+ return directory
+
+
+@pytest.fixture()
+def ncbi_mane_summary(ncbi_mane_summary_data_dir: Path):
+ """Provide NcbiManeSummaryData fixture"""
+ return NcbiManeSummaryData(ncbi_mane_summary_data_dir, silent=True)
+
+
+@pytest.fixture(scope="module")
+def mane_summary_readme(fixture_dir: Path):
+ """Provide latest MANE summary README fixture, for getting latest version."""
+ with (fixture_dir / "ncbi_mane_summary_README.txt").open() as f:
+ return f.read()
+
+
+def test_get_latest(
+ ncbi_mane_summary: NcbiManeSummaryData,
+ ncbi_mane_summary_data_dir: Path,
+ mane_summary_readme: str,
+):
+ """Test NcbiManeSummaryData.get_latest()"""
+ with pytest.raises(
+ ValueError, match="Cannot set both `force_refresh` and `from_local`"
+ ):
+ ncbi_mane_summary.get_latest(from_local=True, force_refresh=True)
+
+ with pytest.raises(FileNotFoundError):
+ ncbi_mane_summary.get_latest(from_local=True)
+
+ with requests_mock.Mocker() as m:
+ m.get(
+ "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt",
+ text=mane_summary_readme,
+ )
+ m.get(
+ "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.summary.txt.gz",
+ text="",
+ )
+ path, version = ncbi_mane_summary.get_latest()
+ assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt"
+ assert path.exists()
+ assert version == "1.3"
+ assert m.call_count == 2
+
+ path, version = ncbi_mane_summary.get_latest()
+ assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt"
+ assert path.exists()
+ assert version == "1.3"
+ assert m.call_count == 3
+
+ path, version = ncbi_mane_summary.get_latest(from_local=True)
+ assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt"
+ assert path.exists()
+ assert version == "1.3"
+ assert m.call_count == 3
+
+ (ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.2.txt").touch()
+ path, version = ncbi_mane_summary.get_latest(from_local=True)
+ assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt"
+ assert path.exists()
+ assert version == "1.3"
+ assert m.call_count == 3
+
+ path, version = ncbi_mane_summary.get_latest(force_refresh=True)
+ assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt"
+ assert path.exists()
+ assert version == "1.3"
+ assert m.call_count == 5