diff --git a/src/wags_tails/__init__.py b/src/wags_tails/__init__.py index 21092e6..9b8dc28 100644 --- a/src/wags_tails/__init__.py +++ b/src/wags_tails/__init__.py @@ -7,11 +7,14 @@ from .drugbank import DrugBankData from .drugsatfda import DrugsAtFdaData from .ensembl import EnsemblData +from .ensembl_transcript_mappings import EnsemblTranscriptMappingData from .guide_to_pharmacology import GToPLigandData from .hemonc import HemOncData from .hgnc import HgncData from .mondo import MondoData from .ncbi import NcbiGeneData, NcbiGenomeData +from .ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData +from .ncbi_mane_summary import NcbiManeSummaryData from .ncit import NcitData from .oncotree import OncoTreeData from .rxnorm import RxNormData @@ -26,12 +29,15 @@ "DrugBankData", "DrugsAtFdaData", "EnsemblData", + "EnsemblTranscriptMappingData", "GToPLigandData", "HemOncData", "HgncData", "MondoData", "NcbiGeneData", "NcbiGenomeData", + "NcbiManeSummaryData", + "NcbiLrgRefSeqGeneData", "NcitData", "OncoTreeData", "RxNormData", diff --git a/src/wags_tails/ensembl_transcript_mappings.py b/src/wags_tails/ensembl_transcript_mappings.py new file mode 100644 index 0000000..5ef3277 --- /dev/null +++ b/src/wags_tails/ensembl_transcript_mappings.py @@ -0,0 +1,27 @@ +"""Fetches transcript mapping data from Ensembl BioMart.""" + +from pathlib import Path + +from wags_tails.base_source import UnversionedDataSource +from wags_tails.utils.downloads import download_http + +QUERY = '' + + +class EnsemblTranscriptMappingData(UnversionedDataSource): + """Provide access to Ensembl transcript mapping data, from the Ensembl BioMart.""" + + _src_name = "ensembl_transcript_mappings" + _filetype = "tsv" + + def _download_data(self, version: str, outfile: Path) -> None: + """Download data file to specified location. + + :param version: version to acquire + :param outfile: location and filename for final data file + """ + download_http( + f"http://ensembl.org/biomart/martservice?query={QUERY}", + outfile, + tqdm_params=self._tqdm_params, + ) diff --git a/src/wags_tails/ncbi_lrg_refseqgene.py b/src/wags_tails/ncbi_lrg_refseqgene.py new file mode 100644 index 0000000..64ceeac --- /dev/null +++ b/src/wags_tails/ncbi_lrg_refseqgene.py @@ -0,0 +1,49 @@ +"""Fetches NCBI LRG_RefSeqGene data.""" +import re +from pathlib import Path + +import requests + +from .base_source import DataSource, RemoteDataError +from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http + + +class NcbiLrgRefSeqGeneData(DataSource): + """Provide access to NCBI LRG_RefSeqGene data.""" + + _src_name = "ncbi_lrg_refseqgene" + _filetype = "tsv" + + def _get_latest_version(self) -> str: + """Retrieve latest version value + + :return: latest release value + :raise RemoteDataError: if unable to parse version number from file directory + """ + url = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/" + response = requests.get(url, timeout=HTTPS_REQUEST_TIMEOUT) + response.raise_for_status() + text = response.text + for row in text.split("\n"): + if "LRG_RefSeqGene" in row: + break + else: + msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}" + raise RemoteDataError(msg) + match = re.findall(r"\d\d\d\d-\d\d-\d\d", row) + if not match: + msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}" + raise RemoteDataError(msg) + return match[0].replace("-", "") + + def _download_data(self, version: str, outfile: Path) -> None: + """Download data file to specified location. + + :param version: version to acquire + :param outfile: location and filename for final data file + """ + download_http( + "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene", + outfile, + tqdm_params=self._tqdm_params, + ) diff --git a/src/wags_tails/ncbi_mane_summary.py b/src/wags_tails/ncbi_mane_summary.py new file mode 100644 index 0000000..53bb56a --- /dev/null +++ b/src/wags_tails/ncbi_mane_summary.py @@ -0,0 +1,43 @@ +"""Fetches NCBI MANE summary data.""" +from pathlib import Path + +import requests + +from .base_source import DataSource, RemoteDataError +from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http, handle_gzip + + +class NcbiManeSummaryData(DataSource): + """Provide access to NCBI MANE summary file.""" + + _src_name = "ncbi_mane_summary" + _filetype = "txt" + + def _get_latest_version(self) -> str: + """Retrieve latest version value + + :return: latest release value + :raise RemoteDataError: if unable to parse version number from README + """ + latest_readme_url = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt" + response = requests.get(latest_readme_url, timeout=HTTPS_REQUEST_TIMEOUT) + response.raise_for_status() + text = response.text + try: + return text.split("\n")[0].split("\t")[1] + except IndexError as e: + msg = f"Unable to parse latest NCBI MANE summary version number from README at {latest_readme_url}" + raise RemoteDataError(msg) from e + + def _download_data(self, version: str, outfile: Path) -> None: + """Download data file to specified location. + + :param version: version to acquire + :param outfile: location and filename for final data file + """ + download_http( + f"https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_{version}/MANE.GRCh38.v{version}.summary.txt.gz", + outfile, + handler=handle_gzip, + tqdm_params=self._tqdm_params, + ) diff --git a/tests/fixtures/ncbi_lrg_refseqgene_index.html b/tests/fixtures/ncbi_lrg_refseqgene_index.html new file mode 100644 index 0000000..6534c12 --- /dev/null +++ b/tests/fixtures/ncbi_lrg_refseqgene_index.html @@ -0,0 +1,31 @@ + + + + Index of /refseq/H_sapiens/RefSeqGene + + +

Index of /refseq/H_sapiens/RefSeqGene

+
Name                                        Last modified      Size  
Parent Directory - +presentations/ 2012-02-14 06:11 - +Aligned2RefSeqGene 2024-02-01 06:05 789K +GCF_000001405.25_refseqgene_alignments.gff3 2022-10-04 14:35 2.5M +LRG_RefSeqGene 2024-02-01 06:07 2.5M +README.txt 2016-03-29 15:33 6.8K +gene_RefSeqGene 2024-02-01 06:05 192K +refseqgene.1.genomic.fna.gz 2024-01-30 10:57 26M +refseqgene.1.genomic.gbff.gz 2024-01-30 10:58 439M +refseqgene.2.genomic.fna.gz 2024-01-30 10:58 37M +refseqgene.2.genomic.gbff.gz 2024-01-30 10:58 428M +refseqgene.3.genomic.fna.gz 2024-01-30 10:58 33M +refseqgene.3.genomic.gbff.gz 2024-01-30 10:58 460M +refseqgene.4.genomic.fna.gz 2024-01-30 10:58 22M +refseqgene.4.genomic.gbff.gz 2024-01-30 10:58 316M +refseqgene.5.genomic.fna.gz 2024-01-30 10:58 23M +refseqgene.5.genomic.gbff.gz 2024-01-30 10:58 337M +refseqgene.6.genomic.fna.gz 2024-01-30 10:58 33M +refseqgene.6.genomic.gbff.gz 2024-01-30 10:58 470M +refseqgene.7.genomic.fna.gz 2024-01-30 10:58 14M +refseqgene.7.genomic.gbff.gz 2024-01-30 10:58 188M +refseqgene.files.installed 2024-01-30 10:58 861 +
+HHS Vulnerability Disclosure diff --git a/tests/fixtures/ncbi_mane_summary_README.txt b/tests/fixtures/ncbi_mane_summary_README.txt new file mode 100644 index 0000000..a98f65c --- /dev/null +++ b/tests/fixtures/ncbi_mane_summary_README.txt @@ -0,0 +1,3 @@ +MANE Version 1.3 +NCBI RefSeq Annotation Release GCF_000001405.40-RS_2023_10 +Ensembl Release 112 diff --git a/tests/test_ensembl_transcript_mappings.py b/tests/test_ensembl_transcript_mappings.py new file mode 100644 index 0000000..693346b --- /dev/null +++ b/tests/test_ensembl_transcript_mappings.py @@ -0,0 +1,63 @@ +"""Test Ensembl Transcript Mappings data source.""" +from pathlib import Path + +import pytest +import requests_mock + +from wags_tails.ensembl_transcript_mappings import EnsemblTranscriptMappingData + + +@pytest.fixture() +def mappings_data_dir(base_data_dir: Path): + """Provide ensembl transcript mappings data directory.""" + directory = base_data_dir / "ensembl_transcript_mappings" + directory.mkdir(exist_ok=True, parents=True) + return directory + + +@pytest.fixture() +def ensembl_transcript_mappings(mappings_data_dir: Path): + """Provide EnsemblTranscriptMappingData fixture""" + return EnsemblTranscriptMappingData(mappings_data_dir, silent=True) + + +def test_get_latest( + ensembl_transcript_mappings: EnsemblTranscriptMappingData, + mappings_data_dir: Path, +): + """Test EnsemblTranscriptMappingData.get_latest()""" + with pytest.raises( + ValueError, match="Cannot set both `force_refresh` and `from_local`" + ): + ensembl_transcript_mappings.get_latest(from_local=True, force_refresh=True) + + with pytest.raises(FileNotFoundError): + ensembl_transcript_mappings.get_latest(from_local=True) + + with requests_mock.Mocker() as m: + m.get( + 'http://ensembl.org/biomart/martservice?query=', + text="", + ) + path, version = ensembl_transcript_mappings.get_latest() + assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" + assert path.exists() + assert version == "" + assert m.call_count == 1 + + path, version = ensembl_transcript_mappings.get_latest() + assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" + assert path.exists() + assert version == "" + assert m.call_count == 1, "don't make extra call if data already exists" + + path, version = ensembl_transcript_mappings.get_latest(from_local=True) + assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" + assert path.exists() + assert m.call_count == 1, "don't make extra call if `from_local` == True" + + path, version = ensembl_transcript_mappings.get_latest(force_refresh=True) + assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv" + assert path.exists() + assert version == "" + assert m.call_count == 2, "make extra call if `force_refresh` == True" diff --git a/tests/test_ncbi_lrg_refseqgene.py b/tests/test_ncbi_lrg_refseqgene.py new file mode 100644 index 0000000..dd17841 --- /dev/null +++ b/tests/test_ncbi_lrg_refseqgene.py @@ -0,0 +1,82 @@ +"""Test NCBI LRG_RefSeqGene data source.""" +from pathlib import Path + +import pytest +import requests_mock + +from wags_tails import NcbiLrgRefSeqGeneData + + +@pytest.fixture() +def ncbi_lrg_refseqgene_data_dir(base_data_dir: Path): + """Provide LRG_RefSeqGene data directory.""" + directory = base_data_dir / "ncbi_lrg_refseqgene" + directory.mkdir(exist_ok=True, parents=True) + return directory + + +@pytest.fixture() +def ncbi_lrg_refseqgene(ncbi_lrg_refseqgene_data_dir: Path): + """Provide NcbiLrgRefSeqGeneData fixture""" + return NcbiLrgRefSeqGeneData(ncbi_lrg_refseqgene_data_dir, silent=True) + + +@pytest.fixture(scope="module") +def index_html_file(fixture_dir: Path): + """Provide NIH file index page, for getting latest version.""" + with (fixture_dir / "ncbi_lrg_refseqgene_index.html").open() as f: + return f.read() + + +def test_get_latest( + ncbi_lrg_refseqgene: NcbiLrgRefSeqGeneData, + ncbi_lrg_refseqgene_data_dir: Path, + index_html_file: str, +): + """Test NcbiLrgRefSeqGeneData.get_latest()""" + with pytest.raises( + ValueError, match="Cannot set both `force_refresh` and `from_local`" + ): + ncbi_lrg_refseqgene.get_latest(from_local=True, force_refresh=True) + + with pytest.raises(FileNotFoundError): + ncbi_lrg_refseqgene.get_latest(from_local=True) + + with requests_mock.Mocker() as m: + m.get( + "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/", + text=index_html_file, + ) + m.get( + "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene", + text="", + ) + path, version = ncbi_lrg_refseqgene.get_latest() + assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" + assert path.exists() + assert version == "20240201" + assert m.call_count == 2 + + path, version = ncbi_lrg_refseqgene.get_latest() + assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" + assert path.exists() + assert version == "20240201" + assert m.call_count == 3 + + path, version = ncbi_lrg_refseqgene.get_latest(from_local=True) + assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" + assert path.exists() + assert m.call_count == 3 + + (ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240131.tsv").touch() + path, version = ncbi_lrg_refseqgene.get_latest(from_local=True) + assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" + assert path.exists() + assert version == "20240201" + assert m.call_count == 3 + + path, version = ncbi_lrg_refseqgene.get_latest(force_refresh=True) + assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv" + assert path.exists() + assert version == "20240201" + assert m.call_count == 5 diff --git a/tests/test_ncbi_mane_summary.py b/tests/test_ncbi_mane_summary.py new file mode 100644 index 0000000..81f4114 --- /dev/null +++ b/tests/test_ncbi_mane_summary.py @@ -0,0 +1,83 @@ +"""Test NCBI MANE summary data.""" +from pathlib import Path + +import pytest +import requests_mock + +from wags_tails import NcbiManeSummaryData + + +@pytest.fixture() +def ncbi_mane_summary_data_dir(base_data_dir: Path): + """Provide NCBI MANE summary data directory.""" + directory = base_data_dir / "ncbi_mane_summary" + directory.mkdir(exist_ok=True, parents=True) + return directory + + +@pytest.fixture() +def ncbi_mane_summary(ncbi_mane_summary_data_dir: Path): + """Provide NcbiManeSummaryData fixture""" + return NcbiManeSummaryData(ncbi_mane_summary_data_dir, silent=True) + + +@pytest.fixture(scope="module") +def mane_summary_readme(fixture_dir: Path): + """Provide latest MANE summary README fixture, for getting latest version.""" + with (fixture_dir / "ncbi_mane_summary_README.txt").open() as f: + return f.read() + + +def test_get_latest( + ncbi_mane_summary: NcbiManeSummaryData, + ncbi_mane_summary_data_dir: Path, + mane_summary_readme: str, +): + """Test NcbiManeSummaryData.get_latest()""" + with pytest.raises( + ValueError, match="Cannot set both `force_refresh` and `from_local`" + ): + ncbi_mane_summary.get_latest(from_local=True, force_refresh=True) + + with pytest.raises(FileNotFoundError): + ncbi_mane_summary.get_latest(from_local=True) + + with requests_mock.Mocker() as m: + m.get( + "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt", + text=mane_summary_readme, + ) + m.get( + "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.summary.txt.gz", + text="", + ) + path, version = ncbi_mane_summary.get_latest() + assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt" + assert path.exists() + assert version == "1.3" + assert m.call_count == 2 + + path, version = ncbi_mane_summary.get_latest() + assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt" + assert path.exists() + assert version == "1.3" + assert m.call_count == 3 + + path, version = ncbi_mane_summary.get_latest(from_local=True) + assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt" + assert path.exists() + assert version == "1.3" + assert m.call_count == 3 + + (ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.2.txt").touch() + path, version = ncbi_mane_summary.get_latest(from_local=True) + assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt" + assert path.exists() + assert version == "1.3" + assert m.call_count == 3 + + path, version = ncbi_mane_summary.get_latest(force_refresh=True) + assert path == ncbi_mane_summary_data_dir / "ncbi_mane_summary_1.3.txt" + assert path.exists() + assert version == "1.3" + assert m.call_count == 5