Skip to content

Commit

Permalink
feat: add Cool-Seq-Tool data (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson authored Mar 18, 2024
1 parent 09ec496 commit 3df7231
Show file tree
Hide file tree
Showing 9 changed files with 387 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/wags_tails/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
from .drugbank import DrugBankData
from .drugsatfda import DrugsAtFdaData
from .ensembl import EnsemblData
from .ensembl_transcript_mappings import EnsemblTranscriptMappingData
from .guide_to_pharmacology import GToPLigandData
from .hemonc import HemOncData
from .hgnc import HgncData
from .mondo import MondoData
from .ncbi import NcbiGeneData, NcbiGenomeData
from .ncbi_lrg_refseqgene import NcbiLrgRefSeqGeneData
from .ncbi_mane_summary import NcbiManeSummaryData
from .ncit import NcitData
from .oncotree import OncoTreeData
from .rxnorm import RxNormData
Expand All @@ -26,12 +29,15 @@
"DrugBankData",
"DrugsAtFdaData",
"EnsemblData",
"EnsemblTranscriptMappingData",
"GToPLigandData",
"HemOncData",
"HgncData",
"MondoData",
"NcbiGeneData",
"NcbiGenomeData",
"NcbiManeSummaryData",
"NcbiLrgRefSeqGeneData",
"NcitData",
"OncoTreeData",
"RxNormData",
Expand Down
27 changes: 27 additions & 0 deletions src/wags_tails/ensembl_transcript_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Fetches transcript mapping data from Ensembl BioMart."""

from pathlib import Path

from wags_tails.base_source import UnversionedDataSource
from wags_tails.utils.downloads import download_http

QUERY = '<Query virtualSchemaName="default" formatter="TSV" header="1" datasetConfigVersion="0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_gene_id" /><Attribute name="ensembl_gene_id_version" /><Attribute name="ensembl_transcript_id" /><Attribute name="ensembl_transcript_id_version" /><Attribute name="ensembl_peptide_id" /><Attribute name="ensembl_peptide_id_version" /><Attribute name="transcript_mane_select" /><Attribute name="external_gene_name" /></Dataset></Query>'


class EnsemblTranscriptMappingData(UnversionedDataSource):
"""Provide access to Ensembl transcript mapping data, from the Ensembl BioMart."""

_src_name = "ensembl_transcript_mappings"
_filetype = "tsv"

def _download_data(self, version: str, outfile: Path) -> None:
"""Download data file to specified location.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
f"http://ensembl.org/biomart/martservice?query={QUERY}",
outfile,
tqdm_params=self._tqdm_params,
)
49 changes: 49 additions & 0 deletions src/wags_tails/ncbi_lrg_refseqgene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Fetches NCBI LRG_RefSeqGene data."""
import re
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http


class NcbiLrgRefSeqGeneData(DataSource):
"""Provide access to NCBI LRG_RefSeqGene data."""

_src_name = "ncbi_lrg_refseqgene"
_filetype = "tsv"

def _get_latest_version(self) -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from file directory
"""
url = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/"
response = requests.get(url, timeout=HTTPS_REQUEST_TIMEOUT)
response.raise_for_status()
text = response.text
for row in text.split("\n"):
if "LRG_RefSeqGene" in row:
break
else:
msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}"
raise RemoteDataError(msg)
match = re.findall(r"\d\d\d\d-\d\d-\d\d", row)
if not match:
msg = f"Unable to parse LRG_RefSeqGene updated date from directory at {url}"
raise RemoteDataError(msg)
return match[0].replace("-", "")

def _download_data(self, version: str, outfile: Path) -> None:
"""Download data file to specified location.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene",
outfile,
tqdm_params=self._tqdm_params,
)
43 changes: 43 additions & 0 deletions src/wags_tails/ncbi_mane_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Fetches NCBI MANE summary data."""
from pathlib import Path

import requests

from .base_source import DataSource, RemoteDataError
from .utils.downloads import HTTPS_REQUEST_TIMEOUT, download_http, handle_gzip


class NcbiManeSummaryData(DataSource):
"""Provide access to NCBI MANE summary file."""

_src_name = "ncbi_mane_summary"
_filetype = "txt"

def _get_latest_version(self) -> str:
"""Retrieve latest version value
:return: latest release value
:raise RemoteDataError: if unable to parse version number from README
"""
latest_readme_url = "https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/current/README_versions.txt"
response = requests.get(latest_readme_url, timeout=HTTPS_REQUEST_TIMEOUT)
response.raise_for_status()
text = response.text
try:
return text.split("\n")[0].split("\t")[1]
except IndexError as e:
msg = f"Unable to parse latest NCBI MANE summary version number from README at {latest_readme_url}"
raise RemoteDataError(msg) from e

def _download_data(self, version: str, outfile: Path) -> None:
"""Download data file to specified location.
:param version: version to acquire
:param outfile: location and filename for final data file
"""
download_http(
f"https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_{version}/MANE.GRCh38.v{version}.summary.txt.gz",
outfile,
handler=handle_gzip,
tqdm_params=self._tqdm_params,
)
31 changes: 31 additions & 0 deletions tests/fixtures/ncbi_lrg_refseqgene_index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /refseq/H_sapiens/RefSeqGene</title>
</head>
<body>
<h1>Index of /refseq/H_sapiens/RefSeqGene</h1>
<pre>Name Last modified Size <hr><a href="/refseq/H_sapiens/">Parent Directory</a> -
<a href="presentations/">presentations/</a> 2012-02-14 06:11 -
<a href="Aligned2RefSeqGene">Aligned2RefSeqGene</a> 2024-02-01 06:05 789K
<a href="GCF_000001405.25_refseqgene_alignments.gff3">GCF_000001405.25_refseqgene_alignments.gff3</a> 2022-10-04 14:35 2.5M
<a href="LRG_RefSeqGene">LRG_RefSeqGene</a> 2024-02-01 06:07 2.5M
<a href="README.txt">README.txt</a> 2016-03-29 15:33 6.8K
<a href="gene_RefSeqGene">gene_RefSeqGene</a> 2024-02-01 06:05 192K
<a href="refseqgene.1.genomic.fna.gz">refseqgene.1.genomic.fna.gz</a> 2024-01-30 10:57 26M
<a href="refseqgene.1.genomic.gbff.gz">refseqgene.1.genomic.gbff.gz</a> 2024-01-30 10:58 439M
<a href="refseqgene.2.genomic.fna.gz">refseqgene.2.genomic.fna.gz</a> 2024-01-30 10:58 37M
<a href="refseqgene.2.genomic.gbff.gz">refseqgene.2.genomic.gbff.gz</a> 2024-01-30 10:58 428M
<a href="refseqgene.3.genomic.fna.gz">refseqgene.3.genomic.fna.gz</a> 2024-01-30 10:58 33M
<a href="refseqgene.3.genomic.gbff.gz">refseqgene.3.genomic.gbff.gz</a> 2024-01-30 10:58 460M
<a href="refseqgene.4.genomic.fna.gz">refseqgene.4.genomic.fna.gz</a> 2024-01-30 10:58 22M
<a href="refseqgene.4.genomic.gbff.gz">refseqgene.4.genomic.gbff.gz</a> 2024-01-30 10:58 316M
<a href="refseqgene.5.genomic.fna.gz">refseqgene.5.genomic.fna.gz</a> 2024-01-30 10:58 23M
<a href="refseqgene.5.genomic.gbff.gz">refseqgene.5.genomic.gbff.gz</a> 2024-01-30 10:58 337M
<a href="refseqgene.6.genomic.fna.gz">refseqgene.6.genomic.fna.gz</a> 2024-01-30 10:58 33M
<a href="refseqgene.6.genomic.gbff.gz">refseqgene.6.genomic.gbff.gz</a> 2024-01-30 10:58 470M
<a href="refseqgene.7.genomic.fna.gz">refseqgene.7.genomic.fna.gz</a> 2024-01-30 10:58 14M
<a href="refseqgene.7.genomic.gbff.gz">refseqgene.7.genomic.gbff.gz</a> 2024-01-30 10:58 188M
<a href="refseqgene.files.installed">refseqgene.files.installed</a> 2024-01-30 10:58 861
<hr></pre>
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html">HHS Vulnerability Disclosure</a>
3 changes: 3 additions & 0 deletions tests/fixtures/ncbi_mane_summary_README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
MANE Version 1.3
NCBI RefSeq Annotation Release GCF_000001405.40-RS_2023_10
Ensembl Release 112
63 changes: 63 additions & 0 deletions tests/test_ensembl_transcript_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Test Ensembl Transcript Mappings data source."""
from pathlib import Path

import pytest
import requests_mock

from wags_tails.ensembl_transcript_mappings import EnsemblTranscriptMappingData


@pytest.fixture()
def mappings_data_dir(base_data_dir: Path):
"""Provide ensembl transcript mappings data directory."""
directory = base_data_dir / "ensembl_transcript_mappings"
directory.mkdir(exist_ok=True, parents=True)
return directory


@pytest.fixture()
def ensembl_transcript_mappings(mappings_data_dir: Path):
"""Provide EnsemblTranscriptMappingData fixture"""
return EnsemblTranscriptMappingData(mappings_data_dir, silent=True)


def test_get_latest(
ensembl_transcript_mappings: EnsemblTranscriptMappingData,
mappings_data_dir: Path,
):
"""Test EnsemblTranscriptMappingData.get_latest()"""
with pytest.raises(
ValueError, match="Cannot set both `force_refresh` and `from_local`"
):
ensembl_transcript_mappings.get_latest(from_local=True, force_refresh=True)

with pytest.raises(FileNotFoundError):
ensembl_transcript_mappings.get_latest(from_local=True)

with requests_mock.Mocker() as m:
m.get(
'http://ensembl.org/biomart/martservice?query=<Query virtualSchemaName="default" formatter="TSV" header="1" datasetConfigVersion="0.6"><Dataset name="hsapiens_gene_ensembl" interface="default"><Attribute name="ensembl_gene_id" /><Attribute name="ensembl_gene_id_version" /><Attribute name="ensembl_transcript_id" /><Attribute name="ensembl_transcript_id_version" /><Attribute name="ensembl_peptide_id" /><Attribute name="ensembl_peptide_id_version" /><Attribute name="transcript_mane_select" /><Attribute name="external_gene_name" /></Dataset></Query>',
text="",
)
path, version = ensembl_transcript_mappings.get_latest()
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
assert path.exists()
assert version == ""
assert m.call_count == 1

path, version = ensembl_transcript_mappings.get_latest()
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
assert path.exists()
assert version == ""
assert m.call_count == 1, "don't make extra call if data already exists"

path, version = ensembl_transcript_mappings.get_latest(from_local=True)
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
assert path.exists()
assert m.call_count == 1, "don't make extra call if `from_local` == True"

path, version = ensembl_transcript_mappings.get_latest(force_refresh=True)
assert path == mappings_data_dir / "ensembl_transcript_mappings.tsv"
assert path.exists()
assert version == ""
assert m.call_count == 2, "make extra call if `force_refresh` == True"
82 changes: 82 additions & 0 deletions tests/test_ncbi_lrg_refseqgene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Test NCBI LRG_RefSeqGene data source."""
from pathlib import Path

import pytest
import requests_mock

from wags_tails import NcbiLrgRefSeqGeneData


@pytest.fixture()
def ncbi_lrg_refseqgene_data_dir(base_data_dir: Path):
"""Provide LRG_RefSeqGene data directory."""
directory = base_data_dir / "ncbi_lrg_refseqgene"
directory.mkdir(exist_ok=True, parents=True)
return directory


@pytest.fixture()
def ncbi_lrg_refseqgene(ncbi_lrg_refseqgene_data_dir: Path):
"""Provide NcbiLrgRefSeqGeneData fixture"""
return NcbiLrgRefSeqGeneData(ncbi_lrg_refseqgene_data_dir, silent=True)


@pytest.fixture(scope="module")
def index_html_file(fixture_dir: Path):
"""Provide NIH file index page, for getting latest version."""
with (fixture_dir / "ncbi_lrg_refseqgene_index.html").open() as f:
return f.read()


def test_get_latest(
ncbi_lrg_refseqgene: NcbiLrgRefSeqGeneData,
ncbi_lrg_refseqgene_data_dir: Path,
index_html_file: str,
):
"""Test NcbiLrgRefSeqGeneData.get_latest()"""
with pytest.raises(
ValueError, match="Cannot set both `force_refresh` and `from_local`"
):
ncbi_lrg_refseqgene.get_latest(from_local=True, force_refresh=True)

with pytest.raises(FileNotFoundError):
ncbi_lrg_refseqgene.get_latest(from_local=True)

with requests_mock.Mocker() as m:
m.get(
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/",
text=index_html_file,
)
m.get(
"https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene",
text="",
)
path, version = ncbi_lrg_refseqgene.get_latest()
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
assert path.exists()
assert version == "20240201"
assert m.call_count == 2

path, version = ncbi_lrg_refseqgene.get_latest()
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
assert path.exists()
assert version == "20240201"
assert m.call_count == 3

path, version = ncbi_lrg_refseqgene.get_latest(from_local=True)
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
assert path.exists()
assert m.call_count == 3

(ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240131.tsv").touch()
path, version = ncbi_lrg_refseqgene.get_latest(from_local=True)
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
assert path.exists()
assert version == "20240201"
assert m.call_count == 3

path, version = ncbi_lrg_refseqgene.get_latest(force_refresh=True)
assert path == ncbi_lrg_refseqgene_data_dir / "ncbi_lrg_refseqgene_20240201.tsv"
assert path.exists()
assert version == "20240201"
assert m.call_count == 5
Loading

0 comments on commit 3df7231

Please sign in to comment.