Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Motif search #1549

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ __pycache__

.pypirc
.gitignore
.python-version

chauthorinfo.sh
pre-commit.sh
Expand All @@ -48,3 +49,6 @@ Documentation/*
*log
*BAK
*.sublime*
*.code-workspace
.vscode/
data/
34 changes: 34 additions & 0 deletions prody/database/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,32 @@

.. _GOA: https://www.ebi.ac.uk/GOA/

Swiss-Prot
================================
The following classes and functions can be used to search and retrieve data from the Swiss-Prot database:
* :class:`.SwissProt` - class to handle Swiss-Prot data from Expasy
* :func:`.getCurrentRelease` - gets current Swiss-Prot release version
* :func:`.downloadRelease` - downloads current Swiss-Prot database files
* :func:`.saveRelease` - saves new Swiss-Prot release version
* :func:`.updateRelease` - updates Swiss-Prot local database
* :func:`.getLocalRelease` - checks local Swiss-Prot release version
* :func:`.checkForUpdates` - checks wheather there is newer Swiss-Prot version than current local one

RefSeq
================================
The following classes and functions can be used to search and retrieve data from the RefSeq database:
* :class:` .RefSeq` - class to handle RefSeq data
* :func:` getCurrentRelease` - func desc
* :func:` getInstalledFiles` - func desc
* :func:` saveInstalledFiles` - func desc
* :func:` getLocalFiles` - func desc
* :func:` getFiles` - func desc
* :func:` pepareDownloadFileList` - func desc
* :func:` downloadRelease` - func desc
* :func:` saveRelease` - func desc
* :func:` updateRelease` - func desc
* :func:` getLocalRelease` - func desc
* :func:` checkForUpdates` - func desc

"""

Expand Down Expand Up @@ -98,3 +124,11 @@
from . import quartataweb
from .quartataweb import *
__all__.extend(quartataweb.__all__)

from . import swissprot
from .swissprot import *
__all__.extend(swissprot.__all__)

from . import refseq
from .refseq import *
__all__.extend(refseq.__all__)
160 changes: 160 additions & 0 deletions prody/database/refseq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# -*- coding: utf-8 -*-

"""RefSeq database operations."""

import os
import re
from concurrent.futures import ThreadPoolExecutor

import requests
from prody import LOGGER
from prody.utilities.helpers import downloadFile
from prody.utilities.pathtools import PRODY_DATA

__all__ = ["RefSeq"]


class RefSeq:
"""RefSeq database."""

RELEASE = "release.txt"
CHECK_SUMS = "files_installed.txt"

@classmethod
def getCurrentRelease(cls) -> str:
"""Get current RefSeq db release version.

Raises:
Exception: No release version found.

Returns:
str: RefSeq db release version.
"""
rs_release = ""
url = "https://ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER"
try:
response = requests.get(url)
except requests.exceptions.RequestException as exception:
LOGGER.error(str(exception))
else:
rs_release = response.text
if not re.match(r"\d+", rs_release):
LOGGER.error("Could't determine release version.")
LOGGER.debug("RefSeq current release: {}".format(rs_release))
return rs_release

@classmethod
def getInstalledFiles(cls) -> str:
"""Reads installed files with corresponding check sums."""
LOGGER.debug("Downloading installed files with check sums.")
current_release = cls.getCurrentRelease()
url = f"https://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/release{current_release}.files.installed"
try:
response = requests.get(url)
except requests.exceptions.RequestException as exception:
LOGGER.error(str(exception))
else:
result = ""
lines = response.text.split("\n")
for line in lines:
if re.search(r"complete\.(\d+\.){1,2}protein\.faa\.gz", line):
result += line + "\n"
return result

@classmethod
def saveInstalledFiles(cls) -> None:
"""Saves installed files list with check sums locally."""
installed_files = cls.getInstalledFiles()
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.CHECK_SUMS}", "w", encoding="utf-8") as file:
mkonstanty marked this conversation as resolved.
Show resolved Hide resolved
file.write(installed_files)
LOGGER.debug(f"{cls.CHECK_SUMS} file saved locally.")
mkonstanty marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def getLocalFiles(cls) -> dict:
"""Lists local RefSeq FASTA protein files and corresponding check sums.

Returns:
dict: file names and corresponding check sums.
"""
LOGGER.debug("Getting local RefSeq protein FASTA files")
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.CHECK_SUMS}", "r", encoding="utf-8") as file:
text = file.read()
results = re.findall(r"^(\d+)\s+(\S+)$", text, re.M)
return {result[1]: result[0] for result in results}

@classmethod
def getFiles(cls) -> dict:
"""Lists all FASTA protein files on RefSeq ftp server.

Returns:
dict: FASTA protein files with check sums.
"""
LOGGER.debug("Getting protein FASTA file list from RefSeq.")
installed_files = cls.getInstalledFiles()
file_matcher = re.compile(r"^(\d+)\s+(\S+)$", re.M)
files = re.findall(file_matcher, installed_files)
return {file[1]: file[0] for file in files} if files else {}

@classmethod
def pepareDownloadFileList(cls) -> list:
"""Prepare file list to be downloaded"""
LOGGER.debug("Preparing file list to be downloaded.")
remote_files = cls.getFiles()
local_files = cls.getLocalFiles()
download_list = []
for filename in remote_files.keys():
if filename in local_files.keys():
if remote_files[filename] != local_files[filename]:
download_list.append(filename)
else:
download_list.append(filename)
return download_list

@classmethod
def downloadRelease(cls) -> None:
"""Download latest RefSeq database release."""
files = cls.pepareDownloadFileList()
url = ""
LOGGER.timeit()
with ThreadPoolExecutor(max_workers=3) as executor:
for file in files:
future = executor.submit(downloadFile, url, cls.__name__, file)
LOGGER.info(future.result())
LOGGER.report()

@classmethod
def saveRelease(cls) -> None:
"""Write current release version to disk."""
current_release = cls.getCurrentRelease()
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.RELEASE}", "w", encoding="utf-8") as file:
file.write(current_release)
LOGGER.debug("RefSeq release {} saved.".format(current_release))

@classmethod
def updateRelease(cls) -> None:
"""Update release to the most recent one."""
LOGGER.debug("Updating RefSeq release version.")
cls.downloadRelease()
cls.saveRelease()

@classmethod
def getLocalRelease(cls) -> str:
"""Get release version from local disk."""
LOGGER.debug("Getting RefSeq local release version.")
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.RELEASE}", "r", encoding="utf-8") as file:
return file.readline()

@classmethod
def checkForUpdates(cls) -> None:
"""Check if local version is the recent one."""
LOGGER.debug("RefSeq checking for updates.")
if cls.getCurrentRelease() != cls.getLocalRelease():
cls.updateRelease()
94 changes: 94 additions & 0 deletions prody/database/swissprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-

"""Swiss-Prot database operations."""

import os
import re
from concurrent.futures import ThreadPoolExecutor

import requests
from prody import LOGGER
from prody.utilities.helpers import downloadFile
from prody.utilities.pathtools import PRODY_DATA

__all__ = ["SwissProt"]


class SwissProt:
"""Swiss-Prot database."""

RELEASE = "release.txt"
HEADERS = {"User-Agent": "Python pattern search agent", "Contact": "[email protected]"}

@classmethod
def getCurrentRelease(cls) -> str:
"""Get current swiss-prot db release version.

Raises:
Exception: No release version found.

Returns:
str: Swiss-prot db release version.
"""
sp_release = ""
url = "https://ftp.expasy.org/databases/swiss-prot/release/reldate.txt"
try:
response = requests.get(url, headers=cls.HEADERS)
except requests.exceptions.RequestException as exception:
LOGGER.error(str(exception))
else:
sp_release = re.search(r"Swiss-Prot Release (\d{4}_\d{2})", response.text)
if not sp_release:
LOGGER.error("Could't determine release version.")
LOGGER.debug("Swiss-Prot current release: {}".format(sp_release[1]))
return sp_release[1]

@classmethod
def downloadRelease(cls, types=None) -> None:
"""Download latest swiss-prot database release.

Args:
types (list, optional): Database file types. Defaults to None.
"""
types = types if types else ["xml", "dat", "fasta"]
files = [f"uniprot_sprot.{type}.gz" for type in types]
url = "https://ftp.expasy.org/databases/swiss-prot/release/"
LOGGER.timeit()
with ThreadPoolExecutor(max_workers=3) as executor:
for file in files:
future = executor.submit(downloadFile, url, cls.__name__, file, headers=cls.HEADERS)
LOGGER.info(future.result())
LOGGER.report()

@classmethod
def saveRelease(cls) -> None:
"""Write current release version to disk."""
current_release = cls.getCurrentRelease()
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.RELEASE}", "w", encoding="utf-8") as file:
file.write(current_release)
LOGGER.debug("Swiss-Prot release {} saved.".format(current_release))

@classmethod
def updateRelease(cls) -> None:
"""Update release to the most recent one."""
LOGGER.debug("Updating Swiss-Prot release version.")
cls.downloadRelease()
cls.saveRelease()

@classmethod
def getLocalRelease(cls) -> str:
"""Get release version from local disk."""
LOGGER.debug("Getting Swiss-Prot local release version.")
path = os.path.join(PRODY_DATA, cls.__name__)
os.makedirs(path, exist_ok=True)
with open(f"{PRODY_DATA}/{cls.__name__}/{cls.RELEASE}", "r", encoding="utf-8") as file:
return file.readline()

@classmethod
def checkForUpdates(cls) -> None:
"""Check if local version is the recent one."""
LOGGER.debug("Swiss-Prot checking for updates.")
if cls.getCurrentRelease() != cls.getLocalRelease():
cls.updateRelease()
10 changes: 9 additions & 1 deletion prody/sequence/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,19 @@
* :func:`.showShannonEntropy` - plot Shannon entropy
* :func:`.showMSAOccupancy` - plot row (sequence) or column occupancy
* :func:`.showMutinfoMatrix` - show mutual information matrix


Searching
========
* :func:`.getPdbCodesFromMotif` - get PDB code from MOTIF
"""

__all__ = []

from . import motif
from .motif import *
__all__.extend(motif.__all__)

from . import msa
from .msa import *
__all__.extend(msa.__all__)
Expand All @@ -69,4 +78,3 @@
from . import sequence
from .sequence import *
__all__.extend(sequence.__all__)

Loading