Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream apis #44

Merged
merged 7 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,34 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download
```

Download single file by name:

```bash
$ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus
```

>**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer).

Search projects with keywords and filters

```bash
$ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353
```
Search files with filters

Search files with filters
```bash
$ pridepy get-files-by-filter --filter fileCategory.value==RAW
```

Stream metadata of all projects as json and write it to a file
```bash
$ pridepy stream-projects-metadata -o all_pride_projects.json
```

Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter
```bash
$ pridepy stream-files-metadata -o all_pride_files.json
OR
$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011
```

Use the below command to view a list of commands available:

```bash
Expand All @@ -83,7 +94,10 @@ Commands:
get-files-by-project-accession get files by project accession...
get-private-files Get private files by project...
get-projects get paged projects :return:
get-projects-by-accession get projects by accession...
get-projects-by-accession get projects by accession...
stream-files-metadata Stream all files metadata in...
stream-projects-metadata Stream all projects metadata...

```
# NOTE

Expand Down
3 changes: 2 additions & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ dependencies:
- boto3
- botocore
- tqdm
- urllib3
- urllib3
- httpx
18 changes: 18 additions & 0 deletions pridepy/files/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class Files:
This class handles PRIDE API files endpoint.
"""

V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3"
API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2"
API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2"
PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk"
Expand All @@ -62,6 +63,23 @@ class Files:
def __init__(self):
pass

async def stream_all_files_metadata(self, output_file, accession=None):
"""
get stream all project files from PRIDE API in JSON format
"""
if accession is None:
request_url = f"{self.V3_API_BASE_URL}/files/all"
count_request_url = f"{self.V3_API_BASE_URL}/files/count"
else:
request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all"
count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count"
headers = {"Accept": "application/JSON"}
response = Util.get_api_call(count_request_url, headers)
total_records = response.json()

regex_search_pattern = '"fileName"'
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)

def get_all_paged_files(
self, query_filter, page_size, page, sort_direction, sort_conditions
):
Expand Down
39 changes: 38 additions & 1 deletion pridepy/pridepy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3

import asyncio
import logging
import click
from pridepy.files.files import Files
Expand Down Expand Up @@ -254,6 +254,43 @@ def search_projects_by_keywords_and_filters(
)


@main.command()
@click.option(
"-o",
"--output_file",
required=True,
help="output file to save all the projects metadata",
)
def stream_projects_metadata(output_file):
"""
Stream all projects metadata in JSON format to a file
:return:
"""
project = Project()
asyncio.run(project.stream_all_projects(output_file))


@main.command()
@click.option(
"-o",
"--output_file",
required=True,
help="output file to save all the files metadata",
)
@click.option(
"-a",
"--accession",
required=False,
help="project accession",
)
def stream_files_metadata(accession, output_file):
"""
Stream all files metadata in JSON format and write it to a file
:return:
"""
files = Files()
asyncio.run(files.stream_all_files_metadata(output_file, accession))

@main.command()
@click.option(
"-ps",
Expand Down
14 changes: 14 additions & 0 deletions pridepy/project/project.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python

from pridepy.authentication.authentication import Authentication
from pridepy.util.api_handling import Util

Expand All @@ -9,6 +10,7 @@ class Project:
"""

API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/"
V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/"
PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/"

def __init__(self):
Expand Down Expand Up @@ -39,6 +41,18 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions):
response = Util.get_api_call(request_url, headers)
return response.json()

async def stream_all_projects(self, output_file):
"""
get stream of all projects from PRIDE API in JSON format
"""
request_url = self.V3_API_BASE_URL + "projects/all"
count_request_url = self.V3_API_BASE_URL + "projects/count"
headers = {"Accept": "application/JSON"}
response = Util.get_api_call(count_request_url, headers)
total_records = response.json()
regex_search_pattern = '"projectDescription"'
await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers)

def get_reanalysis_projects_by_accession(self, accession):
"""
search PRIDE projects by reanalysis accession
Expand Down
28 changes: 28 additions & 0 deletions pridepy/util/api_handling.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#!/usr/bin/env python
import re
import sys

import httpx
import requests
import logging
from ratelimit import limits, sleep_and_retry
from requests.adapters import HTTPAdapter
from tqdm import tqdm
from urllib3.util.retry import Retry


Expand All @@ -30,6 +34,30 @@ def get_api_call(url, headers=None):
)
return response

@staticmethod
@sleep_and_retry
@limits(calls=1000, period=50)
async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None):
# Initialize the progress bar
with tqdm(total=total_records, unit_scale=True) as pbar:
async with httpx.AsyncClient() as client:
# Use a GET request with stream=True to handle streaming responses
async with client.stream("GET", url, headers=headers) as response:
# Check if the response is successful
response.raise_for_status()
try:
with open(out_file, 'w') as cfile:
# Iterate over the streaming content line by line
async for line in response.aiter_lines():
if line: # Avoid printing empty lines (common with text/event-stream)
cfile.write(line + "\n")
# Check if the pattern exists in the string
if re.search(regex_search_pattern, line):
pbar.update(1) # Update progress bar by 1 for each detection
except PermissionError as e:
print("[ERROR] No permissions to write to:", out_file)
sys.exit(1)

@staticmethod
@sleep_and_retry
@limits(calls=1000, period=50)
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ plotly
boto3
botocore
tqdm
urllib3
urllib3
httpx
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

setup(
name="pridepy",
version="0.0.4",
version="0.0.5",
author="PRIDE Team",
author_email="pride-report@ebi.ac.uk",
author_email="pride-support@ebi.ac.uk",
description="Python Client library for PRIDE Rest API",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
Loading