diff --git a/README.md b/README.md index 6e4a6ad..d41ce93 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ $ pridepy download-all-public-raw-files -a PXD012353 -o /Users/yourname/Download ``` Download single file by name: - ```bash $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/foldername/ -f checksum.txt -p globus ``` @@ -58,15 +57,27 @@ $ pridepy download-file-by-name -a PXD022105 -o /Users/yourname/Downloads/folder >**NOTE**: Currently we use Globus URLs (when `-p globus` is used) via HTTPS, not the Globus protocol. For more information about Globus, see [Globus documentation](https://www.globus.org/data-transfer). Search projects with keywords and filters - ```bash $ pridepy search-projects-by-keywords-and-filters --keyword accession:PXD012353 ``` -Search files with filters +Search files with filters ```bash $ pridepy get-files-by-filter --filter fileCategory.value==RAW ``` + +Stream metadata of all projects as json and write it to a file +```bash +$ pridepy stream-projects-metadata -o all_pride_projects.json +``` + +Stream metadata of all files as json and write it to a file. Project accession can be specified as an optional parameter +```bash +$ pridepy stream-files-metadata -o all_pride_files.json +OR +$ pridepy stream-files-metadata -o PXD005011_files.json -a PXD005011 +``` + Use the below command to view a list of commands available: ```bash @@ -83,7 +94,10 @@ Commands: get-files-by-project-accession get files by project accession... get-private-files Get private files by project... get-projects get paged projects :return: - get-projects-by-accession get projects by accession... + get-projects-by-accession get projects by accession... + stream-files-metadata Stream all files metadata in... + stream-projects-metadata Stream all projects metadata... + ``` # NOTE diff --git a/environment.yml b/environment.yml index fb9b158..746fe16 100644 --- a/environment.yml +++ b/environment.yml @@ -13,4 +13,5 @@ dependencies: - boto3 - botocore - tqdm - - urllib3 \ No newline at end of file + - urllib3 + - httpx \ No newline at end of file diff --git a/pridepy/files/files.py b/pridepy/files/files.py index fbf76f2..76fe6da 100644 --- a/pridepy/files/files.py +++ b/pridepy/files/files.py @@ -50,6 +50,7 @@ class Files: This class handles PRIDE API files endpoint. """ + V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3" API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2" API_PRIVATE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2" PRIDE_ARCHIVE_FTP = "ftp.pride.ebi.ac.uk" @@ -62,6 +63,23 @@ class Files: def __init__(self): pass + async def stream_all_files_metadata(self, output_file, accession=None): + """ + get stream all project files from PRIDE API in JSON format + """ + if accession is None: + request_url = f"{self.V3_API_BASE_URL}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/files/count" + else: + request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/all" + count_request_url = f"{self.V3_API_BASE_URL}/projects/{accession}/files/count" + headers = {"Accept": "application/JSON"} + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() + + regex_search_pattern = '"fileName"' + await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers) + def get_all_paged_files( self, query_filter, page_size, page, sort_direction, sort_conditions ): diff --git a/pridepy/pridepy.py b/pridepy/pridepy.py index b15f69c..82dabbf 100644 --- a/pridepy/pridepy.py +++ b/pridepy/pridepy.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 - +import asyncio import logging import click from pridepy.files.files import Files @@ -254,6 +254,43 @@ def search_projects_by_keywords_and_filters( ) +@main.command() +@click.option( + "-o", + "--output_file", + required=True, + help="output file to save all the projects metadata", +) +def stream_projects_metadata(output_file): + """ + Stream all projects metadata in JSON format to a file + :return: + """ + project = Project() + asyncio.run(project.stream_all_projects(output_file)) + + +@main.command() +@click.option( + "-o", + "--output_file", + required=True, + help="output file to save all the files metadata", +) +@click.option( + "-a", + "--accession", + required=False, + help="project accession", +) +def stream_files_metadata(accession, output_file): + """ + Stream all files metadata in JSON format and write it to a file + :return: + """ + files = Files() + asyncio.run(files.stream_all_files_metadata(output_file, accession)) + @main.command() @click.option( "-ps", diff --git a/pridepy/project/project.py b/pridepy/project/project.py index 68740b4..4e4c67d 100644 --- a/pridepy/project/project.py +++ b/pridepy/project/project.py @@ -1,4 +1,5 @@ #!/usr/bin/env python + from pridepy.authentication.authentication import Authentication from pridepy.util.api_handling import Util @@ -9,6 +10,7 @@ class Project: """ API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v2/" + V3_API_BASE_URL = "https://www.ebi.ac.uk/pride/ws/archive/v3/" PRIVATE_API_BASE_URL = "https://www.ebi.ac.uk/pride/private/ws/archive/v2/" def __init__(self): @@ -39,6 +41,18 @@ def get_projects(self, page_size, page, sort_direction, sort_conditions): response = Util.get_api_call(request_url, headers) return response.json() + async def stream_all_projects(self, output_file): + """ + get stream of all projects from PRIDE API in JSON format + """ + request_url = self.V3_API_BASE_URL + "projects/all" + count_request_url = self.V3_API_BASE_URL + "projects/count" + headers = {"Accept": "application/JSON"} + response = Util.get_api_call(count_request_url, headers) + total_records = response.json() + regex_search_pattern = '"projectDescription"' + await Util.stream_response_to_file(output_file, total_records, regex_search_pattern, request_url, headers) + def get_reanalysis_projects_by_accession(self, accession): """ search PRIDE projects by reanalysis accession diff --git a/pridepy/util/api_handling.py b/pridepy/util/api_handling.py index 3362b5c..7f7ace2 100644 --- a/pridepy/util/api_handling.py +++ b/pridepy/util/api_handling.py @@ -1,9 +1,13 @@ #!/usr/bin/env python +import re +import sys +import httpx import requests import logging from ratelimit import limits, sleep_and_retry from requests.adapters import HTTPAdapter +from tqdm import tqdm from urllib3.util.retry import Retry @@ -30,6 +34,30 @@ def get_api_call(url, headers=None): ) return response + @staticmethod + @sleep_and_retry + @limits(calls=1000, period=50) + async def stream_response_to_file(out_file, total_records, regex_search_pattern, url, headers=None): + # Initialize the progress bar + with tqdm(total=total_records, unit_scale=True) as pbar: + async with httpx.AsyncClient() as client: + # Use a GET request with stream=True to handle streaming responses + async with client.stream("GET", url, headers=headers) as response: + # Check if the response is successful + response.raise_for_status() + try: + with open(out_file, 'w') as cfile: + # Iterate over the streaming content line by line + async for line in response.aiter_lines(): + if line: # Avoid printing empty lines (common with text/event-stream) + cfile.write(line + "\n") + # Check if the pattern exists in the string + if re.search(regex_search_pattern, line): + pbar.update(1) # Update progress bar by 1 for each detection + except PermissionError as e: + print("[ERROR] No permissions to write to:", out_file) + sys.exit(1) + @staticmethod @sleep_and_retry @limits(calls=1000, period=50) diff --git a/requirements.txt b/requirements.txt index cbcb2c5..6750e0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ plotly boto3 botocore tqdm -urllib3 \ No newline at end of file +urllib3 +httpx \ No newline at end of file diff --git a/setup.py b/setup.py index bc884ff..171aa15 100644 --- a/setup.py +++ b/setup.py @@ -6,9 +6,9 @@ setup( name="pridepy", - version="0.0.4", + version="0.0.5", author="PRIDE Team", - author_email="pride-report@ebi.ac.uk", + author_email="pride-support@ebi.ac.uk", description="Python Client library for PRIDE Rest API", long_description=long_description, long_description_content_type="text/markdown",