Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYNPY-1514] Handle Expired Pre-Signed URLs #1126

Merged
merged 40 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5cdabde
updates read_response_content
BWMac Aug 27, 2024
fb6bfb8
initial fix
BWMac Aug 28, 2024
fcc1f21
updates integration tests
BWMac Aug 28, 2024
60b8c88
fix kwarg order
BWMac Aug 28, 2024
2271fae
updates unit_test_download.py
BWMac Aug 28, 2024
39740d2
reoganizes download from url tests into a class
BWMac Aug 29, 2024
4513e49
adds test_download_expired_url
BWMac Aug 29, 2024
4cb9c80
async test
BWMac Aug 29, 2024
438a325
adds integration test
BWMac Aug 29, 2024
483c80e
pre-commit
BWMac Aug 29, 2024
db84fd8
fix client tests
BWMac Aug 29, 2024
d97e924
test patch newline
BWMac Aug 30, 2024
b34a317
fix test formatting
BWMac Aug 30, 2024
8114bf3
fix test
BWMac Aug 30, 2024
7b512b3
fix formatting
BWMac Aug 30, 2024
c8dc88e
fix md5 test formatting
BWMac Aug 30, 2024
22d26d1
fixes formatting
BWMac Aug 30, 2024
a9ca732
formatting
BWMac Aug 30, 2024
83963ba
more formatting
BWMac Aug 30, 2024
300b96c
adds missing raise
BWMac Aug 30, 2024
5715c28
fixes download unit tests
BWMac Aug 30, 2024
ad78f23
test formatting again
BWMac Aug 30, 2024
7d417b3
formatting
BWMac Aug 30, 2024
f229298
formatting
BWMac Aug 30, 2024
5229c1c
clean up
BWMac Aug 30, 2024
a78e20e
fix integration test
BWMac Aug 30, 2024
37671bc
formatting
BWMac Aug 30, 2024
a5b2d43
adds initial SSLZeroReturnError handling
BWMac Aug 30, 2024
4ec1a29
revert last
BWMac Aug 30, 2024
e6beb5f
adds SSLZeroReturnError to retry exceptions
BWMac Aug 30, 2024
f368f1f
pre-commit fix
BWMac Aug 30, 2024
53f247f
updates object id and object type names
BWMac Sep 3, 2024
f395d52
pre-commit fix
BWMac Sep 3, 2024
342c26a
adds handling for no expiration datetime
BWMac Sep 3, 2024
fa7ece9
fix test formatting
BWMac Sep 3, 2024
dc259ec
pre-commit
BWMac Sep 3, 2024
4194b54
adds expiration check to retry
BWMac Sep 3, 2024
272e41b
adds test for no expiration url
BWMac Sep 3, 2024
a1dc9bb
fix external bucket integration test
BWMac Sep 3, 2024
0aae5da
update _expire_url
BWMac Sep 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions synapseclient/core/download/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
PresignedUrlInfo,
PresignedUrlProvider,
_MultithreadedDownloader,
_pre_signed_url_expiration_time,
download_file,
)
from .download_functions import (
Expand All @@ -32,4 +33,5 @@
"PresignedUrlInfo",
"PresignedUrlProvider",
"_MultithreadedDownloader",
"_pre_signed_url_expiration_time",
]
7 changes: 6 additions & 1 deletion synapseclient/core/download/download_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,12 @@ def _get_file_size(
if otel_context:
context.attach(otel_context)
with syn._requests_session_storage.stream("GET", url) as response:
_raise_for_status_httpx(response=response, logger=syn.logger, verbose=debug)
_raise_for_status_httpx(
response=response,
logger=syn.logger,
verbose=debug,
read_response_content=False,
)
return int(response.headers["Content-Length"])


Expand Down
94 changes: 79 additions & 15 deletions synapseclient/core/download/download_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module handles the various ways that a user can download a file to Synapse."""

import asyncio
import datetime
import errno
import hashlib
import os
Expand All @@ -14,7 +15,10 @@
from tqdm.contrib.logging import logging_redirect_tqdm

from synapseclient.api.configuration_services import get_client_authenticated_s3_profile
from synapseclient.api.file_services import get_file_handle_for_download_async
from synapseclient.api.file_services import (
get_file_handle_for_download,
get_file_handle_for_download_async,
)
from synapseclient.core import exceptions, sts_transfer, utils
from synapseclient.core.constants import concrete_types
from synapseclient.core.constants.method_flags import (
Expand All @@ -25,6 +29,8 @@
from synapseclient.core.download import (
SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE,
DownloadRequest,
PresignedUrlProvider,
_pre_signed_url_expiration_time,
download_file,
)
from synapseclient.core.exceptions import (
Expand Down Expand Up @@ -500,11 +506,14 @@ def download_fn(
progress_bar = get_or_create_download_progress_bar(
file_size=1, postfix=synapse_id, synapse_client=syn
)

downloaded_path = await loop.run_in_executor(
syn._get_thread_pool_executor(asyncio_event_loop=loop),
lambda: download_from_url(
url=file_handle_result["preSignedURL"],
destination=destination,
entity_id=synapse_id,
file_handle_associate_type=entity_type,
file_handle_id=file_handle["id"],
expected_md5=file_handle.get("contentMd5"),
progress_bar=progress_bar,
Expand Down Expand Up @@ -610,6 +619,8 @@ async def download_from_url_multi_threaded(
def download_from_url(
url: str,
destination: str,
entity_id: Optional[str],
file_handle_associate_type: Optional[str],
file_handle_id: Optional[str] = None,
expected_md5: Optional[str] = None,
progress_bar: Optional[tqdm] = None,
Expand All @@ -622,6 +633,11 @@ def download_from_url(
Arguments:
url: The source of download
destination: The destination on local file system
entity_id: The id of the Synapse object that uses the FileHandle
e.g. "syn123"
file_handle_associate_type: The type of the Synapse object that uses the
FileHandle e.g. "FileEntity". Any of
<https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/file/FileHandleAssociateType.html>
file_handle_id: Optional. If given, the file will be given a temporary name that includes the file
handle id which allows resuming partial downloads of the same file from previous
sessions
Expand Down Expand Up @@ -719,21 +735,69 @@ def _ftp_report_hook(
else None
)

response = with_retry(
lambda url=url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
try:
url_has_expiration = "Expires" in urllib_urlparse.urlparse(url).query
BWMac marked this conversation as resolved.
Show resolved Hide resolved
url_is_expired = False
if url_has_expiration:
url_is_expired = datetime.datetime.now(
tz=datetime.timezone.utc
) + PresignedUrlProvider._TIME_BUFFER >= _pre_signed_url_expiration_time(
url
)
if url_is_expired:
response = get_file_handle_for_download(
file_handle_id=file_handle_id,
synapse_id=entity_id,
entity_type=file_handle_associate_type,
synapse_client=client,
)
url = response["preSignedURL"]
response = with_retry(
lambda url=url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
exceptions._raise_for_status(response, verbose=client.debug)
except SynapseHTTPError as err:
if err.response.status_code == 404:
if err.response.status_code == 403:
url_has_expiration = (
"Expires" in urllib_urlparse.urlparse(url).query
)
url_is_expired = False
if url_has_expiration:
url_is_expired = datetime.datetime.now(
tz=datetime.timezone.utc
) + PresignedUrlProvider._TIME_BUFFER >= _pre_signed_url_expiration_time(
url
)
if url_is_expired:
BWMac marked this conversation as resolved.
Show resolved Hide resolved
BWMac marked this conversation as resolved.
Show resolved Hide resolved
response = get_file_handle_for_download(
file_handle_id=file_handle_id,
synapse_id=entity_id,
entity_type=file_handle_associate_type,
synapse_client=client,
)
refreshed_url = response["preSignedURL"]
response = with_retry(
lambda url=refreshed_url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
else:
raise
elif err.response.status_code == 404:
raise SynapseError(f"Could not download the file at {url}") from err
elif (
err.response.status_code == 416
Expand All @@ -745,8 +809,8 @@ def _ftp_report_hook(
# If it fails the user can retry with another download.
shutil.move(temp_destination, destination)
break
raise

else:
raise
# handle redirects
if response.status_code in [301, 302, 303, 307, 308]:
url = response.headers["location"]
Expand Down
2 changes: 2 additions & 0 deletions synapseclient/core/retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
"TimeoutException",
"ConnectError",
"ConnectTimeout",
# SSL Specific exceptions:
"SSLZeroReturnError",
]

DEBUG_EXCEPTION = "calling %s resulted in an Exception"
Expand Down
61 changes: 60 additions & 1 deletion tests/integration/synapseclient/core/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import random
import shutil
import tempfile
from datetime import datetime
from typing import Callable
from unittest.mock import Mock, patch
from urllib.parse import parse_qs, urlencode, urlparse

import httpx
import pytest
Expand All @@ -15,12 +17,27 @@
import synapseclient
import synapseclient.core.utils as utils
from synapseclient import Synapse
from synapseclient.api.file_services import get_file_handle_for_download
from synapseclient.core import exceptions
from synapseclient.core.download import download_from_url, download_functions
from synapseclient.core.exceptions import SynapseMd5MismatchError
from synapseclient.core.exceptions import SynapseHTTPError, SynapseMd5MismatchError
from synapseclient.models import File, Project
from tests.test_utils import spy_for_async_function


def _expire_url(url: str) -> str:
"""Expire a string URL by setting the expiration date to 1900-01-01T00:00:00Z"""
parsed_url = urlparse(url)
params = parse_qs(parsed_url.query)
expired_date = datetime(1970, 1, 1, 0, 0, 0)
params["X-Amz-Date"] = [expired_date.strftime("%Y%m%dT%H%M%SZ")]
params["X-Amz-Expires"] = ["30"]
params["Expires"] = ["0"]
new_query = urlencode(params, doseq=True)
new_url = parsed_url._replace(query=new_query).geturl()
return new_url


class TestDownloadCollisions:
"""Tests for downloading files with different collision states."""

Expand Down Expand Up @@ -227,6 +244,8 @@ async def test_download_check_md5(
await download_from_url(
url=entity_bad_md5.external_url,
destination=tempfile.gettempdir(),
entity_id=entity_bad_md5.id,
file_handle_associate_type="FileEntity",
file_handle_id=entity_bad_md5.data_file_handle_id,
expected_md5="2345a",
)
Expand Down Expand Up @@ -260,6 +279,8 @@ async def test_download_from_url_resume_partial_download(
path = download_from_url(
url=f"{syn.repoEndpoint}/entity/{file.id}/file",
destination=tempfile.gettempdir(),
entity_id=file.id,
file_handle_associate_type="FileEntity",
file_handle_id=file.data_file_handle_id,
expected_md5=file.file_handle.content_md5,
)
Expand All @@ -271,6 +292,44 @@ async def test_download_from_url_resume_partial_download(
# `caplog: pytest.LogCaptureFixture` would need to be added to the args.
# assert f"Resuming partial download to {tmp_path}. {truncated_size}/{original_size}.0 bytes already transferred." in caplog.text

async def test_download_expired_url(
self,
mocker: MockerFixture,
project_model: Project,
) -> None:
"""Tests that a file download is retried when URL is expired."""
# GIVEN a file stored in synapse
original_file = utils.make_bogus_data_file(40000)
file = await File(path=original_file, parent_id=project_model.id).store_async()

# AND an expired preSignedURL for that file
file_handle_response = get_file_handle_for_download(
file_handle_id=file.data_file_handle_id,
synapse_id=file.id,
entity_type="FileEntity",
)
url = file_handle_response["preSignedURL"]
expired_url = _expire_url(url)

# get_file_handle_for_download only runs if we are handling an expired URL
spy_file_handle = mocker.spy(download_functions, "get_file_handle_for_download")

# WHEN I download the file with a simulated expired url
path = download_from_url(
url=expired_url,
destination=tempfile.gettempdir(),
entity_id=file.id,
file_handle_associate_type="FileEntity",
file_handle_id=file.data_file_handle_id,
expected_md5=file.file_handle.content_md5,
)

# THEN the expired URL is refreshed
spy_file_handle.assert_called_once()

# THEN the file is downloaded anyway AND matches the original
assert filecmp.cmp(original_file, path), "File comparison failed"

async def test_download_via_get(
self,
mocker: MockerFixture,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ async def test_sts_external_storage_location(self) -> None:
Bucket=bucket_name,
Key=remote_key,
)
assert "Access Denied" in str(ex_cm.value)
assert "AccessDenied" in str(ex_cm.value)

# WHEN I put an object directly using the STS read/write credentials
s3_write_client.upload_file(
Expand Down
Loading
Loading