Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYNPY-1514] Handle Expired Pre-Signed URLs #1126

Merged
merged 40 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
5cdabde
updates read_response_content
BWMac Aug 27, 2024
fb6bfb8
initial fix
BWMac Aug 28, 2024
fcc1f21
updates integration tests
BWMac Aug 28, 2024
60b8c88
fix kwarg order
BWMac Aug 28, 2024
2271fae
updates unit_test_download.py
BWMac Aug 28, 2024
39740d2
reoganizes download from url tests into a class
BWMac Aug 29, 2024
4513e49
adds test_download_expired_url
BWMac Aug 29, 2024
4cb9c80
async test
BWMac Aug 29, 2024
438a325
adds integration test
BWMac Aug 29, 2024
483c80e
pre-commit
BWMac Aug 29, 2024
db84fd8
fix client tests
BWMac Aug 29, 2024
d97e924
test patch newline
BWMac Aug 30, 2024
b34a317
fix test formatting
BWMac Aug 30, 2024
8114bf3
fix test
BWMac Aug 30, 2024
7b512b3
fix formatting
BWMac Aug 30, 2024
c8dc88e
fix md5 test formatting
BWMac Aug 30, 2024
22d26d1
fixes formatting
BWMac Aug 30, 2024
a9ca732
formatting
BWMac Aug 30, 2024
83963ba
more formatting
BWMac Aug 30, 2024
300b96c
adds missing raise
BWMac Aug 30, 2024
5715c28
fixes download unit tests
BWMac Aug 30, 2024
ad78f23
test formatting again
BWMac Aug 30, 2024
7d417b3
formatting
BWMac Aug 30, 2024
f229298
formatting
BWMac Aug 30, 2024
5229c1c
clean up
BWMac Aug 30, 2024
a78e20e
fix integration test
BWMac Aug 30, 2024
37671bc
formatting
BWMac Aug 30, 2024
a5b2d43
adds initial SSLZeroReturnError handling
BWMac Aug 30, 2024
4ec1a29
revert last
BWMac Aug 30, 2024
e6beb5f
adds SSLZeroReturnError to retry exceptions
BWMac Aug 30, 2024
f368f1f
pre-commit fix
BWMac Aug 30, 2024
53f247f
updates object id and object type names
BWMac Sep 3, 2024
f395d52
pre-commit fix
BWMac Sep 3, 2024
342c26a
adds handling for no expiration datetime
BWMac Sep 3, 2024
fa7ece9
fix test formatting
BWMac Sep 3, 2024
dc259ec
pre-commit
BWMac Sep 3, 2024
4194b54
adds expiration check to retry
BWMac Sep 3, 2024
272e41b
adds test for no expiration url
BWMac Sep 3, 2024
a1dc9bb
fix external bucket integration test
BWMac Sep 3, 2024
0aae5da
update _expire_url
BWMac Sep 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions synapseclient/core/download/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
PresignedUrlInfo,
PresignedUrlProvider,
_MultithreadedDownloader,
_pre_signed_url_expiration_time,
download_file,
)
from .download_functions import (
Expand All @@ -32,4 +33,5 @@
"PresignedUrlInfo",
"PresignedUrlProvider",
"_MultithreadedDownloader",
"_pre_signed_url_expiration_time",
]
7 changes: 6 additions & 1 deletion synapseclient/core/download/download_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,12 @@ def _get_file_size(
if otel_context:
context.attach(otel_context)
with syn._requests_session_storage.stream("GET", url) as response:
_raise_for_status_httpx(response=response, logger=syn.logger, verbose=debug)
_raise_for_status_httpx(
response=response,
logger=syn.logger,
verbose=debug,
read_response_content=False,
)
return int(response.headers["Content-Length"])


Expand Down
71 changes: 56 additions & 15 deletions synapseclient/core/download/download_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""This module handles the various ways that a user can download a file to Synapse."""

import asyncio
import datetime
import errno
import hashlib
import os
Expand All @@ -14,7 +15,10 @@
from tqdm.contrib.logging import logging_redirect_tqdm

from synapseclient.api.configuration_services import get_client_authenticated_s3_profile
from synapseclient.api.file_services import get_file_handle_for_download_async
from synapseclient.api.file_services import (
get_file_handle_for_download,
get_file_handle_for_download_async,
)
from synapseclient.core import exceptions, sts_transfer, utils
from synapseclient.core.constants import concrete_types
from synapseclient.core.constants.method_flags import (
Expand All @@ -25,6 +29,8 @@
from synapseclient.core.download import (
SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE,
DownloadRequest,
PresignedUrlProvider,
_pre_signed_url_expiration_time,
download_file,
)
from synapseclient.core.exceptions import (
Expand Down Expand Up @@ -500,11 +506,14 @@ def download_fn(
progress_bar = get_or_create_download_progress_bar(
file_size=1, postfix=synapse_id, synapse_client=syn
)

downloaded_path = await loop.run_in_executor(
syn._get_thread_pool_executor(asyncio_event_loop=loop),
lambda: download_from_url(
url=file_handle_result["preSignedURL"],
destination=destination,
object_id=synapse_id,
object_type=entity_type,
file_handle_id=file_handle["id"],
expected_md5=file_handle.get("contentMd5"),
progress_bar=progress_bar,
Expand Down Expand Up @@ -610,6 +619,8 @@ async def download_from_url_multi_threaded(
def download_from_url(
url: str,
destination: str,
object_id: Optional[str],
BWMac marked this conversation as resolved.
Show resolved Hide resolved
object_type: Optional[str],
file_handle_id: Optional[str] = None,
expected_md5: Optional[str] = None,
progress_bar: Optional[tqdm] = None,
Expand All @@ -622,6 +633,11 @@ def download_from_url(
Arguments:
url: The source of download
destination: The destination on local file system
object_id: The id of the Synapse object that uses the FileHandle
e.g. "syn123"
object_type: The type of the Synapse object that uses the
FileHandle e.g. "FileEntity". Any of
<https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/file/FileHandleAssociateType.html>
file_handle_id: Optional. If given, the file will be given a temporary name that includes the file
handle id which allows resuming partial downloads of the same file from previous
sessions
Expand Down Expand Up @@ -719,21 +735,46 @@ def _ftp_report_hook(
else None
)

response = with_retry(
lambda url=url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
try:
response = with_retry(
lambda url=url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
exceptions._raise_for_status(response, verbose=client.debug)
except SynapseHTTPError as err:
if err.response.status_code == 404:
if err.response.status_code == 403:
url_is_expired = datetime.datetime.now(
BWMac marked this conversation as resolved.
Show resolved Hide resolved
tz=datetime.timezone.utc
) + PresignedUrlProvider._TIME_BUFFER >= _pre_signed_url_expiration_time(
url
)
if url_is_expired:
BWMac marked this conversation as resolved.
Show resolved Hide resolved
BWMac marked this conversation as resolved.
Show resolved Hide resolved
response = get_file_handle_for_download(
file_handle_id=file_handle_id,
synapse_id=object_id,
entity_type=object_type,
synapse_client=client,
)
refreshed_url = response["preSignedURL"]
response = with_retry(
lambda url=refreshed_url, range_header=range_header, auth=auth: client._requests_session.get(
url=url,
headers=client._generate_headers(range_header),
stream=True,
allow_redirects=False,
auth=auth,
),
verbose=client.debug,
**STANDARD_RETRY_PARAMS,
)
elif err.response.status_code == 404:
raise SynapseError(f"Could not download the file at {url}") from err
elif (
err.response.status_code == 416
Expand All @@ -745,8 +786,8 @@ def _ftp_report_hook(
# If it fails the user can retry with another download.
shutil.move(temp_destination, destination)
break
raise

else:
raise
# handle redirects
if response.status_code in [301, 302, 303, 307, 308]:
url = response.headers["location"]
Expand Down
129 changes: 90 additions & 39 deletions tests/integration/synapseclient/core/test_download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Integration tests around downloading files from Synapse."""

import datetime
import filecmp
import os
import random
Expand All @@ -10,13 +11,15 @@

import httpx
import pytest
import requests
from pytest_mock import MockerFixture

import synapseclient
import synapseclient.core.utils as utils
from synapseclient import Synapse
from synapseclient.core import exceptions
from synapseclient.core.download import download_from_url, download_functions
from synapseclient.core.exceptions import SynapseMd5MismatchError
from synapseclient.core.exceptions import SynapseHTTPError, SynapseMd5MismatchError
from synapseclient.models import File, Project
from tests.test_utils import spy_for_async_function

Expand Down Expand Up @@ -227,6 +230,8 @@ async def test_download_check_md5(
await download_from_url(
url=entity_bad_md5.external_url,
destination=tempfile.gettempdir(),
object_id=entity_bad_md5.id,
object_type="FileEntity",
file_handle_id=entity_bad_md5.data_file_handle_id,
expected_md5="2345a",
)
Expand Down Expand Up @@ -260,6 +265,8 @@ async def test_download_from_url_resume_partial_download(
path = download_from_url(
url=f"{syn.repoEndpoint}/entity/{file.id}/file",
destination=tempfile.gettempdir(),
object_id=file.id,
object_type="FileEntity",
file_handle_id=file.data_file_handle_id,
expected_md5=file.file_handle.content_md5,
)
Expand All @@ -271,6 +278,37 @@ async def test_download_from_url_resume_partial_download(
# `caplog: pytest.LogCaptureFixture` would need to be added to the args.
# assert f"Resuming partial download to {tmp_path}. {truncated_size}/{original_size}.0 bytes already transferred." in caplog.text

async def test_download_expired_url(
self,
syn: Synapse,
project_model: Project,
) -> None:
# simulate expired url for file
with patch.object(
exceptions,
"_raise_for_status",
side_effect=SynapseHTTPError(response=httpx.Response(403)),
):
"""Tests that a file download is retried when ."""
# GIVEN a file stored in synapse
original_file = utils.make_bogus_data_file(40000)
file = await File(
path=original_file, parent_id=project_model.id
).store_async()

# WHEN I download the file with a simulated expired url
path = download_from_url(
url=f"{syn.repoEndpoint}/entity/{file.id}/file",
destination=tempfile.gettempdir(),
object_id=file.id,
object_type="FileEntity",
file_handle_id=file.data_file_handle_id,
expected_md5=file.file_handle.content_md5,
)

# THEN the file is downloaded anyway AND matches the original
assert filecmp.cmp(original_file, path), "File comparison failed"

async def test_download_via_get(
self,
mocker: MockerFixture,
Expand Down Expand Up @@ -383,14 +421,17 @@ async def test_download_from_url_multi_threaded(
os.remove(file_path)
assert not os.path.exists(file_path)

with patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
), patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
with (
patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
):
# WHEN I download the file with multiple parts
file = await File(id=file.id, path=os.path.dirname(file.path)).get_async()
Expand Down Expand Up @@ -452,21 +493,26 @@ def mock_httpx_send(**kwargs):
# Call the real send function
return client.send(**kwargs)

with patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
), patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
), patch.object(
syn._requests_session_storage,
"send",
mock_httpx_send,
), patch(
"synapseclient.core.download.download_async.DEFAULT_MAX_BACK_OFF_ASYNC",
0.2,
with (
patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
patch.object(
syn._requests_session_storage,
"send",
mock_httpx_send,
),
patch(
"synapseclient.core.download.download_async.DEFAULT_MAX_BACK_OFF_ASYNC",
0.2,
),
):
# WHEN I download the file with multiple parts
file = await File(id=file.id, path=os.path.dirname(file.path)).get_async()
Expand Down Expand Up @@ -526,21 +572,26 @@ def mock_httpx_send(**kwargs):
# Call the real send function
return client.send(**kwargs)

with patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
), patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
), patch.object(
syn._requests_session_storage,
"send",
mock_httpx_send,
), patch(
"synapseclient.core.download.download_async.DEFAULT_MAX_BACK_OFF_ASYNC",
0.2,
with (
patch.object(
synapseclient.core.download.download_functions,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
patch.object(
synapseclient.core.download.download_async,
"SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE",
new=500,
),
patch.object(
syn._requests_session_storage,
"send",
mock_httpx_send,
),
patch(
"synapseclient.core.download.download_async.DEFAULT_MAX_BACK_OFF_ASYNC",
0.2,
),
):
# WHEN I download the file with multiple parts
file = await File(id=file.id, path=os.path.dirname(file.path)).get_async()
Expand Down
Loading
Loading