Skip to content

Commit

Permalink
Send S3 presigned URLs to the client
Browse files Browse the repository at this point in the history
In the currently planned use case for Butler client/server, the server uses S3 for storing artifacts and gives clients access to them by presigning URLs.
  • Loading branch information
dhirving committed Dec 8, 2023
1 parent 925c20b commit a633a1a
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 10 deletions.
15 changes: 14 additions & 1 deletion python/lsst/daf/butler/datastores/fileDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1998,10 +1998,23 @@ def get(
def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
# Docstring inherited

# 1 hour. Chosen somewhat arbitrarily -- this is long enough that the
# client should have time to download a large file with retries if
# needed, but short enough that it will become obvious quickly that
# these URLs expire.
# From a strictly technical standpoint there is no reason this
# shouldn't be a day or more, but there seems to be a political issue
# where people think there is a risk of end users posting presigned
# URLs for people without access rights to download.
url_expiration_time_seconds = 1 * 60 * 60

def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
location, file_info = info
return FileDatastoreGetPayloadFileInfo(
url=location.uri.geturl(), datastoreRecords=file_info.to_simple()
url=location.uri.generate_presigned_get_url(
expiration_time_seconds=url_expiration_time_seconds
),
datastoreRecords=file_info.to_simple(),
)

return FileDatastoreGetPayload(
Expand Down
16 changes: 8 additions & 8 deletions python/lsst/daf/butler/datastores/fileDatastoreClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,18 @@
generate_datastore_get_information,
get_dataset_as_python_object_from_get_info,
)
from pydantic import AnyHttpUrl


class FileDatastoreGetPayloadFileInfo(_BaseModelCompat):
"""Information required to read a single file stored in `FileDatastore`"""

# TODO DM-41879: Allowing arbitrary URLs here is a severe security issue,
# since it allows the server to trick the client into fetching data from
# any file on its local filesystem or from remote storage using credentials
# laying around in the environment. This should be restricted to only
# HTTP, but we don't yet have a means of mocking out HTTP gets in tests.
url: str
"""An absolute URL that can be used to read the file"""
# This is intentionally restricted to HTTP for security reasons. Allowing
# arbitrary URLs here would allow the server to trick the client into
# fetching data from any file on its local filesystem or from remote
# storage using credentials laying around in the environment.
url: AnyHttpUrl
"""An HTTP URL that can be used to read the file"""

datastoreRecords: SerializedStoredFileInfo
"""`FileDatastore` metadata records for this file"""
Expand Down Expand Up @@ -76,7 +76,7 @@ def get_dataset_as_python_object(
The retrieved artifact, converted to a Python object
"""
fileLocations: list[DatasetLocationInformation] = [
(Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords))
(Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords))
for file_info in payload.file_info
]

Expand Down
6 changes: 6 additions & 0 deletions python/lsst/daf/butler/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,12 @@ def addDataset(
datasetType : ``DatasetType``, optional
The dataset type of the added dataset. If `None`, will use the
default dataset type.
Returns
-------
datasetRef : `DatasetRef`
A reference to the added dataset.
"""
if run:
self.butler.registry.registerCollection(run, type=CollectionType.RUN)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def test_get(self):
def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef:
run = "corrupted-run"
ref = repo.addDataset({"instrument": "DummyCamComp", "visit": 423}, run=run)
uris = repo.butler.getURIs(ref, run=run)
uris = repo.butler.getURIs(ref)
oneOfTheComponents = list(uris.componentURIs.values())[0]
oneOfTheComponents.write("corrupted data")
return ref
Expand Down

0 comments on commit a633a1a

Please sign in to comment.