diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index cf2d879447..c0e1860957 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -1998,10 +1998,23 @@ def get( def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: # Docstring inherited + # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the + # client should have time to download a large file with retries if + # needed, but short enough that it will become obvious quickly that + # these URLs expire. + # From a strictly technical standpoint there is no reason this + # shouldn't be a day or more, but there seems to be a political issue + # where people think there is a risk of end users posting presigned + # URLs for people without access rights to download. + url_expiration_time_seconds = 1 * 60 * 60 + def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: location, file_info = info return FileDatastoreGetPayloadFileInfo( - url=location.uri.geturl(), datastoreRecords=file_info.to_simple() + url=location.uri.generate_presigned_get_url( + expiration_time_seconds=url_expiration_time_seconds + ), + datastoreRecords=file_info.to_simple(), ) return FileDatastoreGetPayload( diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index 9eb09b52be..556eacb7a4 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -12,18 +12,18 @@ generate_datastore_get_information, get_dataset_as_python_object_from_get_info, ) +from pydantic import AnyHttpUrl class FileDatastoreGetPayloadFileInfo(_BaseModelCompat): """Information required to read a single file stored in `FileDatastore`""" - # TODO DM-41879: Allowing arbitrary URLs here is a severe security issue, - # since it allows the server to trick the client into fetching data from - # any file on its local filesystem or from remote storage using credentials - # laying around in the environment. This should be restricted to only - # HTTP, but we don't yet have a means of mocking out HTTP gets in tests. - url: str - """An absolute URL that can be used to read the file""" + # This is intentionally restricted to HTTP for security reasons. Allowing + # arbitrary URLs here would allow the server to trick the client into + # fetching data from any file on its local filesystem or from remote + # storage using credentials laying around in the environment. + url: AnyHttpUrl + """An HTTP URL that can be used to read the file""" datastoreRecords: SerializedStoredFileInfo """`FileDatastore` metadata records for this file""" @@ -76,7 +76,7 @@ def get_dataset_as_python_object( The retrieved artifact, converted to a Python object """ fileLocations: list[DatasetLocationInformation] = [ - (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords)) + (Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords)) for file_info in payload.file_info ] diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index b2042bf8cd..cf7f2d23d0 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -313,6 +313,12 @@ def addDataset( datasetType : ``DatasetType``, optional The dataset type of the added dataset. If `None`, will use the default dataset type. + + Returns + ------- + datasetRef : `DatasetRef` + A reference to the added dataset. + """ if run: self.butler.registry.registerCollection(run, type=CollectionType.RUN) diff --git a/tests/test_server.py b/tests/test_server.py index 2ff92670cb..00059959c4 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -245,7 +245,7 @@ def test_get(self): def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef: run = "corrupted-run" ref = repo.addDataset({"instrument": "DummyCamComp", "visit": 423}, run=run) - uris = repo.butler.getURIs(ref, run=run) + uris = repo.butler.getURIs(ref) oneOfTheComponents = list(uris.componentURIs.values())[0] oneOfTheComponents.write("corrupted data") return ref