diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index cf2d879447..c0e1860957 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -1998,10 +1998,23 @@ def get( def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: # Docstring inherited + # 1 hour. Chosen somewhat arbitrarily -- this is long enough that the + # client should have time to download a large file with retries if + # needed, but short enough that it will become obvious quickly that + # these URLs expire. + # From a strictly technical standpoint there is no reason this + # shouldn't be a day or more, but there seems to be a political issue + # where people think there is a risk of end users posting presigned + # URLs for people without access rights to download. + url_expiration_time_seconds = 1 * 60 * 60 + def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: location, file_info = info return FileDatastoreGetPayloadFileInfo( - url=location.uri.geturl(), datastoreRecords=file_info.to_simple() + url=location.uri.generate_presigned_get_url( + expiration_time_seconds=url_expiration_time_seconds + ), + datastoreRecords=file_info.to_simple(), ) return FileDatastoreGetPayload( diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index 9eb09b52be..556eacb7a4 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -12,18 +12,18 @@ generate_datastore_get_information, get_dataset_as_python_object_from_get_info, ) +from pydantic import AnyHttpUrl class FileDatastoreGetPayloadFileInfo(_BaseModelCompat): """Information required to read a single file stored in `FileDatastore`""" - # TODO DM-41879: Allowing arbitrary URLs here is a severe security issue, - # since it allows the server to trick the client into fetching data from - # any file on its local filesystem or from remote storage using credentials - # laying around in the environment. This should be restricted to only - # HTTP, but we don't yet have a means of mocking out HTTP gets in tests. - url: str - """An absolute URL that can be used to read the file""" + # This is intentionally restricted to HTTP for security reasons. Allowing + # arbitrary URLs here would allow the server to trick the client into + # fetching data from any file on its local filesystem or from remote + # storage using credentials laying around in the environment. + url: AnyHttpUrl + """An HTTP URL that can be used to read the file""" datastoreRecords: SerializedStoredFileInfo """`FileDatastore` metadata records for this file""" @@ -76,7 +76,7 @@ def get_dataset_as_python_object( The retrieved artifact, converted to a Python object """ fileLocations: list[DatasetLocationInformation] = [ - (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords)) + (Location(None, str(file_info.url)), StoredFileInfo.from_simple(file_info.datastoreRecords)) for file_info in payload.file_info ] diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py index c10f1914f4..d0b592f426 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/get.py +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -207,13 +207,14 @@ def _read_artifact_into_memory( # Do not do this if the size is negative since that indicates # we do not know. recorded_size = getInfo.info.file_size - resource_size = uri.size() - if recorded_size >= 0 and resource_size != recorded_size: - raise RuntimeError( - "Integrity failure in Datastore. " - f"Size of file {uri} ({resource_size}) " - f"does not match size recorded in registry of {recorded_size}" - ) + + def check_resource_size(resource_size: int) -> None: + if recorded_size >= 0 and resource_size != recorded_size: + raise RuntimeError( + "Integrity failure in Datastore. " + f"Size of file {uri} ({resource_size}) " + f"does not match size recorded in registry of {recorded_size}" + ) # For the general case we have choices for how to proceed. # 1. Always use a local file (downloading the remote resource to a @@ -225,7 +226,7 @@ def _read_artifact_into_memory( formatter = getInfo.formatter nbytes_max = 10_000_000 # Arbitrary number that we can tune - if resource_size <= nbytes_max and formatter.can_read_bytes(): + if recorded_size >= 0 and recorded_size <= nbytes_max and formatter.can_read_bytes(): with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: if cached_file is not None: desired_uri = cached_file @@ -235,6 +236,7 @@ def _read_artifact_into_memory( msg = "" with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): serializedDataset = desired_uri.read() + check_resource_size(len(serializedDataset)) log.debug( "Deserializing %s from %d bytes from location %s with formatter %s", f"component {getInfo.component}" if isComponent else "", @@ -271,6 +273,7 @@ def _read_artifact_into_memory( location_updated = True with uri.as_local() as local_uri: + check_resource_size(local_uri.size()) can_be_cached = False if uri != local_uri: # URI was remote and file was downloaded diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index df4238c254..cf7f2d23d0 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -39,7 +39,7 @@ import astropy from astropy.table import Table as AstropyTable -from .. import Butler, Config, StorageClassFactory, Timespan +from .. import Butler, Config, DatasetRef, StorageClassFactory, Timespan from ..registry import CollectionType from ..tests import MetricsExample, addDatasetType @@ -221,6 +221,10 @@ class MetricTestRepo: The location of the repository, to pass to ``Butler.makeRepo``. configFile : `str` The path to the config file, to pass to ``Butler.makeRepo``. + forceConfigRoot: `bool`, optional + If `False`, any values present in the supplied ``config`` that + would normally be reset are not overridden and will appear + directly in the output config. Passed to ``Butler.makeRepo``. """ METRICS_EXAMPLE_SUMMARY = {"AM1": 5.2, "AM2": 30.6} @@ -237,9 +241,9 @@ def _makeExampleMetrics() -> MetricsExample: [563, 234, 456.7, 752, 8, 9, 27], ) - def __init__(self, root: str, configFile: str) -> None: + def __init__(self, root: str, configFile: str, forceConfigRoot: bool = True) -> None: self.root = root - Butler.makeRepo(self.root, config=Config(configFile)) + Butler.makeRepo(self.root, config=Config(configFile), forceConfigRoot=forceConfigRoot) butlerConfigFile = os.path.join(self.root, "butler.yaml") self.storageClassFactory = StorageClassFactory() self.storageClassFactory.addFromConfig(butlerConfigFile) @@ -291,7 +295,7 @@ def __init__(self, root: str, configFile: str) -> None: def addDataset( self, dataId: dict[str, Any], run: str | None = None, datasetType: DatasetType | None = None - ) -> None: + ) -> DatasetRef: """Create a new example metric and add it to the named run with the given dataId. @@ -309,8 +313,16 @@ def addDataset( datasetType : ``DatasetType``, optional The dataset type of the added dataset. If `None`, will use the default dataset type. + + Returns + ------- + datasetRef : `DatasetRef` + A reference to the added dataset. + """ if run: self.butler.registry.registerCollection(run, type=CollectionType.RUN) metric = self._makeExampleMetrics() - self.butler.put(metric, self.datasetType if datasetType is None else datasetType, dataId, run=run) + return self.butler.put( + metric, self.datasetType if datasetType is None else datasetType, dataId, run=run + ) diff --git a/tests/test_butler.py b/tests/test_butler.py index 5f6f5f6851..437c3725b4 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -48,7 +48,7 @@ try: import boto3 import botocore - from lsst.resources.s3utils import setAwsEnvCredentials, unsetAwsEnvCredentials + from lsst.resources.s3utils import clean_test_environment_for_s3 from moto import mock_s3 # type: ignore[import] except ImportError: boto3 = None @@ -114,13 +114,7 @@ def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] def clean_environment() -> None: """Remove external environment variables that affect the tests.""" - for k in ( - "DAF_BUTLER_REPOSITORY_INDEX", - "S3_ENDPOINT_URL", - "AWS_ACCESS_KEY_ID", - "AWS_SECRET_ACCESS_KEY", - "AWS_SHARED_CREDENTIALS_FILE", - ): + for k in ("DAF_BUTLER_REPOSITORY_INDEX",): os.environ.pop(k, None) @@ -1993,11 +1987,9 @@ def setUp(self) -> None: self.bucketName = uri.netloc # Enable S3 mocking of tests. + self.enterContext(clean_test_environment_for_s3()) self.mock_s3.start() - # set up some fake credentials if they do not exist - self.usingDummyCredentials = setAwsEnvCredentials() - if self.useTempRoot: self.root = self.genRoot() rooturi = f"s3://{self.bucketName}/{self.root}" @@ -2035,10 +2027,6 @@ def tearDown(self) -> None: # Stop the S3 mock. self.mock_s3.stop() - # unset any potentially set dummy credentials - if self.usingDummyCredentials: - unsetAwsEnvCredentials() - if self.reg_dir is not None and os.path.exists(self.reg_dir): shutil.rmtree(self.reg_dir, ignore_errors=True) diff --git a/tests/test_server.py b/tests/test_server.py index 8430e91202..00059959c4 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -35,6 +35,8 @@ from lsst.daf.butler.remote_butler import RemoteButler from lsst.daf.butler.remote_butler.server import Factory, app from lsst.daf.butler.remote_butler.server._dependencies import factory_dependency + from lsst.resources.s3utils import clean_test_environment_for_s3, getS3Client + from moto import mock_s3 except ImportError: TestClient = None app = None @@ -68,11 +70,28 @@ class ButlerClientServerTestCase(unittest.TestCase): @classmethod def setUpClass(cls): + # Set up a mock S3 environment using Moto. Moto also monkeypatches the + # `requests` library so that any HTTP requests to presigned S3 URLs get + # redirected to the mocked S3. + # Note that all files are stored in memory. + cls.enterClassContext(clean_test_environment_for_s3()) + cls.enterClassContext(mock_s3()) + bucket_name = "anybucketname" # matches s3Datastore.yaml + getS3Client().create_bucket(Bucket=bucket_name) + cls.storageClassFactory = StorageClassFactory() # First create a butler and populate it. cls.root = makeTestTempDir(TESTDIR) - cls.repo = MetricTestRepo(root=cls.root, configFile=os.path.join(TESTDIR, "config/basic/butler.yaml")) + cls.repo = MetricTestRepo( + root=cls.root, + configFile=os.path.join(TESTDIR, "config/basic/butler-s3store.yaml"), + forceConfigRoot=False, + ) + + # Add a file with corrupted data for testing error conditions + cls.dataset_with_corrupted_data = _create_corrupted_dataset(cls.repo) + # Override the server's Butler initialization to point at our test repo server_butler = Butler.from_config(cls.root, writeable=True) @@ -212,6 +231,9 @@ def test_get(self): with self.assertRaises(LookupError): self.butler.get(invalid_ref) + with self.assertRaises(RuntimeError): + self.butler.get(self.dataset_with_corrupted_data) + # Test storage class override new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") converted = self.butler.get(ref, storageClass=new_sc) @@ -220,5 +242,14 @@ def test_get(self): self.assertEqual(metric, converted) +def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef: + run = "corrupted-run" + ref = repo.addDataset({"instrument": "DummyCamComp", "visit": 423}, run=run) + uris = repo.butler.getURIs(ref) + oneOfTheComponents = list(uris.componentURIs.values())[0] + oneOfTheComponents.write("corrupted data") + return ref + + if __name__ == "__main__": unittest.main()