Skip to content

Commit

Permalink
Implement RemoteButler.get()
Browse files Browse the repository at this point in the history
Added a partial implementation of get() for RemoteButler, which so far
only handles the DatasetRef form of get().  This modifies the DataStore
interface to include a function that serializes the information
necessary to do the get() on the client side.
  • Loading branch information
dhirving committed Nov 29, 2023
1 parent 0cc0120 commit cbedd9a
Show file tree
Hide file tree
Showing 14 changed files with 272 additions and 22 deletions.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,7 +808,7 @@ def get_dataset_type(self, name: str) -> DatasetType:
def get_dataset(
self,
id: DatasetId,
storage_class: str | StorageClass | None,
storage_class: str | StorageClass | None = None,
dimension_records: bool = False,
datastore_records: bool = False,
) -> DatasetRef | None:
Expand Down
17 changes: 17 additions & 0 deletions python/lsst/daf/butler/datastore/_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,23 @@ def get(
"""
raise NotImplementedError("Must be implemented by subclass")

def prepare_get_for_external_client(self, ref: DatasetRef) -> object:
"""Retrieve serializable data that can be used to execute a get()
Parameters
----------
ref : `DatasetRef`
Reference to the required dataset.
Returns
-------
payload : `object`
Serializable payload containing the information needed to perform a
get() operation. This payload may be sent over the wire to another
system to perform the get().
"""
raise NotImplementedError()

@abstractmethod
def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
"""Write a `InMemoryDataset` with a given `DatasetRef` to the store.
Expand Down
6 changes: 5 additions & 1 deletion python/lsst/daf/butler/datastore/cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,11 @@ class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager):
in lookup keys.
"""

def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse):
def __init__(
self,
config: str | DatastoreCacheManagerConfig | None = None,
universe: DimensionUniverse | None = None,
):
return

def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool:
Expand Down
37 changes: 36 additions & 1 deletion python/lsst/daf/butler/datastore/stored_file_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@

from __future__ import annotations

__all__ = ("StoredDatastoreItemInfo", "StoredFileInfo")
__all__ = ("StoredDatastoreItemInfo", "StoredFileInfo", "SerializedStoredFileInfo")

import inspect
from collections.abc import Iterable, Mapping
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

from lsst.daf.butler._compat import _BaseModelCompat
from lsst.resources import ResourcePath
from lsst.utils import doImportType
from lsst.utils.introspection import get_full_type_name
Expand Down Expand Up @@ -260,6 +261,13 @@ def to_record(self, **kwargs: Any) -> dict[str, Any]:
**kwargs,
)

def to_simple(self) -> SerializedStoredFileInfo:
record = self.to_record()
# We allow None on the model but the record contains a "null string"
# instead
record["component"] = self.component
return SerializedStoredFileInfo.model_validate(record)

def file_location(self, factory: LocationFactory) -> Location:
"""Return the location of artifact.
Expand Down Expand Up @@ -307,6 +315,10 @@ def from_record(cls: type[StoredFileInfo], record: Mapping[str, Any]) -> StoredF
)
return info

@classmethod
def from_simple(cls: type[StoredFileInfo], model: SerializedStoredFileInfo) -> StoredFileInfo:
return cls.from_record(dict(model))

def update(self, **kwargs: Any) -> StoredFileInfo:
new_args = {}
for k in self.__slots__:
Expand All @@ -320,3 +332,26 @@ def update(self, **kwargs: Any) -> StoredFileInfo:

def __reduce__(self) -> str | tuple[Any, ...]:
return (self.from_record, (self.to_record(),))


class SerializedStoredFileInfo(_BaseModelCompat):
"""Serialized representation of `StoredFileInfo` properties"""

formatter: str
"""Fully-qualified name of Formatter."""

path: str
"""Path to dataset within Datastore."""

storage_class: str
"""Name of the StorageClass associated with Dataset."""

component: str | None
"""Component associated with this file. Can be None if the file does
not refer to a component of a composite."""

checksum: str | None
"""Checksum of the serialized dataset."""

file_size: int
"""Size of the serialized dataset in bytes."""
30 changes: 22 additions & 8 deletions python/lsst/daf/butler/datastores/fileDatastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@
import logging
from collections import defaultdict
from collections.abc import Callable, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
from typing import TYPE_CHECKING, Any, ClassVar, cast, TypeAlias
from typing import TYPE_CHECKING, Any, ClassVar, cast

from lsst.daf.butler import (
Config,
Expand Down Expand Up @@ -75,8 +74,12 @@
from lsst.daf.butler.datastores.file_datastore.get import (
DatasetLocationInformation,
DatastoreFileGetInformation,
get_dataset_as_python_object,
prepare_for_get,
generate_datastore_get_information,
get_dataset_as_python_object_from_get_info,
)
from lsst.daf.butler.datastores.fileDatastoreClient import (
FileDatastoreGetPayload,
FileDatastoreGetPayloadFileInfo,
)
from lsst.daf.butler.registry.interfaces import (
DatabaseInsertMode,
Expand Down Expand Up @@ -115,8 +118,6 @@ def __init__(self, datasets: Iterable[FileDataset]):
self.datasets = datasets




class FileDatastore(GenericBaseDatastore[StoredFileInfo]):
"""Generic Datastore for file-based implementations.
Expand Down Expand Up @@ -732,7 +733,7 @@ def _prepare_for_direct_get(

# Is this a component request?
refComponent = ref.datasetType.component()
return prepare_for_get(
return generate_datastore_get_information(
fileLocations,
refStorageClass=refStorageClass,
refComponent=refComponent,
Expand Down Expand Up @@ -1993,10 +1994,23 @@ def get(
ref = ref.overrideStorageClass(storageClass)

allGetInfo = self._prepare_for_direct_get(ref, parameters)
return get_dataset_as_python_object(
return get_dataset_as_python_object_from_get_info(
allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager
)

def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload:
# Docstring inherited

def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo:
location, file_info = info
return FileDatastoreGetPayloadFileInfo(url=location.uri.geturl(), metadata=file_info.to_simple())

return FileDatastoreGetPayload(
datastore_type="file",
dataset_ref=ref.to_simple(),
file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)],
)

@transactional
def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None:
"""Write a InMemoryDataset with a given `DatasetRef` to the store.
Expand Down
58 changes: 58 additions & 0 deletions python/lsst/daf/butler/datastores/fileDatastoreClient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
__all__ = ("get_dataset_as_python_object", "FileDatastoreGetPayload")

from typing import Any, Literal

from lsst.daf.butler import DatasetRef, DimensionUniverse, Location, SerializedDatasetRef, StorageClass
from lsst.daf.butler._compat import _BaseModelCompat
from lsst.daf.butler.datastore.cache_manager import DatastoreDisabledCacheManager
from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo
from lsst.daf.butler.datastores.file_datastore.get import (
DatasetLocationInformation,
Mapping,
generate_datastore_get_information,
get_dataset_as_python_object_from_get_info,
)


class FileDatastoreGetPayloadFileInfo(_BaseModelCompat):
# TODO DM-41879: Allowing arbitrary URLs here is a severe security issue,
# since it allows the server to trick the client into fetching data from
# any file on its local filesystem or from remote storage using credentials
# laying around in the environment. This should be restricted to only
# HTTP, but we don't yet have a means of mocking out HTTP gets in tests.
url: str
metadata: SerializedStoredFileInfo


class FileDatastoreGetPayload(_BaseModelCompat):
datastore_type: Literal["file"]
file_info: list[FileDatastoreGetPayloadFileInfo]
dataset_ref: SerializedDatasetRef


def get_dataset_as_python_object(
model: FileDatastoreGetPayload,
*,
universe: DimensionUniverse,
parameters: Mapping[str, Any] | None,
storageClass: StorageClass | str | None,
) -> Any:
fileLocations: list[DatasetLocationInformation] = [
(Location(None, file_info.url), StoredFileInfo.from_simple(file_info.metadata))
for file_info in model.file_info
]

ref = DatasetRef.from_simple(model.dataset_ref, universe=universe)
if storageClass is not None:
ref = ref.overrideStorageClass(storageClass)

Check warning on line 47 in python/lsst/daf/butler/datastores/fileDatastoreClient.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/datastores/fileDatastoreClient.py#L47

Added line #L47 was not covered by tests

datastore_file_info = generate_datastore_get_information(
fileLocations,
refStorageClass=ref.datasetType.storageClass,
refComponent=ref.datasetType.component(),
parameters=parameters,
dataId=ref.dataId,
)
return get_dataset_as_python_object_from_get_info(
datastore_file_info, ref=ref, parameters=parameters, cache_manager=DatastoreDisabledCacheManager()
)
26 changes: 26 additions & 0 deletions python/lsst/daf/butler/datastores/file_datastore/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
8 changes: 4 additions & 4 deletions python/lsst/daf/butler/datastores/file_datastore/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
__all__ = (
"DatastoreFileGetInformation",
"DatasetLocationInformation",
"prepare_for_get",
"get_dataset_as_python_object",
"generate_datastore_get_information",
"get_dataset_as_python_object_from_get_info",
)

from collections.abc import Mapping
Expand Down Expand Up @@ -77,7 +77,7 @@ class DatastoreFileGetInformation:
"""The `StorageClass` of the dataset being read."""


def prepare_for_get(
def generate_datastore_get_information(
fileLocations: list[DatasetLocationInformation],
*,
refStorageClass: StorageClass,
Expand Down Expand Up @@ -306,7 +306,7 @@ def _read_artifact_into_memory(
)


def get_dataset_as_python_object(
def get_dataset_as_python_object_from_get_info(
allGetInfo: list[DatastoreFileGetInformation],
*,
ref: DatasetRef,
Expand Down
18 changes: 16 additions & 2 deletions python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

import httpx
from lsst.daf.butler import __version__
from lsst.daf.butler.datastores.fileDatastoreClient import get_dataset_as_python_object
from lsst.daf.butler.repo_relocation import replaceRoot
from lsst.resources import ResourcePath, ResourcePathExpression
from lsst.utils.introspection import get_full_type_name
Expand All @@ -56,7 +57,7 @@
from ..transfers import RepoExportContext
from ._authentication import get_authentication_headers, get_authentication_token_from_environment
from ._config import RemoteButlerConfigModel
from .server_models import FindDatasetModel
from .server_models import FindDatasetModel, GetFileRequestModel, GetFileResponseModel

_AnyPydanticModel = TypeVar("_AnyPydanticModel", bound=_BaseModelCompat)

Expand Down Expand Up @@ -207,7 +208,20 @@ def get(
**kwargs: Any,
) -> Any:
# Docstring inherited.
raise NotImplementedError()
if not isinstance(datasetRefOrType, DatasetRef):
raise NotImplementedError("RemoteButler currently only supports get() of a DatasetRef")

dataset_id = datasetRefOrType.id
request = GetFileRequestModel(dataset_id=dataset_id)
response = self._post("get_file", request)
if response.status_code == 404:
raise LookupError(f"Dataset not found with ID ${dataset_id}")
response.raise_for_status()
model = self._parse_model(response, GetFileResponseModel)

return get_dataset_as_python_object(
model, parameters=parameters, storageClass=storageClass, universe=self.dimensions
)

def getURIs(
self,
Expand Down
33 changes: 33 additions & 0 deletions python/lsst/daf/butler/remote_butler/server/_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from fastapi import HTTPException


class NotFoundException(HTTPException):
def __init__(self, message: str = "Not found"):
super().__init__(status_code=404, detail=message)
Loading

0 comments on commit cbedd9a

Please sign in to comment.