From d15e0bcd65bfa1f8bf3ba760d688dc8ef993b1e0 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Wed, 4 Dec 2024 17:07:20 -0700 Subject: [PATCH] Preload dataset type cache in Butler server Pre-fetch dataset types the first time a repository is accessed in Butler server, to avoid the need to re-fetch them in most later operations. --- .../byDimensions/_dataset_type_cache.py | 29 +++++++++++++++++++ .../datasets/byDimensions/_manager.py | 11 ++++++- .../butler/registry/interfaces/_datasets.py | 7 +++++ .../lsst/daf/butler/registry/sql_registry.py | 1 + 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py index 1865736919..75e5c1596e 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py @@ -60,6 +60,35 @@ def __init__(self) -> None: self._full = False self._dimensions_full = False + def clone(self) -> DatasetTypeCache: + """Make a copy of the caches that are safe to use in another thread. + + Notes + ----- + After cloning, the ``tables`` cache will be shared between the new + instance and the current instance. It is safe to read and update + ``tables`` from multiple threads simultaneously -- the cached values + are immutable table schemas, and they are looked up one at a time by + name. + + The other caches are copied, because their access patterns are more + complex. + + ``full`` and ``dimensions_full`` will initially return `False` in the + new instance. This preserves the invariant that a Butler is able to + see any changes to the database made before the Butler is instantiated. + The downside is that the cloned cache will have to be re-fetched before + it can be used for glob searches. + """ + clone = DatasetTypeCache() + # Share DynamicTablesCache between instances. + clone.tables = self.tables + # The inner key/value objects are immutable in both of these caches, so + # we can shallow-copy the dicts. + clone._by_name_cache = self._by_name_cache.copy() + clone._by_dimensions_cache = self._by_dimensions_cache.copy() + return clone + @property def full(self) -> bool: """`True` if cache holds all known dataset types (`bool`).""" diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py index e7708a8614..288c452fbe 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py @@ -135,6 +135,8 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager): Structure containing tables that summarize the contents of collections. registry_schema_version : `VersionTuple` or `None`, optional Version of registry schema. + _cache : `None`, optional + For internal use only. """ def __init__( @@ -146,6 +148,7 @@ def __init__( static: StaticDatasetTablesTuple, summaries: CollectionSummaryManager, registry_schema_version: VersionTuple | None = None, + _cache: DatasetTypeCache | None = None, ): super().__init__(registry_schema_version=registry_schema_version) self._db = db @@ -153,7 +156,7 @@ def __init__( self._dimensions = dimensions self._static = static self._summaries = summaries - self._cache = DatasetTypeCache() + self._cache = _cache if _cache is not None else DatasetTypeCache() self._use_astropy_ingest_date = self.ingest_date_dtype() is ddl.AstropyTimeNsecTai self._run_key_column = collections.getRunForeignKeyName() @@ -270,6 +273,9 @@ def clone( static=self._static, summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context), registry_schema_version=self._registry_schema_version, + # See notes on DatasetTypeCache.clone() about cache behavior after + # cloning. + _cache=self._cache.clone(), ) def refresh(self) -> None: @@ -502,6 +508,9 @@ def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord: def _dataset_type_from_row(self, row: Mapping) -> DatasetType: return self._record_from_row(row).dataset_type + def preload_cache(self) -> None: + self._fetch_dataset_types() + def _fetch_dataset_types(self) -> list[DatasetType]: """Fetch list of all defined dataset types.""" # This is one of three places we populate the dataset type cache: diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py index 2ace8b61ad..41f1ccef39 100644 --- a/python/lsst/daf/butler/registry/interfaces/_datasets.py +++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py @@ -103,6 +103,13 @@ def clone( """ raise NotImplementedError() + @abstractmethod + def preload_cache(self) -> None: + """Fetch data from the database and use it to pre-populate caches to + speed up later operations. + """ + raise NotImplementedError() + @classmethod @abstractmethod def initialize( diff --git a/python/lsst/daf/butler/registry/sql_registry.py b/python/lsst/daf/butler/registry/sql_registry.py index 5365db1733..4e4c53255b 100644 --- a/python/lsst/daf/butler/registry/sql_registry.py +++ b/python/lsst/daf/butler/registry/sql_registry.py @@ -2485,6 +2485,7 @@ def make_datastore_tables(self, tables: Mapping[str, DatastoreOpaqueTable]) -> N def preload_cache(self) -> None: """Immediately load caches that are used for common operations.""" self.dimension_record_cache.preload_cache() + self._managers.datasets.preload_cache() @property def obsCoreTableManager(self) -> ObsCoreTableManager | None: