From d15e0bcd65bfa1f8bf3ba760d688dc8ef993b1e0 Mon Sep 17 00:00:00 2001
From: "David H. Irving" <david.irving@noirlab.edu>
Date: Wed, 4 Dec 2024 17:07:20 -0700
Subject: [PATCH] Preload dataset type cache in Butler server

Pre-fetch dataset types the first time a repository is accessed in Butler server, to avoid the need to re-fetch them in most later operations.
---
 .../byDimensions/_dataset_type_cache.py       | 29 +++++++++++++++++++
 .../datasets/byDimensions/_manager.py         | 11 ++++++-
 .../butler/registry/interfaces/_datasets.py   |  7 +++++
 .../lsst/daf/butler/registry/sql_registry.py  |  1 +
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py
index 1865736919..75e5c1596e 100644
--- a/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py
+++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_dataset_type_cache.py
@@ -60,6 +60,35 @@ def __init__(self) -> None:
         self._full = False
         self._dimensions_full = False
 
+    def clone(self) -> DatasetTypeCache:
+        """Make a copy of the caches that are safe to use in another thread.
+
+        Notes
+        -----
+        After cloning, the ``tables`` cache will be shared between the new
+        instance and the current instance. It is safe to read and update
+        ``tables`` from multiple threads simultaneously -- the cached values
+        are immutable table schemas, and they are looked up one at a time by
+        name.
+
+        The other caches are copied, because their access patterns are more
+        complex.
+
+        ``full`` and ``dimensions_full`` will initially return `False` in the
+        new instance.  This preserves the invariant that a Butler is able to
+        see any changes to the database made before the Butler is instantiated.
+        The downside is that the cloned cache will have to be re-fetched before
+        it can be used for glob searches.
+        """
+        clone = DatasetTypeCache()
+        # Share DynamicTablesCache between instances.
+        clone.tables = self.tables
+        # The inner key/value objects are immutable in both of these caches, so
+        # we can shallow-copy the dicts.
+        clone._by_name_cache = self._by_name_cache.copy()
+        clone._by_dimensions_cache = self._by_dimensions_cache.copy()
+        return clone
+
     @property
     def full(self) -> bool:
         """`True` if cache holds all known dataset types (`bool`)."""
diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
index e7708a8614..288c452fbe 100644
--- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
+++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
@@ -135,6 +135,8 @@ class ByDimensionsDatasetRecordStorageManagerUUID(DatasetRecordStorageManager):
         Structure containing tables that summarize the contents of collections.
     registry_schema_version : `VersionTuple` or `None`, optional
         Version of registry schema.
+    _cache : `None`, optional
+        For internal use only.
     """
 
     def __init__(
@@ -146,6 +148,7 @@ def __init__(
         static: StaticDatasetTablesTuple,
         summaries: CollectionSummaryManager,
         registry_schema_version: VersionTuple | None = None,
+        _cache: DatasetTypeCache | None = None,
     ):
         super().__init__(registry_schema_version=registry_schema_version)
         self._db = db
@@ -153,7 +156,7 @@ def __init__(
         self._dimensions = dimensions
         self._static = static
         self._summaries = summaries
-        self._cache = DatasetTypeCache()
+        self._cache = _cache if _cache is not None else DatasetTypeCache()
         self._use_astropy_ingest_date = self.ingest_date_dtype() is ddl.AstropyTimeNsecTai
         self._run_key_column = collections.getRunForeignKeyName()
 
@@ -270,6 +273,9 @@ def clone(
             static=self._static,
             summaries=self._summaries.clone(db=db, collections=collections, caching_context=caching_context),
             registry_schema_version=self._registry_schema_version,
+            # See notes on DatasetTypeCache.clone() about cache behavior after
+            # cloning.
+            _cache=self._cache.clone(),
         )
 
     def refresh(self) -> None:
@@ -502,6 +508,9 @@ def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
     def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
         return self._record_from_row(row).dataset_type
 
+    def preload_cache(self) -> None:
+        self._fetch_dataset_types()
+
     def _fetch_dataset_types(self) -> list[DatasetType]:
         """Fetch list of all defined dataset types."""
         # This is one of three places we populate the dataset type cache:
diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py
index 2ace8b61ad..41f1ccef39 100644
--- a/python/lsst/daf/butler/registry/interfaces/_datasets.py
+++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py
@@ -103,6 +103,13 @@ def clone(
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def preload_cache(self) -> None:
+        """Fetch data from the database and use it to pre-populate caches to
+        speed up later operations.
+        """
+        raise NotImplementedError()
+
     @classmethod
     @abstractmethod
     def initialize(
diff --git a/python/lsst/daf/butler/registry/sql_registry.py b/python/lsst/daf/butler/registry/sql_registry.py
index 5365db1733..4e4c53255b 100644
--- a/python/lsst/daf/butler/registry/sql_registry.py
+++ b/python/lsst/daf/butler/registry/sql_registry.py
@@ -2485,6 +2485,7 @@ def make_datastore_tables(self, tables: Mapping[str, DatastoreOpaqueTable]) -> N
     def preload_cache(self) -> None:
         """Immediately load caches that are used for common operations."""
         self.dimension_record_cache.preload_cache()
+        self._managers.datasets.preload_cache()
 
     @property
     def obsCoreTableManager(self) -> ObsCoreTableManager | None: