From 5833c91f55f4be34daf795f115f44fdbc45a7085 Mon Sep 17 00:00:00 2001
From: Andy Salnikov <salnikov@slac.stanford.edu>
Date: Wed, 8 Nov 2023 16:13:10 -0800
Subject: [PATCH] Remove dataset type caching from datasets manager.

---
 .../datasets/byDimensions/_manager.py         | 235 +++++++++---------
 .../datasets/byDimensions/summaries.py        |  37 ++-
 .../butler/registry/interfaces/_datasets.py   |   4 +-
 .../lsst/daf/butler/registry/sql_registry.py  |   3 +-
 4 files changed, 142 insertions(+), 137 deletions(-)

diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
index 60e7b0f308..88e20c67f2 100644
--- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
+++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
@@ -4,6 +4,7 @@
 
 __all__ = ("ByDimensionsDatasetRecordStorageManagerUUID",)
 
+import dataclasses
 import logging
 import warnings
 from collections import defaultdict
@@ -55,16 +56,14 @@ class MissingDatabaseTableError(RuntimeError):
     """Exception raised when a table is not found in a database."""
 
 
-class _ExistingTableFactory:
-    """Factory for `sqlalchemy.schema.Table` instances that returns already
-    existing table instance.
-    """
-
-    def __init__(self, table: sqlalchemy.schema.Table):
-        self._table = table
+@dataclasses.dataclass
+class _DatasetTypeRecord:
+    """Contents of a single dataset type record."""
 
-    def __call__(self) -> sqlalchemy.schema.Table:
-        return self._table
+    dataset_type: DatasetType
+    dataset_type_id: int
+    tag_table_name: str
+    calib_table_name: str | None
 
 
 class _SpecTableFactory:
@@ -139,8 +138,6 @@ def __init__(
         self._dimensions = dimensions
         self._static = static
         self._summaries = summaries
-        self._byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
-        self._byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
 
     @classmethod
     def initialize(
@@ -162,6 +159,7 @@ def initialize(
             context,
             collections=collections,
             dimensions=dimensions,
+            dataset_type_table=static.dataset_type,
         )
         return cls(
             db=db,
@@ -236,44 +234,33 @@ def addDatasetForeignKey(
 
     def refresh(self) -> None:
         # Docstring inherited from DatasetRecordStorageManager.
-        byName: dict[str, ByDimensionsDatasetRecordStorage] = {}
-        byId: dict[int, ByDimensionsDatasetRecordStorage] = {}
-        c = self._static.dataset_type.columns
-        with self._db.query(self._static.dataset_type.select()) as sql_result:
-            sql_rows = sql_result.mappings().fetchall()
-        for row in sql_rows:
-            name = row[c.name]
-            dimensions = self._dimensions.loadDimensionGraph(row[c.dimensions_key])
-            calibTableName = row[c.calibration_association_table]
-            datasetType = DatasetType(
-                name, dimensions, row[c.storage_class], isCalibration=(calibTableName is not None)
-            )
-            tags_spec = makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType())
-            tags_table_factory = _SpecTableFactory(self._db, row[c.tag_association_table], tags_spec)
-            calibs_table_factory = None
-            if calibTableName is not None:
-                calibs_spec = makeCalibTableSpec(
-                    datasetType,
-                    type(self._collections),
-                    self._db.getTimespanRepresentation(),
-                    self.getIdColumnType(),
-                )
-                calibs_table_factory = _SpecTableFactory(self._db, calibTableName, calibs_spec)
-            storage = self._recordStorageType(
-                db=self._db,
-                datasetType=datasetType,
-                static=self._static,
-                summaries=self._summaries,
-                tags_table_factory=tags_table_factory,
-                calibs_table_factory=calibs_table_factory,
-                dataset_type_id=row["id"],
-                collections=self._collections,
-                use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
+        pass
+
+    def _make_storage(self, record: _DatasetTypeRecord) -> ByDimensionsDatasetRecordStorage:
+        """Create storage instance for a dataset type record."""
+        tags_spec = makeTagTableSpec(record.dataset_type, type(self._collections), self.getIdColumnType())
+        tags_table_factory = _SpecTableFactory(self._db, record.tag_table_name, tags_spec)
+        calibs_table_factory = None
+        if record.calib_table_name is not None:
+            calibs_spec = makeCalibTableSpec(
+                record.dataset_type,
+                type(self._collections),
+                self._db.getTimespanRepresentation(),
+                self.getIdColumnType(),
             )
-            byName[datasetType.name] = storage
-            byId[storage._dataset_type_id] = storage
-        self._byName = byName
-        self._byId = byId
+            calibs_table_factory = _SpecTableFactory(self._db, record.calib_table_name, calibs_spec)
+        storage = self._recordStorageType(
+            db=self._db,
+            datasetType=record.dataset_type,
+            static=self._static,
+            summaries=self._summaries,
+            tags_table_factory=tags_table_factory,
+            calibs_table_factory=calibs_table_factory,
+            dataset_type_id=record.dataset_type_id,
+            collections=self._collections,
+            use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
+        )
+        return storage
 
     def remove(self, name: str) -> None:
         # Docstring inherited from DatasetRecordStorageManager.
@@ -296,33 +283,28 @@ def remove(self, name: str) -> None:
 
     def find(self, name: str) -> DatasetRecordStorage | None:
         # Docstring inherited from DatasetRecordStorageManager.
-        return self._byName.get(name)
+        record = self._fetch_dataset_type_record(name)
+        return self._make_storage(record) if record is not None else None
 
-    def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
+    def register(self, datasetType: DatasetType) -> bool:
         # Docstring inherited from DatasetRecordStorageManager.
         if datasetType.isComponent():
             raise ValueError(
                 f"Component dataset types can not be stored in registry. Rejecting {datasetType.name}"
             )
-        storage = self._byName.get(datasetType.name)
-        if storage is None:
+        record = self._fetch_dataset_type_record(datasetType.name)
+        if record is None:
             dimensionsKey = self._dimensions.saveDimensionGraph(datasetType.dimensions)
             tagTableName = makeTagTableName(datasetType, dimensionsKey)
-            calibTableName = (
-                makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
-            )
-            # The order is important here, we want to create tables first and
-            # only register them if this operation is successful. We cannot
-            # wrap it into a transaction because database class assumes that
-            # DDL is not transaction safe in general.
-            tags = self._db.ensureTableExists(
+            self._db.ensureTableExists(
                 tagTableName,
                 makeTagTableSpec(datasetType, type(self._collections), self.getIdColumnType()),
             )
-            tags_table_factory = _ExistingTableFactory(tags)
-            calibs_table_factory = None
+            calibTableName = (
+                makeCalibTableName(datasetType, dimensionsKey) if datasetType.isCalibration() else None
+            )
             if calibTableName is not None:
-                calibs = self._db.ensureTableExists(
+                self._db.ensureTableExists(
                     calibTableName,
                     makeCalibTableSpec(
                         datasetType,
@@ -331,8 +313,7 @@ def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool
                         self.getIdColumnType(),
                     ),
                 )
-                calibs_table_factory = _ExistingTableFactory(calibs)
-            row, inserted = self._db.sync(
+            _, inserted = self._db.sync(
                 self._static.dataset_type,
                 keys={"name": datasetType.name},
                 compared={
@@ -347,28 +328,17 @@ def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool
                 },
                 returning=["id", "tag_association_table"],
             )
-            assert row is not None
-            storage = self._recordStorageType(
-                db=self._db,
-                datasetType=datasetType,
-                static=self._static,
-                summaries=self._summaries,
-                tags_table_factory=tags_table_factory,
-                calibs_table_factory=calibs_table_factory,
-                dataset_type_id=row["id"],
-                collections=self._collections,
-                use_astropy_ingest_date=self.ingest_date_dtype() is ddl.AstropyTimeNsecTai,
-            )
-            self._byName[datasetType.name] = storage
-            self._byId[storage._dataset_type_id] = storage
         else:
-            if datasetType != storage.datasetType:
+            if datasetType != record.dataset_type:
                 raise ConflictingDefinitionError(
                     f"Given dataset type {datasetType} is inconsistent "
-                    f"with database definition {storage.datasetType}."
+                    f"with database definition {record.dataset_type}."
                 )
             inserted = False
-        return storage, bool(inserted)
+        # TODO: We return storage instance from this method, but the only
+        # client that uses this method ignores it. Maybe we should drop it
+        # and avoid making storage instance above.
+        return bool(inserted)
 
     def resolve_wildcard(
         self,
@@ -422,15 +392,13 @@ def resolve_wildcard(
                 raise TypeError(
                     "Universal wildcard '...' is not permitted for dataset types in this context."
                 )
-            for storage in self._byName.values():
-                result[storage.datasetType].add(None)
+            for datasetType in self._fetch_dataset_types():
+                result[datasetType].add(None)
                 if components:
                     try:
-                        result[storage.datasetType].update(
-                            storage.datasetType.storageClass.allComponents().keys()
-                        )
+                        result[datasetType].update(datasetType.storageClass.allComponents().keys())
                         if (
-                            storage.datasetType.storageClass.allComponents()
+                            datasetType.storageClass.allComponents()
                             and not already_warned
                             and components_deprecated
                         ):
@@ -442,7 +410,7 @@ def resolve_wildcard(
                             already_warned = True
                     except KeyError as err:
                         _LOG.warning(
-                            f"Could not load storage class {err} for {storage.datasetType.name}; "
+                            f"Could not load storage class {err} for {datasetType.name}; "
                             "if it has components they will not be included in query results.",
                         )
         elif wildcard.patterns:
@@ -454,29 +422,28 @@ def resolve_wildcard(
                     FutureWarning,
                     stacklevel=find_outside_stacklevel("lsst.daf.butler"),
                 )
-            for storage in self._byName.values():
-                if any(p.fullmatch(storage.datasetType.name) for p in wildcard.patterns):
-                    result[storage.datasetType].add(None)
+            dataset_types = self._fetch_dataset_types()
+            for datasetType in dataset_types:
+                if any(p.fullmatch(datasetType.name) for p in wildcard.patterns):
+                    result[datasetType].add(None)
             if components is not False:
-                for storage in self._byName.values():
-                    if components is None and storage.datasetType in result:
+                for datasetType in dataset_types:
+                    if components is None and datasetType in result:
                         continue
                     try:
-                        components_for_parent = storage.datasetType.storageClass.allComponents().keys()
+                        components_for_parent = datasetType.storageClass.allComponents().keys()
                     except KeyError as err:
                         _LOG.warning(
-                            f"Could not load storage class {err} for {storage.datasetType.name}; "
+                            f"Could not load storage class {err} for {datasetType.name}; "
                             "if it has components they will not be included in query results."
                         )
                         continue
                     for component_name in components_for_parent:
                         if any(
-                            p.fullmatch(
-                                DatasetType.nameWithComponent(storage.datasetType.name, component_name)
-                            )
+                            p.fullmatch(DatasetType.nameWithComponent(datasetType.name, component_name))
                             for p in wildcard.patterns
                         ):
-                            result[storage.datasetType].add(component_name)
+                            result[datasetType].add(component_name)
                             if not already_warned and components_deprecated:
                                 warnings.warn(
                                     deprecation_message,
@@ -492,49 +459,77 @@ def getDatasetRef(self, id: DatasetId) -> DatasetRef | None:
             sqlalchemy.sql.select(
                 self._static.dataset.columns.dataset_type_id,
                 self._static.dataset.columns[self._collections.getRunForeignKeyName()],
+                *self._static.dataset_type.columns,
             )
             .select_from(self._static.dataset)
+            .join(self._static.dataset_type)
             .where(self._static.dataset.columns.id == id)
         )
         with self._db.query(sql) as sql_result:
             row = sql_result.mappings().fetchone()
         if row is None:
             return None
-        recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
-        if recordsForType is None:
-            self.refresh()
-            recordsForType = self._byId.get(row[self._static.dataset.columns.dataset_type_id])
-            assert recordsForType is not None, "Should be guaranteed by foreign key constraints."
+        storage = self._make_storage(self._record_from_row(row))
         return DatasetRef(
-            recordsForType.datasetType,
-            dataId=recordsForType.getDataId(id=id),
+            storage.datasetType,
+            dataId=storage.getDataId(id=id),
             id=id,
             run=self._collections[row[self._collections.getRunForeignKeyName()]].name,
         )
 
-    def _dataset_type_factory(self, dataset_type_id: int) -> DatasetType:
-        """Return dataset type given its ID."""
-        return self._byId[dataset_type_id].datasetType
+    def _fetch_dataset_type_record(self, name: str) -> _DatasetTypeRecord | None:
+        """Retrieve all dataset types defined in database.
+
+        Yields
+        ------
+        dataset_types : `_DatasetTypeRecord`
+            Information from a single database record.
+        """
+        c = self._static.dataset_type.columns
+        stmt = self._static.dataset_type.select().where(c.name == name)
+        with self._db.query(stmt) as sql_result:
+            row = sql_result.mappings().one_or_none()
+        if row is None:
+            return None
+        else:
+            return self._record_from_row(row)
+
+    def _record_from_row(self, row: Mapping) -> _DatasetTypeRecord:
+        name = row["name"]
+        dimensions = self._dimensions.loadDimensionGraph(row["dimensions_key"])
+        calibTableName = row["calibration_association_table"]
+        datasetType = DatasetType(
+            name, dimensions, row["storage_class"], isCalibration=(calibTableName is not None)
+        )
+        return _DatasetTypeRecord(
+            dataset_type=datasetType,
+            dataset_type_id=row["id"],
+            tag_table_name=row["tag_association_table"],
+            calib_table_name=calibTableName,
+        )
+
+    def _dataset_type_from_row(self, row: Mapping) -> DatasetType:
+        return self._record_from_row(row).dataset_type
+
+    def _fetch_dataset_types(self) -> list[DatasetType]:
+        """Fetch list of all defined dataset types."""
+        with self._db.query(self._static.dataset_type.select()) as sql_result:
+            sql_rows = sql_result.mappings().fetchall()
+        return [self._record_from_row(row).dataset_type for row in sql_rows]
 
     def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummary:
         # Docstring inherited from DatasetRecordStorageManager.
-        summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_factory)
+        summaries = self._summaries.fetch_summaries([collection], None, self._dataset_type_from_row)
         return summaries[collection.key]
 
     def fetch_summaries(
         self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
     ) -> Mapping[Any, CollectionSummary]:
         # Docstring inherited from DatasetRecordStorageManager.
-        dataset_type_ids: list[int] | None = None
+        dataset_type_names: Iterable[str] | None = None
         if dataset_types is not None:
-            dataset_type_ids = []
-            for dataset_type in dataset_types:
-                if dataset_type.isComponent():
-                    dataset_type = dataset_type.makeCompositeDatasetType()
-                # Assume we know all possible names.
-                dataset_type_id = self._byName[dataset_type.name]._dataset_type_id
-                dataset_type_ids.append(dataset_type_id)
-        return self._summaries.fetch_summaries(collections, dataset_type_ids, self._dataset_type_factory)
+            dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
+        return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
 
     _versions: list[VersionTuple]
     """Schema version for this class."""
diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py b/python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py
index 73511ddd0c..41687cb9c2 100644
--- a/python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py
+++ b/python/lsst/daf/butler/registry/datasets/byDimensions/summaries.py
@@ -133,6 +133,8 @@ class CollectionSummaryManager:
         Manager object for the dimensions in this `Registry`.
     tables : `CollectionSummaryTables`
         Struct containing the tables that hold collection summaries.
+    dataset_type_table : `sqlalchemy.schema.Table`
+        Table containing dataset type definitions.
     """
 
     def __init__(
@@ -142,12 +144,14 @@ def __init__(
         collections: CollectionManager,
         dimensions: DimensionRecordStorageManager,
         tables: CollectionSummaryTables[sqlalchemy.schema.Table],
+        dataset_type_table: sqlalchemy.schema.Table,
     ):
         self._db = db
         self._collections = collections
         self._collectionKeyName = collections.getCollectionForeignKeyName()
         self._dimensions = dimensions
         self._tables = tables
+        self._dataset_type_table = dataset_type_table
 
     @classmethod
     def initialize(
@@ -157,6 +161,7 @@ def initialize(
         *,
         collections: CollectionManager,
         dimensions: DimensionRecordStorageManager,
+        dataset_type_table: sqlalchemy.schema.Table,
     ) -> CollectionSummaryManager:
         """Create all summary tables (or check that they have been created),
         returning an object to manage them.
@@ -172,6 +177,8 @@ def initialize(
             Manager object for the collections in this `Registry`.
         dimensions : `DimensionRecordStorageManager`
             Manager object for the dimensions in this `Registry`.
+        dataset_type_table : `sqlalchemy.schema.Table`
+            Table containing dataset type definitions.
 
         Returns
         -------
@@ -193,6 +200,7 @@ def initialize(
             collections=collections,
             dimensions=dimensions,
             tables=tables,
+            dataset_type_table=dataset_type_table,
         )
 
     def update(
@@ -240,8 +248,8 @@ def update(
     def fetch_summaries(
         self,
         collections: Iterable[CollectionRecord],
-        dataset_type_ids: Iterable[int] | None,
-        dataset_type_factory: Callable[[int], DatasetType],
+        dataset_type_names: Iterable[str] | None,
+        dataset_type_factory: Callable[[sqlalchemy.engine.RowMapping], DatasetType],
     ) -> Mapping[Any, CollectionSummary]:
         """Fetch collection summaries given their names and dataset types.
 
@@ -249,12 +257,12 @@ def fetch_summaries(
         ----------
         collections : `~collections.abc.Iterable` [`CollectionRecord`]
             Collection records to query.
-        dataset_type_ids : `~collections.abc.Iterable` [`int`]
-            IDs of dataset types to include into returned summaries. If `None`
-            then all dataset types will be included.
+        dataset_type_names : `~collections.abc.Iterable` [`str`]
+            Names of dataset types to include into returned summaries. If
+            `None` then all dataset types will be included.
         dataset_type_factory : `Callable`
-            Method that returns `DatasetType` instance given its dataset type
-            ID.
+            Method that takes a table row and make `DatasetType` instance out
+            of it.
 
         Returns
         -------
@@ -282,8 +290,10 @@ def fetch_summaries(
         # information at once.
         coll_col = self._tables.datasetType.columns[self._collectionKeyName].label(self._collectionKeyName)
         dataset_type_id_col = self._tables.datasetType.columns.dataset_type_id.label("dataset_type_id")
-        columns = [coll_col, dataset_type_id_col]
-        fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType
+        columns = [coll_col, dataset_type_id_col] + list(self._dataset_type_table.columns)
+        fromClause: sqlalchemy.sql.expression.FromClause = self._tables.datasetType.join(
+            self._dataset_type_table
+        )
         for dimension, table in self._tables.dimensions.items():
             columns.append(table.columns[dimension.name].label(dimension.name))
             fromClause = fromClause.join(
@@ -297,8 +307,8 @@ def fetch_summaries(
 
         sql = sqlalchemy.sql.select(*columns).select_from(fromClause)
         sql = sql.where(coll_col.in_([coll.key for coll in non_chains]))
-        if dataset_type_ids is not None:
-            sql = sql.where(dataset_type_id_col.in_(dataset_type_ids))
+        if dataset_type_names is not None:
+            sql = sql.where(self._dataset_type_table.columns["name"].in_(dataset_type_names))
 
         # Run the query and construct CollectionSummary objects from the result
         # rows.  This will never include CHAINED collections or collections
@@ -306,13 +316,16 @@ def fetch_summaries(
         summaries: dict[Any, CollectionSummary] = {}
         with self._db.query(sql) as sql_result:
             sql_rows = sql_result.mappings().fetchall()
+        dataset_type_ids: dict[int, DatasetType] = {}
         for row in sql_rows:
             # Collection key should never be None/NULL; it's what we join on.
             # Extract that and then turn it into a collection name.
             collectionKey = row[self._collectionKeyName]
             # dataset_type_id should also never be None/NULL; it's in the first
             # table we joined.
-            dataset_type = dataset_type_factory(row["dataset_type_id"])
+            dataset_type_id = row["dataset_type_id"]
+            if (dataset_type := dataset_type_ids.get(dataset_type_id)) is None:
+                dataset_type_ids[dataset_type_id] = dataset_type = dataset_type_factory(row)
             # See if we have a summary already for this collection; if not,
             # make one.
             summary = summaries.get(collectionKey)
diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py
index 84a5a735d4..8bafb02274 100644
--- a/python/lsst/daf/butler/registry/interfaces/_datasets.py
+++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py
@@ -487,7 +487,7 @@ def find(self, name: str) -> DatasetRecordStorage | None:
         raise NotImplementedError()
 
     @abstractmethod
-    def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool]:
+    def register(self, datasetType: DatasetType) -> bool:
         """Ensure that this `Registry` can hold records for the given
         `DatasetType`, creating new tables as necessary.
 
@@ -499,8 +499,6 @@ def register(self, datasetType: DatasetType) -> tuple[DatasetRecordStorage, bool
 
         Returns
         -------
-        records : `DatasetRecordStorage`
-            The object representing the records for the given dataset type.
         inserted : `bool`
             `True` if the dataset type did not exist in the registry before.
 
diff --git a/python/lsst/daf/butler/registry/sql_registry.py b/python/lsst/daf/butler/registry/sql_registry.py
index 5e03938b78..07eccf6ef8 100644
--- a/python/lsst/daf/butler/registry/sql_registry.py
+++ b/python/lsst/daf/butler/registry/sql_registry.py
@@ -697,8 +697,7 @@ def registerDatasetType(self, datasetType: DatasetType) -> bool:
         This method cannot be called within transactions, as it needs to be
         able to perform its own transaction to be concurrent.
         """
-        _, inserted = self._managers.datasets.register(datasetType)
-        return inserted
+        return self._managers.datasets.register(datasetType)
 
     def removeDatasetType(self, name: str | tuple[str, ...]) -> None:
         """Remove the named `DatasetType` from the registry.