Skip to content

Commit

Permalink
Make DatasetTypeCache non-generic
Browse files Browse the repository at this point in the history
DatasetTypeCache is only used in a single file with a specific set of types, so it no longer needs to be generic.

Upcoming changes to make it thread-safe and cloneable will be easier to reason about with concrete types.
  • Loading branch information
dhirving committed Dec 9, 2024
1 parent f14086a commit 59b45c5
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 47 deletions.
22 changes: 4 additions & 18 deletions python/lsst/daf/butler/registry/_caching_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,14 @@

from __future__ import annotations

__all__ = ["CachingContext", "GenericCachingContext"]

from typing import Generic, TypeAlias, TypeVar
__all__ = ["CachingContext"]

from ._collection_record_cache import CollectionRecordCache
from ._collection_summary_cache import CollectionSummaryCache
from ._dataset_type_cache import DatasetTypeCache

_T = TypeVar("_T")
_U = TypeVar("_U")


class GenericCachingContext(Generic[_T, _U]):
class CachingContext:
"""Collection of caches for various types of records retrieved from
database.
Expand All @@ -54,16 +49,10 @@ class is passed to the relevant managers that can use it to query or
Dataset type cache is always enabled for now, this avoids the need for
explicitly enabling caching in pipetask executors.
`GenericCachingContext` is generic over two kinds of opaque dataset type
data, with the expectation that most code will use the ``CachingContext``
type alias (which resolves to `GenericCachingContext[object, object]`);
the `DatasetRecordStorageManager` can then cast this to a
`GenericCachingContext` with the actual opaque data types it uses.
"""

def __init__(self) -> None:
self._dataset_types: DatasetTypeCache[_T, _U] = DatasetTypeCache()
self._dataset_types: DatasetTypeCache = DatasetTypeCache()
self._collection_records: CollectionRecordCache | None = None
self._collection_summaries: CollectionSummaryCache | None = None
self._depth = 0
Expand Down Expand Up @@ -109,9 +98,6 @@ def collection_summaries(self) -> CollectionSummaryCache | None:
return self._collection_summaries

@property
def dataset_types(self) -> DatasetTypeCache[_T, _U]:
def dataset_types(self) -> DatasetTypeCache:
"""Cache for dataset types, never disabled (`DatasetTypeCache`)."""
return self._dataset_types


CachingContext: TypeAlias = GenericCachingContext[object, object]
48 changes: 22 additions & 26 deletions python/lsst/daf/butler/registry/_dataset_type_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,29 +30,24 @@
__all__ = ("DatasetTypeCache",)

from collections.abc import Iterable, Iterator
from typing import Generic, TypeVar
from typing import TYPE_CHECKING

from .._dataset_type import DatasetType
from ..dimensions import DimensionGroup

_T = TypeVar("_T")
_U = TypeVar("_U")
if TYPE_CHECKING:
from .datasets.byDimensions.tables import DynamicTables


class DatasetTypeCache(Generic[_T, _U]):
class DatasetTypeCache:
"""Cache for dataset types.
Notes
-----
This cache is a pair of mappings with different kinds of keys:
- the `DatasetType` itself is cached by name, as is some opaque data used
only by a `DatasetRecordStorageManager` implementation;
- additional opaque data (also used only by `DatasetRecordStorageManager`
implementations can be cached by the dimensions dataset types (i.e. a
`DimensionGroup`).
`DatasetTypeCache` is generic over these two opaque data types.
- Dataset type name -> (`DatasetType`, database integer primary key)
- `DimensionGroup` -> database table information
In some contexts (e.g. ``resolve_wildcard``) a full list of dataset types
is needed. To signify that cache content can be used in such contexts,
Expand All @@ -62,8 +57,8 @@ class DatasetTypeCache(Generic[_T, _U]):
"""

def __init__(self) -> None:
self._by_name_cache: dict[str, tuple[DatasetType, _T]] = {}
self._by_dimensions_cache: dict[DimensionGroup, _U] = {}
self._by_name_cache: dict[str, tuple[DatasetType, int]] = {}
self._by_dimensions_cache: dict[DimensionGroup, DynamicTables] = {}
self._full = False
self._dimensions_full = False

Expand All @@ -77,25 +72,26 @@ def dimensions_full(self) -> bool:
"""`True` if cache holds all known dataset type dimensions (`bool`)."""
return self._dimensions_full

def add(self, dataset_type: DatasetType, extra: _T) -> None:
def add(self, dataset_type: DatasetType, id: int) -> None:
"""Add one record to the cache.
Parameters
----------
dataset_type : `DatasetType`
Dataset type, replaces any existing dataset type with the same
name.
extra : `Any`
id : `int`
The dataset type primary key
Additional opaque object stored with this dataset type.
"""
self._by_name_cache[dataset_type.name] = (dataset_type, extra)
self._by_name_cache[dataset_type.name] = (dataset_type, id)

def set(
self,
data: Iterable[tuple[DatasetType, _T]],
data: Iterable[tuple[DatasetType, int]],
*,
full: bool = False,
dimensions_data: Iterable[tuple[DimensionGroup, _U]] | None = None,
dimensions_data: Iterable[tuple[DimensionGroup, DynamicTables]] | None = None,
dimensions_full: bool = False,
) -> None:
"""Replace cache contents with the new set of dataset types.
Expand Down Expand Up @@ -136,7 +132,7 @@ def discard(self, name: str) -> None:
"""
self._by_name_cache.pop(name, None)

def get(self, name: str) -> tuple[DatasetType | None, _T | None]:
def get(self, name: str) -> tuple[DatasetType | None, int | None]:
"""Return cached info given dataset type name.
Parameters
Expand Down Expand Up @@ -177,7 +173,7 @@ def get_dataset_type(self, name: str) -> DatasetType | None:
return None
return item[0]

def items(self) -> Iterator[tuple[DatasetType, _T]]:
def items(self) -> Iterator[tuple[DatasetType, int]]:
"""Return iterator for the set of items in the cache, can only be
used if `full` is true.
Expand All @@ -195,19 +191,19 @@ def items(self) -> Iterator[tuple[DatasetType, _T]]:
raise RuntimeError("cannot call items() if cache is not full")
return iter(self._by_name_cache.values())

def add_by_dimensions(self, dimensions: DimensionGroup, extra: _U) -> None:
def add_by_dimensions(self, dimensions: DimensionGroup, tables: DynamicTables) -> None:
"""Add information about a set of dataset type dimensions to the cache.
Parameters
----------
dimensions : `DimensionGroup`
Dimensions of one or more dataset types.
extra : `Any`
tables : `DynamicTables`
Additional opaque object stored with these dimensions.
"""
self._by_dimensions_cache[dimensions] = extra
self._by_dimensions_cache[dimensions] = tables

def get_by_dimensions(self, dimensions: DimensionGroup) -> _U | None:
def get_by_dimensions(self, dimensions: DimensionGroup) -> DynamicTables | None:
"""Get information about a set of dataset type dimensions.
Parameters
Expand All @@ -217,13 +213,13 @@ def get_by_dimensions(self, dimensions: DimensionGroup) -> _U | None:
Returns
-------
extra : `Any` or `None`
tables : `DynamicTables` or `None`
Additional opaque object stored with these dimensions, or `None` if
these dimensions are not present in the cache.
"""
return self._by_dimensions_cache.get(dimensions)

def by_dimensions_items(self) -> Iterator[tuple[DimensionGroup, _U]]:
def by_dimensions_items(self) -> Iterator[tuple[DimensionGroup, DynamicTables]]:
"""Return iterator for all dimensions-keyed data in the cache.
This can only be called if `dimensions_full` is `True`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import datetime
import logging
from collections.abc import Iterable, Mapping, Sequence, Set
from typing import TYPE_CHECKING, Any, ClassVar, cast
from typing import TYPE_CHECKING, Any, ClassVar

import astropy.time
import sqlalchemy
Expand All @@ -24,7 +24,7 @@
from ....dimensions import DataCoordinate, DimensionGroup, DimensionUniverse
from ....direct_query_driver import SqlJoinsBuilder, SqlSelectBuilder # new query system, server+direct only
from ....queries import tree as qt # new query system, both clients + server
from ..._caching_context import CachingContext, GenericCachingContext
from ..._caching_context import CachingContext
from ..._collection_summary import CollectionSummary
from ..._exceptions import ConflictingDefinitionError, DatasetTypeExpressionError, OrphanedRecordError
from ...interfaces import DatasetRecordStorageManager, RunRecord, VersionTuple
Expand Down Expand Up @@ -155,7 +155,7 @@ def __init__(
self._dimensions = dimensions
self._static = static
self._summaries = summaries
self._caching_context = cast(GenericCachingContext[int, DynamicTables], caching_context)
self._caching_context = caching_context
self._use_astropy_ingest_date = self.ingest_date_dtype() is ddl.AstropyTimeNsecTai
self._run_key_column = collections.getRunForeignKeyName()

Expand Down

0 comments on commit 59b45c5

Please sign in to comment.