Skip to content

Commit

Permalink
Sketch out new interfaces for querying multiple dataset types.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Aug 21, 2024
1 parent 201ee96 commit 783f3ee
Show file tree
Hide file tree
Showing 7 changed files with 655 additions and 7 deletions.
1 change: 1 addition & 0 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from ._butler import *
from ._butler_collections import *
from ._butler_config import *
from ._butler_dataset_types import *
from ._butler_repo_index import *
from ._collection_type import CollectionType
from ._column_categorization import *
Expand Down
103 changes: 98 additions & 5 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

from ._butler_collections import ButlerCollections
from ._butler_config import ButlerConfig, ButlerType
from ._butler_dataset_types import ButlerDatasetTypes
from ._butler_instance_options import ButlerInstanceOptions
from ._butler_repo_index import ButlerRepoIndex
from ._config import Config, ConfigSubset
Expand Down Expand Up @@ -840,6 +841,7 @@ def getURI(
)
return primary

# TODO: RFC deprecating this in favor of butler.dataset_types.get.
@abstractmethod
def get_dataset_type(self, name: str) -> DatasetType:
"""Get the `DatasetType`.
Expand Down Expand Up @@ -1448,6 +1450,16 @@ def run(self) -> str | None:
"""
raise NotImplementedError()

# TODO: make this abstract and implement in derived classes.
@property
def dataset_types(self) -> ButlerDatasetTypes:
"""Object with methods for modifying and querying dataset types
(`~lsst.daf.butler.ButlerDatasettypes`).
Use of this object is preferred over `registry` wherever possible.
"""
raise NotImplementedError()

@property
@abstractmethod
def registry(self) -> Registry:
Expand Down Expand Up @@ -1572,22 +1584,20 @@ def _query_datasets(
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
"""Query for dataset references matching user-provided criteria.
"""Query for dataset references of a single dataset type.
Parameters
----------
dataset_type : `str` or `DatasetType`
Dataset type object or name to search for.
collections : collection expression, optional
A collection name or iterable of collection names to search. If not
provided, the default collections are used. See
:ref:`daf_butler_collection_expressions` for more information.
provided, the default collections are used.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
`DatasetRef` of each `DatasetType`, from the first collection in
which a dataset of that dataset type appears (according to the
order of ``collections`` passed in). If `True`, ``collections``
must not contain regular expressions and may not be ``...``.
order of ``collections`` passed in).
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints in
the query.
Expand Down Expand Up @@ -1739,6 +1749,89 @@ def _query_dimension_records(
raise EmptyQueryResultError(list(result.explain_no_results()))
return dimension_records

def _query_all_datasets(
self,
collections: str | Iterable[str] | None = None,
*,
name: str | Iterable[str] = "*",
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
is_calibration: bool | None = None,
find_first: bool = True,
data_id: DataId | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
"""Query for datasets of potentially multiple types.
Parameters
----------
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
The collection or collections to search, in order. If not provided
or `None`, the default collection search path for this butler is
used.
name : `str` or `~collections.abc.Iterable` [ `str` ], optional
Names or name patterns (glob-style) that returned dataset type
names must match. If an iterable, items are OR'd together. The
default is to include all dataset types in the given collections.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have as a subset.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have exactly.
with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
or `StorageClass` or \
`~collections.abc.Iterable` [ `StorageClass` ], optional
Storage classes or storage class names that returned dataset types
must have. If an iterable, items are OR'd together.
is_calibration : `bool` or `None`, optional
If `None`, constrain returned dataset types to be or not be
calibrations.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
`DatasetRef` of each `DatasetType`, from the first collection in
which a dataset of that dataset type appears (according to the
order of ``collections`` passed in).
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints in
the query.
where : `str`, optional
A string expression similar to a SQL WHERE clause. May involve any
column of a dimension table or (as a shortcut for the primary key
column of a dimension table) dimension name. See
:ref:`daf_butler_dimension_expressions` for more information.
bind : `~collections.abc.Mapping`, optional
Mapping containing literal values that should be injected into the
``where`` expression, keyed by the identifiers they replace. Values
of collection type can be expanded in some cases; see
:ref:`daf_butler_dimension_expressions_identifiers` for more
information.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
non-empty list of strings explaining possible causes for empty
result.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
argument (and may be used to provide a constraining data ID even
when the ``data_id`` argument is `None`).
Returns
-------
refs : `list` [ `DatasetRef` ]
Dataset references matching the given query criteria. Nested data
IDs are guaranteed to include values for all implied dimensions
(i.e. `DataCoordinate.hasFull` will return `True`), but will not
include dimension records (`DataCoordinate.hasRecords` will be
`False`).
"""
raise NotImplementedError()

@abstractmethod
def _clone(
self,
Expand Down
Loading

0 comments on commit 783f3ee

Please sign in to comment.