From fcd15198071a0f9e7529d0544503f62d9352e6a2 Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Wed, 21 Aug 2024 12:29:28 -0400 Subject: [PATCH 1/3] Drop out-of-date docstrings. --- python/lsst/daf/butler/_butler.py | 7 ------- python/lsst/daf/butler/queries/_query.py | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index fbfedd6bb7..d0a20cee64 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -1729,13 +1729,6 @@ def query_datasets( collection wildcard is passed when ``find_first`` is `True`, or when ``collections`` is `None` and default butler collections are not defined. - - Notes - ----- - When multiple dataset types are queried in a single call, the results - of this operation are equivalent to querying for each dataset type - separately in turn, and no information about the relationships between - datasets of different types is included. """ if data_id is None: data_id = DataCoordinate.make_empty(self.dimensions) diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py index 75f8d8bfe7..dbd3f0b4ef 100644 --- a/python/lsst/daf/butler/queries/_query.py +++ b/python/lsst/daf/butler/queries/_query.py @@ -242,8 +242,7 @@ def datasets( If `True` (default), for each result data ID, only yield one `DatasetRef` of each `DatasetType`, from the first collection in which a dataset of that dataset type appears (according to the - order of ``collections`` passed in). If `True`, ``collections`` - must not be ``...``. + order of ``collections`` passed in). Returns ------- From 1e89bfd98b0e5d29680487a84068ad77cfd04a01 Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Wed, 21 Aug 2024 12:47:02 -0400 Subject: [PATCH 2/3] Sketch out new interfaces for querying multiple dataset types. --- python/lsst/daf/butler/__init__.py | 1 + python/lsst/daf/butler/_butler.py | 99 ++++++- .../lsst/daf/butler/_butler_dataset_types.py | 244 ++++++++++++++++++ python/lsst/daf/butler/queries/__init__.py | 2 + .../butler/queries/_dataset_type_results.py | 124 +++++++++ .../queries/_heterogeneous_dataset_results.py | 101 ++++++++ python/lsst/daf/butler/queries/_query.py | 87 ++++++- 7 files changed, 654 insertions(+), 4 deletions(-) create mode 100644 python/lsst/daf/butler/_butler_dataset_types.py create mode 100644 python/lsst/daf/butler/queries/_dataset_type_results.py create mode 100644 python/lsst/daf/butler/queries/_heterogeneous_dataset_results.py diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index acbfd4e929..5bb08187ae 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -38,6 +38,7 @@ from ._butler import * from ._butler_collections import * from ._butler_config import * +from ._butler_dataset_types import * from ._butler_repo_index import * from ._collection_type import CollectionType from ._column_categorization import * diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index d0a20cee64..e4f02e1da8 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -42,6 +42,7 @@ from ._butler_collections import ButlerCollections from ._butler_config import ButlerConfig, ButlerType +from ._butler_dataset_types import ButlerDatasetTypes from ._butler_instance_options import ButlerInstanceOptions from ._butler_repo_index import ButlerRepoIndex from ._config import Config, ConfigSubset @@ -841,6 +842,7 @@ def getURI( ) return primary + # TODO: RFC deprecating this in favor of butler.dataset_types.get. @abstractmethod def get_dataset_type(self, name: str) -> DatasetType: """Get the `DatasetType`. @@ -1505,6 +1507,16 @@ def run(self) -> str | None: """ raise NotImplementedError() + # TODO: make this abstract and implement in derived classes. + @property + def dataset_types(self) -> ButlerDatasetTypes: + """Object with methods for modifying and querying dataset types + (`~lsst.daf.butler.ButlerDatasettypes`). + + Use of this object is preferred over `registry` wherever possible. + """ + raise NotImplementedError() + @property @abstractmethod def registry(self) -> Registry: @@ -1648,7 +1660,7 @@ def query_datasets( explain: bool = True, **kwargs: Any, ) -> list[DatasetRef]: - """Query for dataset references matching user-provided criteria. + """Query for dataset references of a single dataset type. Parameters ---------- @@ -1659,7 +1671,6 @@ def query_datasets( provided, the default collections are used. Can be a wildcard if ``find_first`` is `False` (if find first is requested the order of collections matters and wildcards make the order indeterminate). - See :ref:`daf_butler_collection_expressions` for more information. find_first : `bool`, optional If `True` (default), for each result data ID, only yield one `DatasetRef` of each `DatasetType`, from the first collection in @@ -1871,6 +1882,90 @@ def query_dimension_records( raise EmptyQueryResultError(list(result.explain_no_results())) return dimension_records + def _query_all_datasets( + self, + collections: str | Iterable[str] | None = None, + *, + name: str | Iterable[str] = "*", + at_least_dimensions: Iterable[str] | DimensionGroup | None = None, + exact_dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None, + is_calibration: bool | None = None, + find_first: bool = True, + data_id: DataId | None = None, + where: str = "", + bind: Mapping[str, Any] | None = None, + explain: bool = True, + **kwargs: Any, + ) -> list[DatasetRef]: + """Query for datasets of potentially multiple types. + + Parameters + ---------- + collections : `str` or `~collections.abc.Iterable` [ `str` ], optional + The collection or collections to search, in order. If not provided + or `None`, the default collection search path for this butler is + used. + name : `str` or `~collections.abc.Iterable` [ `str` ], optional + Names or name patterns (glob-style) that returned dataset type + names must match. If an iterable, items are OR'd together. The + default is to include all dataset types in the given collections. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have as a subset. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have exactly. + with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + or `StorageClass` or \ + `~collections.abc.Iterable` [ `StorageClass` ], optional + Storage classes or storage class names that returned dataset types + must have. If an iterable, items are OR'd together. + is_calibration : `bool` or `None`, optional + If `None`, constrain returned dataset types to be or not be + calibrations. + find_first : `bool`, optional + If `True` (default), for each result data ID, only yield one + `DatasetRef` of each `DatasetType`, from the first collection in + which a dataset of that dataset type appears (according to the + order of ``collections`` passed in). + data_id : `dict` or `DataCoordinate`, optional + A data ID whose key-value pairs are used as equality constraints in + the query. + where : `str`, optional + A string expression similar to a SQL WHERE clause. May involve any + column of a dimension table or (as a shortcut for the primary key + column of a dimension table) dimension name. See + :ref:`daf_butler_dimension_expressions` for more information. + bind : `~collections.abc.Mapping`, optional + Mapping containing literal values that should be injected into the + ``where`` expression, keyed by the identifiers they replace. Values + of collection type can be expanded in some cases; see + :ref:`daf_butler_dimension_expressions_identifiers` for more + information. + explain : `bool`, optional + If `True` (default) then `EmptyQueryResultError` exception is + raised when resulting list is empty. The exception contains + non-empty list of strings explaining possible causes for empty + result. + **kwargs + Additional keyword arguments are forwarded to + `DataCoordinate.standardize` when processing the ``data_id`` + argument (and may be used to provide a constraining data ID even + when the ``data_id`` argument is `None`). + + Returns + ------- + refs : `list` [ `DatasetRef` ] + Dataset references matching the given query criteria. Nested data + IDs are guaranteed to include values for all implied dimensions + (i.e. `DataCoordinate.hasFull` will return `True`), but will not + include dimension records (`DataCoordinate.hasRecords` will be + `False`). + """ + raise NotImplementedError() + + @abstractmethod def clone( self, *, diff --git a/python/lsst/daf/butler/_butler_dataset_types.py b/python/lsst/daf/butler/_butler_dataset_types.py new file mode 100644 index 0000000000..399fe27ce4 --- /dev/null +++ b/python/lsst/daf/butler/_butler_dataset_types.py @@ -0,0 +1,244 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("ButlerDatasetTypes",) + +from abc import ABC, abstractmethod +from collections.abc import Iterable, Sequence + +from ._dataset_type import DatasetType +from ._storage_class import StorageClass +from .dimensions import DimensionGroup + + +class ButlerDatasetTypes(ABC, Sequence): + """Methods for working with the dataset types known to the Butler.""" + + @abstractmethod + def get(self, name: str) -> DatasetType: + """Return the dataset type with the given name. + + Parameters + ---------- + name : `str` + Name of the dataset type. + + Returns + ------- + dataset_type : `DatasetType` + Dataset type object with the given name. + + Raises + ------ + MissingDatasetTypeError + Raised if there is no dataset type with the given name. + """ + raise NotImplementedError() + + @abstractmethod + def query( + self, + name: str | Iterable[str], + *, + at_least_dimensions: Iterable[str] | DimensionGroup | None = None, + exact_dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None, + is_calibration: bool | None = None, + ) -> Iterable[DatasetType]: + """Query for dataset types matching the given criteria. + + Parameters + ---------- + name : `str` or `~collections.abc.Iterable` [ `str` ] + Names or name patterns (glob-style) that returned dataset type + names must match. If an iterable, items are OR'd together. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have as a subset. + exact_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have exactly. + storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + or `StorageClass` or \ + `~collections.abc.Iterable` [ `StorageClass` ], optional + Storage classes or storage class names that returned dataset types + must have. If an iterable, items are OR'd together. + is_calibration : `bool` or `None`, optional + If `None`, constrain returned dataset types to be or not be + calibrations. + + Returns + ------- + dataset_types : `~collections.abc.Iterable` [ `DatasetType` ] + An iterable of dataset types. This is guaranteed to be a regular + Python in-memory container, not a lazy single-pass iterator, but + the type of container is currently left unspecified in order to + leave room for future convenience behavior. + + Notes + ----- + This method queries all registered dataset types in registry. To query + for the types of datasets that are in a collection, instead use:: + + info = butler.collections.query_info( + collections, + include_summaries=True, + ) + + for a simple summary of the dataset types in each collection (see + `lsst.daf.butler.ButlerCollections.query_info`). Or, for + more complex but powerful queries (including constraints on data IDs or + dataset counts), use:: + + with butler.query() as q: + dataset_types = q.dataset_types(collections) + + See `lsst.daf.butler.queries.Query.dataset_types` for details. + """ + raise NotImplementedError() + + @abstractmethod + def query_names( + self, + name: str | Iterable[str], + *, + at_least_dimensions: Iterable[str] | DimensionGroup | None = None, + exact_dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None, + is_calibration: bool | None = None, + ) -> Iterable[str]: + """Query for the names of dataset types matching the given criteria. + + Parameters + ---------- + name : `str` or `~collections.abc.Iterable` [ `str` ] + Names or name patterns (glob-style) that returned dataset type + names must match. If an iterable, items are OR'd together. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have as a subset. + exact_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have exactly. + storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + or `StorageClass` or \ + `~collections.abc.Iterable` [ `StorageClass` ], optional + Storage classes or storage class names that returned dataset types + must have. If an iterable, items are OR'd together. + is_calibration : `bool` or `None`, optional + If `None`, constrain returned dataset types to be or not be + calibrations. + + Returns + ------- + names : `~collections.abc.Iterable` [ `str` ] + An iterable of dataset types. + """ + raise NotImplementedError() + + @abstractmethod + def register( + self, + name_or_type: str, + /, + dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | StorageClass | None = None, + is_calibration: bool | None = None, + ) -> bool: + """Register a dataset type. + + It is not an error to register the same `DatasetType` twice. + + Parameters + ---------- + name_or_type : `str` or `DatasetType` + The name of the dataset type to be added, or a complete + `DatasetType` type object to add. + dimensions : `~colletions.abc.Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions for the dataset type. Required if the first argument + is just a `str`, and overrides the dimensions if the first argument + is a `DatasetType`. + storage_class : `str` or `StorageClass`, optional + Storage class for the dataset type. Required if the first argument + is just a `str`, and overrides the storage class if the first + arguemnt is a `DatasetType`. + is_calibration : `bool`, optional + Whether the dataset type is a calibration. If the first argument + is a `str`, defaults to `False`. If the first argument is a + `DatasetType` and this argument is not `None`, it overrides the + value on the `DatasetType`. + + Returns + ------- + inserted : `bool` + `True` if a new dataset type was inserted, `False` if an identical + existing dataset type was found. Note that in either case the + dataset type is guaranteed to be defined in the repository + consistently with the given definition. + + Raises + ------ + ValueError + Raised if the dimensions or storage class are invalid. + lsst.daf.butler.registry.ConflictingDefinitionError + Raised if this dataset type is already registered with a different + definition. + """ + raise NotImplementedError() + + @abstractmethod + def remove(self, name: str) -> None: + """Remove the dataset type with the given name. + + .. warning:: + + Butler implementations can cache the dataset type definitions. + This means that deleting the dataset type definition may result in + unexpected behavior from other butler processes that are active + that have not seen the deletion. + + Parameters + ---------- + name : `str` or `tuple` [`str`] + Name of the type to be removed or tuple containing a list of type + names to be removed. Wildcards are allowed. + + Raises + ------ + lsst.daf.butler.registry.OrphanedRecordError + Raised if an attempt is made to remove the dataset type definition + when there are still datasets associated with it. + + Notes + ----- + If the dataset type is not registered the method will return without + action. + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/queries/__init__.py b/python/lsst/daf/butler/queries/__init__.py index 720e4ca6d1..ef695eea46 100644 --- a/python/lsst/daf/butler/queries/__init__.py +++ b/python/lsst/daf/butler/queries/__init__.py @@ -28,6 +28,8 @@ from ._base import * from ._data_coordinate_query_results import * from ._dataset_query_results import * +from ._dataset_type_results import * from ._dimension_record_query_results import * from ._general_query_results import * +from ._heterogeneous_dataset_results import * from ._query import * diff --git a/python/lsst/daf/butler/queries/_dataset_type_results.py b/python/lsst/daf/butler/queries/_dataset_type_results.py new file mode 100644 index 0000000000..5e45e80d93 --- /dev/null +++ b/python/lsst/daf/butler/queries/_dataset_type_results.py @@ -0,0 +1,124 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("DatasetTypeQueryResults",) + +from collections.abc import Iterable, Iterator + +from .._dataset_type import DatasetType + + +class DatasetTypeQueryResults: + """A query result object that summarizes a query for datasets by doing the + equivalent of a SQL GROUP BY on the dataset type. + """ + + def __iter__(self) -> Iterator[DatasetType]: + raise NotImplementedError() + + def names(self) -> Iterable[str]: + """Iterate over the names of the matched dataset types.""" + raise NotImplementedError() + + def by_collection( + self, + *, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Iterable[tuple[str, Iterable[DatasetType]]]: + """Iterate over results while grouping by collection as well as dataset + type. + + Parameters + ---------- + flatten_chains : `bool`, optional + If `True` (`False` is default), expand the child collections of + matching `~CollectionType.CHAINED` collections in the results. + include_chains : `bool` or `None`, optional + If `True`, yield records for matching `~CollectionType.CHAINED` + collections. Default is the opposite of ``flatten_chains``: + include either CHAINED collections or their children, but not both. + + Returns + ------- + rows : `~collections.abc.Iterable` [ `tuple` ] + An iterable of ``(collection, dataset_types)`` pairs. The + ``dataset_types`` values are guaranteed to be regular in-memory + iterables, not lazy single-pass iterators, but the exact type + of iterable is left unspecified to leave room for future + improvements. + """ + raise NotImplementedError() + + def with_counts(self, find_first: bool = True) -> Iterable[tuple[DatasetType, int]]: + """Iterate over results with counts for the number of datasets of each + type. + + Parameters + ---------- + find_first : `bool`, optional + If `True` (default), only count unique dataset type + data ID + combinations, not shadowed datasets. + + Returns + ------- + rows : `tuple` [ `DatasetRef`, `int` ] + An iterable of ``(dataset_type, count)`` pairs. + """ + raise NotImplementedError() + + def by_collection_with_counts( + self, + *, + flatten_chains: bool = False, + include_chains: bool | None = None, + ) -> Iterable[tuple[str, Iterable[tuple[DatasetType, int]]]]: + """Iterate over results while grouping by collection as well as dataset + type, and counting the number of datasets in each combination. + + Parameters + ---------- + flatten_chains : `bool`, optional + If `True` (`False` is default), expand the child collections of + matching `~CollectionType.CHAINED` collections in the results. + include_chains : `bool` or `None`, optional + If `True`, yield records for matching `~CollectionType.CHAINED` + collections. Default is the opposite of ``flatten_chains``: + include either CHAINED collections or their children, but not both. + + Returns + ------- + rows : `~collections.abc.Iterable` [ `tuple` ] + An iterable of ``(collection, dataset_types_with_counts)`` pairs, + with the latter an iterable of ``(DatasetType, int`)``. + These inner iterables are guaranteed to be regular in-memory + iterables, not lazy single-pass iterators, but the exact type of + iterable is left unspecified to leave room for future improvements. + """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/queries/_heterogeneous_dataset_results.py b/python/lsst/daf/butler/queries/_heterogeneous_dataset_results.py new file mode 100644 index 0000000000..fa5362f0bd --- /dev/null +++ b/python/lsst/daf/butler/queries/_heterogeneous_dataset_results.py @@ -0,0 +1,101 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("HeterogeneousDatasetRefQueryResults",) + +from collections.abc import Iterable, Iterator + +from .._dataset_ref import DatasetId, DatasetRef +from ._base import QueryBase + + +class HeterogeneousDatasetRefQueryResults(QueryBase): + """A query result object for datasets with multiple dataset types.""" + + def __iter__(self) -> Iterator[DatasetRef]: + raise NotImplementedError() + + def ids(self) -> Iterable[DatasetId]: + """Iterate over just the dataset IDs. + + This may return a lazy-single pass iterator or a regular in-memory + iterable, in order to allow for the possibility that it may be + upgraded into a query results object in the future. + """ + # In some cases - depending on the WHERE clause and other things joined + # in - this could result in a single query, rather than a Python-side + # aggregation of per-dimension-group queries. + raise NotImplementedError() + + def any(self, *, execute: bool = True, exact: bool = True) -> bool: + # Docstring inherited. + raise NotImplementedError("Base class implementation is not correct for this derived class.") + + def explain_no_results(self, execute: bool = True) -> Iterable[str]: + # Docstring inherited. + raise NotImplementedError("Base class implementation is not correct for this derived class.") + + def count(self, *, exact: bool = True, discard: bool = False) -> int: + """Count the number of rows this query would return. + + Parameters + ---------- + exact : `bool`, optional + If `True`, run the full query and perform post-query filtering if + needed to account for that filtering in the count. If `False`, the + result may be an upper bound. + discard : `bool`, optional + If `True`, compute the exact count even if it would require running + the full query and then throwing away the result rows after + counting them. If `False`, this is an error, as the user would + usually be better off executing the query first to fetch its rows + into a new query (or passing ``exact=False``). Ignored if + ``exact=False``. + + Returns + ------- + count : `int` + The number of rows the query would return, or an upper bound if + ``exact=False``. + """ + raise NotImplementedError() + + # This class intentionally lacks some attributes that are defined on other + # QueryResults objects: + # + # - 'dimensions' isn't well-defined in general. + # + # - 'order_by' and 'limit' are hard to implement in the common case where + # we have to run one query for each dimension group. + # + # - 'where' exists on other result objects because the way they are + # constructed adds context (a dataset search join, some dimensions) that + # can help interpret arguments to 'where'. That's not generally true + # here, so calling `Query.where(...).all_datasets()` can do anything that + # `Query.all_datasets().where(...)` might be able to do. diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py index dbd3f0b4ef..4d2c98c111 100644 --- a/python/lsst/daf/butler/queries/_query.py +++ b/python/lsst/daf/butler/queries/_query.py @@ -27,6 +27,9 @@ from __future__ import annotations +from lsst.daf.butler.queries._dataset_type_results import DatasetTypeQueryResults +from lsst.daf.butler.queries._heterogeneous_dataset_results import HeterogeneousDatasetRefQueryResults + __all__ = ("Query",) from collections.abc import Iterable, Mapping, Set @@ -37,7 +40,7 @@ from .._dataset_type import DatasetType from .._exceptions import DimensionNameError, InvalidQueryError -from .._storage_class import StorageClassFactory +from .._storage_class import StorageClass, StorageClassFactory from ..dimensions import DataCoordinate, DataId, DataIdValue, DimensionGroup from ..registry import DatasetTypeError from ._base import QueryBase @@ -228,7 +231,8 @@ def datasets( *, find_first: bool = True, ) -> DatasetRefQueryResults: - """Return a result object that is a `DatasetRef` iterable. + """Return a result object that is a `DatasetRef` iterable with a single + dataset type. Parameters ---------- @@ -447,6 +451,85 @@ def general( ) return GeneralQueryResults(self._driver, tree=tree, spec=result_spec) + def all_datasets( + self, + collections: str | Iterable[str] | None = None, + *, + name: str | Iterable[str] = "*", + at_least_dimensions: Iterable[str] | DimensionGroup | None = None, + exact_dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None, + is_calibration: bool | None = None, + find_first: bool = True, + ) -> HeterogeneousDatasetRefQueryResults: + """Return a result object that is a `DatasetRef` iterable whose entries + may have different dataset types. + + Parameters + ---------- + collections : `str` or `~collections.abc.Iterable` [ `str` ], optional + The collection or collections to search, in order. If not provided + or `None`, the default collection search path for this butler is + used. + name : `str` or `~collections.abc.Iterable` [ `str` ], optional + Names or name patterns (glob-style) that returned dataset type + names must match. If an iterable, items are OR'd together. The + default is to include all dataset types in the given collections. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have as a subset. + at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ + optional + Dimensions that returned dataset types must have exactly. + with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + or `StorageClass` or \ + `~collections.abc.Iterable` [ `StorageClass` ], optional + Storage classes or storage class names that returned dataset types + must have. If an iterable, items are OR'd together. + is_calibration : `bool` or `None`, optional + If `None`, constrain returned dataset types to be or not be + calibrations. + find_first : `bool`, optional + If `True` (default), for each result data ID, only yield one + `DatasetRef` of each `DatasetType`, from the first collection in + which a dataset of that dataset type appears (according to the + order of ``collections`` passed in). + + Returns + ------- + refs : `.queries.HeterogeneousDatasetRefQueryResults` + Dataset references matching the given query criteria. Nested data + IDs are guaranteed to include values for all implied dimensions + (i.e. `DataCoordinate.hasFull` will return `True`), but will not + include dimension records (`DataCoordinate.hasRecords` will be + `False`). + """ + raise NotImplementedError() + + def dataset_types( + self, + collections: str | Iterable[str] | None = None, + *, + name: str | Iterable[str] = "*", + at_least_dimensions: Iterable[str] | DimensionGroup | None = None, + exact_dimensions: Iterable[str] | DimensionGroup | None = None, + storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None, + is_calibration: bool | None = None, + ) -> DatasetTypeQueryResults: + """Return a result object that groups dataset queries by their dataset + type. + + See `all_datasets` for parameter descriptions. + + Returns + ------- + types : `DatasetTypeQueryResults` + A result object that iterates over `DatasetType` objects and + provides methods for grouping further by collection and/or counting + the number of datasets of each type. + """ + raise NotImplementedError() + def materialize( self, *, From 8bac470062889b919014124751a0c282705ef664 Mon Sep 17 00:00:00 2001 From: Andy Salnikov Date: Wed, 18 Sep 2024 12:22:18 -0700 Subject: [PATCH 3/3] Few fixes for docstrings --- python/lsst/daf/butler/_butler.py | 9 ++++----- python/lsst/daf/butler/queries/_query.py | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index e4f02e1da8..2cb5540368 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -1511,7 +1511,7 @@ def run(self) -> str | None: @property def dataset_types(self) -> ButlerDatasetTypes: """Object with methods for modifying and querying dataset types - (`~lsst.daf.butler.ButlerDatasettypes`). + (`~lsst.daf.butler.ButlerDatasetTypes`). Use of this object is preferred over `registry` wherever possible. """ @@ -1718,7 +1718,7 @@ def query_datasets( Returns ------- - refs : `.queries.DatasetRefQueryResults` + refs : `list` [`DatasetRef`] Dataset references matching the given query criteria. Nested data IDs are guaranteed to include values for all implied dimensions (i.e. `DataCoordinate.hasFull` will return `True`). @@ -1913,10 +1913,9 @@ def _query_all_datasets( at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ optional Dimensions that returned dataset types must have as a subset. - at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ - optional + exact_dimensions : `Iterable` [ `str` ] or `DimensionGroup`, optional Dimensions that returned dataset types must have exactly. - with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ or `StorageClass` or \ `~collections.abc.Iterable` [ `StorageClass` ], optional Storage classes or storage class names that returned dataset types diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py index 4d2c98c111..dcc0d0436e 100644 --- a/python/lsst/daf/butler/queries/_query.py +++ b/python/lsst/daf/butler/queries/_query.py @@ -478,10 +478,9 @@ def all_datasets( at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ optional Dimensions that returned dataset types must have as a subset. - at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\ - optional + exact_dimensions : `Iterable` [ `str` ] or `DimensionGroup`, optional Dimensions that returned dataset types must have exactly. - with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ + storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\ or `StorageClass` or \ `~collections.abc.Iterable` [ `StorageClass` ], optional Storage classes or storage class names that returned dataset types @@ -506,7 +505,7 @@ def all_datasets( """ raise NotImplementedError() - def dataset_types( + def dataset_types( # numpydoc ignore=PR01 self, collections: str | Iterable[str] | None = None, *,