Sketch out new interfaces for querying multiple dataset types.

lsst · Aug 21, 2024 · c57d384 · c57d384
1 parent f86e001
commit c57d384
Show file tree

Hide file tree

Showing 7 changed files with 645 additions and 10 deletions.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -32,12 +32,16 @@
 # Some components are not auto-imported since they can have additional runtime
 # dependencies.
 
-from . import logging  # most symbols are helpers only
-from . import progress  # most symbols are only used by handler implementors
-from . import ddl, time_utils
+from . import (
+    ddl,
+    logging,  # most symbols are helpers only
+    progress,  # most symbols are only used by handler implementors
+    time_utils,
+)
 from ._butler import *
 from ._butler_collections import *
 from ._butler_config import *
+from ._butler_dataset_types import *
 from ._butler_repo_index import *
 from ._collection_type import CollectionType
 from ._column_categorization import *

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -41,6 +41,7 @@
 
 from ._butler_collections import ButlerCollections
 from ._butler_config import ButlerConfig, ButlerType
+from ._butler_dataset_types import ButlerDatasetTypes
 from ._butler_instance_options import ButlerInstanceOptions
 from ._butler_repo_index import ButlerRepoIndex
 from ._config import Config, ConfigSubset
@@ -836,6 +837,7 @@ def getURI(
             )
         return primary
 
+    # TODO: RFC deprecating this in favor of butler.dataset_types.get.
     @abstractmethod
     def get_dataset_type(self, name: str) -> DatasetType:
         """Get the `DatasetType`.
@@ -1444,6 +1446,16 @@ def run(self) -> str | None:
         """
         raise NotImplementedError()
 
+    # TODO: make this abstract and implement in derived classes.
+    @property
+    def dataset_types(self) -> ButlerDatasetTypes:
+        """Object with methods for modifying and querying dataset types
+        (`~lsst.daf.butler.ButlerDatasettypes`).
+
+        Use of this object is preferred over `registry` wherever possible.
+        """
+        raise NotImplementedError()
+
     @property
     @abstractmethod
     def registry(self) -> Registry:
@@ -1568,22 +1580,20 @@ def _query_datasets(
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DatasetRef]:
-        """Query for dataset references matching user-provided criteria.
+        """Query for dataset references of a single dataset type.
 
         Parameters
         ----------
         dataset_type : `str` or `DatasetType`
             Dataset type object or name to search for.
         collections : collection expression, optional
             A collection name or iterable of collection names to search. If not
-            provided, the default collections are used.  See
-            :ref:`daf_butler_collection_expressions` for more information.
+            provided, the default collections are used.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
             `DatasetRef` of each `DatasetType`, from the first collection in
             which a dataset of that dataset type appears (according to the
-            order of ``collections`` passed in).  If `True`, ``collections``
-            must not contain regular expressions and may not be ``...``.
+            order of ``collections`` passed in).
         data_id : `dict` or `DataCoordinate`, optional
             A data ID whose key-value pairs are used as equality constraints in
             the query.
@@ -1735,6 +1745,89 @@ def _query_dimension_records(
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return dimension_records
 
+    def _query_all_datasets(
+        self,
+        collections: str | Iterable[str] | None = None,
+        *,
+        name: str | Iterable[str] = "*",
+        at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
+        exact_dimensions: Iterable[str] | DimensionGroup | None = None,
+        storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
+        is_calibration: bool | None = None,
+        find_first: bool = True,
+        data_id: DataId | None = None,
+        where: str = "",
+        bind: Mapping[str, Any] | None = None,
+        explain: bool = True,
+        **kwargs: Any,
+    ) -> Iterable[DatasetRef]:
+        """Query for datasets of potentially multiple types.
+
+        Parameters
+        ----------
+        collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
+            The collection or collections to search, in order.  If not provided
+            or `None`, the default collection search path for this butler is
+            used.
+        name : `str` or `~collections.abc.Iterable` [ `str` ], optional
+            Names or name patterns (glob-style) that returned dataset type
+            names must match.  If an iterable, items are OR'd together.  The
+            default is to include all dataset types in the given collections.
+        at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
+                optional
+            Dimensions that returned dataset types must have as a subset.
+        at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
+                optional
+            Dimensions that returned dataset types must have exactly.
+        with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
+                or `StorageClass` or \
+                `~collections.abc.Iterable` [ `StorageClass` ], optional
+            Storage classes or storage class names that returned dataset types
+            must have.  If an iterable, items are OR'd together.
+        is_calibration : `bool` or `None`, optional
+            If `None`, constrain returned dataset types to be or not be
+            calibrations.
+        find_first : `bool`, optional
+            If `True` (default), for each result data ID, only yield one
+            `DatasetRef` of each `DatasetType`, from the first collection in
+            which a dataset of that dataset type appears (according to the
+            order of ``collections`` passed in).
+        data_id : `dict` or `DataCoordinate`, optional
+            A data ID whose key-value pairs are used as equality constraints in
+            the query.
+        where : `str`, optional
+            A string expression similar to a SQL WHERE clause.  May involve any
+            column of a dimension table or (as a shortcut for the primary key
+            column of a dimension table) dimension name.  See
+            :ref:`daf_butler_dimension_expressions` for more information.
+        bind : `~collections.abc.Mapping`, optional
+            Mapping containing literal values that should be injected into the
+            ``where`` expression, keyed by the identifiers they replace. Values
+            of collection type can be expanded in some cases; see
+            :ref:`daf_butler_dimension_expressions_identifiers` for more
+            information.
+        explain : `bool`, optional
+            If `True` (default) then `EmptyQueryResultError` exception is
+            raised when resulting list is empty. The exception contains
+            non-empty list of strings explaining possible causes for empty
+            result.
+        **kwargs
+            Additional keyword arguments are forwarded to
+            `DataCoordinate.standardize` when processing the ``data_id``
+            argument (and may be used to provide a constraining data ID even
+            when the ``data_id`` argument is `None`).
+
+        Returns
+        -------
+        refs : `.queries.HeterogeneousDatasetRefQueryResults`
+            Dataset references matching the given query criteria.  Nested data
+            IDs are guaranteed to include values for all implied dimensions
+            (i.e. `DataCoordinate.hasFull` will return `True`), but will not
+            include dimension records (`DataCoordinate.hasRecords` will be
+            `False`).
+        """
+        raise NotImplementedError()
+
     @abstractmethod
     def _clone(
         self,

diff --git a/python/lsst/daf/butler/_butler_dataset_types.py b/python/lsst/daf/butler/_butler_dataset_types.py
@@ -0,0 +1,220 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ("ButlerDatasetTypes",)
+
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Sequence, Set
+from typing import Any, overload
+
+from pydantic import BaseModel
+
+from ._dataset_type import DatasetType
+from ._storage_class import StorageClass
+from .dimensions import DimensionGroup
+
+
+class ButlerDatasetTypes(ABC, Sequence):
+    """Methods for working with the dataset types known to the Butler."""
+
+    @abstractmethod
+    def get(self, name: str) -> DatasetType:
+        """Return the dataset type with the given name.
+
+        Returns
+        -------
+        dataset_type : `DatasetType`
+            Dataset type object with the given name.
+
+        Raises
+        ------
+        MissingDatasetTypeError
+            Raised if there is no dataset type with the given name.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def query(
+        self,
+        name: str | Iterable[str],
+        *,
+        at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
+        exact_dimensions: Iterable[str] | DimensionGroup | None = None,
+        storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
+        is_calibration: bool | None = None,
+    ) -> Iterable[DatasetType]:
+        """Query for dataset types matching the given criteria.
+
+        Parameters
+        ----------
+        name : `str` or `~collections.abc.Iterable` [ `str` ]
+            Names or name patterns (glob-style) that returned dataset type
+            names must match.  If an iterable, items are OR'd together.
+        at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
+                optional
+            Dimensions that returned dataset types must have as a subset.
+        at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
+                optional
+            Dimensions that returned dataset types must have exactly.
+        with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
+                or `StorageClass` or \
+                `~collections.abc.Iterable` [ `StorageClass` ], optional
+            Storage classes or storage class names that returned dataset types
+            must have.  If an iterable, items are OR'd together.
+        is_calibration : `bool` or `None`, optional
+            If `None`, constrain returned dataset types to be or not be
+            calibrations.
+
+        Returns
+        -------
+        dataset_types : `~collections.abc.Iterable` [ `DatasetType`
+            An iterable of dataset types.  This is guaranteed to be a regular
+            Python in-memory container, not a lazy single-pass iterator, but
+            the type of container is currently left unspecified in order to
+            leave room for future convenience behavior.
+
+        Notes
+        -----
+        This method queries all registered dataset types in registry.  To query
+        for the types of datasets that are in a collection, instead use::
+
+            info = butler.collections.query_info(
+                collections,
+                include_summaries=True,
+            )
+
+        for a simple summary of the dataset types in each collection (see
+        `lsst.daf.butler.ButlerCollections.query_info`).  Or, for
+        more complex but powerful queries (including constraints on data IDs or
+        dataset counts), use::
+
+            with butler.query() as q:
+                dataset_types = q.dataset_types(collections)
+
+        See `lsst.daf.butler.queries.Query.dataset_types` for details.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def query_names(
+        self,
+        name: str | Iterable[str],
+        *,
+        at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
+        exact_dimensions: Iterable[str] | DimensionGroup | None = None,
+        storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
+        is_calibration: bool | None = None,
+    ) -> Iterable[str]:
+        """Query for the names of dataset types matching the given criteria.
+
+        See `query` for parameter descriptions.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def register(
+        self,
+        name_or_type: str,
+        /,
+        dimensions: Iterable[str] | DimensionGroup | None = None,
+        storage_class: str | StorageClass | None = None,
+        is_calibration: bool | None = None,
+    ) -> bool:
+        """Register a dataset type.
+
+        It is not an error to register the same `DatasetType` twice.
+
+        Parameters
+        ----------
+        name_or_type : `str` or `DatasetType`
+            The name of the dataset type to be added, or a complete
+            `DatasetType` type object to add.
+        dimensions : `~colletions.abc.Iterable` [ `str` ] or `DimensionGroup`,\
+                optional
+            Dimensions for the dataset type.  Required if the first argument
+            is just a `str`, and overrides the dimensions if the first argument
+            is a `DatasetType`.
+        storage_class : `str` or `StorageClass`, optional
+            Storage class for the dataset type.  Required if the first argument
+            is just a `str`, and overrides the storage class if the first
+            arguemnt is a `DatasetType`.
+        is_calibration: `bool`, optional
+            Whether the dataset type is a calibration.  If the first argument
+            is a `str`, defaults to `False`.  If the first argument is a
+            `DatasetType` and this argument is not `None`, it overrides the
+            value on the `DatasetType`.
+
+        Returns
+        -------
+        inserted : `bool`
+            `True` if a new dataset type was inserted, `False` if an identical
+            existing dataset type was found.  Note that in either case the
+            dataset type is guaranteed to be defined in the repository
+            consistently with the given definition.
+
+        Raises
+        ------
+        ValueError
+            Raised if the dimensions or storage class are invalid.
+        lsst.daf.butler.registry.ConflictingDefinitionError
+            Raised if this dataset type is already registered with a different
+            definition.
+
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def remove(self, name: str) -> None:
+        """Remove the dataset type with the given name.
+
+        .. warning::
+
+            Butler implementations can cache the dataset type definitions.
+            This means that deleting the dataset type definition may result in
+            unexpected behavior from other butler processes that are active
+            that have not seen the deletion.
+
+        Parameters
+        ----------
+        name : `str` or `tuple` [`str`]
+            Name of the type to be removed or tuple containing a list of type
+            names to be removed. Wildcards are allowed.
+
+        Raises
+        ------
+        lsst.daf.butler.registry.OrphanedRecordError
+            Raised if an attempt is made to remove the dataset type definition
+            when there are still datasets associated with it.
+
+        Notes
+        -----
+        If the dataset type is not registered the method will return without
+        action.
+        """
+        raise NotImplementedError()
diff --git a/python/lsst/daf/butler/queries/__init__.py b/python/lsst/daf/butler/queries/__init__.py
@@ -28,6 +28,8 @@
 from ._base import *
 from ._data_coordinate_query_results import *
 from ._dataset_query_results import *
+from ._dataset_type_results import *
 from ._dimension_record_query_results import *
 from ._general_query_results import *
+from ._heterogeneous_dataset_results import *
 from ._query import *