lsst · timj · Sep 6, 2024 · Aug 22, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/doc/changes/DM-45872.feature.rst b/doc/changes/DM-45872.feature.rst
@@ -0,0 +1,10 @@
+* A new query system and interface is now available using ``butler.query()`` as a context manager.
+  This new system is much more flexible and supports far more expressive queries, and no longer requires the results to be placed in a `set` to removed duplication.
+* Added ``butler.query_datasets()``, ``butler.query_dimension_records()`` and ``butler.query_data_ids()`` as replacements for the ``butler.registry`` equivalents.
+  These use the new query system and are preferred over the old interfaces.
+* The experimental collections querying interface is now public and called ``butler.collections.query_info`` and ``butler.collections.query``.
+* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support a ``--limit`` parameter.
+  The default for all except ``associate`` (which defaults to no limit) is to limit the number of results to 10,000.
+  A warning will be issued if the cap is hit.
+* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support ``--order-by`` to control the sorting in conjunction with ``--limit``.
+  For ``query-datasets`` this will also control the sorting of the reported tables.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -1465,13 +1465,13 @@ def registry(self) -> Registry:
         raise NotImplementedError()
 
     @abstractmethod
-    def _query(self) -> AbstractContextManager[Query]:
+    def query(self) -> AbstractContextManager[Query]:
         """Context manager returning a `Query` object used for construction
         and execution of complex queries.
         """
         raise NotImplementedError()
 
-    def _query_data_ids(
+    def query_data_ids(
         self,
         dimensions: DimensionGroup | Iterable[str] | str,
         *,
@@ -1480,7 +1480,7 @@ def _query_data_ids(
         bind: Mapping[str, Any] | None = None,
         with_dimension_records: bool = False,
         order_by: Iterable[str] | str | None = None,
-        limit: int | None = None,
+        limit: int | None = -20_000,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DataCoordinate]:
@@ -1514,8 +1514,13 @@ def _query_data_ids(
             Names of the columns/dimensions to use for ordering returned data
             IDs. Column name can be prefixed with minus (``-``) to use
             descending ordering.
-        limit : `int`, optional
-            Upper limit on the number of returned records.
+        limit : `int` or `None`, optional
+            Upper limit on the number of returned records. `None` can be used
+            if no limit is wanted. A limit of ``0`` means that the query will
+            be executed and validated but no results will be returned. In this
+            case there will be no exception even if ``explain`` is `True`.
+            If a negative value is given a warning will be issued if the number
+            of results is capped by that limit.
         explain : `bool`, optional
             If `True` (default) then `EmptyQueryResultError` exception is
             raised when resulting list is empty. The exception contains
@@ -1549,21 +1554,33 @@ def _query_data_ids(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
+        if order_by is None:
+            order_by = []
+        query_limit = limit
+        warn_limit = False
+        if limit is not None and limit < 0:
+            query_limit = abs(limit) + 1
+            warn_limit = True
+        with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
                 .data_ids(dimensions)
                 .order_by(*ensure_iterable(order_by))
-                .limit(limit)
+                .limit(query_limit)
             )
             if with_dimension_records:
                 result = result.with_dimension_records()
             data_ids = list(result)
-        if explain and not data_ids:
+            if warn_limit and len(data_ids) == query_limit:
+                # We asked for one too many so must remove that from the list.
+                data_ids.pop(-1)
+                assert limit is not None  # For mypy.
+                _LOG.warning("More data IDs are available than the requested limit of %d.", abs(limit))
+        if explain and (limit is None or limit != 0) and not data_ids:
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return data_ids
 
-    def _query_datasets(
+    def query_datasets(
         self,
         dataset_type: str | DatasetType,
         collections: str | Iterable[str] | None = None,
@@ -1573,6 +1590,8 @@ def _query_datasets(
         where: str = "",
         bind: Mapping[str, Any] | None = None,
         with_dimension_records: bool = False,
+        order_by: Iterable[str] | str | None = None,
+        limit: int | None = -20_000,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DatasetRef]:
@@ -1584,14 +1603,16 @@ def _query_datasets(
             Dataset type object or name to search for.
         collections : collection expression, optional
             A collection name or iterable of collection names to search. If not
-            provided, the default collections are used.  See
-            :ref:`daf_butler_collection_expressions` for more information.
+            provided, the default collections are used. Can be a wildcard if
+            ``find_first`` is `False` (if find first is requested the order
+            of collections matters and wildcards make the order indeterminate).
+             See :ref:`daf_butler_collection_expressions` for more information.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
             `DatasetRef` of each `DatasetType`, from the first collection in
             which a dataset of that dataset type appears (according to the
             order of ``collections`` passed in).  If `True`, ``collections``
-            must not contain regular expressions and may not be ``...``.
+            must not contain wildcards.
         data_id : `dict` or `DataCoordinate`, optional
             A data ID whose key-value pairs are used as equality constraints in
             the query.
@@ -1609,6 +1630,17 @@ def _query_datasets(
         with_dimension_records : `bool`, optional
             If `True` (default is `False`) then returned data IDs will have
             dimension records.
+        order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
+            Names of the columns/dimensions to use for ordering returned data
+            IDs. Column name can be prefixed with minus (``-``) to use
+            descending ordering.
+        limit : `int` or `None`, optional
+            Upper limit on the number of returned records. `None` can be used
+            if no limit is wanted. A limit of ``0`` means that the query will
+            be executed and validated but no results will be returned. In this
+            case there will be no exception even if ``explain`` is `True`.
+            If a negative value is given a warning will be issued if the number
+            of results is capped by that limit.
         explain : `bool`, optional
             If `True` (default) then `EmptyQueryResultError` exception is
             raised when resulting list is empty. The exception contains
@@ -1654,28 +1686,52 @@ def _query_datasets(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
-            result = query.where(data_id, where, bind=bind, **kwargs).datasets(
-                dataset_type,
-                collections=collections,
-                find_first=find_first,
+        if order_by is None:
+            order_by = []
+        if collections:
+            # Wild cards need to be expanded but can only be allowed if
+            # find_first=False because expanding wildcards does not return
+            # a guaranteed ordering.
+            expanded_collections = self.collections.query(collections)
+            if find_first and set(expanded_collections) != set(ensure_iterable(collections)):
+                raise RuntimeError(
+                    "Can not use wildcards in collections when find_first=True "
+                    f" (given {collections} which expanded to {expanded_collections})"
+                )
+            collections = expanded_collections
+        query_limit = limit
+        warn_limit = False
+        if limit is not None and limit < 0:
+            query_limit = abs(limit) + 1
+            warn_limit = True
+        with self.query() as query:
+            result = (
+                query.where(data_id, where, bind=bind, **kwargs)
+                .datasets(dataset_type, collections=collections, find_first=find_first)
+                .order_by(*ensure_iterable(order_by))
+                .limit(query_limit)
             )
             if with_dimension_records:
                 result = result.with_dimension_records()
             refs = list(result)
-        if explain and not refs:
+            if warn_limit and len(refs) == query_limit:
+                # We asked for one too many so must remove that from the list.
+                refs.pop(-1)
+                assert limit is not None  # For mypy.
+                _LOG.warning("More datasets are available than the requested limit of %d.", abs(limit))
+        if explain and (limit is None or limit != 0) and not refs:
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return refs
 
-    def _query_dimension_records(
+    def query_dimension_records(
         self,
         element: str,
         *,
         data_id: DataId | None = None,
         where: str = "",
         bind: Mapping[str, Any] | None = None,
         order_by: Iterable[str] | str | None = None,
-        limit: int | None = None,
+        limit: int | None = -20_000,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DimensionRecord]:
@@ -1702,8 +1758,13 @@ def _query_dimension_records(
             Names of the columns/dimensions to use for ordering returned data
             IDs. Column name can be prefixed with minus (``-``) to use
             descending ordering.
-        limit : `int`, optional
-            Upper limit on the number of returned records.
+        limit : `int` or `None`, optional
+            Upper limit on the number of returned records. `None` can be used
+            if no limit is wanted. A limit of ``0`` means that the query will
+            be executed and validated but no results will be returned. In this
+            case there will be no exception even if ``explain`` is `True`.
+            If a negative value is given a warning will be issued if the number
+            of results is capped by that limit.
         explain : `bool`, optional
             If `True` (default) then `EmptyQueryResultError` exception is
             raised when resulting list is empty. The exception contains
@@ -1738,15 +1799,29 @@ def _query_dimension_records(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
+        if order_by is None:
+            order_by = []
+        query_limit = limit
+        warn_limit = False
+        if limit is not None and limit < 0:
+            query_limit = abs(limit) + 1
+            warn_limit = True
+        with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
                 .dimension_records(element)
                 .order_by(*ensure_iterable(order_by))
-                .limit(limit)
+                .limit(query_limit)
             )
             dimension_records = list(result)
-        if explain and not dimension_records:
+            if warn_limit and len(dimension_records) == query_limit:
+                # We asked for one too many so must remove that from the list.
+                dimension_records.pop(-1)
+                assert limit is not None  # For mypy.
+                _LOG.warning(
+                    "More dimension records are available than the requested limit of %d.", abs(limit)
+                )
+        if explain and (limit is None or limit != 0) and not dimension_records:
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return dimension_records
 

diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py
@@ -218,7 +218,7 @@ def remove_from_chain(
         """
         raise NotImplementedError()
 
-    def x_query(
+    def query(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,
@@ -227,8 +227,6 @@ def x_query(
     ) -> Sequence[str]:
         """Query the butler for collections matching an expression.
 
-        **This is an experimental interface that can change at any time.**
-
         Parameters
         ----------
         expression : `str` or `~collections.abc.Iterable` [ `str` ]
@@ -260,7 +258,7 @@ def x_query(
 
         The default implementation is a wrapper around `x_query_info`.
         """
-        collections_info = self.x_query_info(
+        collections_info = self.query_info(
             expression,
             collection_types=collection_types,
             flatten_chains=flatten_chains,
@@ -269,7 +267,7 @@ def x_query(
         return [info.name for info in collections_info]
 
     @abstractmethod
-    def x_query_info(
+    def query_info(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,
@@ -281,8 +279,6 @@ def x_query_info(
         """Query the butler for collections matching an expression and
         return detailed information about those collections.
 
-        **This is an experimental interface that can change at any time.**
-
         Parameters
         ----------
         expression : `str` or `~collections.abc.Iterable` [ `str` ]

diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -78,7 +78,7 @@
 @click.command(cls=ButlerCommand, short_help="Add existing datasets to a tagged collection.")
 @repo_argument(required=True)
 @collection_argument(help="COLLECTION is the collection the datasets should be associated with.")
-@query_datasets_options(repo=False, showUri=False, useArguments=False)
+@query_datasets_options(repo=False, showUri=False, useArguments=False, default_limit=0, use_order_by=False)
 @options_file_option()
 def associate(**kwargs: Any) -> None:
     """Add existing datasets to a tagged collection; searches for datasets with

diff --git a/python/lsst/daf/butler/cli/opt/optionGroups.py b/python/lsst/daf/butler/cli/opt/optionGroups.py
@@ -33,7 +33,7 @@
 
 from ..utils import OptionGroup, unwrap, where_help
 from .arguments import glob_argument, repo_argument
-from .options import collections_option, dataset_type_option, where_option
+from .options import collections_option, dataset_type_option, limit_option, order_by_option, where_option
 
 
 class query_datasets_options(OptionGroup):  # noqa: N801
@@ -47,9 +47,20 @@ class query_datasets_options(OptionGroup):  # noqa: N801
         Whether to include the dataset URI.
     useArguments : `bool`
         Whether this is an argument or an option.
+    default_limit : `int`
+        The default value to use for the limit parameter.
+    use_order_by : `bool`
+        Whether to include an order_by option.
     """
 
-    def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool = True) -> None:
+    def __init__(
+        self,
+        repo: bool = True,
+        showUri: bool = True,
+        useArguments: bool = True,
+        default_limit: int = -20_000,
+        use_order_by: bool = True,
+    ) -> None:
         self.decorators = []
         if repo:
             if not useArguments:
@@ -78,8 +89,9 @@ def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool =
                 collections_option(),
                 where_option(help=where_help),
                 click.option(
-                    "--find-first",
+                    "--find-first/--no-find-first",
                     is_flag=True,
+                    default=False,
                     help=unwrap(
                         """For each result data ID, only yield one DatasetRef of each
                                      DatasetType, from the first collection in which a dataset of that dataset
@@ -88,8 +100,16 @@ def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool =
                                      contain wildcards."""
                     ),
                 ),
+                limit_option(
+                    help="Limit the number of results that are processed. 0 means no limit. A negative "
+                    "value specifies a cap where a warning will be issued if the cap is hit. "
+                    f"Default value is {default_limit}.",
+                    default=default_limit,
+                ),
             ]
         )
+        if use_order_by:
+            self.decorators.append(order_by_option())
         if showUri:
             self.decorators.append(
                 click.option("--show-uri", is_flag=True, help="Show the dataset URI in results.")

diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py
@@ -651,6 +651,8 @@ def getManyURIs(
         # Docstring inherited
 
         uris: dict[DatasetRef, DatasetRefURIs] = {}
+        ephemeral_uris: dict[DatasetRef, DatasetRefURIs] = {}
+
         missing_refs = set(refs)
 
         # If predict is True we don't want to predict a dataset in the first
@@ -666,11 +668,23 @@ def getManyURIs(
                 except NotImplementedError:
                     # some datastores may not implement generating URIs
                     continue
+                if datastore.isEphemeral:
+                    # Only use these as last resort so do not constrain
+                    # subsequent queries.
+                    ephemeral_uris.update(got_uris)
+                    continue
+
                 missing_refs -= got_uris.keys()
                 uris.update(got_uris)
                 if not missing_refs:
                     break
 
+        if missing_refs and ephemeral_uris:
+            ephemeral_refs = missing_refs.intersection(ephemeral_uris.keys())
+            for ref in ephemeral_refs:
+                uris[ref] = ephemeral_uris[ref]
+                missing_refs.remove(ref)
+
         if missing_refs and not allow_missing:
             raise FileNotFoundError(f"Dataset(s) {missing_refs} not in this datastore.")