lsst · timj · Sep 6, 2024 · Aug 22, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/doc/changes/DM-45872.feature.rst b/doc/changes/DM-45872.feature.rst
@@ -0,0 +1,10 @@
+* A new query system and interface is now available using ``butler.query()`` as a context manager.
+  This new system is much more flexible and supports far more expressive queries, and no longer requires the results to be placed in a `set` to removed duplication.
+* Added ``butler.query_datasets()``, ``butler.query_dimension_records()`` and ``butler.query_data_ids()`` as replacements for the ``butler.registry`` equivalents.
+  These use the new query system and are preferred over the old interfaces.
+* The experimental collections querying interface is now public and called ``butler.collections.query_info`` and ``butler.collections.query``.
+* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support a ``--limit`` parameter.
+  The default for all except ``associate`` (which defaults to no limit) is to limit the number of results to 10,000.
+  A warning will be issued if the cap is hit.
+* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support ``--order-by`` to control the sorting in conjunction with ``--limit``.
+  For ``query-datasets`` this will also control the sorting of the reported tables.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -1465,13 +1465,13 @@ def registry(self) -> Registry:
         raise NotImplementedError()
 
     @abstractmethod
-    def _query(self) -> AbstractContextManager[Query]:
+    def query(self) -> AbstractContextManager[Query]:
         """Context manager returning a `Query` object used for construction
         and execution of complex queries.
         """
         raise NotImplementedError()
 
-    def _query_data_ids(
+    def query_data_ids(
         self,
         dimensions: DimensionGroup | Iterable[str] | str,
         *,
@@ -1549,7 +1549,9 @@ def _query_data_ids(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
+        if order_by is None:
+            order_by = []
+        with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
                 .data_ids(dimensions)
@@ -1563,7 +1565,7 @@ def _query_data_ids(
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return data_ids
 
-    def _query_datasets(
+    def query_datasets(
         self,
         dataset_type: str | DatasetType,
         collections: str | Iterable[str] | None = None,
@@ -1573,6 +1575,8 @@ def _query_datasets(
         where: str = "",
         bind: Mapping[str, Any] | None = None,
         with_dimension_records: bool = False,
+        order_by: Iterable[str] | str | None = None,
+        limit: int = 20_000,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DatasetRef]:
@@ -1584,7 +1588,7 @@ def _query_datasets(
             Dataset type object or name to search for.
         collections : collection expression, optional
             A collection name or iterable of collection names to search. If not
-            provided, the default collections are used.  See
+            provided, the default collections are used. Can be a wildcard. See
             :ref:`daf_butler_collection_expressions` for more information.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
@@ -1609,6 +1613,12 @@ def _query_datasets(
         with_dimension_records : `bool`, optional
             If `True` (default is `False`) then returned data IDs will have
             dimension records.
+        order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
+            Names of the columns/dimensions to use for ordering returned data
+            IDs. Column name can be prefixed with minus (``-``) to use
+            descending ordering.
+        limit : `int`, optional
+            Upper limit on the number of returned records.
         explain : `bool`, optional
             If `True` (default) then `EmptyQueryResultError` exception is
             raised when resulting list is empty. The exception contains
@@ -1654,11 +1664,16 @@ def _query_datasets(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
-            result = query.where(data_id, where, bind=bind, **kwargs).datasets(
-                dataset_type,
-                collections=collections,
-                find_first=find_first,
+        if order_by is None:
+            order_by = []
+        if collections:
+            collections = self.collections.query(collections)
+        with self.query() as query:
+            result = (
+                query.where(data_id, where, bind=bind, **kwargs)
+                .datasets(dataset_type, collections=collections, find_first=find_first)
+                .order_by(*ensure_iterable(order_by))
+                .limit(limit)
             )
             if with_dimension_records:
                 result = result.with_dimension_records()
@@ -1667,7 +1682,7 @@ def _query_datasets(
             raise EmptyQueryResultError(list(result.explain_no_results()))
         return refs
 
-    def _query_dimension_records(
+    def query_dimension_records(
         self,
         element: str,
         *,
@@ -1738,7 +1753,9 @@ def _query_dimension_records(
         """
         if data_id is None:
             data_id = DataCoordinate.make_empty(self.dimensions)
-        with self._query() as query:
+        if order_by is None:
+            order_by = []
+        with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
                 .dimension_records(element)

diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py
@@ -218,7 +218,7 @@ def remove_from_chain(
         """
         raise NotImplementedError()
 
-    def x_query(
+    def query(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,
@@ -227,8 +227,6 @@ def x_query(
     ) -> Sequence[str]:
         """Query the butler for collections matching an expression.
 
-        **This is an experimental interface that can change at any time.**
-
         Parameters
         ----------
         expression : `str` or `~collections.abc.Iterable` [ `str` ]
@@ -260,7 +258,7 @@ def x_query(
 
         The default implementation is a wrapper around `x_query_info`.
         """
-        collections_info = self.x_query_info(
+        collections_info = self.query_info(
             expression,
             collection_types=collection_types,
             flatten_chains=flatten_chains,
@@ -269,7 +267,7 @@ def x_query(
         return [info.name for info in collections_info]
 
     @abstractmethod
-    def x_query_info(
+    def query_info(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,
@@ -281,8 +279,6 @@ def x_query_info(
         """Query the butler for collections matching an expression and
         return detailed information about those collections.
 
-        **This is an experimental interface that can change at any time.**
-
         Parameters
         ----------
         expression : `str` or `~collections.abc.Iterable` [ `str` ]

diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -78,7 +78,7 @@
 @click.command(cls=ButlerCommand, short_help="Add existing datasets to a tagged collection.")
 @repo_argument(required=True)
 @collection_argument(help="COLLECTION is the collection the datasets should be associated with.")
-@query_datasets_options(repo=False, showUri=False, useArguments=False)
+@query_datasets_options(repo=False, showUri=False, useArguments=False, default_limit=0, use_order_by=False)
 @options_file_option()
 def associate(**kwargs: Any) -> None:
     """Add existing datasets to a tagged collection; searches for datasets with

diff --git a/python/lsst/daf/butler/cli/opt/optionGroups.py b/python/lsst/daf/butler/cli/opt/optionGroups.py
@@ -33,7 +33,7 @@
 
 from ..utils import OptionGroup, unwrap, where_help
 from .arguments import glob_argument, repo_argument
-from .options import collections_option, dataset_type_option, where_option
+from .options import collections_option, dataset_type_option, limit_option, order_by_option, where_option
 
 
 class query_datasets_options(OptionGroup):  # noqa: N801
@@ -47,9 +47,20 @@ class query_datasets_options(OptionGroup):  # noqa: N801
         Whether to include the dataset URI.
     useArguments : `bool`
         Whether this is an argument or an option.
+    default_limit : `int`
+        The default value to use for the limit parameter.
+    use_order_by : `bool`
+        Whether to include an order_by option.
     """
 
-    def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool = True) -> None:
+    def __init__(
+        self,
+        repo: bool = True,
+        showUri: bool = True,
+        useArguments: bool = True,
+        default_limit: int = -10_000,
+        use_order_by: bool = True,
+    ) -> None:
         self.decorators = []
         if repo:
             if not useArguments:
@@ -88,8 +99,16 @@ def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool =
                                      contain wildcards."""
                     ),
                 ),
+                limit_option(
+                    help="Limit the number of results that are processed. 0 means no limit. A negative "
+                    "value specifies a cap where a warning will be issued if the cap is hit. "
+                    f"Default value is {default_limit}.",
+                    default=default_limit,
+                ),
             ]
         )
+        if use_order_by:
+            self.decorators.append(order_by_option())
         if showUri:
             self.decorators.append(
                 click.option("--show-uri", is_flag=True, help="Show the dataset URI in results.")

diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py
@@ -1396,9 +1396,9 @@ def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
             collectionType = self._registry.getCollectionType(name)
             if collectionType is not CollectionType.RUN:
                 raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
-            with self._query() as query:
+            with self.query() as query:
                 # Work out the dataset types that are relevant.
-                collections_info = self.collections.x_query_info(name, include_summary=True)
+                collections_info = self.collections.query_info(name, include_summary=True)
                 filtered_dataset_types = self.collections._filter_dataset_types(
                     all_dataset_types, collections_info
                 )
@@ -2201,7 +2201,7 @@ def dimensions(self) -> DimensionUniverse:
         # Docstring inherited.
         return self._registry.dimensions
 
-    def _query(self) -> contextlib.AbstractContextManager[Query]:
+    def query(self) -> contextlib.AbstractContextManager[Query]:
         # Docstring inherited.
         return self._registry._query()
 

diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py
@@ -81,7 +81,7 @@ def remove_from_chain(
             parent_collection_name, list(ensure_iterable(child_collection_names))
         )
 
-    def x_query(
+    def query(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,
@@ -99,7 +99,7 @@ def x_query(
             includeChains=include_chains,
         )
 
-    def x_query_info(
+    def query_info(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,

diff --git a/python/lsst/daf/butler/queries/_expression_strings.py b/python/lsst/daf/butler/queries/_expression_strings.py
@@ -80,7 +80,8 @@ def convert_expression_string_to_predicate(
         tree = parse_expression(expression)
     except Exception as exc:
         raise InvalidQueryError(f"Failed to parse expression '{expression}'") from exc
-
+    if tree is None:
+        return Predicate.from_bool(True)
     converter = _ConversionVisitor(context, universe)
     predicate = tree.visit(converter)
     assert isinstance(

diff --git a/python/lsst/daf/butler/queries/_query.py b/python/lsst/daf/butler/queries/_query.py
@@ -143,7 +143,7 @@ def expression_factory(self) -> ExpressionFactory:
         variable, and then its (dynamic) attributes can be used to obtain
         references to columns that can be included in a query::
 
-            with butler._query() as query:
+            with butler.query() as query:
                 x = query.expression_factory
                 query = query.where(
                     x.instrument == "LSSTCam",
@@ -161,7 +161,7 @@ def expression_factory(self) -> ExpressionFactory:
         `~CollectionType.CALIBRATION` collection searches) can be obtained with
         dict-like access instead::
 
-            with butler._query() as query:
+            with butler.query() as query:
                 query = query.order_by(x["raw"].ingest_date)
 
         Expression proxy objects that correspond to scalar columns overload the

diff --git a/python/lsst/daf/butler/registry/queries/expressions/_predicate.py b/python/lsst/daf/butler/registry/queries/expressions/_predicate.py
@@ -134,6 +134,7 @@ def make_string_expression_predicate(
         tree = parse_expression(string)
     except Exception as exc:
         raise UserExpressionSyntaxError(f"Failed to parse user expression {string!r}.") from exc
+    assert tree is not None, "Should only happen if the string is empty, and that's handled above."
     if bind is None:
         bind = {}
     if bind:

diff --git a/python/lsst/daf/butler/registry/queries/expressions/parser/parser.py b/python/lsst/daf/butler/registry/queries/expressions/parser/parser.py
@@ -31,7 +31,7 @@
 from .parserYacc import ParserYacc  # type: ignore
 
 
-def parse_expression(expression: str) -> Node:
+def parse_expression(expression: str) -> Node | None:
     """Given a Butler query expression string, convert it to a tree form.
 
     Parameters
@@ -41,7 +41,7 @@ def parse_expression(expression: str) -> Node:
 
     Returns
     -------
-    node : `Node`
+    node : `Node` or `None`
         Tree form of expression query.
     """
     parser = ParserYacc()

diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py
@@ -421,7 +421,7 @@ def queryDatasets(
 
         query_results = [
             QueryDriverDatasetRefQueryResults(
-                self._butler._query,
+                self._butler.query,
                 args,
                 dataset_type=dt,
                 find_first=findFirst,
@@ -457,7 +457,7 @@ def queryDataIds(
             dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
         )
         return QueryDriverDataCoordinateQueryResults(
-            self._butler._query, dimensions=dimensions, expanded=False, args=args
+            self._butler.query, dimensions=dimensions, expanded=False, args=args
         )
 
     def queryDimensionRecords(
@@ -480,7 +480,7 @@ def queryDimensionRecords(
             dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
         )
 
-        return QueryDriverDimensionRecordQueryResults(self._butler._query, element, args)
+        return QueryDriverDimensionRecordQueryResults(self._butler.query, element, args)
 
     def _convert_common_query_arguments(
         self,
@@ -519,7 +519,7 @@ def queryDatasetAssociations(
         resolved_collections = self.queryCollections(
             collections, datasetType=datasetType, collectionTypes=collectionTypes, flattenChains=flattenChains
         )
-        with self._butler._query() as query:
+        with self._butler.query() as query:
             query = query.join_dataset_search(datasetType, resolved_collections)
             result = query.general(
                 datasetType.dimensions,

diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py
@@ -570,7 +570,7 @@ def registry(self) -> Registry:
         return self._registry
 
     @contextmanager
-    def _query(self) -> Iterator[Query]:
+    def query(self) -> Iterator[Query]:
         driver = RemoteQueryDriver(self, self._connection)
         with driver:
             query = Query(driver)

diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py
@@ -71,7 +71,7 @@ def remove_from_chain(
     ) -> None:
         raise NotImplementedError("Not yet available")
 
-    def x_query_info(
+    def query_info(
         self,
         expression: str | Iterable[str],
         collection_types: Set[CollectionType] | CollectionType | None = None,

diff --git a/python/lsst/daf/butler/script/_associate.py b/python/lsst/daf/butler/script/_associate.py
@@ -40,6 +40,7 @@ def associate(
     collections: Iterable[str],
     where: str,
     find_first: bool,
+    limit: int,
 ) -> None:
     """Add existing datasets to a CHAINED collection.
 
@@ -57,6 +58,10 @@ def associate(
         Query string.
     find_first : `bool`
         Whether to find the first match or not.
+    limit : `int`
+        Limit the number of results to be returned. A value of 0 means
+        unlimited. A negative value is used to specify a cap where a warning
+        is issued if that cap is hit.
     """
     butler = Butler.from_config(repo, writeable=True, without_datastore=True)
 
@@ -68,6 +73,8 @@ def associate(
         collections=collections,
         where=where,
         find_first=find_first,
+        limit=limit,
+        order_by=(),
         show_uri=False,
         repo=None,
     )