Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-45872: Make the new query system public #1068

Merged
merged 30 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
612276f
Handle empty query 'where' strings.
TallJimbo Aug 22, 2024
034b6ac
Rename Butler._query to Butler.query
timj Sep 3, 2024
0b29c09
Make the ButlerCollections.query methods public
timj Sep 3, 2024
bce90de
Make Butler.query_datasets and others public
timj Sep 3, 2024
372285c
Remove the simple query APIs from hybrid butler
timj Sep 4, 2024
dcf3b11
Add some simple query interface tests
timj Sep 4, 2024
c324dac
Add --limit to query-datasets and related command-line scripts
timj Sep 4, 2024
97885ea
Add --order-by support to query-datasets command-line
timj Sep 4, 2024
01c425c
Add news fragment
timj Sep 4, 2024
501b331
Add a test for query-datasets command-line with limit/order_by
timj Sep 4, 2024
bbf27d7
Be more explicit about the default status of --find-first
timj Sep 5, 2024
5f08e8e
Fix handling of find_first=True with collection wildcards
timj Sep 5, 2024
c309707
Add support for negative limit to Butler.query_datasets
timj Sep 5, 2024
9dfe525
Use same limit for command line query-datasets and butler.query_datasets
timj Sep 5, 2024
ff0d8f9
Change command-line to convert limit=0 to limit=None for API usage
timj Sep 5, 2024
0cd7c5f
Fix copy-paste bug in expression factory dataset timespan access.
TallJimbo Aug 22, 2024
aaca3f3
Change query-datasets command line to use iterators
timj Sep 5, 2024
a287d0b
Fix --show-uri for chained datasstores
timj Sep 5, 2024
75ed7ac
Add support for negative limit to query_dimension_records
timj Sep 6, 2024
8841e2d
Add negative limit support to query_data_ids
timj Sep 6, 2024
b1dc851
Make sure query-datasets complains if collection wildcard with find-f…
timj Sep 6, 2024
c7e55f9
Rename Query.general to x_general to mark as experimental
timj Sep 6, 2024
6e98bfa
Make in-memory dataset URI more informative and remove duplication
timj Sep 6, 2024
c2c7073
Set large but negative limits for query_data_ids and query_dimension_…
timj Sep 6, 2024
e14eae1
Only use ephemeral URIs when we need them in get_many_uris
timj Sep 6, 2024
1dbd433
Change query-datasets command-line to use the butler it creates
timj Sep 6, 2024
3229f5c
Fix mypy validation error by adding explicit type guard function
timj Sep 6, 2024
4eb4204
Allow MetricTestRepo to use a different storage class
timj Sep 6, 2024
8ccae72
Add a test for query-datasets uris without disassembly
timj Sep 6, 2024
e588c74
Revert "Fix --show-uri for chained datasstores"
timj Sep 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/changes/DM-45872.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
* A new query system and interface is now available using ``butler.query()`` as a context manager.
This new system is much more flexible and supports far more expressive queries, and no longer requires the results to be placed in a `set` to removed duplication.
* Added ``butler.query_datasets()``, ``butler.query_dimension_records()`` and ``butler.query_data_ids()`` as replacements for the ``butler.registry`` equivalents.
These use the new query system and are preferred over the old interfaces.
* The experimental collections querying interface is now public and called ``butler.collections.query_info`` and ``butler.collections.query``.
* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support a ``--limit`` parameter.
The default for all except ``associate`` (which defaults to no limit) is to limit the number of results to 10,000.
A warning will be issued if the cap is hit.
* The command line tools ``query-datasets``, ``associate``, ``retrieve-artifacts`` and ``transfer-datasets`` now support ``--order-by`` to control the sorting in conjunction with ``--limit``.
For ``query-datasets`` this will also control the sorting of the reported tables.
41 changes: 29 additions & 12 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1465,13 +1465,13 @@ def registry(self) -> Registry:
raise NotImplementedError()

@abstractmethod
def _query(self) -> AbstractContextManager[Query]:
def query(self) -> AbstractContextManager[Query]:
"""Context manager returning a `Query` object used for construction
and execution of complex queries.
"""
raise NotImplementedError()

def _query_data_ids(
def query_data_ids(
self,
dimensions: DimensionGroup | Iterable[str] | str,
*,
Expand Down Expand Up @@ -1549,7 +1549,9 @@ def _query_data_ids(
"""
if data_id is None:
data_id = DataCoordinate.make_empty(self.dimensions)
with self._query() as query:
if order_by is None:
order_by = []
with self.query() as query:
result = (
query.where(data_id, where, bind=bind, **kwargs)
.data_ids(dimensions)
Expand All @@ -1563,7 +1565,7 @@ def _query_data_ids(
raise EmptyQueryResultError(list(result.explain_no_results()))
return data_ids

def _query_datasets(
def query_datasets(
self,
dataset_type: str | DatasetType,
collections: str | Iterable[str] | None = None,
Expand All @@ -1573,6 +1575,8 @@ def _query_datasets(
where: str = "",
bind: Mapping[str, Any] | None = None,
with_dimension_records: bool = False,
order_by: Iterable[str] | str | None = None,
limit: int = 20_000,
timj marked this conversation as resolved.
Show resolved Hide resolved
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
Expand All @@ -1584,7 +1588,7 @@ def _query_datasets(
Dataset type object or name to search for.
collections : collection expression, optional
A collection name or iterable of collection names to search. If not
provided, the default collections are used. See
provided, the default collections are used. Can be a wildcard. See
timj marked this conversation as resolved.
Show resolved Hide resolved
:ref:`daf_butler_collection_expressions` for more information.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
Expand All @@ -1609,6 +1613,12 @@ def _query_datasets(
with_dimension_records : `bool`, optional
If `True` (default is `False`) then returned data IDs will have
dimension records.
order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
Names of the columns/dimensions to use for ordering returned data
IDs. Column name can be prefixed with minus (``-``) to use
descending ordering.
limit : `int`, optional
Upper limit on the number of returned records.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
Expand Down Expand Up @@ -1654,11 +1664,16 @@ def _query_datasets(
"""
if data_id is None:
data_id = DataCoordinate.make_empty(self.dimensions)
with self._query() as query:
result = query.where(data_id, where, bind=bind, **kwargs).datasets(
dataset_type,
collections=collections,
find_first=find_first,
if order_by is None:
order_by = []
if collections:
collections = self.collections.query(collections)
with self.query() as query:
result = (
query.where(data_id, where, bind=bind, **kwargs)
.datasets(dataset_type, collections=collections, find_first=find_first)
.order_by(*ensure_iterable(order_by))
.limit(limit)
)
if with_dimension_records:
result = result.with_dimension_records()
Expand All @@ -1667,7 +1682,7 @@ def _query_datasets(
raise EmptyQueryResultError(list(result.explain_no_results()))
return refs

def _query_dimension_records(
def query_dimension_records(
self,
element: str,
*,
Expand Down Expand Up @@ -1738,7 +1753,9 @@ def _query_dimension_records(
"""
if data_id is None:
data_id = DataCoordinate.make_empty(self.dimensions)
with self._query() as query:
if order_by is None:
order_by = []
with self.query() as query:
result = (
query.where(data_id, where, bind=bind, **kwargs)
.dimension_records(element)
Expand Down
10 changes: 3 additions & 7 deletions python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def remove_from_chain(
"""
raise NotImplementedError()

def x_query(
def query(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
Expand All @@ -227,8 +227,6 @@ def x_query(
) -> Sequence[str]:
"""Query the butler for collections matching an expression.

**This is an experimental interface that can change at any time.**

Parameters
----------
expression : `str` or `~collections.abc.Iterable` [ `str` ]
Expand Down Expand Up @@ -260,7 +258,7 @@ def x_query(

The default implementation is a wrapper around `x_query_info`.
"""
collections_info = self.x_query_info(
collections_info = self.query_info(
expression,
collection_types=collection_types,
flatten_chains=flatten_chains,
Expand All @@ -269,7 +267,7 @@ def x_query(
return [info.name for info in collections_info]

@abstractmethod
def x_query_info(
def query_info(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
Expand All @@ -281,8 +279,6 @@ def x_query_info(
"""Query the butler for collections matching an expression and
return detailed information about those collections.

**This is an experimental interface that can change at any time.**

Parameters
----------
expression : `str` or `~collections.abc.Iterable` [ `str` ]
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
@click.command(cls=ButlerCommand, short_help="Add existing datasets to a tagged collection.")
@repo_argument(required=True)
@collection_argument(help="COLLECTION is the collection the datasets should be associated with.")
@query_datasets_options(repo=False, showUri=False, useArguments=False)
@query_datasets_options(repo=False, showUri=False, useArguments=False, default_limit=0, use_order_by=False)
@options_file_option()
def associate(**kwargs: Any) -> None:
"""Add existing datasets to a tagged collection; searches for datasets with
Expand Down
23 changes: 21 additions & 2 deletions python/lsst/daf/butler/cli/opt/optionGroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..utils import OptionGroup, unwrap, where_help
from .arguments import glob_argument, repo_argument
from .options import collections_option, dataset_type_option, where_option
from .options import collections_option, dataset_type_option, limit_option, order_by_option, where_option


class query_datasets_options(OptionGroup): # noqa: N801
Expand All @@ -47,9 +47,20 @@ class query_datasets_options(OptionGroup): # noqa: N801
Whether to include the dataset URI.
useArguments : `bool`
Whether this is an argument or an option.
default_limit : `int`
The default value to use for the limit parameter.
use_order_by : `bool`
Whether to include an order_by option.
"""

def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool = True) -> None:
def __init__(
self,
repo: bool = True,
showUri: bool = True,
useArguments: bool = True,
default_limit: int = -10_000,
use_order_by: bool = True,
) -> None:
self.decorators = []
if repo:
if not useArguments:
Expand Down Expand Up @@ -88,8 +99,16 @@ def __init__(self, repo: bool = True, showUri: bool = True, useArguments: bool =
contain wildcards."""
),
),
limit_option(
help="Limit the number of results that are processed. 0 means no limit. A negative "
"value specifies a cap where a warning will be issued if the cap is hit. "
f"Default value is {default_limit}.",
default=default_limit,
),
]
)
if use_order_by:
self.decorators.append(order_by_option())
if showUri:
self.decorators.append(
click.option("--show-uri", is_flag=True, help="Show the dataset URI in results.")
Expand Down
6 changes: 3 additions & 3 deletions python/lsst/daf/butler/direct_butler/_direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1396,9 +1396,9 @@ def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
collectionType = self._registry.getCollectionType(name)
if collectionType is not CollectionType.RUN:
raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
with self._query() as query:
with self.query() as query:
# Work out the dataset types that are relevant.
collections_info = self.collections.x_query_info(name, include_summary=True)
collections_info = self.collections.query_info(name, include_summary=True)
filtered_dataset_types = self.collections._filter_dataset_types(
all_dataset_types, collections_info
)
Expand Down Expand Up @@ -2201,7 +2201,7 @@ def dimensions(self) -> DimensionUniverse:
# Docstring inherited.
return self._registry.dimensions

def _query(self) -> contextlib.AbstractContextManager[Query]:
def query(self) -> contextlib.AbstractContextManager[Query]:
# Docstring inherited.
return self._registry._query()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def remove_from_chain(
parent_collection_name, list(ensure_iterable(child_collection_names))
)

def x_query(
def query(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
Expand All @@ -99,7 +99,7 @@ def x_query(
includeChains=include_chains,
)

def x_query_info(
def query_info(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
Expand Down
3 changes: 2 additions & 1 deletion python/lsst/daf/butler/queries/_expression_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def convert_expression_string_to_predicate(
tree = parse_expression(expression)
except Exception as exc:
raise InvalidQueryError(f"Failed to parse expression '{expression}'") from exc

if tree is None:
return Predicate.from_bool(True)
converter = _ConversionVisitor(context, universe)
predicate = tree.visit(converter)
assert isinstance(
Expand Down
4 changes: 2 additions & 2 deletions python/lsst/daf/butler/queries/_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def expression_factory(self) -> ExpressionFactory:
variable, and then its (dynamic) attributes can be used to obtain
references to columns that can be included in a query::

with butler._query() as query:
with butler.query() as query:
x = query.expression_factory
query = query.where(
x.instrument == "LSSTCam",
Expand All @@ -161,7 +161,7 @@ def expression_factory(self) -> ExpressionFactory:
`~CollectionType.CALIBRATION` collection searches) can be obtained with
dict-like access instead::

with butler._query() as query:
with butler.query() as query:
query = query.order_by(x["raw"].ingest_date)

Expression proxy objects that correspond to scalar columns overload the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ def make_string_expression_predicate(
tree = parse_expression(string)
except Exception as exc:
raise UserExpressionSyntaxError(f"Failed to parse user expression {string!r}.") from exc
assert tree is not None, "Should only happen if the string is empty, and that's handled above."
if bind is None:
bind = {}
if bind:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from .parserYacc import ParserYacc # type: ignore


def parse_expression(expression: str) -> Node:
def parse_expression(expression: str) -> Node | None:
"""Given a Butler query expression string, convert it to a tree form.

Parameters
Expand All @@ -41,7 +41,7 @@ def parse_expression(expression: str) -> Node:

Returns
-------
node : `Node`
node : `Node` or `None`
Tree form of expression query.
"""
parser = ParserYacc()
Expand Down
8 changes: 4 additions & 4 deletions python/lsst/daf/butler/remote_butler/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def queryDatasets(

query_results = [
QueryDriverDatasetRefQueryResults(
self._butler._query,
self._butler.query,
args,
dataset_type=dt,
find_first=findFirst,
Expand Down Expand Up @@ -457,7 +457,7 @@ def queryDataIds(
dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
)
return QueryDriverDataCoordinateQueryResults(
self._butler._query, dimensions=dimensions, expanded=False, args=args
self._butler.query, dimensions=dimensions, expanded=False, args=args
)

def queryDimensionRecords(
Expand All @@ -480,7 +480,7 @@ def queryDimensionRecords(
dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
)

return QueryDriverDimensionRecordQueryResults(self._butler._query, element, args)
return QueryDriverDimensionRecordQueryResults(self._butler.query, element, args)

def _convert_common_query_arguments(
self,
Expand Down Expand Up @@ -519,7 +519,7 @@ def queryDatasetAssociations(
resolved_collections = self.queryCollections(
collections, datasetType=datasetType, collectionTypes=collectionTypes, flattenChains=flattenChains
)
with self._butler._query() as query:
with self._butler.query() as query:
query = query.join_dataset_search(datasetType, resolved_collections)
result = query.general(
datasetType.dimensions,
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ def registry(self) -> Registry:
return self._registry

@contextmanager
def _query(self) -> Iterator[Query]:
def query(self) -> Iterator[Query]:
driver = RemoteQueryDriver(self, self._connection)
with driver:
query = Query(driver)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def remove_from_chain(
) -> None:
raise NotImplementedError("Not yet available")

def x_query_info(
def query_info(
self,
expression: str | Iterable[str],
collection_types: Set[CollectionType] | CollectionType | None = None,
Expand Down
7 changes: 7 additions & 0 deletions python/lsst/daf/butler/script/_associate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def associate(
collections: Iterable[str],
where: str,
find_first: bool,
limit: int,
) -> None:
"""Add existing datasets to a CHAINED collection.

Expand All @@ -57,6 +58,10 @@ def associate(
Query string.
find_first : `bool`
Whether to find the first match or not.
limit : `int`
Limit the number of results to be returned. A value of 0 means
unlimited. A negative value is used to specify a cap where a warning
is issued if that cap is hit.
"""
butler = Butler.from_config(repo, writeable=True, without_datastore=True)

Expand All @@ -68,6 +73,8 @@ def associate(
collections=collections,
where=where,
find_first=find_first,
limit=limit,
order_by=(),
show_uri=False,
repo=None,
)
Expand Down
Loading
Loading