Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-45489: Add more test cover for RemoteButler queries #1042

Merged
merged 6 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python/lsst/daf/butler/queries/convert_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from collections.abc import Mapping, Set
from typing import Any

from .._exceptions import InvalidQueryError
from ..dimensions import DataCoordinate, DataId, DimensionGroup
from ._expression_strings import convert_expression_string_to_predicate
from ._identifiers import IdentifierContext, interpret_identifier
Expand Down Expand Up @@ -147,6 +148,8 @@ def convert_order_by_args(
if arg.startswith("-"):
reverse = True
arg = arg[1:]
if len(arg) == 0:
raise InvalidQueryError("Empty dimension name in ORDER BY")
arg = interpret_identifier(context, arg)
if reverse:
arg = Reversed(operand=arg)
Expand Down
92 changes: 67 additions & 25 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,12 +1378,20 @@ def testSkyMapDimensions(self):
self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))

# Specifying non-existing skymap is an exception
with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
rows = registry.queryDataIds(
def do_query():
return registry.queryDataIds(
dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
).toSet()

if self.supportsQueryGovernorValidation:
# Specifying non-existing skymap is an exception
with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
do_query()
else:
# New query system returns zero rows instead of raising an
# exception.
self.assertEqual(len(do_query()), 0)

def testSpatialJoin(self):
"""Test queries that involve spatial overlap joins."""
registry = self.makeRegistry()
Expand Down Expand Up @@ -2663,11 +2671,12 @@ def testSkipCalibs(self):
)
with self.assertRaises(exception_type):
list(registry.queryDatasets("bias", collections=coll_list, findFirst=True))
# explicit list will raise if there are temporal dimensions
with self.assertRaises(NotImplementedError):
registry.queryDataIds(
["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
).count()
if not self.supportsCalibrationCollectionInFindFirst:
# explicit list will raise if there are temporal dimensions
with self.assertRaises(NotImplementedError):
registry.queryDataIds(
["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
).count()
Comment on lines +2676 to +2679
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the new query system this is returning zero rows, which I don't think is right.. it should probably be returning the data IDs for all the bias datasets instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the the behavior is correct, and the comment above this test is wrong: I don't see anything that inserts exposure dimension records, so the query is going to return no rows because it's got a join to that empty table. Note that the dimensions of bias are just {instrument, detector}; the match to exposure is going to be a temporal join between the CALIBRATION collection's validity range and exposure.timespan.

So I suspect there's supposed to be an exposure record inserted with a timespan that overlaps the validity range of exactly one bias for or or two detectors in this test, as that'd make the test much more interesting for the new system (and it wouldn't affect the behavior of the old query system, which is probably why it wasn't done). And if we do that the new query system should return rows for that exposure and whatever detectors have a bias with the matching timespan. But it's quite possible that behavior is already covered in other tests of the new query system and hence there's no need to re-check it here.

Note that we don't really care about the case where the exposure's timespan overlaps the validity ranges of multiple biases; this query might still be sound, but it'd make any find-first search for the bias fail, and hence it represents a practically useless validity range.


# chain will skip
datasets = list(registry.queryDatasets("bias", collections=chain))
Expand Down Expand Up @@ -3008,7 +3017,7 @@ def testQueryResultSummaries(self):
]
with self.assertRaises(MissingDatasetTypeError):
# Dataset type name doesn't match any existing dataset types.
registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...)
list(registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...))
with self.assertRaises(MissingDatasetTypeError):
# Dataset type name doesn't match any existing dataset types.
registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...).any()
Expand Down Expand Up @@ -3138,12 +3147,20 @@ def testQueryDataIdsExpressionError(self):
registry = self.makeRegistry()
self.loadData(registry, "base.yaml")
bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
registry.queryDataIds(["detector"], where="foo.bar = 12")
# The diagnostics raised are slightly different between the old query
# system (ValueError, first error string) and the new query system
# (InvalidQueryError, second error string).
with self.assertRaisesRegex(
LookupError, "Dimension element name cannot be inferred in this context."
(LookupError, InvalidQueryError),
r"(No dimension element with name 'foo' in 'foo\.bar'\.)|(Unrecognized identifier 'foo.bar')",
):
registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
list(registry.queryDataIds(["detector"], where="foo.bar = 12"))
with self.assertRaisesRegex(
(LookupError, InvalidQueryError),
"(Dimension element name cannot be inferred in this context.)"
"|(Unrecognized identifier 'timespan')",
):
list(registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind))

def testQueryDataIdsOrderBy(self):
"""Test order_by and limit on result returned by queryDataIds()."""
Expand Down Expand Up @@ -3229,42 +3246,67 @@ def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
with query.materialize():
pass

# errors in a name
# Test exceptions for errors in a name.
# Many of these raise slightly different diagnostics in the old query
# system (ValueError, first error string) than the new query system
# (InvalidQueryError, second error string).
for order_by in ("", "-"):
with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
with self.assertRaisesRegex((ValueError, InvalidQueryError), "Empty dimension name in ORDER BY"):
list(do_query().order_by(order_by))

for order_by in ("undimension.name", "-undimension.name"):
with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Unknown dimension element 'undimension')|(Unrecognized identifier 'undimension.name')",
):
list(do_query().order_by(order_by))

for order_by in ("attract", "-attract"):
with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Metadata 'attract' cannot be found in any dimension)|(Unrecognized identifier 'attract')",
):
list(do_query().order_by(order_by))

with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Metadata 'exposure_time' exists in more than one dimension)"
"|(Ambiguous identifier 'exposure_time' matches multiple fields)",
):
list(do_query(("exposure", "visit")).order_by("exposure_time"))

with self.assertRaisesRegex(
ValueError,
r"Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
r"qualify timespan with specific dimension name\.",
(ValueError, InvalidQueryError),
r"(Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
r"qualify timespan with specific dimension name\.)|"
r"(Ambiguous identifier 'timespan' matches multiple fields)",
):
list(do_query(("exposure", "visit")).order_by("timespan.begin"))

with self.assertRaisesRegex(
ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
(ValueError, InvalidQueryError),
"(Cannot find any temporal dimension element for 'timespan.begin')"
"|(Unrecognized identifier 'timespan')",
):
list(do_query("tract").order_by("timespan.begin"))

with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Cannot use 'timespan.begin' with non-temporal element)"
"|(Unrecognized field 'timespan' for tract)",
):
list(do_query("tract").order_by("tract.timespan.begin"))

with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Field 'name' does not exist in 'tract')" "|(Unrecognized field 'name' for tract.)",
):
list(do_query("tract").order_by("tract.name"))

with self.assertRaisesRegex(
ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
(ValueError, InvalidQueryError),
r"(Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?)"
r"|(Unrecognized identifier 'timestamp.begin')",
):
list(do_query("visit").order_by("timestamp.begin"))

Expand Down
4 changes: 4 additions & 0 deletions python/lsst/daf/butler/remote_butler/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
Registry,
RegistryDefaults,
)
from ..registry._exceptions import ArgumentError
from ..registry.queries import (
ChainedDatasetQueryResults,
DataCoordinateQueryResults,
Expand Down Expand Up @@ -448,6 +449,9 @@ def queryDataIds(
check: bool = True,
**kwargs: Any,
) -> DataCoordinateQueryResults:
if collections is not None and datasets is None:
raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")

dimensions = self.dimensions.conform(dimensions)
args = self._convert_common_query_arguments(
dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
Expand Down
52 changes: 31 additions & 21 deletions python/lsst/daf/butler/tests/hybrid_butler_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from __future__ import annotations

import contextlib
from collections.abc import Iterable, Iterator, Mapping, Sequence
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from typing import Any, cast

from .._dataset_association import DatasetAssociation
Expand Down Expand Up @@ -312,17 +312,6 @@ def queryDataIds(
check: bool = True,
**kwargs: Any,
) -> DataCoordinateQueryResults:
direct = self._direct.queryDataIds(
dimensions,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

remote = self._remote.queryDataIds(
dimensions,
dataId=dataId,
Expand All @@ -334,8 +323,25 @@ def queryDataIds(
**kwargs,
)

# Defer creation of the DirectButler version until we really need the
# object for handling an unimplemented method. This avoids masking of
# missing exception handling in the RemoteButler side -- otherwise
# exceptions from DirectButler would cause tests to pass.
def create_direct_result() -> DataCoordinateQueryResults:
return self._direct.queryDataIds(
dimensions,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

return cast(
DataCoordinateQueryResults, _HybridDataCoordinateQueryResults(direct=direct, remote=remote)
DataCoordinateQueryResults,
_HybridDataCoordinateQueryResults(direct=create_direct_result, remote=remote),
)

def queryDimensionRecords(
Expand Down Expand Up @@ -388,7 +394,9 @@ class _HybridDataCoordinateQueryResults:
provide a few methods that aren't implemented yet.
"""

def __init__(self, *, direct: DataCoordinateQueryResults, remote: DataCoordinateQueryResults) -> None:
def __init__(
self, *, direct: Callable[[], DataCoordinateQueryResults], remote: DataCoordinateQueryResults
) -> None:
self._direct = direct
self._remote = remote

Expand All @@ -401,20 +409,20 @@ def __iter__(self) -> Iterator[DataCoordinate]:

def order_by(self, *args: str) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.order_by(*args), remote=self._remote.order_by(*args)
direct=lambda: self._direct().order_by(*args), remote=self._remote.order_by(*args)
)

def limit(self, limit: int, offset: int | None = 0) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.limit(limit, offset), remote=self._remote.limit(limit, offset)
direct=lambda: self._direct().limit(limit, offset), remote=self._remote.limit(limit, offset)
)

def materialize(self) -> contextlib.AbstractContextManager[DataCoordinateQueryResults]:
return self._direct.materialize()
return self._direct().materialize()

def expanded(self) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
remote=self._remote.expanded(), direct=self._direct.expanded()
remote=self._remote.expanded(), direct=lambda: self._direct().expanded()
)

def subset(
Expand All @@ -424,7 +432,7 @@ def subset(
unique: bool = False,
) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.subset(dimensions, unique=unique),
direct=lambda: self._direct().subset(dimensions, unique=unique),
remote=self._remote.subset(dimensions, unique=unique),
)

Expand All @@ -436,7 +444,9 @@ def findDatasets(
findFirst: bool = True,
components: bool = False,
) -> ParentDatasetQueryResults:
return self._direct.findDatasets(datasetType, collections, findFirst=findFirst, components=components)
return self._direct().findDatasets(
datasetType, collections, findFirst=findFirst, components=components
)

def findRelatedDatasets(
self,
Expand All @@ -446,6 +456,6 @@ def findRelatedDatasets(
findFirst: bool = True,
dimensions: DimensionGroup | Iterable[str] | None = None,
) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
return self._direct.findRelatedDatasets(
return self._direct().findRelatedDatasets(
datasetType, collections, findFirst=findFirst, dimensions=dimensions
)
Loading