Skip to content

Commit

Permalink
Allow boolean columns to be used in where strings
Browse files Browse the repository at this point in the history
Fix an issue where boolean metadata columns (like exposure.can_see_sky and exposure.has_simulated) were not usable in "where" clauses for Registry query functions.

These column names can now be used as a boolean expression, for example `where="exposure.can_see_sky"` or `where="NOT exposure.can_see_sky"`.
  • Loading branch information
dhirving committed Aug 8, 2024
1 parent 427f673 commit 5b4498c
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 3 deletions.
4 changes: 4 additions & 0 deletions doc/changes/DM-45680.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix an issue where boolean metadata columns (like `exposure.can_see_sky` and
`exposure.has_simulated`) were not usable in `where` clauses for Registry query
functions. These column names can now be used as a boolean expression, for
example `where="exposure.can_see_sky` or `where="NOT exposure.can_see_sky"`.
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ def visit_reversed(self, expression: qt.Reversed) -> sqlalchemy.ColumnElement[An
# Docstring inherited.
return self.expect_scalar(expression.operand).desc()

def visit_boolean_wrapper(
self, value: qt.ColumnExpression, flags: PredicateVisitFlags
) -> sqlalchemy.ColumnElement[bool]:
return self.expect_scalar(value)

def visit_comparison(
self,
a: qt.ColumnExpression,
Expand Down
9 changes: 8 additions & 1 deletion python/lsst/daf/butler/queries/_expression_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,14 @@ def visitIdentifier(self, name: str, node: Node) -> _VisitorResult:
if categorizeConstant(name) == ExpressionConstant.NULL:
return _Null()

return _ColExpr(interpret_identifier(self.context, name))
column_expression = interpret_identifier(self.context, name)
if column_expression.column_type == "bool":
# Expression-handling code (in this file and elsewhere) expects
# boolean-valued expressions to be represented as Predicate, not a
# column expression.
return Predicate.from_bool_expression(column_expression)
else:
return _ColExpr(column_expression)

def visitNumericLiteral(self, value: str, node: Node) -> _VisitorResult:
numeric: int | float
Expand Down
42 changes: 41 additions & 1 deletion python/lsst/daf/butler/queries/tree/_predicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@ def from_bool(cls, value: bool) -> Predicate:
#
return cls.model_construct(operands=() if value else ((),))

@classmethod
def from_bool_expression(cls, value: ColumnExpression) -> Predicate:
"""Construct a predicate that wraps a boolean ColumnExpression, taking
on the value of the underlying ColumnExpression.
Parameters
----------
value : `ColumnExpression`
Boolean-valued expression to convert to Predicate.
Returns
-------
predicate : `Predicate`
Predicate representing the expression.
"""
if value.column_type != "bool":
raise ValueError(f"ColumnExpression must have column type 'bool', not '{value.column_type}'")

Check warning on line 174 in python/lsst/daf/butler/queries/tree/_predicate.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/queries/tree/_predicate.py#L174

Added line #L174 was not covered by tests

return cls._from_leaf(BooleanWrapper(operand=value))

@classmethod
def compare(cls, a: ColumnExpression, operator: ComparisonOperator, b: ColumnExpression) -> Predicate:
"""Construct a predicate representing a binary comparison between
Expand Down Expand Up @@ -412,6 +432,26 @@ def visit(self, visitor: PredicateVisitor[_A, _O, _L], flags: PredicateVisitFlag
return visitor._visit_logical_not(self.operand, flags)


class BooleanWrapper(PredicateLeafBase):
"""Pass-through to a pre-existing boolean column expression."""

predicate_type: Literal["boolean_wrapper"] = "boolean_wrapper"

operand: ColumnExpression
"""Wrapped expression that will be used as the value for this predicate."""

def gather_required_columns(self, columns: ColumnSet) -> None:
# Docstring inherited.
self.operand.gather_required_columns(columns)

def __str__(self) -> str:
return f"{self.operand}"

Check warning on line 448 in python/lsst/daf/butler/queries/tree/_predicate.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/queries/tree/_predicate.py#L448

Added line #L448 was not covered by tests

def visit(self, visitor: PredicateVisitor[_A, _O, _L], flags: PredicateVisitFlags) -> _L:
# Docstring inherited.
return visitor.visit_boolean_wrapper(self.operand, flags)


@final
class IsNull(PredicateLeafBase):
"""A boolean column expression that tests whether its operand is NULL."""
Expand Down Expand Up @@ -639,7 +679,7 @@ def _validate_column_types(self) -> InQuery:
return self


LogicalNotOperand: TypeAlias = IsNull | Comparison | InContainer | InRange | InQuery
LogicalNotOperand: TypeAlias = IsNull | Comparison | InContainer | InRange | InQuery | BooleanWrapper
PredicateLeaf: TypeAlias = Annotated[
LogicalNotOperand | LogicalNot, pydantic.Field(discriminator="predicate_type")
]
Expand Down
24 changes: 24 additions & 0 deletions python/lsst/daf/butler/queries/visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,25 @@ class PredicateVisitor(Generic[_A, _O, _L]):
visit method arguments.
"""

@abstractmethod
def visit_boolean_wrapper(self, value: tree.ColumnExpression, flags: PredicateVisitFlags) -> _L:
"""Visit a boolean-valued column expression.
Parameters
----------
value : `tree.ColumnExpression`
Column expression, guaranteed to have `column_type == "bool"`.
flags : `PredicateVisitFlags`
Information about where this leaf appears in the larger predicate
tree.
Returns
-------
result : `object`
Implementation-defined.
"""
raise NotImplementedError()

@abstractmethod
def visit_comparison(
self,
Expand Down Expand Up @@ -448,6 +467,11 @@ class SimplePredicateVisitor(
return a replacement `Predicate` to construct a new tree.
"""

def visit_boolean_wrapper(
self, value: tree.ColumnExpression, flags: PredicateVisitFlags
) -> tree.Predicate | None:
return None

def visit_comparison(
self,
a: tree.ColumnExpression,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,12 @@ def visitIdentifier(self, name: str, node: Node) -> VisitorResult:
if column == timespan_database_representation.TimespanDatabaseRepresentation.NAME
else element.RecordClass.fields.standard[column].getPythonType()
)
return ColumnExpression.reference(tag, dtype)
if dtype is bool:
# ColumnExpression is for non-boolean columns only. Booleans
# are represented as Predicate.
return Predicate.reference(tag)
else:
return ColumnExpression.reference(tag, dtype)
else:
tag = DimensionKeyColumnTag(element.name)
assert isinstance(element, Dimension)
Expand Down
42 changes: 42 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4133,3 +4133,45 @@ def test_collection_summary(self) -> None:
# Note that instrument governor resurrects here, even though there are
# no datasets left with that governor.
self.assertEqual(summary.governors, {"instrument": {"Cam1"}, "skymap": {"SkyMap1"}})

def test_query_where_string_boolean_expressions(self) -> None:
"""Test that 'where' clauses for queries return the expected results
for boolean columns used as expressions.
"""
registry = self.makeRegistry()
# Exposure is the only dimension that has boolean columns, and this set
# of data has all the pre-requisites for exposure set up.
self.loadData(registry, "hsc-rc2-subset.yaml")
base_data = {"instrument": "HSC", "physical_filter": "HSC-R", "group": "903342", "day_obs": 20130617}

TRUE_ID_1 = 1001
TRUE_ID_2 = 2001
FALSE_ID_1 = 1002
FALSE_ID_2 = 2002
records = [
{"id": TRUE_ID_1, "obs_id": "true-1", "can_see_sky": True},
{"id": TRUE_ID_2, "obs_id": "true-2", "can_see_sky": True},
{"id": FALSE_ID_1, "obs_id": "false-1", "can_see_sky": False},
{"id": FALSE_ID_2, "obs_id": "false-2", "can_see_sky": False},
# There is also a record ID 903342 from the YAML file with a NULL
# value for can_see_sky.
]
for record in records:
registry.insertDimensionData("exposure", base_data | record)

def _run_query(where: str) -> list[str]:
result = list(registry.queryDimensionRecords("exposure", where=where, instrument="HSC"))
return [x.dataId["exposure"] for x in result]

# Boolean columns should be usable standalone as an expression.
self.assertCountEqual(_run_query("exposure.can_see_sky"), [TRUE_ID_1, TRUE_ID_2])

# You can find false values in the column with NOT. The NOT of NULL
# is NULL, consistent with SQL semantics -- so records with NULL
# can_see_sky are not included here.
self.assertCountEqual(_run_query("NOT exposure.can_see_sky"), [FALSE_ID_1, FALSE_ID_2])

# Make sure the bare column composes with other expressions correctly.
self.assertCountEqual(
_run_query("exposure.can_see_sky OR exposure = 1002"), [TRUE_ID_1, TRUE_ID_2, FALSE_ID_1]
)

0 comments on commit 5b4498c

Please sign in to comment.