Skip to content

Commit

Permalink
Merge pull request #1104 from lsst/tickets/DM-46479
Browse files Browse the repository at this point in the history
DM-46479: add low-level support for union queries over multiple dataset types
  • Loading branch information
TallJimbo authored Oct 28, 2024
2 parents 84471a8 + 4649edc commit 6333457
Show file tree
Hide file tree
Showing 19 changed files with 2,251 additions and 1,052 deletions.
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/direct_query_driver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@

from ._driver import DirectQueryDriver
from ._postprocessing import Postprocessing
from ._query_builder import QueryBuilder, QueryJoiner
from ._sql_builders import SqlJoinsBuilder, SqlSelectBuilder
978 changes: 609 additions & 369 deletions python/lsst/daf/butler/direct_query_driver/_driver.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,36 @@
from __future__ import annotations

__all__ = (
"QueryPlan",
"QueryJoinsPlan",
"QueryProjectionPlan",
"QueryFindFirstPlan",
"QueryJoinsAnalysis",
"QueryFindFirstAnalysis",
"ResolvedDatasetSearch",
"QueryCollectionAnalysis",
)

import dataclasses
from collections.abc import Iterator
from collections.abc import Iterator, Mapping
from typing import TYPE_CHECKING, Generic, TypeVar

from ..dimensions import DimensionElement, DimensionGroup
from ..queries import tree as qt
from ..registry import CollectionSummary
from ..registry.interfaces import CollectionRecord

if TYPE_CHECKING:
from ._postprocessing import Postprocessing
from ._sql_builders import SqlSelectBuilder

_T = TypeVar("_T")


@dataclasses.dataclass
class ResolvedDatasetSearch:
class ResolvedDatasetSearch(Generic[_T]):
"""A struct describing a dataset search joined into a query, after
resolving its collection search path.
"""

name: str
"""Name of the dataset type."""
name: _T
"""Name or names of the dataset type(s)."""

dimensions: DimensionGroup
"""Dimensions of the dataset type."""
Expand All @@ -71,15 +78,17 @@ class ResolvedDatasetSearch:
`~CollectionType.CALIBRATION` collection, `False` otherwise.
Since only calibration datasets can be present in
`~CollectionType.CALIBRATION` collections, this also
`~CollectionType.CALIBRATION` collections, this also indicates that the
dataset type is a calibration.
"""


@dataclasses.dataclass
class QueryJoinsPlan:
class QueryJoinsAnalysis:
"""A struct describing the "joins" section of a butler query.
See `QueryPlan` and `QueryPlan.joins` for additional information.
See `DirectQueryDriver.build_query` for an overview of how queries are
transformed into SQL, and the role this object plays in that.
"""

predicate: qt.Predicate
Expand All @@ -95,7 +104,7 @@ class QueryJoinsPlan:
materializations: dict[qt.MaterializationKey, DimensionGroup] = dataclasses.field(default_factory=dict)
"""Materializations to join into the query."""

datasets: dict[str, ResolvedDatasetSearch] = dataclasses.field(default_factory=dict)
datasets: dict[str, ResolvedDatasetSearch[str]] = dataclasses.field(default_factory=dict)
"""Dataset searches to join into the query."""

data_coordinate_uploads: dict[qt.DataCoordinateUploadKey, DimensionGroup] = dataclasses.field(
Expand All @@ -111,14 +120,20 @@ class QueryJoinsPlan:
def __post_init__(self) -> None:
self.predicate.gather_required_columns(self.columns)

def iter_mandatory(self) -> Iterator[DimensionElement]:
def iter_mandatory(self, union_dataset_dimensions: DimensionGroup | None) -> Iterator[DimensionElement]:
"""Return an iterator over the dimension elements that must be joined
into the query.
These elements either provide "field" (non-key) columns or define
relationships that result rows must be consistent with. They do not
necessarily include all dimension keys in `columns`, since each of
those can typically be included in a query in multiple different ways.
Parameters
----------
union_dataset_dimensions : `DimensionGroup` or `None`
Dimensions of the union dataset types, or `None` if this is not
a union dataset query.
"""
for element_name in self.columns.dimensions.elements:
element = self.columns.dimensions.universe[element_name]
Expand All @@ -143,6 +158,11 @@ def iter_mandatory(self) -> Iterator[DimensionElement]:
for dataset_spec in self.datasets.values()
):
continue
if (
union_dataset_dimensions is not None
and element.minimal_group.names <= union_dataset_dimensions.required
):
continue
# Materializations have all key columns for their dimensions.
if any(
element in materialization_dimensions.names
Expand All @@ -153,128 +173,77 @@ def iter_mandatory(self) -> Iterator[DimensionElement]:


@dataclasses.dataclass
class QueryProjectionPlan:
"""A struct describing the "projection" stage of a butler query.
This struct evaluates to `True` in boolean contexts if either
`needs_dimension_distinct` or `needs_dataset_distinct` are `True`. In
other cases the projection is effectively a no-op, because the
"joins"-stage rows are already unique.
See `QueryPlan` and `QueryPlan.projection` for additional information.
"""

columns: qt.ColumnSet
"""The columns present in the query after the projection is applied.
class QueryFindFirstAnalysis(Generic[_T]):
"""A struct describing the "find-first" stage of a butler query.
This is always a subset of `QueryJoinsPlan.columns`.
See `DirectQueryDriver.build_query` for an overview of how queries are
transformed into SQL, and the role this object plays in that.
"""

datasets: dict[str, ResolvedDatasetSearch]
"""Dataset searches to join into the query."""

needs_dimension_distinct: bool = False
"""If `True`, the projection's dimensions do not include all dimensions in
the "joins" stage, and hence a SELECT DISTINCT [ON] or GROUP BY must be
used to make post-projection rows unique.
"""
search: ResolvedDatasetSearch[_T]
"""Information about the dataset type or types being searched for."""

needs_dataset_distinct: bool = False
"""If `True`, the projection columns do not include collection-specific
dataset fields that were present in the "joins" stage, and hence a SELECT
DISTINCT [ON] or GROUP BY must be added to make post-projection rows
unique.
"""
@property
def dataset_type(self) -> _T:
"""Name(s) of the dataset type(s)."""
return self.search.name

def __bool__(self) -> bool:
return self.needs_dimension_distinct or self.needs_dataset_distinct
return len(self.search.collection_records) > 1

find_first_dataset: str | None = None
"""If not `None`, this is a find-first query for this dataset.

This is set even if the find-first search is trivial because there is only
one resolved collection.
@dataclasses.dataclass
class QueryCollectionAnalysis:
"""A struct containing information about all of the collections that appear
in a butler query.
"""

collection_records: Mapping[str, CollectionRecord]
"""All collection records, keyed by collection name.
@dataclasses.dataclass
class QueryFindFirstPlan:
"""A struct describing the "find-first" stage of a butler query.
See `QueryPlan` and `QueryPlan.find_first` for additional information.
This includes CHAINED collections.
"""

search: ResolvedDatasetSearch
"""Information about the dataset being searched for."""

@property
def dataset_type(self) -> str:
"""Name of the dataset type."""
return self.search.name
calibration_dataset_types: set[str | qt.AnyDatasetType] = dataclasses.field(default_factory=set)
"""A set of the anmes of all calibration dataset types.
def __bool__(self) -> bool:
return len(self.search.collection_records) > 1
If ``ANY_DATASET`` appears in the set, the dataset type union includes at
least one calibration dataset type.
"""

summaries_by_dataset_type: dict[
str | qt.AnyDatasetType, list[tuple[CollectionRecord, CollectionSummary]]
] = dataclasses.field(default_factory=dict)
"""Collection records and summaries, in search order, keyed by dataset type
name.
@dataclasses.dataclass
class QueryPlan:
"""A struct that aggregates information about a complete butler query.
Notes
-----
Butler queries are transformed into a combination of SQL and Python-side
postprocessing in three stages, with each corresponding to an attributes of
this class and a method of `DirectQueryDriver`
- In the `joins` stage (`~DirectQueryDriver.apply_query_joins`), we define
the main SQL FROM and WHERE clauses, by joining all tables needed to
bring in any columns, or constrain the keys of its rows.
- In the `projection` stage (`~DirectQueryDriver.apply_query_projection`),
we select only the columns needed for the query's result rows (including
columns needed only by postprocessing and ORDER BY, as well those needed
by the objects returned to users). If the result rows are not naturally
unique given what went into the query in the "joins" stage, the
projection involves a SELECT DISTINCT [ON] or GROUP BY to make them
unique, and in a few rare cases uses aggregate functions with GROUP BY.
- In the `find_first` stage (`~DirectQueryDriver.apply_query_find_first`),
we use a window function (PARTITION BY) subquery to find only the first
dataset in the collection search path for each data ID. This stage does
nothing if there is no find-first dataset search, or if the search is
trivial because there is only one collection.
In `DirectQueryDriver.build_query`, a `QueryPlan` instance is constructed
via `DirectQueryDriver.analyze_query`, which also returns an initial
`QueryBuilder`. After this point the plans are considered frozen, and the
nested plan attributes are then passed to each of the corresponding
`DirectQueryDriver` methods along with the builder, which is mutated (and
occasionally replaced) into the complete SQL/postprocessing form of the
query.
CHAINED collections are flattened out in the nested lists. Lists have been
filtered to be consistent with the dataset types in the summaries, but not
necessarily the governor dimensions in the summaries.
"""

joins: QueryJoinsPlan
"""Description of the "joins" stage of query construction."""

projection: QueryProjectionPlan
"""Description of the "projection" stage of query construction."""
@dataclasses.dataclass
class QueryTreeAnalysis:
"""A struct aggregating all analysis results derived from the query tree.
find_first: QueryFindFirstPlan | None
"""Description of the "find_first" stage of query construction.
See `DirectQueryDriver.build_query` for an overview of how queries are
transformed into SQL, and the role this object plays in that.
"""

This attribute is `None` if there is no find-first search at all, and
`False` in boolean contexts if the search is trivial because there is only
one collection after the collections have been resolved.
joins: QueryJoinsAnalysis
"""Analysis of the "joins" stage, including all joins and columns needed by
``tree``. Additional columns will be added to this plan later.
"""

final_columns: qt.ColumnSet
"""The columns included in the SELECT clause of the complete SQL query
that is actually executed.
union_datasets: list[ResolvedDatasetSearch[list[str]]]
"""Resolved dataset searches that expand `QueryTree.any_dataset` out
into groups of dataset types with the same collection search path.
"""

This is a subset of `QueryProjectionPlan.columns` that differs only in
columns used by the `find_first` stage or an ORDER BY expression.
initial_select_builder: SqlSelectBuilder
"""In-progress SQL query builder, initialized with just spatial and
temporal overlaps."""

Like all other `.queries.tree.ColumnSet` attributes, it does not include
fields added directly to `QueryBuilder.special`, which may also be added
to the SELECT clause.
"""
postprocessing: Postprocessing
"""Struct representing post-query processing to be done in Python."""
Loading

0 comments on commit 6333457

Please sign in to comment.