Skip to content

Commit

Permalink
Add new private filtering method to Butler.collections.
Browse files Browse the repository at this point in the history
This allows more efficient filtering with per-dataset type list
of collection names returned.
  • Loading branch information
andy-slac committed Sep 9, 2024
1 parent cd52c66 commit c88a4bb
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 30 deletions.
30 changes: 29 additions & 1 deletion python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
__all__ = ("ButlerCollections", "CollectionInfo")

from abc import ABC, abstractmethod
from collections.abc import Iterable, Sequence, Set
from collections import defaultdict
from collections.abc import Iterable, Mapping, Sequence, Set
from typing import TYPE_CHECKING, Any, overload

from pydantic import BaseModel
Expand Down Expand Up @@ -424,3 +425,30 @@ def _filter_dataset_types(
collection_dataset_types.update(info.dataset_types)
dataset_types_set = dataset_types_set.intersection(collection_dataset_types)
return dataset_types_set

def _group_by_dataset_type(
self, dataset_types: Set[str], collection_infos: Iterable[CollectionInfo]
) -> Mapping[str, list[str]]:
"""Filter dataset types and collections names based on summary in
collecion infos.
Parameters
----------
dataset_types : `~collections.abc.Set` [`str`]
Set of dataset type names to extract.
collection_infos : `~collections.abc.Iterable` [`CollectionInfo`]
Colelction infos, must contain dataset type summary.
Returns
-------
filtered : `~collections.abc.Mapping` [`str`, `list`[`str`]]
Mapping of the dataset type name to its corresponding list of
collection names.
"""
dataset_type_collections: dict[str, list[str]] = defaultdict(list)
for info in collection_infos:
if info.dataset_types is None:
raise RuntimeError("Can only filter by collections if include_summary was True")
for dataset_type in info.dataset_types & dataset_types:
dataset_type_collections[dataset_type].append(info.name)
return dataset_type_collections
17 changes: 6 additions & 11 deletions python/lsst/daf/butler/script/queryDataIds.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,25 +179,20 @@ def queryDataIds(
query_collections = collections or "*"
collections_info = butler.collections.query_info(query_collections, include_summary=True)
expanded_collections = [info.name for info in collections_info]
filtered_dataset_types = list(
butler.collections._filter_dataset_types([dt.name for dt in dataset_types], collections_info)
dataset_type_collections = butler.collections._group_by_dataset_type(
{dt.name for dt in dataset_types}, collections_info
)
if not filtered_dataset_types:
if not dataset_type_collections:
return (
None,
f"No datasets of type {datasets!r} existed in the specified "
f"collections {','.join(expanded_collections)}.",
)

sub_query = query.join_dataset_search(
filtered_dataset_types.pop(0), collections=expanded_collections
)
for dt in filtered_dataset_types:
sub_query = sub_query.join_dataset_search(dt, collections=expanded_collections)
for dt, dt_collections in dataset_type_collections.items():
query = query.join_dataset_search(dt, collections=dt_collections)

results = sub_query.data_ids(dimensions)
else:
results = query.data_ids(dimensions)
results = query.data_ids(dimensions)

if where:
results = results.where(where)
Expand Down
11 changes: 3 additions & 8 deletions python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from astropy.table import Table as AstropyTable

from .._butler import Butler
from .._collection_type import CollectionType
from ..cli.utils import sortAstropyTable

if TYPE_CHECKING:
Expand Down Expand Up @@ -268,13 +267,9 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:

# Only iterate over dataset types that are relevant for the query.
dataset_type_names = {dataset_type.name for dataset_type in dataset_types}
dataset_type_collections: dict[str, list[str]] = defaultdict(list)
for coll_info in query_collections_info:
# Only care about non-chained collections.
if coll_info.type != CollectionType.CHAINED:
assert coll_info.dataset_types is not None, "collection summary is missing"
for dataset_type in coll_info.dataset_types & dataset_type_names:
dataset_type_collections[dataset_type].append(coll_info.name)
dataset_type_collections = self.butler.collections._group_by_dataset_type(
dataset_type_names, query_collections_info
)

if (n_filtered := len(dataset_type_collections)) != n_dataset_types:
_LOG.info("Filtered %d dataset types down to %d", n_dataset_types, n_filtered)
Expand Down
18 changes: 8 additions & 10 deletions python/lsst/daf/butler/script/queryDimensionRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,18 @@ def queryDimensionRecords(
if datasets:
query_collections = collections or "*"
collections_info = butler.collections.query_info(query_collections, include_summary=True)
expanded_collections = [info.name for info in collections_info]
dataset_types = [dt.name for dt in butler.registry.queryDatasetTypes(datasets)]
dataset_types = list(butler.collections._filter_dataset_types(dataset_types, collections_info))
dataset_types = {dt.name for dt in butler.registry.queryDatasetTypes(datasets)}
dataset_type_collections = butler.collections._group_by_dataset_type(
dataset_types, collections_info
)

if not dataset_types:
if not dataset_type_collections:
return None

sub_query = query.join_dataset_search(dataset_types.pop(0), collections=expanded_collections)
for dt in dataset_types:
sub_query = sub_query.join_dataset_search(dt, collections=expanded_collections)
for dt, dt_collections in dataset_type_collections.items():
query = query.join_dataset_search(dt, collections=dt_collections)

query_results = sub_query.dimension_records(element)
else:
query_results = query.dimension_records(element)
query_results = query.dimension_records(element)

if where:
query_results = query_results.where(where)
Expand Down

0 comments on commit c88a4bb

Please sign in to comment.