Skip to content

Commit

Permalink
Merge pull request #1074 from lsst/tickets/DM-46129
Browse files Browse the repository at this point in the history
DM-46129: Make collections.query_info a single HTTP call
  • Loading branch information
dhirving committed Sep 11, 2024
2 parents 5a6ff82 + af34091 commit fb8c666
Show file tree
Hide file tree
Showing 15 changed files with 315 additions and 149 deletions.
9 changes: 6 additions & 3 deletions python/lsst/daf/butler/_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
class CollectionInfo(BaseModel):
"""Information about a single Butler collection."""

# This class is serialized for the server API -- any new properties you add
# must have default values provided to preserve backwards compatibility.

name: str
"""Name of the collection."""
type: CollectionType
Expand Down Expand Up @@ -280,7 +283,7 @@ def query_info(
include_parents: bool = False,
include_summary: bool = False,
include_doc: bool = False,
summary_datasets: Iterable[DatasetType] | None = None,
summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Sequence[CollectionInfo]:
"""Query the butler for collections matching an expression and
return detailed information about those collections.
Expand All @@ -307,8 +310,8 @@ def query_info(
include_doc : `bool`, optional
Whether the returned information includes collection documentation
string.
summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ], \
optional
summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \
`~collections.abc.Iterable` [ `str` ], optional
Dataset types to include in returned summaries. Only used if
``include_summary`` is `True`. If not specified then all dataset
types will be included.
Expand Down
22 changes: 22 additions & 0 deletions python/lsst/daf/butler/_dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,3 +796,25 @@ def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetT
arguments as well as positional arguments.
"""
return factory(*args, **kwargs)


def get_dataset_type_name(datasetTypeOrName: DatasetType | str) -> str:
"""Given a `DatasetType` object or a dataset type name, return a dataset
type name.
Parameters
----------
datasetTypeOrName : `DatasetType` | `str`
A DatasetType, or the name of a DatasetType.
Returns
-------
name
The name associated with the given DatasetType, or the given string.
"""
if isinstance(datasetTypeOrName, DatasetType):
return datasetTypeOrName.name
elif isinstance(datasetTypeOrName, str):
return datasetTypeOrName
else:
raise TypeError(f"Expected DatasetType or str, got unexpected object: {datasetTypeOrName}")
89 changes: 44 additions & 45 deletions python/lsst/daf/butler/direct_butler/_direct_butler_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,55 +114,54 @@ def query_info(
include_parents: bool = False,
include_summary: bool = False,
include_doc: bool = False,
summary_datasets: Iterable[DatasetType] | None = None,
summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Sequence[CollectionInfo]:
info = []
with self._registry.caching_context():
if collection_types is None:
collection_types = CollectionType.all()
elif isinstance(collection_types, CollectionType):
collection_types = {collection_types}

records = self._registry._managers.collections.resolve_wildcard(
CollectionWildcard.from_expression(expression),
collection_types=collection_types,
flatten_chains=flatten_chains,
include_chains=include_chains,
)
if collection_types is None:
collection_types = CollectionType.all()
elif isinstance(collection_types, CollectionType):
collection_types = {collection_types}

records = self._registry._managers.collections.resolve_wildcard(
CollectionWildcard.from_expression(expression),
collection_types=collection_types,
flatten_chains=flatten_chains,
include_chains=include_chains,
)

summaries: Mapping[Any, CollectionSummary] = {}
if include_summary:
summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)

docs: Mapping[Any, str] = {}
if include_doc:
docs = self._registry._managers.collections.get_docs(record.key for record in records)

for record in records:
doc = docs.get(record.key, "")
children: tuple[str, ...] = tuple()
if record.type == CollectionType.CHAINED:
assert isinstance(record, ChainedCollectionRecord)
children = tuple(record.children)
parents: frozenset[str] | None = None
if include_parents:
# TODO: This is non-vectorized, so expensive to do in a
# loop.
parents = frozenset(self._registry.getCollectionParentChains(record.name))
dataset_types: Set[str] | None = None
if summary := summaries.get(record.key):
dataset_types = frozenset([dt.name for dt in summary.dataset_types])

info.append(
CollectionInfo(
name=record.name,
type=record.type,
doc=doc,
parents=parents,
children=children,
dataset_types=dataset_types,
)
summaries: Mapping[Any, CollectionSummary] = {}
if include_summary:
summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)

docs: Mapping[Any, str] = {}
if include_doc:
docs = self._registry._managers.collections.get_docs(record.key for record in records)

for record in records:
doc = docs.get(record.key, "")
children: tuple[str, ...] = tuple()
if record.type == CollectionType.CHAINED:
assert isinstance(record, ChainedCollectionRecord)
children = tuple(record.children)
parents: frozenset[str] | None = None
if include_parents:
# TODO: This is non-vectorized, so expensive to do in a
# loop.
parents = frozenset(self._registry.getCollectionParentChains(record.name))
dataset_types: Set[str] | None = None
if summary := summaries.get(record.key):
dataset_types = frozenset([dt.name for dt in summary.dataset_types])

info.append(
CollectionInfo(
name=record.name,
type=record.type,
doc=doc,
parents=parents,
children=children,
dataset_types=dataset_types,
)
)

return info

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

import sqlalchemy

from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ...._dataset_type import DatasetType, get_dataset_type_name
from ...._exceptions_legacy import DatasetTypeError
from ....dimensions import DimensionUniverse
from ..._collection_summary import CollectionSummary
Expand Down Expand Up @@ -511,12 +512,14 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar
return summaries[collection.key]

def fetch_summaries(
self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
self,
collections: Iterable[CollectionRecord],
dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Mapping[Any, CollectionSummary]:
# Docstring inherited from DatasetRecordStorageManager.
dataset_type_names: Iterable[str] | None = None
if dataset_types is not None:
dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types)
return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)

_versions: list[VersionTuple]
Expand Down
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/registry/interfaces/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,9 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar

@abstractmethod
def fetch_summaries(
self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
self,
collections: Iterable[CollectionRecord],
dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
) -> Mapping[Any, CollectionSummary]:
"""Fetch collection summaries given their names and dataset types.
Expand Down
60 changes: 60 additions & 0 deletions python/lsst/daf/butler/remote_butler/_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ..registry import RegistryDefaults


class DefaultsHolder:
"""Holds a `RegistryDefaults` object and allows it to be set.
Parameters
----------
defaults : `RegistryDefaults`
Initial value for the defaults object.
Notes
-----
This exists to work around circular dependency issues (RemoteButler,
ButlerCollections, and Registry all need to know/modify the defaults.)
"""

def __init__(self, defaults: RegistryDefaults) -> None:
self._defaults = defaults

def get(self) -> RegistryDefaults:
"""Retrieve the current registry defaults."""
return self._defaults

def set(self, defaults: RegistryDefaults) -> None:
"""Set a new value for the registry defaults.
Parameters
----------
defaults : `RegistryDefaults`
New value for defaults object.
"""
self._defaults = defaults
9 changes: 2 additions & 7 deletions python/lsst/daf/butler/remote_butler/_ref_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from pydantic import TypeAdapter

from .._dataset_ref import DatasetRef
from .._dataset_type import DatasetType
from .._dataset_type import DatasetType, get_dataset_type_name
from .._storage_class import StorageClass
from ..dimensions import DataCoordinate, DataId, DataIdValue, SerializedDataId
from .server_models import DatasetTypeName
Expand Down Expand Up @@ -85,12 +85,7 @@ def normalize_dataset_type_name(datasetTypeOrName: DatasetType | str) -> Dataset
A DatasetType, or the name of a DatasetType. This union is a common
parameter in many `Butler` methods.
"""
if isinstance(datasetTypeOrName, DatasetType):
return DatasetTypeName(datasetTypeOrName.name)
elif isinstance(datasetTypeOrName, str):
return DatasetTypeName(datasetTypeOrName)
else:
raise TypeError(f"Got unexpected object for DatasetType: {datasetTypeOrName}")
return DatasetTypeName(get_dataset_type_name(datasetTypeOrName))


def simplify_dataId(dataId: DataId | None, kwargs: dict[str, DataIdValue]) -> SerializedDataId:
Expand Down
Loading

0 comments on commit fb8c666

Please sign in to comment.