Merge pull request #1074 from lsst/tickets/DM-46129

DM-46129: Make collections.query_info a single HTTP call
lsst · Sep 11, 2024 · fb8c666 · fb8c666
2 parents 5a6ff82 + af34091
commit fb8c666
Show file tree

Hide file tree

Showing 15 changed files with 315 additions and 149 deletions.
diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py
@@ -45,6 +45,9 @@
 class CollectionInfo(BaseModel):
     """Information about a single Butler collection."""
 
+    # This class is serialized for the server API -- any new properties you add
+    # must have default values provided to preserve backwards compatibility.
+
     name: str
     """Name of the collection."""
     type: CollectionType
@@ -280,7 +283,7 @@ def query_info(
         include_parents: bool = False,
         include_summary: bool = False,
         include_doc: bool = False,
-        summary_datasets: Iterable[DatasetType] | None = None,
+        summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
     ) -> Sequence[CollectionInfo]:
         """Query the butler for collections matching an expression and
         return detailed information about those collections.
@@ -307,8 +310,8 @@ def query_info(
         include_doc : `bool`, optional
             Whether the returned information includes collection documentation
             string.
-        summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ], \
-                optional
+        summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \
+            `~collections.abc.Iterable` [ `str` ], optional
             Dataset types to include in returned summaries. Only used if
             ``include_summary`` is `True`. If not specified then all dataset
             types will be included.

diff --git a/python/lsst/daf/butler/_dataset_type.py b/python/lsst/daf/butler/_dataset_type.py
@@ -796,3 +796,25 @@ def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetT
     arguments as well as positional arguments.
     """
     return factory(*args, **kwargs)
+
+
+def get_dataset_type_name(datasetTypeOrName: DatasetType | str) -> str:
+    """Given a `DatasetType` object or a dataset type name, return a dataset
+    type name.
+
+    Parameters
+    ----------
+    datasetTypeOrName : `DatasetType` | `str`
+        A DatasetType, or the name of a DatasetType.
+
+    Returns
+    -------
+    name
+        The name associated with the given DatasetType, or the given string.
+    """
+    if isinstance(datasetTypeOrName, DatasetType):
+        return datasetTypeOrName.name
+    elif isinstance(datasetTypeOrName, str):
+        return datasetTypeOrName
+    else:
+        raise TypeError(f"Expected DatasetType or str, got unexpected object: {datasetTypeOrName}")
diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py
@@ -114,55 +114,54 @@ def query_info(
         include_parents: bool = False,
         include_summary: bool = False,
         include_doc: bool = False,
-        summary_datasets: Iterable[DatasetType] | None = None,
+        summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None,
     ) -> Sequence[CollectionInfo]:
         info = []
-        with self._registry.caching_context():
-            if collection_types is None:
-                collection_types = CollectionType.all()
-            elif isinstance(collection_types, CollectionType):
-                collection_types = {collection_types}
-
-            records = self._registry._managers.collections.resolve_wildcard(
-                CollectionWildcard.from_expression(expression),
-                collection_types=collection_types,
-                flatten_chains=flatten_chains,
-                include_chains=include_chains,
-            )
+        if collection_types is None:
+            collection_types = CollectionType.all()
+        elif isinstance(collection_types, CollectionType):
+            collection_types = {collection_types}
+
+        records = self._registry._managers.collections.resolve_wildcard(
+            CollectionWildcard.from_expression(expression),
+            collection_types=collection_types,
+            flatten_chains=flatten_chains,
+            include_chains=include_chains,
+        )
 
-            summaries: Mapping[Any, CollectionSummary] = {}
-            if include_summary:
-                summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)
-
-            docs: Mapping[Any, str] = {}
-            if include_doc:
-                docs = self._registry._managers.collections.get_docs(record.key for record in records)
-
-            for record in records:
-                doc = docs.get(record.key, "")
-                children: tuple[str, ...] = tuple()
-                if record.type == CollectionType.CHAINED:
-                    assert isinstance(record, ChainedCollectionRecord)
-                    children = tuple(record.children)
-                parents: frozenset[str] | None = None
-                if include_parents:
-                    # TODO: This is non-vectorized, so expensive to do in a
-                    # loop.
-                    parents = frozenset(self._registry.getCollectionParentChains(record.name))
-                dataset_types: Set[str] | None = None
-                if summary := summaries.get(record.key):
-                    dataset_types = frozenset([dt.name for dt in summary.dataset_types])
-
-                info.append(
-                    CollectionInfo(
-                        name=record.name,
-                        type=record.type,
-                        doc=doc,
-                        parents=parents,
-                        children=children,
-                        dataset_types=dataset_types,
-                    )
+        summaries: Mapping[Any, CollectionSummary] = {}
+        if include_summary:
+            summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets)
+
+        docs: Mapping[Any, str] = {}
+        if include_doc:
+            docs = self._registry._managers.collections.get_docs(record.key for record in records)
+
+        for record in records:
+            doc = docs.get(record.key, "")
+            children: tuple[str, ...] = tuple()
+            if record.type == CollectionType.CHAINED:
+                assert isinstance(record, ChainedCollectionRecord)
+                children = tuple(record.children)
+            parents: frozenset[str] | None = None
+            if include_parents:
+                # TODO: This is non-vectorized, so expensive to do in a
+                # loop.
+                parents = frozenset(self._registry.getCollectionParentChains(record.name))
+            dataset_types: Set[str] | None = None
+            if summary := summaries.get(record.key):
+                dataset_types = frozenset([dt.name for dt in summary.dataset_types])
+
+            info.append(
+                CollectionInfo(
+                    name=record.name,
+                    type=record.type,
+                    doc=doc,
+                    parents=parents,
+                    children=children,
+                    dataset_types=dataset_types,
                 )
+            )
 
         return info
 

diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py
@@ -11,7 +11,8 @@
 
 import sqlalchemy
 
-from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType
+from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
+from ...._dataset_type import DatasetType, get_dataset_type_name
 from ...._exceptions_legacy import DatasetTypeError
 from ....dimensions import DimensionUniverse
 from ..._collection_summary import CollectionSummary
@@ -511,12 +512,14 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar
         return summaries[collection.key]
 
     def fetch_summaries(
-        self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
+        self,
+        collections: Iterable[CollectionRecord],
+        dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
     ) -> Mapping[Any, CollectionSummary]:
         # Docstring inherited from DatasetRecordStorageManager.
         dataset_type_names: Iterable[str] | None = None
         if dataset_types is not None:
-            dataset_type_names = set(dataset_type.name for dataset_type in dataset_types)
+            dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types)
         return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row)
 
     _versions: list[VersionTuple]

diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py
@@ -674,7 +674,9 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar
 
     @abstractmethod
     def fetch_summaries(
-        self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None
+        self,
+        collections: Iterable[CollectionRecord],
+        dataset_types: Iterable[DatasetType] | Iterable[str] | None = None,
     ) -> Mapping[Any, CollectionSummary]:
         """Fetch collection summaries given their names and dataset types.
 

diff --git a/python/lsst/daf/butler/remote_butler/_defaults.py b/python/lsst/daf/butler/remote_butler/_defaults.py
@@ -0,0 +1,60 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from ..registry import RegistryDefaults
+
+
+class DefaultsHolder:
+    """Holds a `RegistryDefaults` object and allows it to be set.
+
+    Parameters
+    ----------
+    defaults : `RegistryDefaults`
+        Initial value for the defaults object.
+
+    Notes
+    -----
+    This exists to work around circular dependency issues (RemoteButler,
+    ButlerCollections, and Registry all need to know/modify the defaults.)
+    """
+
+    def __init__(self, defaults: RegistryDefaults) -> None:
+        self._defaults = defaults
+
+    def get(self) -> RegistryDefaults:
+        """Retrieve the current registry defaults."""
+        return self._defaults
+
+    def set(self, defaults: RegistryDefaults) -> None:
+        """Set a new value for the registry defaults.
+
+        Parameters
+        ----------
+        defaults : `RegistryDefaults`
+            New value for defaults object.
+        """
+        self._defaults = defaults
diff --git a/python/lsst/daf/butler/remote_butler/_ref_utils.py b/python/lsst/daf/butler/remote_butler/_ref_utils.py
@@ -36,7 +36,7 @@
 from pydantic import TypeAdapter
 
 from .._dataset_ref import DatasetRef
-from .._dataset_type import DatasetType
+from .._dataset_type import DatasetType, get_dataset_type_name
 from .._storage_class import StorageClass
 from ..dimensions import DataCoordinate, DataId, DataIdValue, SerializedDataId
 from .server_models import DatasetTypeName
@@ -85,12 +85,7 @@ def normalize_dataset_type_name(datasetTypeOrName: DatasetType | str) -> Dataset
         A DatasetType, or the name of a DatasetType. This union is a common
         parameter in many `Butler` methods.
     """
-    if isinstance(datasetTypeOrName, DatasetType):
-        return DatasetTypeName(datasetTypeOrName.name)
-    elif isinstance(datasetTypeOrName, str):
-        return DatasetTypeName(datasetTypeOrName)
-    else:
-        raise TypeError(f"Got unexpected object for DatasetType: {datasetTypeOrName}")
+    return DatasetTypeName(get_dataset_type_name(datasetTypeOrName))
 
 
 def simplify_dataId(dataId: DataId | None, kwargs: dict[str, DataIdValue]) -> SerializedDataId: