From bcf3cd2248ddeab39b994f171e850dfa37d3ede2 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 9 Sep 2024 10:41:49 -0700 Subject: [PATCH 1/8] Make collections.query_info a single HTTP call Previously collections.query_info would make an HTTP call for each resolved collection. It now does everything in a single call to the server instead. --- python/lsst/daf/butler/_butler_collections.py | 3 ++ .../butler/remote_butler/_remote_butler.py | 4 +- .../_remote_butler_collections.py | 41 +++++++++++++------ .../server/handlers/_external.py | 20 +++++++++ .../daf/butler/remote_butler/server_models.py | 18 ++++++++ 5 files changed, 71 insertions(+), 15 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 85791b8603..6d4bae4984 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -45,6 +45,9 @@ class CollectionInfo(BaseModel): """Information about a single Butler collection.""" + # This class is serialized for the server API -- any new properties you add + # must have default values provided to preserve backwards compatibility. + name: str """Name of the collection.""" type: CollectionType diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 35c1fad2d8..b1685e135b 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -166,14 +166,14 @@ def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" from ._registry import RemoteButlerRegistry - return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry)) + return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry), self._connection) @property def collections(self) -> ButlerCollections: """Object with methods for modifying and querying collections.""" from ._registry import RemoteButlerRegistry - return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry)) + return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry), self._connection) @property def dimensions(self) -> DimensionUniverse: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 9d5697b07b..e8ea4a9ae0 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -32,8 +32,13 @@ from collections.abc import Iterable, Sequence, Set from typing import TYPE_CHECKING +from lsst.utils.iteration import ensure_iterable + from .._butler_collections import ButlerCollections, CollectionInfo from .._collection_type import CollectionType +from ._collection_args import convert_collection_arg_to_glob_string_list +from ._http_connection import RemoteButlerHttpConnection, parse_model +from .server_models import QueryCollectionInfoRequestModel, QueryCollectionInfoResponseModel if TYPE_CHECKING: from .._dataset_type import DatasetType @@ -47,10 +52,13 @@ class RemoteButlerCollections(ButlerCollections): ---------- registry : `~lsst.daf.butler.registry.sql_registry.SqlRegistry` Registry object used to work with the collections database. + connection : `RemoteButlerHttpConnection` + HTTP connection to Butler server. """ - def __init__(self, registry: RemoteButlerRegistry): + def __init__(self, registry: RemoteButlerRegistry, connection: RemoteButlerHttpConnection): self._registry = registry + self._connection = connection @property def defaults(self) -> Sequence[str]: @@ -83,19 +91,26 @@ def query_info( include_doc: bool = False, summary_datasets: Iterable[DatasetType] | None = None, ) -> Sequence[CollectionInfo]: - # This should become a single call on the server in the future. if collection_types is None: - collection_types = CollectionType.all() - - info = [] - for name in self._registry.queryCollections( - expression, - collectionTypes=collection_types, - flattenChains=flatten_chains, - includeChains=include_chains, - ): - info.append(self.get_info(name, include_parents=include_parents, include_summary=include_summary)) - return info + types = list(CollectionType.all()) + else: + types = list(ensure_iterable(collection_types)) + + if include_chains is None: + include_chains = not flatten_chains + + request = QueryCollectionInfoRequestModel( + expression=convert_collection_arg_to_glob_string_list(expression), + collection_types=types, + flatten_chains=flatten_chains, + include_chains=include_chains, + include_parents=include_parents, + include_summary=include_summary, + ) + response = self._connection.post("query_collection_info", request) + model = parse_model(response, QueryCollectionInfoResponseModel) + + return model.collections def get_info( self, name: str, include_parents: bool = False, include_summary: bool = False diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 6d723228aa..1ac350dc91 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -46,6 +46,8 @@ GetFileByDataIdRequestModel, GetFileResponseModel, GetUniverseResponseModel, + QueryCollectionInfoRequestModel, + QueryCollectionInfoResponseModel, QueryCollectionsRequestModel, QueryCollectionsResponseModel, QueryDatasetTypesRequestModel, @@ -247,6 +249,24 @@ def query_collections( return QueryCollectionsResponseModel(collections=collections) +@external_router.post( + "/v1/query_collection_info", summary="Search for collections with names that match an expression" +) +def query_collection_info( + request: QueryCollectionInfoRequestModel, factory: Factory = Depends(factory_dependency) +) -> QueryCollectionInfoResponseModel: + butler = factory.create_butler() + collections = butler.collections.query_info( + expression=request.expression, + collection_types=set(request.collection_types), + flatten_chains=request.flatten_chains, + include_chains=request.include_chains, + include_parents=request.include_parents, + include_summary=request.include_summary, + ) + return QueryCollectionInfoResponseModel(collections=list(collections)) + + @external_router.post( "/v1/query_dataset_types", summary="Search for dataset types with names that match an expression" ) diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 5ae692506f..d1e61a17d3 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -42,6 +42,7 @@ import pydantic from lsst.daf.butler import ( + CollectionInfo, CollectionType, DataIdValue, SerializedDataCoordinate, @@ -189,6 +190,23 @@ class QueryCollectionsResponseModel(pydantic.BaseModel): """Collection names that match the search.""" +class QueryCollectionInfoRequestModel(pydantic.BaseModel): + """Request model for query_collection_info.""" + + expression: CollectionList + collection_types: list[CollectionType] + flatten_chains: bool + include_chains: bool + include_parents: bool + include_summary: bool + + +class QueryCollectionInfoResponseModel(pydantic.BaseModel): + """Response model for query_collection_info.""" + + collections: list[CollectionInfo] + + class QueryDatasetTypesRequestModel(pydantic.BaseModel): """Request model for queryDatasetTypes.""" From 063d5726970be41e87942d8f6be2d378197fc61e Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 9 Sep 2024 11:12:31 -0700 Subject: [PATCH 2/8] Make get_info use the same endpoint as query_info There is no reason to add a duplicate API for this, because get_info is identical to calling query_info with a single collection name. --- .../_remote_butler_collections.py | 21 ++++++------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index e8ea4a9ae0..169006f274 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -36,6 +36,7 @@ from .._butler_collections import ButlerCollections, CollectionInfo from .._collection_type import CollectionType +from ..utils import has_globs from ._collection_args import convert_collection_arg_to_glob_string_list from ._http_connection import RemoteButlerHttpConnection, parse_model from .server_models import QueryCollectionInfoRequestModel, QueryCollectionInfoResponseModel @@ -115,21 +116,11 @@ def query_info( def get_info( self, name: str, include_parents: bool = False, include_summary: bool = False ) -> CollectionInfo: - info = self._registry._get_collection_info(name, include_doc=True, include_parents=include_parents) - doc = info.doc or "" - children = info.children or () - dataset_types: Set[str] | None = None - if include_summary: - summary = self._registry.getCollectionSummary(name) - dataset_types = frozenset([dt.name for dt in summary.dataset_types]) - return CollectionInfo( - name=name, - type=info.type, - doc=doc, - parents=info.parents, - children=children, - dataset_types=dataset_types, - ) + if has_globs(name): + raise ValueError("Search expressions are not allowed in 'name' parameter to get_info") + results = self.query_info(name, include_parents=include_parents, include_summary=include_summary) + assert len(results) == 1, "Only one result should be returned for get_info." + return results[0] def register(self, name: str, type: CollectionType = CollectionType.RUN, doc: str | None = None) -> bool: raise NotImplementedError("Not yet available.") From 694d360a71185bb3ba1d7b6d9d39cb88506ec872 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 9 Sep 2024 11:46:37 -0700 Subject: [PATCH 3/8] Break RegistryDefaults circular dependency The various Butler sub-objects (registry, collections) need to access the registry defaults. Added a small helper class so that they don't need a reference to RemoteButler or each other in order to fetch the defaults. --- .../daf/butler/remote_butler/_defaults.py | 60 +++++++++++++++++++ .../daf/butler/remote_butler/_registry.py | 14 +++-- .../butler/remote_butler/_remote_butler.py | 34 ++++------- .../_remote_butler_collections.py | 17 +++--- 4 files changed, 89 insertions(+), 36 deletions(-) create mode 100644 python/lsst/daf/butler/remote_butler/_defaults.py diff --git a/python/lsst/daf/butler/remote_butler/_defaults.py b/python/lsst/daf/butler/remote_butler/_defaults.py new file mode 100644 index 0000000000..8f2c743ff0 --- /dev/null +++ b/python/lsst/daf/butler/remote_butler/_defaults.py @@ -0,0 +1,60 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from ..registry import RegistryDefaults + + +class DefaultsHolder: + """Holds a `RegistryDefaults` object and allows it to be set. + + Parameters + ---------- + defaults : `RegistryDefaults` + Initial value for the defaults object. + + Notes + ----- + This exists to work around circular dependency issues (RemoteButler, + ButlerCollections, and Registry all need to know/modify the defaults.) + """ + + def __init__(self, defaults: RegistryDefaults) -> None: + self._defaults = defaults + + def get(self) -> RegistryDefaults: + """Retrieve the current registry defaults.""" + return self._defaults + + def set(self, defaults: RegistryDefaults) -> None: + """Set a new value for the registry defaults. + + Parameters + ---------- + defaults : `RegistryDefaults` + New value for defaults object. + """ + self._defaults = defaults diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py index 74543de1f0..f31f04929b 100644 --- a/python/lsst/daf/butler/remote_butler/_registry.py +++ b/python/lsst/daf/butler/remote_butler/_registry.py @@ -31,6 +31,7 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence from typing import Any +from lsst.daf.butler import Butler from lsst.utils.iteration import ensure_iterable from .._collection_type import CollectionType @@ -69,8 +70,8 @@ convert_collection_arg_to_glob_string_list, convert_dataset_type_arg_to_glob_string_list, ) +from ._defaults import DefaultsHolder from ._http_connection import RemoteButlerHttpConnection, parse_model -from ._remote_butler import RemoteButler from .registry._query_common import CommonQueryArguments from .registry._query_data_coordinates import QueryDriverDataCoordinateQueryResults from .registry._query_datasets import QueryDriverDatasetRefQueryResults @@ -92,15 +93,18 @@ class RemoteButlerRegistry(Registry): Parameters ---------- - butler : `RemoteButler` + butler : `Butler` Butler instance to which this registry delegates operations. + defaults : `DefaultHolder` + Reference to object containing default collections and data ID. connection : `RemoteButlerHttpConnection` HTTP connection to Butler server for looking up data. """ - def __init__(self, butler: RemoteButler, connection: RemoteButlerHttpConnection): + def __init__(self, butler: Butler, defaults: DefaultsHolder, connection: RemoteButlerHttpConnection): self._butler = butler self._connection = connection + self._defaults = defaults def isWriteable(self) -> bool: return self._butler.isWriteable() @@ -111,12 +115,12 @@ def dimensions(self) -> DimensionUniverse: @property def defaults(self) -> RegistryDefaults: - return self._butler._registry_defaults + return self._defaults.get() @defaults.setter def defaults(self, value: RegistryDefaults) -> None: value.finish(self) - self._butler._registry_defaults = value + self._defaults.set(value) def refresh(self) -> None: # In theory the server should manage all necessary invalidation of diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index b1685e135b..382a32cd93 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -61,8 +61,11 @@ from ..queries import Query from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults from ._collection_args import convert_collection_arg_to_glob_string_list +from ._defaults import DefaultsHolder +from ._http_connection import RemoteButlerHttpConnection, parse_model, quote_path_variable from ._query_driver import RemoteQueryDriver from ._ref_utils import apply_storage_class_override, normalize_dataset_type_name, simplify_dataId +from ._registry import RemoteButlerRegistry from ._remote_butler_collections import RemoteButlerCollections from .server_models import ( CollectionList, @@ -81,8 +84,6 @@ from ..dimensions import DataId from ..transfers import RepoExportContext -from ._http_connection import RemoteButlerHttpConnection, parse_model, quote_path_variable - class RemoteButler(Butler): # numpydoc ignore=PR02 """A `Butler` that can be used to connect through a remote server. @@ -109,10 +110,10 @@ class RemoteButler(Butler): # numpydoc ignore=PR02 `Butler.from_config` or `RemoteButlerFactory`. """ - _registry_defaults: RegistryDefaults + _registry_defaults: DefaultsHolder _connection: RemoteButlerHttpConnection _cache: RemoteButlerCache - _registry: Registry + _registry: RemoteButlerRegistry _datastore_cache_manager: AbstractDatastoreCacheManager | None _use_disabled_datastore_cache: bool @@ -140,15 +141,10 @@ def __new__( self._datastore_cache_manager = None self._use_disabled_datastore_cache = use_disabled_datastore_cache - # Avoid a circular import by deferring this import. - from ._registry import RemoteButlerRegistry - - self._registry = RemoteButlerRegistry(self, self._connection) - - self._registry_defaults = RegistryDefaults( - options.collections, options.run, options.inferDefaults, **options.kwargs - ) - self._registry_defaults.finish(self._registry) + defaults = RegistryDefaults(options.collections, options.run, options.inferDefaults, **options.kwargs) + self._registry_defaults = DefaultsHolder(defaults) + self._registry = RemoteButlerRegistry(self, self._registry_defaults, self._connection) + defaults.finish(self._registry) return self @@ -164,16 +160,12 @@ def isWriteable(self) -> bool: ) def collection_chains(self) -> ButlerCollections: """Object with methods for modifying collection chains.""" - from ._registry import RemoteButlerRegistry - - return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry), self._connection) + return self.collections @property def collections(self) -> ButlerCollections: """Object with methods for modifying and querying collections.""" - from ._registry import RemoteButlerRegistry - - return RemoteButlerCollections(cast(RemoteButlerRegistry, self._registry), self._connection) + return RemoteButlerCollections(self._registry_defaults, self._connection) @property def dimensions(self) -> DimensionUniverse: @@ -563,7 +555,7 @@ def validateConfiguration( @property def run(self) -> str | None: # Docstring inherited. - return self._registry_defaults.run + return self._registry_defaults.get().run @property def registry(self) -> Registry: @@ -633,7 +625,7 @@ def _serialize_default_data_id(self) -> SerializedDataId: # dimensions, but knowing what things are implied depends on what the # required dimensions are. - return self._registry_defaults.dataId.to_simple(minimal=True).dataId + return self._registry_defaults.get().dataId.to_simple(minimal=True).dataId def _to_file_payload(get_file_response: GetFileResponseModel) -> FileDatastoreGetPayload: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 169006f274..43572b63c0 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -30,40 +30,37 @@ __all__ = ("RemoteButlerCollections",) from collections.abc import Iterable, Sequence, Set -from typing import TYPE_CHECKING from lsst.utils.iteration import ensure_iterable from .._butler_collections import ButlerCollections, CollectionInfo from .._collection_type import CollectionType +from .._dataset_type import DatasetType from ..utils import has_globs from ._collection_args import convert_collection_arg_to_glob_string_list +from ._defaults import DefaultsHolder from ._http_connection import RemoteButlerHttpConnection, parse_model from .server_models import QueryCollectionInfoRequestModel, QueryCollectionInfoResponseModel -if TYPE_CHECKING: - from .._dataset_type import DatasetType - from ._registry import RemoteButlerRegistry - class RemoteButlerCollections(ButlerCollections): """Implementation of ButlerCollections for RemoteButler. Parameters ---------- - registry : `~lsst.daf.butler.registry.sql_registry.SqlRegistry` - Registry object used to work with the collections database. + defaults : `DefaultsHolder` + Registry object used to look up default collections. connection : `RemoteButlerHttpConnection` HTTP connection to Butler server. """ - def __init__(self, registry: RemoteButlerRegistry, connection: RemoteButlerHttpConnection): - self._registry = registry + def __init__(self, defaults: DefaultsHolder, connection: RemoteButlerHttpConnection): + self._defaults = defaults self._connection = connection @property def defaults(self) -> Sequence[str]: - return self._registry.defaults.collections + return self._defaults.get().collections def extend_chain(self, parent_collection_name: str, child_collection_names: str | Iterable[str]) -> None: raise NotImplementedError("Not yet available") From a7ea9f2658c4cb3a404fb149e3e69b9e5c8515b9 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 9 Sep 2024 11:59:36 -0700 Subject: [PATCH 4/8] Forward RemoteButlerRegistry to ButlerCollections Now that there is an independent implementation of ButlerCollections for RemoteButler, forward the equivalent Registry methods to it to reduce duplication. --- .../daf/butler/remote_butler/_registry.py | 37 ++++++------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py index f31f04929b..f213c59f68 100644 --- a/python/lsst/daf/butler/remote_butler/_registry.py +++ b/python/lsst/daf/butler/remote_butler/_registry.py @@ -79,10 +79,7 @@ from .server_models import ( ExpandDataIdRequestModel, ExpandDataIdResponseModel, - GetCollectionInfoResponseModel, GetCollectionSummaryResponseModel, - QueryCollectionsRequestModel, - QueryCollectionsResponseModel, QueryDatasetTypesRequestModel, QueryDatasetTypesResponseModel, ) @@ -145,7 +142,7 @@ def registerCollection( raise NotImplementedError() def getCollectionType(self, name: str) -> CollectionType: - return self._get_collection_info(name).type + return self._butler.collections.get_info(name).type def registerRun(self, name: str, doc: str | None = None) -> bool: raise NotImplementedError() @@ -154,7 +151,7 @@ def removeCollection(self, name: str) -> None: raise NotImplementedError() def getCollectionChain(self, parent: str) -> Sequence[str]: - info = self._get_collection_info(parent) + info = self._butler.collections.get_info(parent) if info.type is not CollectionType.CHAINED: raise CollectionTypeError(f"Collection '{parent}' has type {info.type.name}, not CHAINED.") return info.children @@ -163,26 +160,19 @@ def setCollectionChain(self, parent: str, children: Any, *, flatten: bool = Fals raise NotImplementedError() def getCollectionParentChains(self, collection: str) -> set[str]: - info = self._get_collection_info(collection, include_parents=True) + info = self._butler.collections.get_info(collection, include_parents=True) assert info.parents is not None, "Requested list of parents from server, but it did not send them." - return info.parents + return set(info.parents) def getCollectionDocumentation(self, collection: str) -> str | None: - info = self._get_collection_info(collection, include_doc=True) - return info.doc + doc = self._butler.collections.get_info(collection).doc + if not doc: + return None + return doc def setCollectionDocumentation(self, collection: str, doc: str | None) -> None: raise NotImplementedError() - def _get_collection_info( - self, collection_name: str, include_doc: bool = False, include_parents: bool = False - ) -> GetCollectionInfoResponseModel: - response = self._connection.get( - "collection_info", - {"name": collection_name, "include_doc": include_doc, "include_parents": include_parents}, - ) - return parse_model(response, GetCollectionInfoResponseModel) - def getCollectionSummary(self, collection: str) -> CollectionSummary: response = self._connection.get("collection_summary", {"name": collection}) parsed = parse_model(response, GetCollectionSummaryResponseModel) @@ -356,18 +346,13 @@ def queryCollections( flattenChains: bool = False, includeChains: bool | None = None, ) -> Sequence[str]: - if includeChains is None: - includeChains = not flattenChains - query = QueryCollectionsRequestModel( - search=convert_collection_arg_to_glob_string_list(expression), - collection_types=list(ensure_iterable(collectionTypes)), + return self._butler.collections.query( + expression, + collection_types=set(ensure_iterable(collectionTypes)), flatten_chains=flattenChains, include_chains=includeChains, ) - response = self._connection.post("query_collections", query) - return parse_model(response, QueryCollectionsResponseModel).collections - def queryDatasets( self, datasetType: Any, From 56d6a2781b46933b092d7069fd91f97e6c8935fa Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 9 Sep 2024 12:12:55 -0700 Subject: [PATCH 5/8] Mark deprecated REST endpoints --- .../server/handlers/_external.py | 4 ++++ .../daf/butler/remote_butler/server_models.py | 6 ++++++ tests/test_server.py | 21 +++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 1ac350dc91..982c008c42 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -199,6 +199,8 @@ def _get_file_by_ref(butler: Butler, ref: DatasetRef) -> GetFileResponseModel: return GetFileResponseModel(dataset_ref=ref.to_simple(), artifact=payload) +# TODO DM-46204: This can be removed once the RSP recommended image has been +# upgraded to a version that contains DM-46129. @external_router.get( "/v1/collection_info", summary="Get information about a collection", response_model_exclude_unset=True ) @@ -233,6 +235,8 @@ def get_collection_summary( return GetCollectionSummaryResponseModel(summary=butler.registry.getCollectionSummary(name).to_simple()) +# TODO DM-46204: This can be removed once the RSP recommended image has been +# upgraded to a version that contains DM-46129. @external_router.post( "/v1/query_collections", summary="Search for collections with names that match an expression" ) diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index d1e61a17d3..6573d70967 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -142,6 +142,8 @@ class ErrorResponseModel(pydantic.BaseModel): """Detailed explanation of the error that will be sent to the client.""" +# TODO DM-46204: This can be removed once the RSP recommended image has been +# upgraded to a version that contains DM-46129. class GetCollectionInfoResponseModel(pydantic.BaseModel): """Response model for get_collection_info.""" @@ -174,6 +176,8 @@ class ExpandDataIdResponseModel(pydantic.BaseModel): data_coordinate: SerializedDataCoordinate +# TODO DM-46204: This can be removed once the RSP recommended image has been +# upgraded to a version that contains DM-46129. class QueryCollectionsRequestModel(pydantic.BaseModel): """Request model for query_collections.""" @@ -183,6 +187,8 @@ class QueryCollectionsRequestModel(pydantic.BaseModel): include_chains: bool +# TODO DM-46204: This can be removed once the RSP recommended image has been +# upgraded to a version that contains DM-46129. class QueryCollectionsResponseModel(pydantic.BaseModel): """Response model for query_collections.""" diff --git a/tests/test_server.py b/tests/test_server.py index 4b41c5d01b..be7d0cd81c 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -42,6 +42,7 @@ from lsst.daf.butler.remote_butler._authentication import _EXPLICIT_BUTLER_ACCESS_TOKEN_ENVIRONMENT_KEY from lsst.daf.butler.remote_butler.server import create_app from lsst.daf.butler.remote_butler.server._dependencies import butler_factory_dependency + from lsst.daf.butler.remote_butler.server_models import QueryCollectionsRequestModel from lsst.daf.butler.tests.server import TEST_REPOSITORY_NAME, UnhandledServerError, create_test_server reason_text = "" @@ -426,6 +427,26 @@ def test_query_keepalive(self): self.assertGreaterEqual(mock_timeout.call_count, 3) self.assertGreaterEqual(mock_keep_alive.call_count, 2) + # TODO DM-46204: This can be removed once the RSP recommended image has + # been upgraded to a version that contains DM-46129. + def test_deprecated_collection_endpoints(self): + # These REST endpoints are no longer used by Butler client so they need + # to be checked separately until they can be removed. + json = self.butler._connection.get( + "collection_info", + params={"name": "imported_g", "include_doc": True, "include_parents": True}, + ).json() + self.assertEqual(json["name"], "imported_g") + self.assertEqual(json["type"], 1) + + json = self.butler._connection.post( + "query_collections", + QueryCollectionsRequestModel( + search=["imported_*"], collection_types=[1], flatten_chains=False, include_chains=False + ), + ).json() + self.assertCountEqual(json["collections"], ["imported_g", "imported_r"]) + def _create_corrupted_dataset(repo: MetricTestRepo) -> DatasetRef: run = "corrupted-run" From 8c3ca9c8ec1c5c240a6b7adb2fc6fc5c303a2367 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 10 Sep 2024 10:47:15 -0700 Subject: [PATCH 6/8] Handle collection query optimization parameters Update RemoteButlerCollections to pass the new optimization parameters added in DM-45993 to the server. --- python/lsst/daf/butler/_butler_collections.py | 6 +++--- python/lsst/daf/butler/_dataset_type.py | 18 ++++++++++++++++++ .../_direct_butler_collections.py | 2 +- .../registry/datasets/byDimensions/_manager.py | 7 +++++-- .../butler/registry/interfaces/_datasets.py | 4 +++- .../daf/butler/remote_butler/_ref_utils.py | 9 ++------- .../_remote_butler_collections.py | 14 ++++++++++++-- .../remote_butler/server/handlers/_external.py | 2 ++ .../daf/butler/remote_butler/server_models.py | 2 ++ .../butler/tests/hybrid_butler_collections.py | 2 +- 10 files changed, 49 insertions(+), 17 deletions(-) diff --git a/python/lsst/daf/butler/_butler_collections.py b/python/lsst/daf/butler/_butler_collections.py index 6d4bae4984..504dcc12c3 100644 --- a/python/lsst/daf/butler/_butler_collections.py +++ b/python/lsst/daf/butler/_butler_collections.py @@ -283,7 +283,7 @@ def query_info( include_parents: bool = False, include_summary: bool = False, include_doc: bool = False, - summary_datasets: Iterable[DatasetType] | None = None, + summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Sequence[CollectionInfo]: """Query the butler for collections matching an expression and return detailed information about those collections. @@ -310,8 +310,8 @@ def query_info( include_doc : `bool`, optional Whether the returned information includes collection documentation string. - summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ], \ - optional + summary_datasets : `~collections.abc.Iterable` [ `DatasetType` ] or \ + `~collections.abc.Iterable` [ `str` ], optional Dataset types to include in returned summaries. Only used if ``include_summary`` is `True`. If not specified then all dataset types will be included. diff --git a/python/lsst/daf/butler/_dataset_type.py b/python/lsst/daf/butler/_dataset_type.py index 9321ea67c6..47fdacd49d 100644 --- a/python/lsst/daf/butler/_dataset_type.py +++ b/python/lsst/daf/butler/_dataset_type.py @@ -796,3 +796,21 @@ def _unpickle_via_factory(factory: Callable, args: Any, kwargs: Any) -> DatasetT arguments as well as positional arguments. """ return factory(*args, **kwargs) + + +def get_dataset_type_name(datasetTypeOrName: DatasetType | str) -> str: + """Given a `DatasetType` object or a dataset type name, return a dataset + type name. + + Parameters + ---------- + datasetTypeOrName : `DatasetType` | `str` + A DatasetType, or the name of a DatasetType. This union is a common + parameter in many `Butler` methods. + """ + if isinstance(datasetTypeOrName, DatasetType): + return datasetTypeOrName.name + elif isinstance(datasetTypeOrName, str): + return datasetTypeOrName + else: + raise TypeError(f"Expected DatasetType or str, got unexpected object: {datasetTypeOrName}") diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index 1ce6fb63b9..a84cebecde 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -114,7 +114,7 @@ def query_info( include_parents: bool = False, include_summary: bool = False, include_doc: bool = False, - summary_datasets: Iterable[DatasetType] | None = None, + summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Sequence[CollectionInfo]: info = [] with self._registry.caching_context(): diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py index 909761ea19..df0f17a03f 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py @@ -12,6 +12,7 @@ import sqlalchemy from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType +from ...._dataset_type import get_dataset_type_name from ...._exceptions_legacy import DatasetTypeError from ....dimensions import DimensionUniverse from ..._collection_summary import CollectionSummary @@ -511,12 +512,14 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar return summaries[collection.key] def fetch_summaries( - self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None + self, + collections: Iterable[CollectionRecord], + dataset_types: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Mapping[Any, CollectionSummary]: # Docstring inherited from DatasetRecordStorageManager. dataset_type_names: Iterable[str] | None = None if dataset_types is not None: - dataset_type_names = set(dataset_type.name for dataset_type in dataset_types) + dataset_type_names = set(get_dataset_type_name(dt) for dt in dataset_types) return self._summaries.fetch_summaries(collections, dataset_type_names, self._dataset_type_from_row) _versions: list[VersionTuple] diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py index d86fe25414..be8a22f910 100644 --- a/python/lsst/daf/butler/registry/interfaces/_datasets.py +++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py @@ -674,7 +674,9 @@ def getCollectionSummary(self, collection: CollectionRecord) -> CollectionSummar @abstractmethod def fetch_summaries( - self, collections: Iterable[CollectionRecord], dataset_types: Iterable[DatasetType] | None = None + self, + collections: Iterable[CollectionRecord], + dataset_types: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Mapping[Any, CollectionSummary]: """Fetch collection summaries given their names and dataset types. diff --git a/python/lsst/daf/butler/remote_butler/_ref_utils.py b/python/lsst/daf/butler/remote_butler/_ref_utils.py index b5944fa9bb..039805f7b5 100644 --- a/python/lsst/daf/butler/remote_butler/_ref_utils.py +++ b/python/lsst/daf/butler/remote_butler/_ref_utils.py @@ -36,7 +36,7 @@ from pydantic import TypeAdapter from .._dataset_ref import DatasetRef -from .._dataset_type import DatasetType +from .._dataset_type import DatasetType, get_dataset_type_name from .._storage_class import StorageClass from ..dimensions import DataCoordinate, DataId, DataIdValue, SerializedDataId from .server_models import DatasetTypeName @@ -85,12 +85,7 @@ def normalize_dataset_type_name(datasetTypeOrName: DatasetType | str) -> Dataset A DatasetType, or the name of a DatasetType. This union is a common parameter in many `Butler` methods. """ - if isinstance(datasetTypeOrName, DatasetType): - return DatasetTypeName(datasetTypeOrName.name) - elif isinstance(datasetTypeOrName, str): - return DatasetTypeName(datasetTypeOrName) - else: - raise TypeError(f"Got unexpected object for DatasetType: {datasetTypeOrName}") + return DatasetTypeName(get_dataset_type_name(datasetTypeOrName)) def simplify_dataId(dataId: DataId | None, kwargs: dict[str, DataIdValue]) -> SerializedDataId: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py index 43572b63c0..db3f3ca83e 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler_collections.py @@ -40,6 +40,7 @@ from ._collection_args import convert_collection_arg_to_glob_string_list from ._defaults import DefaultsHolder from ._http_connection import RemoteButlerHttpConnection, parse_model +from ._ref_utils import normalize_dataset_type_name from .server_models import QueryCollectionInfoRequestModel, QueryCollectionInfoResponseModel @@ -87,7 +88,7 @@ def query_info( include_parents: bool = False, include_summary: bool = False, include_doc: bool = False, - summary_datasets: Iterable[DatasetType] | None = None, + summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Sequence[CollectionInfo]: if collection_types is None: types = list(CollectionType.all()) @@ -97,6 +98,11 @@ def query_info( if include_chains is None: include_chains = not flatten_chains + if summary_datasets is None: + dataset_types = None + else: + dataset_types = [normalize_dataset_type_name(t) for t in summary_datasets] + request = QueryCollectionInfoRequestModel( expression=convert_collection_arg_to_glob_string_list(expression), collection_types=types, @@ -104,6 +110,8 @@ def query_info( include_chains=include_chains, include_parents=include_parents, include_summary=include_summary, + include_doc=include_doc, + summary_datasets=dataset_types, ) response = self._connection.post("query_collection_info", request) model = parse_model(response, QueryCollectionInfoResponseModel) @@ -115,7 +123,9 @@ def get_info( ) -> CollectionInfo: if has_globs(name): raise ValueError("Search expressions are not allowed in 'name' parameter to get_info") - results = self.query_info(name, include_parents=include_parents, include_summary=include_summary) + results = self.query_info( + name, include_parents=include_parents, include_summary=include_summary, include_doc=True + ) assert len(results) == 1, "Only one result should be returned for get_info." return results[0] diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 982c008c42..bfa90b2d8a 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -267,6 +267,8 @@ def query_collection_info( include_chains=request.include_chains, include_parents=request.include_parents, include_summary=request.include_summary, + include_doc=request.include_doc, + summary_datasets=request.summary_datasets, ) return QueryCollectionInfoResponseModel(collections=list(collections)) diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 6573d70967..1c688fd213 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -205,6 +205,8 @@ class QueryCollectionInfoRequestModel(pydantic.BaseModel): include_chains: bool include_parents: bool include_summary: bool + include_doc: bool + summary_datasets: list[DatasetTypeName] | None class QueryCollectionInfoResponseModel(pydantic.BaseModel): diff --git a/python/lsst/daf/butler/tests/hybrid_butler_collections.py b/python/lsst/daf/butler/tests/hybrid_butler_collections.py index 2ca5522dc7..c375b27af3 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_collections.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_collections.py @@ -87,7 +87,7 @@ def query_info( include_parents: bool = False, include_summary: bool = False, include_doc: bool = False, - summary_datasets: Iterable[DatasetType] | None = None, + summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Sequence[CollectionInfo]: return self._hybrid._remote_butler.collections.query_info( expression, From 701bd480db461e5ea21cd07ef46d9755e1f390b6 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 10 Sep 2024 15:37:38 -0700 Subject: [PATCH 7/8] Fix summary_datasets being ignored When attempting to add a test for the summary_datasets parameter to ButlerCollectionInfo for RemoteButler, it turned out that it does not doing anything in the DirectButler version. This was occurring because a caching context causes this parameter to be ignored, and the cache was always enabled in query_info previously. This cache does not have a benefit for the new implementation. --- .../_direct_butler_collections.py | 87 +++++++++---------- .../lsst/daf/butler/tests/butler_queries.py | 15 ++++ 2 files changed, 58 insertions(+), 44 deletions(-) diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py index a84cebecde..0d2d549972 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler_collections.py @@ -117,52 +117,51 @@ def query_info( summary_datasets: Iterable[DatasetType] | Iterable[str] | None = None, ) -> Sequence[CollectionInfo]: info = [] - with self._registry.caching_context(): - if collection_types is None: - collection_types = CollectionType.all() - elif isinstance(collection_types, CollectionType): - collection_types = {collection_types} - - records = self._registry._managers.collections.resolve_wildcard( - CollectionWildcard.from_expression(expression), - collection_types=collection_types, - flatten_chains=flatten_chains, - include_chains=include_chains, - ) + if collection_types is None: + collection_types = CollectionType.all() + elif isinstance(collection_types, CollectionType): + collection_types = {collection_types} + + records = self._registry._managers.collections.resolve_wildcard( + CollectionWildcard.from_expression(expression), + collection_types=collection_types, + flatten_chains=flatten_chains, + include_chains=include_chains, + ) - summaries: Mapping[Any, CollectionSummary] = {} - if include_summary: - summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets) - - docs: Mapping[Any, str] = {} - if include_doc: - docs = self._registry._managers.collections.get_docs(record.key for record in records) - - for record in records: - doc = docs.get(record.key, "") - children: tuple[str, ...] = tuple() - if record.type == CollectionType.CHAINED: - assert isinstance(record, ChainedCollectionRecord) - children = tuple(record.children) - parents: frozenset[str] | None = None - if include_parents: - # TODO: This is non-vectorized, so expensive to do in a - # loop. - parents = frozenset(self._registry.getCollectionParentChains(record.name)) - dataset_types: Set[str] | None = None - if summary := summaries.get(record.key): - dataset_types = frozenset([dt.name for dt in summary.dataset_types]) - - info.append( - CollectionInfo( - name=record.name, - type=record.type, - doc=doc, - parents=parents, - children=children, - dataset_types=dataset_types, - ) + summaries: Mapping[Any, CollectionSummary] = {} + if include_summary: + summaries = self._registry._managers.datasets.fetch_summaries(records, summary_datasets) + + docs: Mapping[Any, str] = {} + if include_doc: + docs = self._registry._managers.collections.get_docs(record.key for record in records) + + for record in records: + doc = docs.get(record.key, "") + children: tuple[str, ...] = tuple() + if record.type == CollectionType.CHAINED: + assert isinstance(record, ChainedCollectionRecord) + children = tuple(record.children) + parents: frozenset[str] | None = None + if include_parents: + # TODO: This is non-vectorized, so expensive to do in a + # loop. + parents = frozenset(self._registry.getCollectionParentChains(record.name)) + dataset_types: Set[str] | None = None + if summary := summaries.get(record.key): + dataset_types = frozenset([dt.name for dt in summary.dataset_types]) + + info.append( + CollectionInfo( + name=record.name, + type=record.type, + doc=doc, + parents=parents, + children=children, + dataset_types=dataset_types, ) + ) return info diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index 34ae03be95..1165e4e054 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -1804,6 +1804,21 @@ def test_calibration_join_queries(self) -> None: ], ) + def test_collection_query_info(self) -> None: + butler = self.make_butler("base.yaml", "datasets.yaml") + + info = butler.collections.query_info("imported_g", include_summary=True) + self.assertEqual(len(info), 1) + dataset_types = info[0].dataset_types + assert dataset_types is not None + self.assertCountEqual(dataset_types, ["flat", "bias"]) + + info = butler.collections.query_info("imported_g", include_summary=True, summary_datasets=["flat"]) + self.assertEqual(len(info), 1) + dataset_types = info[0].dataset_types + assert dataset_types is not None + self.assertCountEqual(dataset_types, ["flat"]) + def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]: output = [] From af340911623bf13a72625bda4d312b0146900e60 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 10 Sep 2024 17:02:31 -0700 Subject: [PATCH 8/8] Fix minor review comments --- python/lsst/daf/butler/_dataset_type.py | 8 ++++++-- .../daf/butler/registry/datasets/byDimensions/_manager.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/lsst/daf/butler/_dataset_type.py b/python/lsst/daf/butler/_dataset_type.py index 47fdacd49d..c7b546cc94 100644 --- a/python/lsst/daf/butler/_dataset_type.py +++ b/python/lsst/daf/butler/_dataset_type.py @@ -805,8 +805,12 @@ def get_dataset_type_name(datasetTypeOrName: DatasetType | str) -> str: Parameters ---------- datasetTypeOrName : `DatasetType` | `str` - A DatasetType, or the name of a DatasetType. This union is a common - parameter in many `Butler` methods. + A DatasetType, or the name of a DatasetType. + + Returns + ------- + name + The name associated with the given DatasetType, or the given string. """ if isinstance(datasetTypeOrName, DatasetType): return datasetTypeOrName.name diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py index df0f17a03f..568f0a88a9 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_manager.py @@ -11,8 +11,8 @@ import sqlalchemy -from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, DatasetType -from ...._dataset_type import get_dataset_type_name +from ...._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef +from ...._dataset_type import DatasetType, get_dataset_type_name from ...._exceptions_legacy import DatasetTypeError from ....dimensions import DimensionUniverse from ..._collection_summary import CollectionSummary