From 97cc0737d932e5fe562b633d4029121453cd29e4 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 27 Nov 2023 15:22:32 -0700 Subject: [PATCH 1/9] Prepare to decouple get() from FileDatastore In preparation for extracting portions of the FileDatastore get() method to a free function to be used for client/server, remove references to 'self' and split the function into a few pieces. --- .../lsst/daf/butler/datastore/generic_base.py | 98 +++++++++---------- .../daf/butler/datastores/fileDatastore.py | 72 ++++++++++---- .../butler/datastores/inMemoryDatastore.py | 4 +- 3 files changed, 102 insertions(+), 72 deletions(-) diff --git a/python/lsst/daf/butler/datastore/generic_base.py b/python/lsst/daf/butler/datastore/generic_base.py index 9fe2c71f96..ca72d2e32e 100644 --- a/python/lsst/daf/butler/datastore/generic_base.py +++ b/python/lsst/daf/butler/datastore/generic_base.py @@ -29,7 +29,7 @@ from __future__ import annotations -__all__ = ("GenericBaseDatastore",) +__all__ = ("GenericBaseDatastore", "post_process_get") import logging from collections.abc import Mapping @@ -54,54 +54,6 @@ class GenericBaseDatastore(Datastore, Generic[_InfoType]): Should always be sub-classed since key abstract methods are missing. """ - def _post_process_get( - self, - inMemoryDataset: object, - readStorageClass: StorageClass, - assemblerParams: Mapping[str, Any] | None = None, - isComponent: bool = False, - ) -> object: - """Given the Python object read from the datastore, manipulate - it based on the supplied parameters and ensure the Python - type is correct. - - Parameters - ---------- - inMemoryDataset : `object` - Dataset to check. - readStorageClass: `StorageClass` - The `StorageClass` used to obtain the assembler and to - check the python type. - assemblerParams : `dict`, optional - Parameters to pass to the assembler. Can be `None`. - isComponent : `bool`, optional - If this is a component, allow the inMemoryDataset to be `None`. - - Returns - ------- - dataset : `object` - In-memory dataset, potentially converted to expected type. - """ - # Process any left over parameters - if assemblerParams: - inMemoryDataset = readStorageClass.delegate().handleParameters(inMemoryDataset, assemblerParams) - - # Validate the returned data type matches the expected data type - pytype = readStorageClass.pytype - - allowedTypes = [] - if pytype: - allowedTypes.append(pytype) - - # Special case components to allow them to be None - if isComponent: - allowedTypes.append(type(None)) - - if allowedTypes and not isinstance(inMemoryDataset, tuple(allowedTypes)): - inMemoryDataset = readStorageClass.coerce_type(inMemoryDataset) - - return inMemoryDataset - def _validate_put_parameters(self, inMemoryDataset: object, ref: DatasetRef) -> None: """Validate the supplied arguments for put. @@ -173,3 +125,51 @@ def transfer(self, inputDatastore: Datastore, ref: DatasetRef) -> None: assert inputDatastore is not self # unless we want it for renames? inMemoryDataset = inputDatastore.get(ref) return self.put(inMemoryDataset, ref) + + +def post_process_get( + inMemoryDataset: object, + readStorageClass: StorageClass, + assemblerParams: Mapping[str, Any] | None = None, + isComponent: bool = False, +) -> object: + """Given the Python object read from the datastore, manipulate + it based on the supplied parameters and ensure the Python + type is correct. + + Parameters + ---------- + inMemoryDataset : `object` + Dataset to check. + readStorageClass: `StorageClass` + The `StorageClass` used to obtain the assembler and to + check the python type. + assemblerParams : `dict`, optional + Parameters to pass to the assembler. Can be `None`. + isComponent : `bool`, optional + If this is a component, allow the inMemoryDataset to be `None`. + + Returns + ------- + dataset : `object` + In-memory dataset, potentially converted to expected type. + """ + # Process any left over parameters + if assemblerParams: + inMemoryDataset = readStorageClass.delegate().handleParameters(inMemoryDataset, assemblerParams) + + # Validate the returned data type matches the expected data type + pytype = readStorageClass.pytype + + allowedTypes = [] + if pytype: + allowedTypes.append(pytype) + + # Special case components to allow them to be None + if isComponent: + allowedTypes.append(type(None)) + + if allowedTypes and not isinstance(inMemoryDataset, tuple(allowedTypes)): + inMemoryDataset = readStorageClass.coerce_type(inMemoryDataset) + + return inMemoryDataset diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index c07862ea2b..f90cb7805c 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -37,10 +37,11 @@ from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, ClassVar, cast +from typing import TYPE_CHECKING, Any, ClassVar, cast, TypeAlias from lsst.daf.butler import ( Config, + DataCoordinate, DatasetId, DatasetRef, DatasetType, @@ -69,7 +70,7 @@ ) from lsst.daf.butler.datastore.composites import CompositesMap from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError -from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore +from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore, post_process_get from lsst.daf.butler.datastore.record_data import DatastoreRecordData from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo from lsst.daf.butler.registry.interfaces import ( @@ -138,6 +139,9 @@ class DatastoreFileGetInformation: """The `StorageClass` of the dataset being read.""" +DatasetLocationInformation: TypeAlias = tuple[Location, StoredFileInfo] + + class FileDatastore(GenericBaseDatastore[StoredFileInfo]): """Generic Datastore for file-based implementations. @@ -577,7 +581,7 @@ def removeStoredItemInfo(self, ref: DatasetIdRef) -> None: def _get_dataset_locations_info( self, ref: DatasetIdRef, ignore_datastore_records: bool = False - ) -> list[tuple[Location, StoredFileInfo]]: + ) -> list[DatasetLocationInformation]: r"""Find all the `Location`\ s of the requested dataset in the `Datastore` and the associated stored file information. @@ -702,7 +706,7 @@ def _get_expected_dataset_locations_info(self, ref: DatasetRef) -> list[tuple[Lo for location, formatter, storageClass, component in all_info ] - def _prepare_for_get( + def _prepare_for_direct_get( self, ref: DatasetRef, parameters: Mapping[str, Any] | None = None ) -> list[DatastoreFileGetInformation]: """Check parameters for ``get`` and obtain formatter and @@ -738,8 +742,6 @@ def _prepare_for_get( fileLocations = self._get_expected_dataset_locations_info(ref) if len(fileLocations) > 1: - disassembled = True - # If trust is involved it is possible that there will be # components listed here that do not exist in the datastore. # Explicitly check for file artifact existence and filter out any @@ -753,12 +755,26 @@ def _prepare_for_get( if not fileLocations: raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") - else: - disassembled = False - # Is this a component request? refComponent = ref.datasetType.component() + return self._prepare_for_get( + fileLocations, + refStorageClass=refStorageClass, + refComponent=refComponent, + parameters=parameters, + dataId=ref.dataId, + ) + def _prepare_for_get( + self, + fileLocations: list[DatasetLocationInformation], + *, + refStorageClass: StorageClass, + refComponent: str | None, + parameters: Mapping[str, Any] | None, + dataId: DataCoordinate, + ) -> list[DatastoreFileGetInformation]: + disassembled = len(fileLocations) > 1 fileGetInfo = [] for location, storedFileInfo in fileLocations: # The storage class used to write the file @@ -778,7 +794,7 @@ def _prepare_for_get( storageClass=writeStorageClass, parameters=parameters, ), - ref.dataId, + dataId, ) formatterParams, notFormatterParams = formatter.segregateParameters() @@ -1346,6 +1362,7 @@ def _read_artifact_into_memory( self, getInfo: DatastoreFileGetInformation, ref: DatasetRef, + cache_manager: AbstractDatastoreCacheManager, isComponent: bool = False, cache_ref: DatasetRef | None = None, ) -> Any: @@ -1408,7 +1425,7 @@ def _read_artifact_into_memory( formatter = getInfo.formatter nbytes_max = 10_000_000 # Arbitrary number that we can tune if resource_size <= nbytes_max and formatter.can_read_bytes(): - with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: + with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: if cached_file is not None: desired_uri = cached_file msg = f" (cached version of {uri})" @@ -1446,7 +1463,7 @@ def _read_artifact_into_memory( # The cache will only be relevant for remote resources but # no harm in always asking. Context manager ensures that cache # file is not deleted during cache expiration. - with self.cacheManager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: + with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: if cached_file is not None: msg = f"(via cache read of remote file {uri})" uri = cached_file @@ -1459,7 +1476,7 @@ def _read_artifact_into_memory( cache_msg = "" location_updated = True - if self.cacheManager.should_be_cached(cache_ref): + if cache_manager.should_be_cached(cache_ref): # In this scenario we want to ask if the downloaded # file should be cached but we should not cache # it until after we've used it (to ensure it can't @@ -1507,9 +1524,9 @@ def _read_artifact_into_memory( # File was read successfully so can move to cache if can_be_cached: - self.cacheManager.move_to_cache(local_uri, cache_ref) + cache_manager.move_to_cache(local_uri, cache_ref) - return self._post_process_get( + return post_process_get( result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent ) @@ -2228,11 +2245,20 @@ def get( # type conversion. if storageClass is not None: ref = ref.overrideStorageClass(storageClass) - refStorageClass = ref.datasetType.storageClass - allGetInfo = self._prepare_for_get(ref, parameters) - refComponent = ref.datasetType.component() + allGetInfo = self._prepare_for_direct_get(ref, parameters) + return self._execute_get(allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager) + def _execute_get( + self, + allGetInfo: list[DatastoreFileGetInformation], + *, + ref: DatasetRef, + parameters: Mapping[str, Any] | None, + cache_manager: AbstractDatastoreCacheManager, + ) -> Any: + refStorageClass = ref.datasetType.storageClass + refComponent = ref.datasetType.component() # Create mapping from component name to related info allComponents = {i.component: i for i in allGetInfo} @@ -2289,7 +2315,7 @@ def get( # standalone dataset -- always tell reader it is not a # component. components[component] = self._read_artifact_into_memory( - getInfo, ref.makeComponentRef(component), isComponent=False + getInfo, ref.makeComponentRef(component), cache_manager, isComponent=False ) inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) @@ -2376,7 +2402,9 @@ def get( refStorageClass, ) - return self._read_artifact_into_memory(readInfo, ref, isComponent=True, cache_ref=cache_ref) + return self._read_artifact_into_memory( + readInfo, ref, cache_manager, isComponent=True, cache_ref=cache_ref + ) else: # Single file request or component from that composite file @@ -2410,7 +2438,9 @@ def get( # the composite storage class getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) - return self._read_artifact_into_memory(getInfo, ref, isComponent=isComponent, cache_ref=cache_ref) + return self._read_artifact_into_memory( + getInfo, ref, cache_manager, isComponent=isComponent, cache_ref=cache_ref + ) @transactional def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index 33b65d71a0..aac3a681df 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -40,7 +40,7 @@ from lsst.daf.butler import DatasetId, DatasetRef, StorageClass from lsst.daf.butler.datastore import DatasetRefURIs -from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore +from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore, post_process_get from lsst.daf.butler.datastore.record_data import DatastoreRecordData from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo from lsst.daf.butler.utils import transactional @@ -349,7 +349,7 @@ def get( # Since there is no formatter to process parameters, they all must be # passed to the assembler. - inMemoryDataset = self._post_process_get( + inMemoryDataset = post_process_get( inMemoryDataset, refStorageClass, parameters, isComponent=component is not None ) From c16f8b3d20ffbac7c7d92b08bf43a0add42f6130 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 27 Nov 2023 15:47:36 -0700 Subject: [PATCH 2/9] Extract portions of FileDatastore.get() Convert portions of FileDatastore.get() to free functions, to allow the get operation to be called on the client side without instantiating a FileDatastore object --- .../daf/butler/datastores/fileDatastore.py | 471 +---------------- .../datastores/file_datastore/__init__.py | 0 .../butler/datastores/file_datastore/get.py | 493 ++++++++++++++++++ 3 files changed, 506 insertions(+), 458 deletions(-) create mode 100644 python/lsst/daf/butler/datastores/file_datastore/__init__.py create mode 100644 python/lsst/daf/butler/datastores/file_datastore/get.py diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index f90cb7805c..199c893b79 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -36,12 +36,11 @@ import logging from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence -from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast from typing import TYPE_CHECKING, Any, ClassVar, cast, TypeAlias from lsst.daf.butler import ( Config, - DataCoordinate, DatasetId, DatasetRef, DatasetType, @@ -70,9 +69,15 @@ ) from lsst.daf.butler.datastore.composites import CompositesMap from lsst.daf.butler.datastore.file_templates import FileTemplates, FileTemplateValidationError -from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore, post_process_get +from lsst.daf.butler.datastore.generic_base import GenericBaseDatastore from lsst.daf.butler.datastore.record_data import DatastoreRecordData from lsst.daf.butler.datastore.stored_file_info import StoredDatastoreItemInfo, StoredFileInfo +from lsst.daf.butler.datastores.file_datastore.get import ( + DatasetLocationInformation, + DatastoreFileGetInformation, + get_dataset_as_python_object, + prepare_for_get, +) from lsst.daf.butler.registry.interfaces import ( DatabaseInsertMode, DatastoreRegistryBridge, @@ -82,12 +87,11 @@ from lsst.daf.butler.repo_relocation import replaceRoot from lsst.daf.butler.utils import transactional from lsst.resources import ResourcePath, ResourcePathExpression -from lsst.utils.introspection import get_class_of, get_instance_of +from lsst.utils.introspection import get_class_of from lsst.utils.iteration import chunk_iterable # For VERBOSE logging usage. from lsst.utils.logging import VERBOSE, getLogger -from lsst.utils.timer import time_this from sqlalchemy import BigInteger, String if TYPE_CHECKING: @@ -111,35 +115,6 @@ def __init__(self, datasets: Iterable[FileDataset]): self.datasets = datasets -@dataclass(frozen=True) -class DatastoreFileGetInformation: - """Collection of useful parameters needed to retrieve a file from - a Datastore. - """ - - location: Location - """The location from which to read the dataset.""" - - formatter: Formatter - """The `Formatter` to use to deserialize the dataset.""" - - info: StoredFileInfo - """Stored information about this file and its formatter.""" - - assemblerParams: Mapping[str, Any] - """Parameters to use for post-processing the retrieved dataset.""" - - formatterParams: Mapping[str, Any] - """Parameters that were understood by the associated formatter.""" - - component: str | None - """The component to be retrieved (can be `None`).""" - - readStorageClass: StorageClass - """The `StorageClass` of the dataset being read.""" - - -DatasetLocationInformation: TypeAlias = tuple[Location, StoredFileInfo] class FileDatastore(GenericBaseDatastore[StoredFileInfo]): @@ -757,7 +732,7 @@ def _prepare_for_direct_get( # Is this a component request? refComponent = ref.datasetType.component() - return self._prepare_for_get( + return prepare_for_get( fileLocations, refStorageClass=refStorageClass, refComponent=refComponent, @@ -765,63 +740,6 @@ def _prepare_for_direct_get( dataId=ref.dataId, ) - def _prepare_for_get( - self, - fileLocations: list[DatasetLocationInformation], - *, - refStorageClass: StorageClass, - refComponent: str | None, - parameters: Mapping[str, Any] | None, - dataId: DataCoordinate, - ) -> list[DatastoreFileGetInformation]: - disassembled = len(fileLocations) > 1 - fileGetInfo = [] - for location, storedFileInfo in fileLocations: - # The storage class used to write the file - writeStorageClass = storedFileInfo.storageClass - - # If this has been disassembled we need read to match the write - if disassembled: - readStorageClass = writeStorageClass - else: - readStorageClass = refStorageClass - - formatter = get_instance_of( - storedFileInfo.formatter, - FileDescriptor( - location, - readStorageClass=readStorageClass, - storageClass=writeStorageClass, - parameters=parameters, - ), - dataId, - ) - - formatterParams, notFormatterParams = formatter.segregateParameters() - - # Of the remaining parameters, extract the ones supported by - # this StorageClass (for components not all will be handled) - assemblerParams = readStorageClass.filterParameters(notFormatterParams) - - # The ref itself could be a component if the dataset was - # disassembled by butler, or we disassembled in datastore and - # components came from the datastore records - component = storedFileInfo.component if storedFileInfo.component else refComponent - - fileGetInfo.append( - DatastoreFileGetInformation( - location, - formatter, - storedFileInfo, - assemblerParams, - formatterParams, - component, - readStorageClass, - ) - ) - - return fileGetInfo - def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: """Check the arguments for ``put`` and obtain formatter and location. @@ -1358,178 +1276,6 @@ def _removeFileExists(uri: ResourcePath) -> None: # URI is needed to resolve what ingest case are we dealing with return self._extractIngestInfo(uri, ref, formatter=formatter) - def _read_artifact_into_memory( - self, - getInfo: DatastoreFileGetInformation, - ref: DatasetRef, - cache_manager: AbstractDatastoreCacheManager, - isComponent: bool = False, - cache_ref: DatasetRef | None = None, - ) -> Any: - """Read the artifact from datastore into in memory object. - - Parameters - ---------- - getInfo : `DatastoreFileGetInformation` - Information about the artifact within the datastore. - ref : `DatasetRef` - The registry information associated with this artifact. - isComponent : `bool` - Flag to indicate if a component is being read from this artifact. - cache_ref : `DatasetRef`, optional - The DatasetRef to use when looking up the file in the cache. - This ref must have the same ID as the supplied ref but can - be a parent ref or component ref to indicate to the cache whether - a composite file is being requested from the cache or a component - file. Without this the cache will default to the supplied ref but - it can get confused with read-only derived components for - disassembled composites. - - Returns - ------- - inMemoryDataset : `object` - The artifact as a python object. - """ - location = getInfo.location - uri = location.uri - log.debug("Accessing data from %s", uri) - - if cache_ref is None: - cache_ref = ref - if cache_ref.id != ref.id: - raise ValueError( - "The supplied cache dataset ref refers to a different dataset than expected:" - f" {ref.id} != {cache_ref.id}" - ) - - # Cannot recalculate checksum but can compare size as a quick check - # Do not do this if the size is negative since that indicates - # we do not know. - recorded_size = getInfo.info.file_size - resource_size = uri.size() - if recorded_size >= 0 and resource_size != recorded_size: - raise RuntimeError( - "Integrity failure in Datastore. " - f"Size of file {uri} ({resource_size}) " - f"does not match size recorded in registry of {recorded_size}" - ) - - # For the general case we have choices for how to proceed. - # 1. Always use a local file (downloading the remote resource to a - # temporary file if needed). - # 2. Use a threshold size and read into memory and use bytes. - # Use both for now with an arbitrary hand off size. - # This allows small datasets to be downloaded from remote object - # stores without requiring a temporary file. - - formatter = getInfo.formatter - nbytes_max = 10_000_000 # Arbitrary number that we can tune - if resource_size <= nbytes_max and formatter.can_read_bytes(): - with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: - if cached_file is not None: - desired_uri = cached_file - msg = f" (cached version of {uri})" - else: - desired_uri = uri - msg = "" - with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): - serializedDataset = desired_uri.read() - log.debug( - "Deserializing %s from %d bytes from location %s with formatter %s", - f"component {getInfo.component}" if isComponent else "", - len(serializedDataset), - uri, - formatter.name(), - ) - try: - result = formatter.fromBytes( - serializedDataset, component=getInfo.component if isComponent else None - ) - except Exception as e: - raise ValueError( - f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" - f" ({ref.datasetType.name} from {uri}): {e}" - ) from e - else: - # Read from file. - - # Have to update the Location associated with the formatter - # because formatter.read does not allow an override. - # This could be improved. - location_updated = False - msg = "" - - # First check in cache for local version. - # The cache will only be relevant for remote resources but - # no harm in always asking. Context manager ensures that cache - # file is not deleted during cache expiration. - with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: - if cached_file is not None: - msg = f"(via cache read of remote file {uri})" - uri = cached_file - location_updated = True - - with uri.as_local() as local_uri: - can_be_cached = False - if uri != local_uri: - # URI was remote and file was downloaded - cache_msg = "" - location_updated = True - - if cache_manager.should_be_cached(cache_ref): - # In this scenario we want to ask if the downloaded - # file should be cached but we should not cache - # it until after we've used it (to ensure it can't - # be expired whilst we are using it). - can_be_cached = True - - # Say that it is "likely" to be cached because - # if the formatter read fails we will not be - # caching this file. - cache_msg = " and likely cached" - - msg = f"(via download to local file{cache_msg})" - - # Calculate the (possibly) new location for the formatter - # to use. - newLocation = Location(*local_uri.split()) if location_updated else None - - log.debug( - "Reading%s from location %s %s with formatter %s", - f" component {getInfo.component}" if isComponent else "", - uri, - msg, - formatter.name(), - ) - try: - with ( - formatter._updateLocation(newLocation), - time_this( - log, - msg="Reading%s from location %s %s with formatter %s", - args=( - f" component {getInfo.component}" if isComponent else "", - uri, - msg, - formatter.name(), - ), - ), - ): - result = formatter.read(component=getInfo.component if isComponent else None) - except Exception as e: - raise ValueError( - f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" - f" ({ref.datasetType.name} from {uri}): {e}" - ) from e - - # File was read successfully so can move to cache - if can_be_cached: - cache_manager.move_to_cache(local_uri, cache_ref) - - return post_process_get( - result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent - ) - def knows(self, ref: DatasetRef) -> bool: """Check if the dataset is known to the datastore. @@ -2247,200 +1993,9 @@ def get( ref = ref.overrideStorageClass(storageClass) allGetInfo = self._prepare_for_direct_get(ref, parameters) - return self._execute_get(allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager) - - def _execute_get( - self, - allGetInfo: list[DatastoreFileGetInformation], - *, - ref: DatasetRef, - parameters: Mapping[str, Any] | None, - cache_manager: AbstractDatastoreCacheManager, - ) -> Any: - refStorageClass = ref.datasetType.storageClass - refComponent = ref.datasetType.component() - # Create mapping from component name to related info - allComponents = {i.component: i for i in allGetInfo} - - # By definition the dataset is disassembled if we have more - # than one record for it. - isDisassembled = len(allGetInfo) > 1 - - # Look for the special case where we are disassembled but the - # component is a derived component that was not written during - # disassembly. For this scenario we need to check that the - # component requested is listed as a derived component for the - # composite storage class - isDisassembledReadOnlyComponent = False - if isDisassembled and refComponent: - # The composite storage class should be accessible through - # the component dataset type - compositeStorageClass = ref.datasetType.parentStorageClass - - # In the unlikely scenario where the composite storage - # class is not known, we can only assume that this is a - # normal component. If that assumption is wrong then the - # branch below that reads a persisted component will fail - # so there is no need to complain here. - if compositeStorageClass is not None: - isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents - - if isDisassembled and not refComponent: - # This was a disassembled dataset spread over multiple files - # and we need to put them all back together again. - # Read into memory and then assemble - - # Check that the supplied parameters are suitable for the type read - refStorageClass.validateParameters(parameters) - - # We want to keep track of all the parameters that were not used - # by formatters. We assume that if any of the component formatters - # use a parameter that we do not need to apply it again in the - # assembler. - usedParams = set() - - components: dict[str, Any] = {} - for getInfo in allGetInfo: - # assemblerParams are parameters not understood by the - # associated formatter. - usedParams.update(set(getInfo.formatterParams)) - - component = getInfo.component - - if component is None: - raise RuntimeError(f"Internal error in datastore assembly of {ref}") - - # We do not want the formatter to think it's reading - # a component though because it is really reading a - # standalone dataset -- always tell reader it is not a - # component. - components[component] = self._read_artifact_into_memory( - getInfo, ref.makeComponentRef(component), cache_manager, isComponent=False - ) - - inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) - - # Any unused parameters will have to be passed to the assembler - if parameters: - unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} - else: - unusedParams = {} - - # Process parameters - return ref.datasetType.storageClass.delegate().handleParameters( - inMemoryDataset, parameters=unusedParams - ) - - elif isDisassembledReadOnlyComponent: - compositeStorageClass = ref.datasetType.parentStorageClass - if compositeStorageClass is None: - raise RuntimeError( - f"Unable to retrieve derived component '{refComponent}' since" - "no composite storage class is available." - ) - - if refComponent is None: - # Mainly for mypy - raise RuntimeError(f"Internal error in datastore {self.name}: component can not be None here") - - # Assume that every derived component can be calculated by - # forwarding the request to a single read/write component. - # Rather than guessing which rw component is the right one by - # scanning each for a derived component of the same name, - # we ask the storage class delegate directly which one is best to - # use. - compositeDelegate = compositeStorageClass.delegate() - forwardedComponent = compositeDelegate.selectResponsibleComponent( - refComponent, set(allComponents) - ) - - # Select the relevant component - rwInfo = allComponents[forwardedComponent] - - # For now assume that read parameters are validated against - # the real component and not the requested component - forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass - forwardedStorageClass.validateParameters(parameters) - - # The reference to use for the caching must refer to the forwarded - # component and not the derived component. - cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) - - # Unfortunately the FileDescriptor inside the formatter will have - # the wrong write storage class so we need to create a new one - # given the immutability constraint. - writeStorageClass = rwInfo.info.storageClass - - # We may need to put some thought into parameters for read - # components but for now forward them on as is - readFormatter = type(rwInfo.formatter)( - FileDescriptor( - rwInfo.location, - readStorageClass=refStorageClass, - storageClass=writeStorageClass, - parameters=parameters, - ), - ref.dataId, - ) - - # The assembler can not receive any parameter requests for a - # derived component at this time since the assembler will - # see the storage class of the derived component and those - # parameters will have to be handled by the formatter on the - # forwarded storage class. - assemblerParams: dict[str, Any] = {} - - # Need to created a new info that specifies the derived - # component and associated storage class - readInfo = DatastoreFileGetInformation( - rwInfo.location, - readFormatter, - rwInfo.info, - assemblerParams, - {}, - refComponent, - refStorageClass, - ) - - return self._read_artifact_into_memory( - readInfo, ref, cache_manager, isComponent=True, cache_ref=cache_ref - ) - - else: - # Single file request or component from that composite file - for lookup in (refComponent, None): - if lookup in allComponents: - getInfo = allComponents[lookup] - break - else: - raise FileNotFoundError( - f"Component {refComponent} not found for ref {ref} in datastore {self.name}" - ) - - # Do not need the component itself if already disassembled - if isDisassembled: - isComponent = False - else: - isComponent = getInfo.component is not None - - # For a component read of a composite we want the cache to - # be looking at the composite ref itself. - cache_ref = ref.makeCompositeRef() if isComponent else ref - - # For a disassembled component we can validate parameters against - # the component storage class directly - if isDisassembled: - refStorageClass.validateParameters(parameters) - else: - # For an assembled composite this could be a derived - # component derived from a real component. The validity - # of the parameters is not clear. For now validate against - # the composite storage class - getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) - - return self._read_artifact_into_memory( - getInfo, ref, cache_manager, isComponent=isComponent, cache_ref=cache_ref - ) + return get_dataset_as_python_object( + allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager + ) @transactional def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/file_datastore/__init__.py b/python/lsst/daf/butler/datastores/file_datastore/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py new file mode 100644 index 0000000000..e13b1ed54f --- /dev/null +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -0,0 +1,493 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +__all__ = ( + "DatastoreFileGetInformation", + "DatasetLocationInformation", + "prepare_for_get", + "get_dataset_as_python_object", +) + +from collections.abc import Mapping +from dataclasses import dataclass +from typing import Any, TypeAlias + +from lsst.daf.butler import DataCoordinate, DatasetRef, FileDescriptor, Formatter, Location, StorageClass +from lsst.daf.butler.datastore.cache_manager import AbstractDatastoreCacheManager +from lsst.daf.butler.datastore.generic_base import post_process_get +from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo +from lsst.utils.introspection import get_instance_of +from lsst.utils.logging import getLogger +from lsst.utils.timer import time_this + +log = getLogger(__name__) + +DatasetLocationInformation: TypeAlias = tuple[Location, StoredFileInfo] + + +@dataclass(frozen=True) +class DatastoreFileGetInformation: + """Collection of useful parameters needed to retrieve a file from + a Datastore. + """ + + location: Location + """The location from which to read the dataset.""" + + formatter: Formatter + """The `Formatter` to use to deserialize the dataset.""" + + info: StoredFileInfo + """Stored information about this file and its formatter.""" + + assemblerParams: Mapping[str, Any] + """Parameters to use for post-processing the retrieved dataset.""" + + formatterParams: Mapping[str, Any] + """Parameters that were understood by the associated formatter.""" + + component: str | None + """The component to be retrieved (can be `None`).""" + + readStorageClass: StorageClass + """The `StorageClass` of the dataset being read.""" + + +def prepare_for_get( + fileLocations: list[DatasetLocationInformation], + *, + refStorageClass: StorageClass, + refComponent: str | None, + parameters: Mapping[str, Any] | None, + dataId: DataCoordinate, +) -> list[DatastoreFileGetInformation]: + disassembled = len(fileLocations) > 1 + fileGetInfo = [] + for location, storedFileInfo in fileLocations: + # The storage class used to write the file + writeStorageClass = storedFileInfo.storageClass + + # If this has been disassembled we need read to match the write + if disassembled: + readStorageClass = writeStorageClass + else: + readStorageClass = refStorageClass + + formatter = get_instance_of( + storedFileInfo.formatter, + FileDescriptor( + location, + readStorageClass=readStorageClass, + storageClass=writeStorageClass, + parameters=parameters, + ), + dataId, + ) + + formatterParams, notFormatterParams = formatter.segregateParameters() + + # Of the remaining parameters, extract the ones supported by + # this StorageClass (for components not all will be handled) + assemblerParams = readStorageClass.filterParameters(notFormatterParams) + + # The ref itself could be a component if the dataset was + # disassembled by butler, or we disassembled in datastore and + # components came from the datastore records + component = storedFileInfo.component if storedFileInfo.component else refComponent + + fileGetInfo.append( + DatastoreFileGetInformation( + location, + formatter, + storedFileInfo, + assemblerParams, + formatterParams, + component, + readStorageClass, + ) + ) + + return fileGetInfo + + +def _read_artifact_into_memory( + getInfo: DatastoreFileGetInformation, + ref: DatasetRef, + cache_manager: AbstractDatastoreCacheManager, + isComponent: bool = False, + cache_ref: DatasetRef | None = None, +) -> Any: + """Read the artifact from datastore into in memory object. + + Parameters + ---------- + getInfo : `DatastoreFileGetInformation` + Information about the artifact within the datastore. + ref : `DatasetRef` + The registry information associated with this artifact. + isComponent : `bool` + Flag to indicate if a component is being read from this artifact. + cache_ref : `DatasetRef`, optional + The DatasetRef to use when looking up the file in the cache. + This ref must have the same ID as the supplied ref but can + be a parent ref or component ref to indicate to the cache whether + a composite file is being requested from the cache or a component + file. Without this the cache will default to the supplied ref but + it can get confused with read-only derived components for + disassembled composites. + + Returns + ------- + inMemoryDataset : `object` + The artifact as a python object. + """ + location = getInfo.location + uri = location.uri + log.debug("Accessing data from %s", uri) + + if cache_ref is None: + cache_ref = ref + if cache_ref.id != ref.id: + raise ValueError( + "The supplied cache dataset ref refers to a different dataset than expected:" + f" {ref.id} != {cache_ref.id}" + ) + + # Cannot recalculate checksum but can compare size as a quick check + # Do not do this if the size is negative since that indicates + # we do not know. + recorded_size = getInfo.info.file_size + resource_size = uri.size() + if recorded_size >= 0 and resource_size != recorded_size: + raise RuntimeError( + "Integrity failure in Datastore. " + f"Size of file {uri} ({resource_size}) " + f"does not match size recorded in registry of {recorded_size}" + ) + + # For the general case we have choices for how to proceed. + # 1. Always use a local file (downloading the remote resource to a + # temporary file if needed). + # 2. Use a threshold size and read into memory and use bytes. + # Use both for now with an arbitrary hand off size. + # This allows small datasets to be downloaded from remote object + # stores without requiring a temporary file. + + formatter = getInfo.formatter + nbytes_max = 10_000_000 # Arbitrary number that we can tune + if resource_size <= nbytes_max and formatter.can_read_bytes(): + with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: + if cached_file is not None: + desired_uri = cached_file + msg = f" (cached version of {uri})" + else: + desired_uri = uri + msg = "" + with time_this(log, msg="Reading bytes from %s%s", args=(desired_uri, msg)): + serializedDataset = desired_uri.read() + log.debug( + "Deserializing %s from %d bytes from location %s with formatter %s", + f"component {getInfo.component}" if isComponent else "", + len(serializedDataset), + uri, + formatter.name(), + ) + try: + result = formatter.fromBytes( + serializedDataset, component=getInfo.component if isComponent else None + ) + except Exception as e: + raise ValueError( + f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" + f" ({ref.datasetType.name} from {uri}): {e}" + ) from e + else: + # Read from file. + + # Have to update the Location associated with the formatter + # because formatter.read does not allow an override. + # This could be improved. + location_updated = False + msg = "" + + # First check in cache for local version. + # The cache will only be relevant for remote resources but + # no harm in always asking. Context manager ensures that cache + # file is not deleted during cache expiration. + with cache_manager.find_in_cache(cache_ref, uri.getExtension()) as cached_file: + if cached_file is not None: + msg = f"(via cache read of remote file {uri})" + uri = cached_file + location_updated = True + + with uri.as_local() as local_uri: + can_be_cached = False + if uri != local_uri: + # URI was remote and file was downloaded + cache_msg = "" + location_updated = True + + if cache_manager.should_be_cached(cache_ref): + # In this scenario we want to ask if the downloaded + # file should be cached but we should not cache + # it until after we've used it (to ensure it can't + # be expired whilst we are using it). + can_be_cached = True + + # Say that it is "likely" to be cached because + # if the formatter read fails we will not be + # caching this file. + cache_msg = " and likely cached" + + msg = f"(via download to local file{cache_msg})" + + # Calculate the (possibly) new location for the formatter + # to use. + newLocation = Location(*local_uri.split()) if location_updated else None + + log.debug( + "Reading%s from location %s %s with formatter %s", + f" component {getInfo.component}" if isComponent else "", + uri, + msg, + formatter.name(), + ) + try: + with ( + formatter._updateLocation(newLocation), + time_this( + log, + msg="Reading%s from location %s %s with formatter %s", + args=( + f" component {getInfo.component}" if isComponent else "", + uri, + msg, + formatter.name(), + ), + ), + ): + result = formatter.read(component=getInfo.component if isComponent else None) + except Exception as e: + raise ValueError( + f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" + f" ({ref.datasetType.name} from {uri}): {e}" + ) from e + + # File was read successfully so can move to cache + if can_be_cached: + cache_manager.move_to_cache(local_uri, cache_ref) + + return post_process_get( + result, ref.datasetType.storageClass, getInfo.assemblerParams, isComponent=isComponent + ) + + +def get_dataset_as_python_object( + allGetInfo: list[DatastoreFileGetInformation], + *, + ref: DatasetRef, + parameters: Mapping[str, Any] | None, + cache_manager: AbstractDatastoreCacheManager, +) -> Any: + refStorageClass = ref.datasetType.storageClass + refComponent = ref.datasetType.component() + # Create mapping from component name to related info + allComponents = {i.component: i for i in allGetInfo} + + # By definition the dataset is disassembled if we have more + # than one record for it. + isDisassembled = len(allGetInfo) > 1 + + # Look for the special case where we are disassembled but the + # component is a derived component that was not written during + # disassembly. For this scenario we need to check that the + # component requested is listed as a derived component for the + # composite storage class + isDisassembledReadOnlyComponent = False + if isDisassembled and refComponent: + # The composite storage class should be accessible through + # the component dataset type + compositeStorageClass = ref.datasetType.parentStorageClass + + # In the unlikely scenario where the composite storage + # class is not known, we can only assume that this is a + # normal component. If that assumption is wrong then the + # branch below that reads a persisted component will fail + # so there is no need to complain here. + if compositeStorageClass is not None: + isDisassembledReadOnlyComponent = refComponent in compositeStorageClass.derivedComponents + + if isDisassembled and not refComponent: + # This was a disassembled dataset spread over multiple files + # and we need to put them all back together again. + # Read into memory and then assemble + + # Check that the supplied parameters are suitable for the type read + refStorageClass.validateParameters(parameters) + + # We want to keep track of all the parameters that were not used + # by formatters. We assume that if any of the component formatters + # use a parameter that we do not need to apply it again in the + # assembler. + usedParams = set() + + components: dict[str, Any] = {} + for getInfo in allGetInfo: + # assemblerParams are parameters not understood by the + # associated formatter. + usedParams.update(set(getInfo.formatterParams)) + + component = getInfo.component + + if component is None: + raise RuntimeError(f"Internal error in datastore assembly of {ref}") + + # We do not want the formatter to think it's reading + # a component though because it is really reading a + # standalone dataset -- always tell reader it is not a + # component. + components[component] = _read_artifact_into_memory( + getInfo, ref.makeComponentRef(component), cache_manager, isComponent=False + ) + + inMemoryDataset = ref.datasetType.storageClass.delegate().assemble(components) + + # Any unused parameters will have to be passed to the assembler + if parameters: + unusedParams = {k: v for k, v in parameters.items() if k not in usedParams} + else: + unusedParams = {} + + # Process parameters + return ref.datasetType.storageClass.delegate().handleParameters( + inMemoryDataset, parameters=unusedParams + ) + + elif isDisassembledReadOnlyComponent: + compositeStorageClass = ref.datasetType.parentStorageClass + if compositeStorageClass is None: + raise RuntimeError( + f"Unable to retrieve derived component '{refComponent}' since" + "no composite storage class is available." + ) + + if refComponent is None: + # Mainly for mypy + raise RuntimeError("Internal error in datastore: component can not be None here") + + # Assume that every derived component can be calculated by + # forwarding the request to a single read/write component. + # Rather than guessing which rw component is the right one by + # scanning each for a derived component of the same name, + # we ask the storage class delegate directly which one is best to + # use. + compositeDelegate = compositeStorageClass.delegate() + forwardedComponent = compositeDelegate.selectResponsibleComponent(refComponent, set(allComponents)) + + # Select the relevant component + rwInfo = allComponents[forwardedComponent] + + # For now assume that read parameters are validated against + # the real component and not the requested component + forwardedStorageClass = rwInfo.formatter.fileDescriptor.readStorageClass + forwardedStorageClass.validateParameters(parameters) + + # The reference to use for the caching must refer to the forwarded + # component and not the derived component. + cache_ref = ref.makeCompositeRef().makeComponentRef(forwardedComponent) + + # Unfortunately the FileDescriptor inside the formatter will have + # the wrong write storage class so we need to create a new one + # given the immutability constraint. + writeStorageClass = rwInfo.info.storageClass + + # We may need to put some thought into parameters for read + # components but for now forward them on as is + readFormatter = type(rwInfo.formatter)( + FileDescriptor( + rwInfo.location, + readStorageClass=refStorageClass, + storageClass=writeStorageClass, + parameters=parameters, + ), + ref.dataId, + ) + + # The assembler can not receive any parameter requests for a + # derived component at this time since the assembler will + # see the storage class of the derived component and those + # parameters will have to be handled by the formatter on the + # forwarded storage class. + assemblerParams: dict[str, Any] = {} + + # Need to created a new info that specifies the derived + # component and associated storage class + readInfo = DatastoreFileGetInformation( + rwInfo.location, + readFormatter, + rwInfo.info, + assemblerParams, + {}, + refComponent, + refStorageClass, + ) + + return _read_artifact_into_memory(readInfo, ref, cache_manager, isComponent=True, cache_ref=cache_ref) + + else: + # Single file request or component from that composite file + for lookup in (refComponent, None): + if lookup in allComponents: + getInfo = allComponents[lookup] + break + else: + raise FileNotFoundError(f"Component {refComponent} not found for ref {ref} in datastore") + + # Do not need the component itself if already disassembled + if isDisassembled: + isComponent = False + else: + isComponent = getInfo.component is not None + + # For a component read of a composite we want the cache to + # be looking at the composite ref itself. + cache_ref = ref.makeCompositeRef() if isComponent else ref + + # For a disassembled component we can validate parameters against + # the component storage class directly + if isDisassembled: + refStorageClass.validateParameters(parameters) + else: + # For an assembled composite this could be a derived + # component derived from a real component. The validity + # of the parameters is not clear. For now validate against + # the composite storage class + getInfo.formatter.fileDescriptor.storageClass.validateParameters(parameters) + + return _read_artifact_into_memory( + getInfo, ref, cache_manager, isComponent=isComponent, cache_ref=cache_ref + ) From 2cf08cd3584732943862b83923541667e22868c0 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Tue, 28 Nov 2023 17:17:44 -0700 Subject: [PATCH 3/9] Add Pydantic utilities in RemoteButler Added re-usable functions for sending a POST using a Pydantic model, and for parsing a response using the model. --- .../butler/remote_butler/_remote_butler.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 53a28bc631..f563c5e961 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -31,7 +31,7 @@ from collections.abc import Collection, Iterable, Mapping, Sequence from contextlib import AbstractContextManager -from typing import TYPE_CHECKING, Any, TextIO +from typing import TYPE_CHECKING, Any, TextIO, Type, TypeVar import httpx from lsst.daf.butler import __version__ @@ -41,7 +41,9 @@ from .._butler import Butler from .._butler_config import ButlerConfig -from .._dataset_ref import DatasetRef, SerializedDatasetRef +from .._compat import _BaseModelCompat +from .._config import Config +from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, SerializedDatasetRef from .._dataset_type import DatasetType, SerializedDatasetType from .._storage_class import StorageClass from ..dimensions import DataCoordinate, DimensionConfig, DimensionUniverse, SerializedDataCoordinate @@ -52,9 +54,7 @@ from .server_models import FindDatasetModel if TYPE_CHECKING: - from .._config import Config from .._dataset_existence import DatasetExistence - from .._dataset_ref import DatasetId, DatasetIdGenEnum from .._deferredDatasetHandle import DeferredDatasetHandle from .._file_dataset import FileDataset from .._limited_butler import LimitedButler @@ -66,6 +66,9 @@ from ..transfers import RepoExportContext +_AnyPydanticModel = TypeVar("_AnyPydanticModel", bound=_BaseModelCompat) + + class RemoteButler(Butler): def __init__( self, @@ -328,12 +331,12 @@ def find_dataset( ) path = f"find_dataset/{dataset_type}" - response = self._client.post( - self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True, exclude_defaults=True) - ) + response = self._post(path, query) response.raise_for_status() - return DatasetRef.from_simple(SerializedDatasetRef(**response.json()), universe=self.dimensions) + return DatasetRef.from_simple( + self._parse_model(response, SerializedDatasetRef), universe=self.dimensions + ) def retrieveArtifacts( self, @@ -530,3 +533,11 @@ def _get_url(self, path: str, version: str = "v1") -> str: The full path to the endpoint. """ return f"{version}/{path}" + + def _post(self, path: str, model: _BaseModelCompat) -> httpx.Response: + json = model.model_dump_json(exclude_unset=True).encode("utf-8") + url = self._get_url(path) + return self._client.post(url, content=json, headers={"content-type": "application/json"}) + + def _parse_model(self, response: httpx.Response, model: Type[_AnyPydanticModel]) -> _AnyPydanticModel: + return model.model_validate_json(response.content) From 6a1aa9e1dfd758f01c3ffda7b37bc13e5e13b8c2 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Wed, 22 Nov 2023 10:43:26 -0700 Subject: [PATCH 4/9] Fix dicts in RemoteButler data IDs Fixed an issue where UnboundLocalError would be thrown when passing a dict as a DataId to a RemoteButler function --- python/lsst/daf/butler/remote_butler/_remote_butler.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index f563c5e961..ad05352078 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -159,14 +159,9 @@ def _simplify_dataId( if isinstance(dataId, DataCoordinate): return dataId.to_simple() - if dataId is None: - data_id = kwargs - elif kwargs: - # Change variable because DataId is immutable and mypy complains. - data_id = dict(dataId) - data_id.update(kwargs) - # Assume we can treat it as a dict. + data_id = dict(dataId) if dataId is not None else {} + data_id.update(kwargs) return SerializedDataCoordinate(dataId=data_id) def _caching_context(self) -> AbstractContextManager[None]: From dfe02c4c530873f6001e45bdf0fcad5afe25c138 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Wed, 29 Nov 2023 11:58:34 -0700 Subject: [PATCH 5/9] Implement RemoteButler.get() Added a partial implementation of get() for RemoteButler, which so far only handles the DatasetRef form of get(). This modifies the DataStore interface to include a function that serializes the information necessary to do the get() on the client side. --- python/lsst/daf/butler/_butler.py | 2 +- .../lsst/daf/butler/datastore/_datastore.py | 17 ++++++ .../daf/butler/datastore/cache_manager.py | 6 +- .../daf/butler/datastore/stored_file_info.py | 37 +++++++++++- .../daf/butler/datastores/fileDatastore.py | 30 +++++++--- .../butler/datastores/fileDatastoreClient.py | 58 +++++++++++++++++++ .../datastores/file_datastore/__init__.py | 26 +++++++++ .../butler/datastores/file_datastore/get.py | 8 +-- .../butler/remote_butler/_remote_butler.py | 18 +++++- .../remote_butler/server/_exceptions.py | 33 +++++++++++ .../server/handlers/_external.py | 20 ++++++- .../daf/butler/remote_butler/server_models.py | 12 +++- python/lsst/daf/butler/tests/utils.py | 7 ++- tests/test_server.py | 32 ++++++++-- 14 files changed, 280 insertions(+), 26 deletions(-) create mode 100644 python/lsst/daf/butler/datastores/fileDatastoreClient.py create mode 100644 python/lsst/daf/butler/remote_butler/server/_exceptions.py diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index e2804dd0aa..02323ac4d2 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -814,7 +814,7 @@ def get_dataset_type(self, name: str) -> DatasetType: def get_dataset( self, id: DatasetId, - storage_class: str | StorageClass | None, + storage_class: str | StorageClass | None = None, dimension_records: bool = False, datastore_records: bool = False, ) -> DatasetRef | None: diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index f0226643f3..9dfd8f8327 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -540,6 +540,23 @@ def get( """ raise NotImplementedError("Must be implemented by subclass") + def prepare_get_for_external_client(self, ref: DatasetRef) -> object: + """Retrieve serializable data that can be used to execute a get() + + Parameters + ---------- + ref : `DatasetRef` + Reference to the required dataset. + + Returns + ------- + payload : `object` + Serializable payload containing the information needed to perform a + get() operation. This payload may be sent over the wire to another + system to perform the get(). + """ + raise NotImplementedError() + @abstractmethod def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None: """Write a `InMemoryDataset` with a given `DatasetRef` to the store. diff --git a/python/lsst/daf/butler/datastore/cache_manager.py b/python/lsst/daf/butler/datastore/cache_manager.py index 8196cf2a45..034f8ac3d3 100644 --- a/python/lsst/daf/butler/datastore/cache_manager.py +++ b/python/lsst/daf/butler/datastore/cache_manager.py @@ -1015,7 +1015,11 @@ class DatastoreDisabledCacheManager(AbstractDatastoreCacheManager): in lookup keys. """ - def __init__(self, config: str | DatastoreCacheManagerConfig, universe: DimensionUniverse): + def __init__( + self, + config: str | DatastoreCacheManagerConfig | None = None, + universe: DimensionUniverse | None = None, + ): return def should_be_cached(self, entity: DatasetRef | DatasetType | StorageClass) -> bool: diff --git a/python/lsst/daf/butler/datastore/stored_file_info.py b/python/lsst/daf/butler/datastore/stored_file_info.py index ab1572d901..b173656b48 100644 --- a/python/lsst/daf/butler/datastore/stored_file_info.py +++ b/python/lsst/daf/butler/datastore/stored_file_info.py @@ -27,13 +27,14 @@ from __future__ import annotations -__all__ = ("StoredDatastoreItemInfo", "StoredFileInfo") +__all__ = ("StoredDatastoreItemInfo", "StoredFileInfo", "SerializedStoredFileInfo") import inspect from collections.abc import Iterable, Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any +from lsst.daf.butler._compat import _BaseModelCompat from lsst.resources import ResourcePath from lsst.utils import doImportType from lsst.utils.introspection import get_full_type_name @@ -260,6 +261,13 @@ def to_record(self, **kwargs: Any) -> dict[str, Any]: **kwargs, ) + def to_simple(self) -> SerializedStoredFileInfo: + record = self.to_record() + # We allow None on the model but the record contains a "null string" + # instead + record["component"] = self.component + return SerializedStoredFileInfo.model_validate(record) + def file_location(self, factory: LocationFactory) -> Location: """Return the location of artifact. @@ -307,6 +315,10 @@ def from_record(cls: type[StoredFileInfo], record: Mapping[str, Any]) -> StoredF ) return info + @classmethod + def from_simple(cls: type[StoredFileInfo], model: SerializedStoredFileInfo) -> StoredFileInfo: + return cls.from_record(dict(model)) + def update(self, **kwargs: Any) -> StoredFileInfo: new_args = {} for k in self.__slots__: @@ -320,3 +332,26 @@ def update(self, **kwargs: Any) -> StoredFileInfo: def __reduce__(self) -> str | tuple[Any, ...]: return (self.from_record, (self.to_record(),)) + + +class SerializedStoredFileInfo(_BaseModelCompat): + """Serialized representation of `StoredFileInfo` properties""" + + formatter: str + """Fully-qualified name of Formatter.""" + + path: str + """Path to dataset within Datastore.""" + + storage_class: str + """Name of the StorageClass associated with Dataset.""" + + component: str | None + """Component associated with this file. Can be None if the file does + not refer to a component of a composite.""" + + checksum: str | None + """Checksum of the serialized dataset.""" + + file_size: int + """Size of the serialized dataset in bytes.""" diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 199c893b79..990e1d69cf 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -36,8 +36,7 @@ import logging from collections import defaultdict from collections.abc import Callable, Iterable, Mapping, Sequence -from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast -from typing import TYPE_CHECKING, Any, ClassVar, cast, TypeAlias +from typing import TYPE_CHECKING, Any, ClassVar, cast from lsst.daf.butler import ( Config, @@ -75,8 +74,12 @@ from lsst.daf.butler.datastores.file_datastore.get import ( DatasetLocationInformation, DatastoreFileGetInformation, - get_dataset_as_python_object, - prepare_for_get, + generate_datastore_get_information, + get_dataset_as_python_object_from_get_info, +) +from lsst.daf.butler.datastores.fileDatastoreClient import ( + FileDatastoreGetPayload, + FileDatastoreGetPayloadFileInfo, ) from lsst.daf.butler.registry.interfaces import ( DatabaseInsertMode, @@ -115,8 +118,6 @@ def __init__(self, datasets: Iterable[FileDataset]): self.datasets = datasets - - class FileDatastore(GenericBaseDatastore[StoredFileInfo]): """Generic Datastore for file-based implementations. @@ -732,7 +733,7 @@ def _prepare_for_direct_get( # Is this a component request? refComponent = ref.datasetType.component() - return prepare_for_get( + return generate_datastore_get_information( fileLocations, refStorageClass=refStorageClass, refComponent=refComponent, @@ -1993,10 +1994,23 @@ def get( ref = ref.overrideStorageClass(storageClass) allGetInfo = self._prepare_for_direct_get(ref, parameters) - return get_dataset_as_python_object( + return get_dataset_as_python_object_from_get_info( allGetInfo, ref=ref, parameters=parameters, cache_manager=self.cacheManager ) + def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPayload: + # Docstring inherited + + def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: + location, file_info = info + return FileDatastoreGetPayloadFileInfo(url=location.uri.geturl(), metadata=file_info.to_simple()) + + return FileDatastoreGetPayload( + datastore_type="file", + dataset_ref=ref.to_simple(), + file_info=[to_file_info_payload(info) for info in self._get_dataset_locations_info(ref)], + ) + @transactional def put(self, inMemoryDataset: Any, ref: DatasetRef) -> None: """Write a InMemoryDataset with a given `DatasetRef` to the store. diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py new file mode 100644 index 0000000000..9dcaed323d --- /dev/null +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -0,0 +1,58 @@ +__all__ = ("get_dataset_as_python_object", "FileDatastoreGetPayload") + +from typing import Any, Literal + +from lsst.daf.butler import DatasetRef, DimensionUniverse, Location, SerializedDatasetRef, StorageClass +from lsst.daf.butler._compat import _BaseModelCompat +from lsst.daf.butler.datastore.cache_manager import DatastoreDisabledCacheManager +from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo +from lsst.daf.butler.datastores.file_datastore.get import ( + DatasetLocationInformation, + Mapping, + generate_datastore_get_information, + get_dataset_as_python_object_from_get_info, +) + + +class FileDatastoreGetPayloadFileInfo(_BaseModelCompat): + # TODO DM-41879: Allowing arbitrary URLs here is a severe security issue, + # since it allows the server to trick the client into fetching data from + # any file on its local filesystem or from remote storage using credentials + # laying around in the environment. This should be restricted to only + # HTTP, but we don't yet have a means of mocking out HTTP gets in tests. + url: str + metadata: SerializedStoredFileInfo + + +class FileDatastoreGetPayload(_BaseModelCompat): + datastore_type: Literal["file"] + file_info: list[FileDatastoreGetPayloadFileInfo] + dataset_ref: SerializedDatasetRef + + +def get_dataset_as_python_object( + model: FileDatastoreGetPayload, + *, + universe: DimensionUniverse, + parameters: Mapping[str, Any] | None, + storageClass: StorageClass | str | None, +) -> Any: + fileLocations: list[DatasetLocationInformation] = [ + (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.metadata)) + for file_info in model.file_info + ] + + ref = DatasetRef.from_simple(model.dataset_ref, universe=universe) + if storageClass is not None: + ref = ref.overrideStorageClass(storageClass) + + datastore_file_info = generate_datastore_get_information( + fileLocations, + refStorageClass=ref.datasetType.storageClass, + refComponent=ref.datasetType.component(), + parameters=parameters, + dataId=ref.dataId, + ) + return get_dataset_as_python_object_from_get_info( + datastore_file_info, ref=ref, parameters=parameters, cache_manager=DatastoreDisabledCacheManager() + ) diff --git a/python/lsst/daf/butler/datastores/file_datastore/__init__.py b/python/lsst/daf/butler/datastores/file_datastore/__init__.py index e69de29bb2..36d1294e36 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/__init__.py +++ b/python/lsst/daf/butler/datastores/file_datastore/__init__.py @@ -0,0 +1,26 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py index e13b1ed54f..dc15df1065 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/get.py +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -28,8 +28,8 @@ __all__ = ( "DatastoreFileGetInformation", "DatasetLocationInformation", - "prepare_for_get", - "get_dataset_as_python_object", + "generate_datastore_get_information", + "get_dataset_as_python_object_from_get_info", ) from collections.abc import Mapping @@ -77,7 +77,7 @@ class DatastoreFileGetInformation: """The `StorageClass` of the dataset being read.""" -def prepare_for_get( +def generate_datastore_get_information( fileLocations: list[DatasetLocationInformation], *, refStorageClass: StorageClass, @@ -306,7 +306,7 @@ def _read_artifact_into_memory( ) -def get_dataset_as_python_object( +def get_dataset_as_python_object_from_get_info( allGetInfo: list[DatastoreFileGetInformation], *, ref: DatasetRef, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index ad05352078..1eebc2ca7f 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -35,6 +35,7 @@ import httpx from lsst.daf.butler import __version__ +from lsst.daf.butler.datastores.fileDatastoreClient import get_dataset_as_python_object from lsst.daf.butler.repo_relocation import replaceRoot from lsst.resources import ResourcePath, ResourcePathExpression from lsst.utils.introspection import get_full_type_name @@ -51,7 +52,7 @@ from ..registry.wildcards import CollectionWildcard from ._authentication import get_authentication_headers, get_authentication_token_from_environment from ._config import RemoteButlerConfigModel -from .server_models import FindDatasetModel +from .server_models import FindDatasetModel, GetFileRequestModel, GetFileResponseModel if TYPE_CHECKING: from .._dataset_existence import DatasetExistence @@ -215,7 +216,20 @@ def get( **kwargs: Any, ) -> Any: # Docstring inherited. - raise NotImplementedError() + if not isinstance(datasetRefOrType, DatasetRef): + raise NotImplementedError("RemoteButler currently only supports get() of a DatasetRef") + + dataset_id = datasetRefOrType.id + request = GetFileRequestModel(dataset_id=dataset_id) + response = self._post("get_file", request) + if response.status_code == 404: + raise LookupError(f"Dataset not found with ID ${dataset_id}") + response.raise_for_status() + model = self._parse_model(response, GetFileResponseModel) + + return get_dataset_as_python_object( + model, parameters=parameters, storageClass=storageClass, universe=self.dimensions + ) def getURIs( self, diff --git a/python/lsst/daf/butler/remote_butler/server/_exceptions.py b/python/lsst/daf/butler/remote_butler/server/_exceptions.py new file mode 100644 index 0000000000..2514cd9046 --- /dev/null +++ b/python/lsst/daf/butler/remote_butler/server/_exceptions.py @@ -0,0 +1,33 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from fastapi import HTTPException + + +class NotFoundException(HTTPException): + def __init__(self, message: str = "Not found"): + super().__init__(status_code=404, detail=message) diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 7b36133008..a765e55c1e 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -32,9 +32,14 @@ from fastapi import APIRouter, Depends from lsst.daf.butler import SerializedDatasetRef, SerializedDatasetType -from lsst.daf.butler.remote_butler.server_models import FindDatasetModel +from lsst.daf.butler.remote_butler.server_models import ( + FindDatasetModel, + GetFileRequestModel, + GetFileResponseModel, +) from .._dependencies import factory_dependency +from .._exceptions import NotFoundException from .._factory import Factory external_router = APIRouter() @@ -152,3 +157,16 @@ def find_dataset( **data_id, ) return ref.to_simple() if ref else None + + +@external_router.post("/v1/get_file") +def get_file( + request: GetFileRequestModel, + factory: Factory = Depends(factory_dependency), +) -> GetFileResponseModel: + butler = factory.create_butler() + ref = butler.get_dataset(request.dataset_id, datastore_records=True) + if ref is None: + raise NotFoundException(f"Dataset ID {request.dataset_id} not found") + payload = butler._datastore.prepare_get_for_external_client(ref) + return GetFileResponseModel.model_validate(payload) diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 1b4d59b5c1..5b9fa957c6 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -27,9 +27,10 @@ """Models used for client/server communication.""" -__all__ = ["FindDatasetModel"] +__all__ = ["FindDatasetModel", "GetFileResponseModel"] -from lsst.daf.butler import SerializedDataCoordinate +from lsst.daf.butler import DatasetId, SerializedDataCoordinate +from lsst.daf.butler.datastores.fileDatastoreClient import FileDatastoreGetPayload from .._compat import _BaseModelCompat @@ -42,3 +43,10 @@ class FindDatasetModel(_BaseModelCompat): storage_class: str | None dimension_records: bool = False datastore_records: bool = False + + +class GetFileRequestModel(_BaseModelCompat): + dataset_id: DatasetId + + +GetFileResponseModel = FileDatastoreGetPayload diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index 84e5d1a650..df4238c254 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -223,11 +223,16 @@ class MetricTestRepo: The path to the config file, to pass to ``Butler.makeRepo``. """ + METRICS_EXAMPLE_SUMMARY = {"AM1": 5.2, "AM2": 30.6} + """The summary data included in ``MetricsExample`` objects stored in the + test repo + """ + @staticmethod def _makeExampleMetrics() -> MetricsExample: """Make an object to put into the repository.""" return MetricsExample( - {"AM1": 5.2, "AM2": 30.6}, + MetricTestRepo.METRICS_EXAMPLE_SUMMARY, {"a": [1, 2, 3], "b": {"blue": 5, "red": "green"}}, [563, 234, 456.7, 752, 8, 9, 27], ) diff --git a/tests/test_server.py b/tests/test_server.py index c050b97320..8430e91202 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -43,7 +43,7 @@ from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, MissingDatasetTypeError, StorageClassFactory from lsst.daf.butler.tests import DatastoreMock -from lsst.daf.butler.tests.utils import MetricTestRepo, makeTestTempDir, removeTestTempDir +from lsst.daf.butler.tests.utils import MetricsExample, MetricTestRepo, makeTestTempDir, removeTestTempDir from lsst.resources.http import HttpResourcePath TESTDIR = os.path.abspath(os.path.dirname(__file__)) @@ -76,9 +76,6 @@ def setUpClass(cls): # Override the server's Butler initialization to point at our test repo server_butler = Butler.from_config(cls.root, writeable=True) - # Not yet testing butler.get() - DatastoreMock.apply(server_butler) - def create_factory_dependency(): return Factory(butler=server_butler) @@ -89,7 +86,9 @@ def create_factory_dependency(): cls.client.base_url = "http://test.example/api/butler/" cls.butler = _make_remote_butler(cls.client) - # Populate the test server. + # Populate the test server. The DatastoreMock is required because the + # datasets referenced in these imports do not point at real files + DatastoreMock.apply(server_butler) server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets.yaml")) @@ -197,6 +196,29 @@ def override_read(http_resource_path): assert isinstance(butler, RemoteButler) assert str(butler._config.remote_butler.url) == "https://test.example/api/butler/" + def test_get(self): + # Test get() of a DatasetRef + ref = self.butler.find_dataset( + "test_metric_comp", + {"instrument": "DummyCamComp", "visit": 423}, + collections="ingest/run", + ) + metric = self.butler.get(ref) + self.assertIsInstance(metric, MetricsExample) + self.assertEqual(metric.summary, MetricTestRepo.METRICS_EXAMPLE_SUMMARY) + + # Test looking up a non-existent ref + invalid_ref = ref.replace(id=uuid.uuid4()) + with self.assertRaises(LookupError): + self.butler.get(invalid_ref) + + # Test storage class override + new_sc = self.storageClassFactory.getStorageClass("MetricsConversion") + converted = self.butler.get(ref, storageClass=new_sc) + self.assertNotEqual(type(metric), type(converted)) + self.assertIs(type(converted), new_sc.pytype) + self.assertEqual(metric, converted) + if __name__ == "__main__": unittest.main() From b779efb2935e695ab1ca9dd9434ece54da663657 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Wed, 29 Nov 2023 15:46:07 -0700 Subject: [PATCH 6/9] Deduplicate FileDatastore functions Remove a small amount of duplicated code by simplifying the function parameters --- .../lsst/daf/butler/datastores/fileDatastore.py | 7 ++----- .../butler/datastores/fileDatastoreClient.py | 4 +--- .../daf/butler/datastores/file_datastore/get.py | 17 ++++++++++------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 990e1d69cf..685eee7168 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -731,14 +731,11 @@ def _prepare_for_direct_get( if not fileLocations: raise FileNotFoundError(f"None of the component files for dataset {ref} exist.") - # Is this a component request? - refComponent = ref.datasetType.component() return generate_datastore_get_information( fileLocations, - refStorageClass=refStorageClass, - refComponent=refComponent, + readStorageClass=refStorageClass, + ref=ref, parameters=parameters, - dataId=ref.dataId, ) def _prepare_for_put(self, inMemoryDataset: Any, ref: DatasetRef) -> tuple[Location, Formatter]: diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index 9dcaed323d..525e396b02 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -48,10 +48,8 @@ def get_dataset_as_python_object( datastore_file_info = generate_datastore_get_information( fileLocations, - refStorageClass=ref.datasetType.storageClass, - refComponent=ref.datasetType.component(), + ref=ref, parameters=parameters, - dataId=ref.dataId, ) return get_dataset_as_python_object_from_get_info( datastore_file_info, ref=ref, parameters=parameters, cache_manager=DatastoreDisabledCacheManager() diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py index dc15df1065..27474c12c1 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/get.py +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -36,7 +36,7 @@ from dataclasses import dataclass from typing import Any, TypeAlias -from lsst.daf.butler import DataCoordinate, DatasetRef, FileDescriptor, Formatter, Location, StorageClass +from lsst.daf.butler import DatasetRef, FileDescriptor, Formatter, Location, StorageClass from lsst.daf.butler.datastore.cache_manager import AbstractDatastoreCacheManager from lsst.daf.butler.datastore.generic_base import post_process_get from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo @@ -80,11 +80,16 @@ class DatastoreFileGetInformation: def generate_datastore_get_information( fileLocations: list[DatasetLocationInformation], *, - refStorageClass: StorageClass, - refComponent: str | None, + ref: DatasetRef, parameters: Mapping[str, Any] | None, - dataId: DataCoordinate, + readStorageClass: StorageClass | None = None, ) -> list[DatastoreFileGetInformation]: + if readStorageClass is None: + readStorageClass = ref.datasetType.storageClass + + # Is this a component request? + refComponent = ref.datasetType.component() + disassembled = len(fileLocations) > 1 fileGetInfo = [] for location, storedFileInfo in fileLocations: @@ -94,8 +99,6 @@ def generate_datastore_get_information( # If this has been disassembled we need read to match the write if disassembled: readStorageClass = writeStorageClass - else: - readStorageClass = refStorageClass formatter = get_instance_of( storedFileInfo.formatter, @@ -105,7 +108,7 @@ def generate_datastore_get_information( storageClass=writeStorageClass, parameters=parameters, ), - dataId, + ref.dataId, ) formatterParams, notFormatterParams = formatter.segregateParameters() From 833686aa2266c9b77c75597ff44a2564cb629319 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Wed, 29 Nov 2023 16:34:09 -0700 Subject: [PATCH 7/9] Add documentation --- .../daf/butler/datastores/fileDatastore.py | 4 +- .../butler/datastores/fileDatastoreClient.py | 47 +++++++++++++++++-- .../butler/datastores/file_datastore/get.py | 42 +++++++++++++++++ 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 685eee7168..cf2d879447 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2000,7 +2000,9 @@ def prepare_get_for_external_client(self, ref: DatasetRef) -> FileDatastoreGetPa def to_file_info_payload(info: DatasetLocationInformation) -> FileDatastoreGetPayloadFileInfo: location, file_info = info - return FileDatastoreGetPayloadFileInfo(url=location.uri.geturl(), metadata=file_info.to_simple()) + return FileDatastoreGetPayloadFileInfo( + url=location.uri.geturl(), datastoreRecords=file_info.to_simple() + ) return FileDatastoreGetPayload( datastore_type="file", diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index 525e396b02..a13cb83851 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -15,34 +15,71 @@ class FileDatastoreGetPayloadFileInfo(_BaseModelCompat): + """Information required to read a single file stored in `FileDatastore`""" + # TODO DM-41879: Allowing arbitrary URLs here is a severe security issue, # since it allows the server to trick the client into fetching data from # any file on its local filesystem or from remote storage using credentials # laying around in the environment. This should be restricted to only # HTTP, but we don't yet have a means of mocking out HTTP gets in tests. url: str - metadata: SerializedStoredFileInfo + """An absolute URL that can be used to read the file""" + + datastoreRecords: SerializedStoredFileInfo + """`FileDatastore` metadata records for this file""" class FileDatastoreGetPayload(_BaseModelCompat): + """A serializable representation of the data needed for retrieving an + artifact and converting it to a python object. + """ + datastore_type: Literal["file"] + file_info: list[FileDatastoreGetPayloadFileInfo] + """List of retrieval information for each file associated with this + artifact + """ + dataset_ref: SerializedDatasetRef + """Registry information associated with this artifact""" def get_dataset_as_python_object( - model: FileDatastoreGetPayload, + payload: FileDatastoreGetPayload, *, universe: DimensionUniverse, parameters: Mapping[str, Any] | None, storageClass: StorageClass | str | None, ) -> Any: + """Retrieve an artifact from storage and return it as a Python object + + Parameters + ---------- + payload : `FileDatastoreGetPayload` + Pre-processed information about each file associated with this artifact + universe: `DimensionUniverse` + The universe of dimensions associated with the `DatasetRef` contained + in `payload`. + parameters : `Mapping`[`str`, `Any`] + `StorageClass` and `Formatter` parameters to be used when converting + the artifact to a Python object + storageClass: `StorageClass` | `str` | `None` + Overrides the `StorageClass` to be used when converting the artifact to + a Python object. If `None`, uses the `StorageClass` specified by + `payload`. + + Returns + ------- + python_object: `Any` + The retrieved artifact, converted to a Python object + """ fileLocations: list[DatasetLocationInformation] = [ - (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.metadata)) - for file_info in model.file_info + (Location(None, file_info.url), StoredFileInfo.from_simple(file_info.datastoreRecords)) + for file_info in payload.file_info ] - ref = DatasetRef.from_simple(model.dataset_ref, universe=universe) + ref = DatasetRef.from_simple(payload.dataset_ref, universe=universe) if storageClass is not None: ref = ref.overrideStorageClass(storageClass) diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py index 27474c12c1..7019582e3b 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/get.py +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -84,6 +84,27 @@ def generate_datastore_get_information( parameters: Mapping[str, Any] | None, readStorageClass: StorageClass | None = None, ) -> list[DatastoreFileGetInformation]: + """Process parameters and instantiate formatters for in preparation for + retrieving an artifact and converting it to a Python object + + Parameters + ---------- + fileLocations : `list`[`DatasetLocationInformation`] + List of file locations for this artifact and their associated datastore + records + ref : `DatasetRef` + The registry information associated with this artifact. + parameters : `Mapping`[`str`, `Any`] + `StorageClass` and `Formatter` parameters + readStorageClass : `StorageClass` | `None`, optional + The StorageClass to use when ultimately returning the resulting object + from the get. Defaults to the `StorageClass` specified by `ref` + + Returns + ------- + getInfo : `list[DatastoreFileGetInformation]` + The parameters needed to retrieve each file + """ if readStorageClass is None: readStorageClass = ref.datasetType.storageClass @@ -154,6 +175,8 @@ def _read_artifact_into_memory( The registry information associated with this artifact. isComponent : `bool` Flag to indicate if a component is being read from this artifact. + cache_manager: `AbstractDatastoreCacheManager` + The cache manager to use for caching retrieved files cache_ref : `DatasetRef`, optional The DatasetRef to use when looking up the file in the cache. This ref must have the same ID as the supplied ref but can @@ -316,6 +339,25 @@ def get_dataset_as_python_object_from_get_info( parameters: Mapping[str, Any] | None, cache_manager: AbstractDatastoreCacheManager, ) -> Any: + """Retrieve an artifact from storage and return it as a Python object + + Parameters + ---------- + allGetInfo : `list`[`DatastoreFileGetInformation`] + Pre-processed information about each file associated with this artifact + ref : `DatasetRef` + The registry information associated with this artifact. + parameters : `Mapping`[`str`, `Any`] + `StorageClass` and `Formatter` parameters + cache_manager: `AbstractDatastoreCacheManager` + The cache manager to use for caching retrieved files + + Returns + ------- + python_object: `Any` + The retrieved artifact, converted to a Python object according to the + `StorageClass` specified in `ref`. + """ refStorageClass = ref.datasetType.storageClass refComponent = ref.datasetType.component() # Create mapping from component name to related info From 77b8e9920fca9705b1538f6779d1a066d6db2876 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Thu, 7 Dec 2023 10:44:02 -0700 Subject: [PATCH 8/9] Require keyword arguments for get_dataset --- python/lsst/daf/butler/_butler.py | 1 + python/lsst/daf/butler/direct_butler.py | 1 + python/lsst/daf/butler/remote_butler/_remote_butler.py | 1 + 3 files changed, 3 insertions(+) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 02323ac4d2..04eae2647e 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -814,6 +814,7 @@ def get_dataset_type(self, name: str) -> DatasetType: def get_dataset( self, id: DatasetId, + *, storage_class: str | StorageClass | None = None, dimension_records: bool = False, datastore_records: bool = False, diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index ab857d9fbd..9b20102c81 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -1337,6 +1337,7 @@ def get_dataset_type(self, name: str) -> DatasetType: def get_dataset( self, id: DatasetId, + *, storage_class: str | StorageClass | None = None, dimension_records: bool = False, datastore_records: bool = False, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 1eebc2ca7f..2d0c451e9b 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -274,6 +274,7 @@ def get_dataset_type(self, name: str) -> DatasetType: def get_dataset( self, id: DatasetId, + *, storage_class: str | StorageClass | None = None, dimension_records: bool = False, datastore_records: bool = False, From d718a09bc6748ff0c60dc2faed10a22feda0d3dd Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Thu, 7 Dec 2023 10:14:37 -0700 Subject: [PATCH 9/9] Tweak comments per review Co-authored-by: Tim Jenness --- python/lsst/daf/butler/datastore/stored_file_info.py | 4 ++-- python/lsst/daf/butler/datastores/fileDatastoreClient.py | 7 ++++--- python/lsst/daf/butler/datastores/file_datastore/get.py | 8 ++++---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/datastore/stored_file_info.py b/python/lsst/daf/butler/datastore/stored_file_info.py index b173656b48..73eb3c75a4 100644 --- a/python/lsst/daf/butler/datastore/stored_file_info.py +++ b/python/lsst/daf/butler/datastore/stored_file_info.py @@ -215,7 +215,7 @@ def __init__( """StorageClass associated with Dataset.""" component: str | None - """Component associated with this file. Can be None if the file does + """Component associated with this file. Can be `None` if the file does not refer to a component of a composite.""" checksum: str | None @@ -347,7 +347,7 @@ class SerializedStoredFileInfo(_BaseModelCompat): """Name of the StorageClass associated with Dataset.""" component: str | None - """Component associated with this file. Can be None if the file does + """Component associated with this file. Can be `None` if the file does not refer to a component of a composite.""" checksum: str | None diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index a13cb83851..9eb09b52be 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -57,11 +57,12 @@ def get_dataset_as_python_object( Parameters ---------- payload : `FileDatastoreGetPayload` - Pre-processed information about each file associated with this artifact + Pre-processed information about each file associated with this + artifact. universe: `DimensionUniverse` The universe of dimensions associated with the `DatasetRef` contained - in `payload`. - parameters : `Mapping`[`str`, `Any`] + in ``payload``. + parameters : `Mapping`[`str`, `typing.Any`] `StorageClass` and `Formatter` parameters to be used when converting the artifact to a Python object storageClass: `StorageClass` | `str` | `None` diff --git a/python/lsst/daf/butler/datastores/file_datastore/get.py b/python/lsst/daf/butler/datastores/file_datastore/get.py index 7019582e3b..c10f1914f4 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/get.py +++ b/python/lsst/daf/butler/datastores/file_datastore/get.py @@ -102,7 +102,7 @@ def generate_datastore_get_information( Returns ------- - getInfo : `list[DatastoreFileGetInformation]` + getInfo : `list` [`DatastoreFileGetInformation`] The parameters needed to retrieve each file """ if readStorageClass is None: @@ -349,14 +349,14 @@ def get_dataset_as_python_object_from_get_info( The registry information associated with this artifact. parameters : `Mapping`[`str`, `Any`] `StorageClass` and `Formatter` parameters - cache_manager: `AbstractDatastoreCacheManager` + cache_manager : `AbstractDatastoreCacheManager` The cache manager to use for caching retrieved files Returns ------- - python_object: `Any` + python_object : `typing.Any` The retrieved artifact, converted to a Python object according to the - `StorageClass` specified in `ref`. + `StorageClass` specified in ``ref``. """ refStorageClass = ref.datasetType.storageClass refComponent = ref.datasetType.component()