diff --git a/doc/changes/DM-41116.api.md b/doc/changes/DM-41116.api.md new file mode 100644 index 0000000000..7dbbf2837e --- /dev/null +++ b/doc/changes/DM-41116.api.md @@ -0,0 +1,3 @@ +- `Butler` class becomes an abstract base class, original `Butler` was renamed to `DirectButler`. +- Clients that need an access to `DirectButler` class will have to import it from `lsst.daf.butler.direct_butler`. +- `Butler.from_config(...)` should be used to make `Butler` instances. `Butler(...)` still works and is identical to `Butler.from_config(...)`, but will generate `mypy` errors. diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index a8a3e0a18d..a01541f9ad 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -25,32 +25,18 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -"""Butler top level classes. -""" from __future__ import annotations -__all__ = ( - "Butler", - "ButlerValidationError", -) - -import collections.abc -import contextlib -import io -import logging -import numbers -import os -import warnings -from collections import Counter, defaultdict -from collections.abc import Iterable, Iterator, MutableMapping, Sequence -from typing import TYPE_CHECKING, Any, ClassVar, TextIO - -from deprecated.sphinx import deprecated +__all__ = ["Butler"] + +from abc import abstractmethod +from collections.abc import Collection, Iterable, Sequence +from contextlib import AbstractContextManager +from typing import Any, TextIO + from lsst.resources import ResourcePath, ResourcePathExpression from lsst.utils import doImportType -from lsst.utils.introspection import get_class_of -from lsst.utils.logging import VERBOSE, getLogger -from sqlalchemy.exc import IntegrityError +from lsst.utils.logging import getLogger from ._butler_config import ButlerConfig from ._butler_repo_index import ButlerRepoIndex @@ -59,69 +45,30 @@ from ._dataset_ref import DatasetIdGenEnum, DatasetRef from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle -from ._exceptions import ValidationError from ._file_dataset import FileDataset from ._limited_butler import LimitedButler -from ._registry_shim import RegistryShim -from ._storage_class import StorageClass, StorageClassFactory -from ._timespan import Timespan -from .datastore import DatasetRefURIs, Datastore, NullDatastore -from .dimensions import ( - DataCoordinate, - DataId, - DataIdValue, - Dimension, - DimensionConfig, - DimensionElement, - DimensionRecord, - DimensionUniverse, -) -from .progress import Progress -from .registry import ( - CollectionType, - ConflictingDefinitionError, - DataIdError, - MissingDatasetTypeError, - NoDefaultCollectionError, - Registry, - RegistryConfig, - RegistryDefaults, - _ButlerRegistry, - _RegistryFactory, -) +from ._storage_class import StorageClass +from .datastore import DatasetRefURIs, Datastore +from .dimensions import DataId, DimensionConfig +from .registry import Registry, RegistryConfig, _RegistryFactory from .repo_relocation import BUTLER_ROOT_TAG from .transfers import RepoExportContext -from .utils import transactional - -if TYPE_CHECKING: - from lsst.resources import ResourceHandleProtocol - - from .transfers import RepoImportBackend - -log = getLogger(__name__) - -class ButlerValidationError(ValidationError): - """There is a problem with the Butler configuration.""" - - pass +_LOG = getLogger(__name__) class Butler(LimitedButler): - """Main entry point for the data access system. + """Interface for data butler and factory for Butler instances. Parameters ---------- config : `ButlerConfig`, `Config` or `str`, optional. - Configuration. Anything acceptable to the - `ButlerConfig` constructor. If a directory path - is given the configuration will be read from a ``butler.yaml`` file in - that location. If `None` is given default values will be used. - butler : `Butler`, optional. - If provided, construct a new Butler that uses the same registry and - datastore as the given one, but with the given collection and run. - Incompatible with the ``config``, ``searchPaths``, and ``writeable`` - arguments. + Configuration. Anything acceptable to the `ButlerConfig` constructor. + If a directory path is given the configuration will be read from a + ``butler.yaml`` file in that location. If `None` is given default + values will be used. If ``config`` contains "cls" key then its value is + used as a name of butler class and it must be a sub-class of this + class, otherwise `DirectButler` is instantiated. collections : `str` or `~collections.abc.Iterable` [ `str` ], optional An expression specifying the collections to be searched (in order) when reading datasets. @@ -151,179 +98,182 @@ class Butler(LimitedButler): the default for that dimension. Nonexistent collections are ignored. If a default value is provided explicitly for a governor dimension via ``**kwargs``, no default will be inferred for that dimension. - without_datastore : `bool`, optional - If `True` do not attach a datastore to this butler. Any attempts - to use a datastore will fail. - **kwargs : `str` - Default data ID key-value pairs. These may only identify "governor" - dimensions like ``instrument`` and ``skymap``. - - Examples - -------- - While there are many ways to control exactly how a `Butler` interacts with - the collections in its `Registry`, the most common cases are still simple. - - For a read-only `Butler` that searches one collection, do:: - - butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"]) - - For a read-write `Butler` that writes to and reads from a - `~CollectionType.RUN` collection:: - - butler = Butler("/path/to/repo", run="u/alice/DM-50000/a") - - The `Butler` passed to a ``PipelineTask`` is often much more complex, - because we want to write to one `~CollectionType.RUN` collection but read - from several others (as well):: - - butler = Butler("/path/to/repo", run="u/alice/DM-50000/a", - collections=["u/alice/DM-50000/a", - "u/bob/DM-49998", - "HSC/defaults"]) - - This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. - Datasets will be read first from that run (since it appears first in the - chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``. - - Finally, one can always create a `Butler` with no collections:: - - butler = Butler("/path/to/repo", writeable=True) - - This can be extremely useful when you just want to use ``butler.registry``, - e.g. for inserting dimension data or managing collections, or when the - collections you want to use with the butler are not consistent. - Passing ``writeable`` explicitly here is only necessary if you want to be - able to make changes to the repo - usually the value for ``writeable`` can - be guessed from the collection arguments provided, but it defaults to - `False` when there are not collection arguments. + **kwargs : `Any` + Additional keyword arguments passed to a constructor of actual butler + class. + + Notes + ----- + The preferred way to instantiate Butler is via the `from_config` method. + The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``, + but ``mypy`` will complain about the former. """ - def __init__( - self, + def __new__( + cls, config: Config | ResourcePathExpression | None = None, *, - butler: Butler | None = None, collections: Any = None, run: str | None = None, searchPaths: Sequence[ResourcePathExpression] | None = None, writeable: bool | None = None, inferDefaults: bool = True, - without_datastore: bool = False, - **kwargs: str, - ): - defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) - # Load registry, datastore, etc. from config or existing butler. - if butler is not None: - if config is not None or searchPaths is not None or writeable is not None: - raise TypeError( - "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." - ) - self._registry = butler._registry.copy(defaults) - self._datastore = butler._datastore - self.storageClasses = butler.storageClasses - self._config: ButlerConfig = butler._config - else: - self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) - try: - butlerRoot = self._config.get("root", self._config.configDir) - if writeable is None: - writeable = run is not None - self._registry = _RegistryFactory(self._config).from_config( - butlerRoot=butlerRoot, writeable=writeable, defaults=defaults - ) - if without_datastore: - self._datastore = NullDatastore(None, None) - else: - self._datastore = Datastore.fromConfig( - self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot - ) - # TODO: Once datastore drops dependency on registry we can - # construct datastore first and pass opaque tables to registry - # constructor. - self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions()) - self.storageClasses = StorageClassFactory() - self.storageClasses.addFromConfig(self._config) - except Exception: - # Failures here usually mean that configuration is incomplete, - # just issue an error message which includes config file URI. - log.error(f"Failed to instantiate Butler from config {self._config.configFile}.") - raise - - # For execution butler the datastore needs a special - # dependency-inversion trick. This is not used by regular butler, - # but we do not have a way to distinguish regular butler from execution - # butler. - self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) - - if "run" in self._config or "collection" in self._config: - raise ValueError("Passing a run or collection via configuration is no longer supported.") - - self._registry_shim = RegistryShim(self) - - GENERATION: ClassVar[int] = 3 - """This is a Generation 3 Butler. - - This attribute may be removed in the future, once the Generation 2 Butler - interface has been fully retired; it should only be used in transitional - code. - """ + **kwargs: Any, + ) -> Butler: + if cls is Butler: + cls = cls._find_butler_class(config, searchPaths) + # Note: we do not pass any parameters to __new__, Python will pass them + # to __init__ after __new__ returns sub-class instance. + return super().__new__(cls) - def _retrieve_dataset_type(self, name: str) -> DatasetType | None: - """Return DatasetType defined in registry given dataset type name.""" - try: - return self._registry.getDatasetType(name) - except MissingDatasetTypeError: - return None + @staticmethod + def _find_butler_class( + config: Config | ResourcePathExpression | None = None, + searchPaths: Sequence[ResourcePathExpression] | None = None, + ) -> type[Butler]: + """Find actual class to instantiate.""" + butler_class_name: str | None = None + if config is not None: + # Check for optional "cls" key in config. + if not isinstance(config, Config): + config = ButlerConfig(config, searchPaths=searchPaths) + butler_class_name = config.get("cls") + + # Make DirectButler if class is not specified. + butler_class: type[Butler] + if butler_class_name is None: + from .direct_butler import DirectButler + + butler_class = DirectButler + else: + butler_class = doImportType(butler_class_name) + if not issubclass(butler_class, Butler): + raise TypeError(f"{butler_class_name} is not a subclass of Butler") + return butler_class @classmethod - def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: - """Look up the label in a butler repository index. + def from_config( + cls, + config: Config | ResourcePathExpression | None = None, + *, + collections: Any = None, + run: str | None = None, + searchPaths: Sequence[ResourcePathExpression] | None = None, + writeable: bool | None = None, + inferDefaults: bool = True, + **kwargs: Any, + ) -> Butler: + """Create butler instance from configuration. Parameters ---------- - label : `str` - Label of the Butler repository to look up. - return_label : `bool`, optional - If ``label`` cannot be found in the repository index (either - because index is not defined or ``label`` is not in the index) and - ``return_label`` is `True` then return ``ResourcePath(label)``. - If ``return_label`` is `False` (default) then an exception will be - raised instead. - - Returns - ------- - uri : `lsst.resources.ResourcePath` - URI to the Butler repository associated with the given label or - default value if it is provided. - - Raises - ------ - KeyError - Raised if the label is not found in the index, or if an index - is not defined, and ``return_label`` is `False`. + config : `ButlerConfig`, `Config` or `str`, optional. + Configuration. Anything acceptable to the `ButlerConfig` + constructor. If a directory path is given the configuration will be + read from a ``butler.yaml`` file in that location. If `None` is + given default values will be used. If ``config`` contains "cls" key + then its value is used as a name of butler class and it must be a + sub-class of this class, otherwise `DirectButler` is instantiated. + collections : `str` or `~collections.abc.Iterable` [ `str` ], optional + An expression specifying the collections to be searched (in order) + when reading datasets. + This may be a `str` collection name or an iterable thereof. + See :ref:`daf_butler_collection_expressions` for more information. + These collections are not registered automatically and must be + manually registered before they are used by any method, but they + may be manually registered after the `Butler` is initialized. + run : `str`, optional + Name of the `~CollectionType.RUN` collection new datasets should be + inserted into. If ``collections`` is `None` and ``run`` is not + `None`, ``collections`` will be set to ``[run]``. If not `None`, + this collection will automatically be registered. If this is not + set (and ``writeable`` is not set either), a read-only butler will + be created. + searchPaths : `list` of `str`, optional + Directory paths to search when calculating the full Butler + configuration. Not used if the supplied config is already a + `ButlerConfig`. + writeable : `bool`, optional + Explicitly sets whether the butler supports write operations. If + not provided, a read-write butler is created if any of ``run``, + ``tags``, or ``chains`` is non-empty. + inferDefaults : `bool`, optional + If `True` (default) infer default data ID values from the values + present in the datasets in ``collections``: if all collections have + the same value (or no value) for a governor dimension, that value + will be the default for that dimension. Nonexistent collections + are ignored. If a default value is provided explicitly for a + governor dimension via ``**kwargs``, no default will be inferred + for that dimension. + **kwargs : `Any` + Additional keyword arguments passed to a constructor of actual + butler class. Notes ----- - See `~lsst.daf.butler.ButlerRepoIndex` for details on how the - information is discovered. - """ - return ButlerRepoIndex.get_repo_uri(label, return_label) + Calling this factory method is identical to calling + ``Butler(config, ...)``. Its only raison d'ĂȘtre is that ``mypy`` + complains about ``Butler()`` call. - @classmethod - def get_known_repos(cls) -> set[str]: - """Retrieve the list of known repository labels. + Examples + -------- + While there are many ways to control exactly how a `Butler` interacts + with the collections in its `Registry`, the most common cases are still + simple. - Returns - ------- - repos : `set` of `str` - All the known labels. Can be empty if no index can be found. + For a read-only `Butler` that searches one collection, do:: - Notes - ----- - See `~lsst.daf.butler.ButlerRepoIndex` for details on how the - information is discovered. + butler = Butler.from_config( + "/path/to/repo", collections=["u/alice/DM-50000"] + ) + + For a read-write `Butler` that writes to and reads from a + `~CollectionType.RUN` collection:: + + butler = Butler.from_config( + "/path/to/repo", run="u/alice/DM-50000/a" + ) + + The `Butler` passed to a ``PipelineTask`` is often much more complex, + because we want to write to one `~CollectionType.RUN` collection but + read from several others (as well):: + + butler = Butler.from_config( + "/path/to/repo", + run="u/alice/DM-50000/a", + collections=[ + "u/alice/DM-50000/a", "u/bob/DM-49998", "HSC/defaults" + ] + ) + + This butler will `put` new datasets to the run ``u/alice/DM-50000/a``. + Datasets will be read first from that run (since it appears first in + the chain), and then from ``u/bob/DM-49998`` and finally + ``HSC/defaults``. + + Finally, one can always create a `Butler` with no collections:: + + butler = Butler.from_config("/path/to/repo", writeable=True) + + This can be extremely useful when you just want to use + ``butler.registry``, e.g. for inserting dimension data or managing + collections, or when the collections you want to use with the butler + are not consistent. Passing ``writeable`` explicitly here is only + necessary if you want to be able to make changes to the repo - usually + the value for ``writeable`` can be guessed from the collection + arguments provided, but it defaults to `False` when there are not + collection arguments. """ - return ButlerRepoIndex.get_known_repos() + cls = cls._find_butler_class(config, searchPaths) + return cls( + config, + collections=collections, + run=run, + searchPaths=searchPaths, + writeable=writeable, + inferDefaults=inferDefaults, + **kwargs, + ) @staticmethod def makeRepo( @@ -477,668 +427,69 @@ def makeRepo( dimensionConfig=dimensionConfig, butlerRoot=root_uri ) - log.verbose("Wrote new Butler configuration file to %s", configURI) + _LOG.verbose("Wrote new Butler configuration file to %s", configURI) return config @classmethod - def _unpickle( - cls, - config: ButlerConfig, - collections: tuple[str, ...] | None, - run: str | None, - defaultDataId: dict[str, str], - writeable: bool, - ) -> Butler: - """Callable used to unpickle a Butler. - - We prefer not to use ``Butler.__init__`` directly so we can force some - of its many arguments to be keyword-only (note that ``__reduce__`` - can only invoke callables with positional arguments). + def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: + """Look up the label in a butler repository index. Parameters ---------- - config : `ButlerConfig` - Butler configuration, already coerced into a true `ButlerConfig` - instance (and hence after any search paths for overrides have been - utilized). - collections : `tuple` [ `str` ] - Names of the default collections to read from. - run : `str`, optional - Name of the default `~CollectionType.RUN` collection to write to. - defaultDataId : `dict` [ `str`, `str` ] - Default data ID values. - writeable : `bool` - Whether the Butler should support write operations. + label : `str` + Label of the Butler repository to look up. + return_label : `bool`, optional + If ``label`` cannot be found in the repository index (either + because index is not defined or ``label`` is not in the index) and + ``return_label`` is `True` then return ``ResourcePath(label)``. + If ``return_label`` is `False` (default) then an exception will be + raised instead. Returns ------- - butler : `Butler` - A new `Butler` instance. - """ - # MyPy doesn't recognize that the kwargs below are totally valid; it - # seems to think '**defaultDataId* is a _positional_ argument! - return cls( - config=config, - collections=collections, - run=run, - writeable=writeable, - **defaultDataId, # type: ignore - ) - - def __reduce__(self) -> tuple: - """Support pickling.""" - return ( - Butler._unpickle, - ( - self._config, - self.collections, - self.run, - self._registry.defaults.dataId.byName(), - self._registry.isWriteable(), - ), - ) - - def __str__(self) -> str: - return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( - self.collections, self.run, self._datastore, self._registry - ) - - def isWriteable(self) -> bool: - """Return `True` if this `Butler` supports write operations.""" - return self._registry.isWriteable() + uri : `lsst.resources.ResourcePath` + URI to the Butler repository associated with the given label or + default value if it is provided. - @contextlib.contextmanager - def transaction(self) -> Iterator[None]: - """Context manager supporting `Butler` transactions. + Raises + ------ + KeyError + Raised if the label is not found in the index, or if an index + is not defined, and ``return_label`` is `False`. - Transactions can be nested. + Notes + ----- + See `~lsst.daf.butler.ButlerRepoIndex` for details on how the + information is discovered. """ - with self._registry.transaction(), self._datastore.transaction(): - yield - - def _standardizeArgs( - self, - datasetRefOrType: DatasetRef | DatasetType | str, - dataId: DataId | None = None, - for_put: bool = True, - **kwargs: Any, - ) -> tuple[DatasetType, DataId | None]: - """Standardize the arguments passed to several Butler APIs. + return ButlerRepoIndex.get_repo_uri(label, return_label) - Parameters - ---------- - datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` - When `DatasetRef` the `dataId` should be `None`. - Otherwise the `DatasetType` or name thereof. - dataId : `dict` or `DataCoordinate` - A `dict` of `Dimension` link name, value pairs that label the - `DatasetRef` within a Collection. When `None`, a `DatasetRef` - should be provided as the second argument. - for_put : `bool`, optional - If `True` this call is invoked as part of a `Butler.put()`. - Otherwise it is assumed to be part of a `Butler.get()`. This - parameter is only relevant if there is dataset type - inconsistency. - **kwargs - Additional keyword arguments used to augment or construct a - `DataCoordinate`. See `DataCoordinate.standardize` - parameters. + @classmethod + def get_known_repos(cls) -> set[str]: + """Retrieve the list of known repository labels. Returns ------- - datasetType : `DatasetType` - A `DatasetType` instance extracted from ``datasetRefOrType``. - dataId : `dict` or `DataId`, optional - Argument that can be used (along with ``kwargs``) to construct a - `DataId`. + repos : `set` of `str` + All the known labels. Can be empty if no index can be found. Notes ----- - Butler APIs that conceptually need a DatasetRef also allow passing a - `DatasetType` (or the name of one) and a `DataId` (or a dict and - keyword arguments that can be used to construct one) separately. This - method accepts those arguments and always returns a true `DatasetType` - and a `DataId` or `dict`. - - Standardization of `dict` vs `DataId` is best handled by passing the - returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are - generally similarly flexible. - """ - externalDatasetType: DatasetType | None = None - internalDatasetType: DatasetType | None = None - if isinstance(datasetRefOrType, DatasetRef): - if dataId is not None or kwargs: - raise ValueError("DatasetRef given, cannot use dataId as well") - externalDatasetType = datasetRefOrType.datasetType - dataId = datasetRefOrType.dataId - else: - # Don't check whether DataId is provided, because Registry APIs - # can usually construct a better error message when it wasn't. - if isinstance(datasetRefOrType, DatasetType): - externalDatasetType = datasetRefOrType - else: - internalDatasetType = self._registry.getDatasetType(datasetRefOrType) - - # Check that they are self-consistent - if externalDatasetType is not None: - internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) - if externalDatasetType != internalDatasetType: - # We can allow differences if they are compatible, depending - # on whether this is a get or a put. A get requires that - # the python type associated with the datastore can be - # converted to the user type. A put requires that the user - # supplied python type can be converted to the internal - # type expected by registry. - relevantDatasetType = internalDatasetType - if for_put: - is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) - else: - is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) - relevantDatasetType = externalDatasetType - if not is_compatible: - raise ValueError( - f"Supplied dataset type ({externalDatasetType}) inconsistent with " - f"registry definition ({internalDatasetType})" - ) - # Override the internal definition. - internalDatasetType = relevantDatasetType - - assert internalDatasetType is not None - return internalDatasetType, dataId - - def _rewrite_data_id( - self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any - ) -> tuple[DataId | None, dict[str, Any]]: - """Rewrite a data ID taking into account dimension records. - - Take a Data ID and keyword args and rewrite it if necessary to - allow the user to specify dimension records rather than dimension - primary values. - - This allows a user to include a dataId dict with keys of - ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving - the integer exposure ID. It also allows a string to be given - for a dimension value rather than the integer ID if that is more - convenient. For example, rather than having to specifying the - detector with ``detector.full_name``, a string given for ``detector`` - will be interpreted as the full name and converted to the integer - value. - - Keyword arguments can also use strings for dimensions like detector - and exposure but python does not allow them to include ``.`` and - so the ``exposure.day_obs`` syntax can not be used in a keyword - argument. - - Parameters - ---------- - dataId : `dict` or `DataCoordinate` - A `dict` of `Dimension` link name, value pairs that will label the - `DatasetRef` within a Collection. - datasetType : `DatasetType` - The dataset type associated with this dataId. Required to - determine the relevant dimensions. - **kwargs - Additional keyword arguments used to augment or construct a - `DataId`. See `DataId` parameters. - - Returns - ------- - dataId : `dict` or `DataCoordinate` - The, possibly rewritten, dataId. If given a `DataCoordinate` and - no keyword arguments, the original dataId will be returned - unchanged. - **kwargs : `dict` - Any unused keyword arguments (would normally be empty dict). + See `~lsst.daf.butler.ButlerRepoIndex` for details on how the + information is discovered. """ - # Do nothing if we have a standalone DataCoordinate. - if isinstance(dataId, DataCoordinate) and not kwargs: - return dataId, kwargs - - # Process dimension records that are using record information - # rather than ids - newDataId: dict[str, DataIdValue] = {} - byRecord: dict[str, dict[str, Any]] = defaultdict(dict) - - # if all the dataId comes from keyword parameters we do not need - # to do anything here because they can't be of the form - # exposure.obs_id because a "." is not allowed in a keyword parameter. - if dataId: - for k, v in dataId.items(): - # If we have a Dimension we do not need to do anything - # because it cannot be a compound key. - if isinstance(k, str) and "." in k: - # Someone is using a more human-readable dataId - dimensionName, record = k.split(".", 1) - byRecord[dimensionName][record] = v - elif isinstance(k, Dimension): - newDataId[k.name] = v - else: - newDataId[k] = v - - # Go through the updated dataId and check the type in case someone is - # using an alternate key. We have already filtered out the compound - # keys dimensions.record format. - not_dimensions = {} - - # Will need to look in the dataId and the keyword arguments - # and will remove them if they need to be fixed or are unrecognized. - for dataIdDict in (newDataId, kwargs): - # Use a list so we can adjust the dict safely in the loop - for dimensionName in list(dataIdDict): - value = dataIdDict[dimensionName] - try: - dimension = self.dimensions.getStaticDimensions()[dimensionName] - except KeyError: - # This is not a real dimension - not_dimensions[dimensionName] = value - del dataIdDict[dimensionName] - continue - - # Convert an integral type to an explicit int to simplify - # comparisons here - if isinstance(value, numbers.Integral): - value = int(value) - - if not isinstance(value, dimension.primaryKey.getPythonType()): - for alternate in dimension.alternateKeys: - if isinstance(value, alternate.getPythonType()): - byRecord[dimensionName][alternate.name] = value - del dataIdDict[dimensionName] - log.debug( - "Converting dimension %s to %s.%s=%s", - dimensionName, - dimensionName, - alternate.name, - value, - ) - break - else: - log.warning( - "Type mismatch found for value '%r' provided for dimension %s. " - "Could not find matching alternative (primary key has type %s) " - "so attempting to use as-is.", - value, - dimensionName, - dimension.primaryKey.getPythonType(), - ) - - # By this point kwargs and newDataId should only include valid - # dimensions. Merge kwargs in to the new dataId and log if there - # are dimensions in both (rather than calling update). - for k, v in kwargs.items(): - if k in newDataId and newDataId[k] != v: - log.debug( - "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v - ) - newDataId[k] = v - # No need to retain any values in kwargs now. - kwargs = {} - - # If we have some unrecognized dimensions we have to try to connect - # them to records in other dimensions. This is made more complicated - # by some dimensions having records with clashing names. A mitigation - # is that we can tell by this point which dimensions are missing - # for the DatasetType but this does not work for calibrations - # where additional dimensions can be used to constrain the temporal - # axis. - if not_dimensions: - # Search for all dimensions even if we have been given a value - # explicitly. In some cases records are given as well as the - # actually dimension and this should not be an error if they - # match. - mandatoryDimensions = datasetType.dimensions.names # - provided - - candidateDimensions: set[str] = set() - candidateDimensions.update(mandatoryDimensions) - - # For calibrations we may well be needing temporal dimensions - # so rather than always including all dimensions in the scan - # restrict things a little. It is still possible for there - # to be confusion over day_obs in visit vs exposure for example. - # If we are not searching calibration collections things may - # fail but they are going to fail anyway because of the - # ambiguousness of the dataId... - if datasetType.isCalibration(): - for dim in self.dimensions.getStaticDimensions(): - if dim.temporal: - candidateDimensions.add(str(dim)) - - # Look up table for the first association with a dimension - guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) - - # Keep track of whether an item is associated with multiple - # dimensions. - counter: Counter[str] = Counter() - assigned: dict[str, set[str]] = defaultdict(set) - - # Go through the missing dimensions and associate the - # given names with records within those dimensions - matched_dims = set() - for dimensionName in candidateDimensions: - dimension = self.dimensions.getStaticDimensions()[dimensionName] - fields = dimension.metadata.names | dimension.uniqueKeys.names - for field in not_dimensions: - if field in fields: - guessedAssociation[dimensionName][field] = not_dimensions[field] - counter[dimensionName] += 1 - assigned[field].add(dimensionName) - matched_dims.add(field) - - # Calculate the fields that matched nothing. - never_found = set(not_dimensions) - matched_dims - - if never_found: - raise ValueError(f"Unrecognized keyword args given: {never_found}") - - # There is a chance we have allocated a single dataId item - # to multiple dimensions. Need to decide which should be retained. - # For now assume that the most popular alternative wins. - # This means that day_obs with seq_num will result in - # exposure.day_obs and not visit.day_obs - # Also prefer an explicitly missing dimension over an inferred - # temporal dimension. - for fieldName, assignedDimensions in assigned.items(): - if len(assignedDimensions) > 1: - # Pick the most popular (preferring mandatory dimensions) - requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) - if requiredButMissing: - candidateDimensions = requiredButMissing - else: - candidateDimensions = assignedDimensions - - # If this is a choice between visit and exposure and - # neither was a required part of the dataset type, - # (hence in this branch) always prefer exposure over - # visit since exposures are always defined and visits - # are defined from exposures. - if candidateDimensions == {"exposure", "visit"}: - candidateDimensions = {"exposure"} - - # Select the relevant items and get a new restricted - # counter. - theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} - duplicatesCounter: Counter[str] = Counter() - duplicatesCounter.update(theseCounts) - - # Choose the most common. If they are equally common - # we will pick the one that was found first. - # Returns a list of tuples - selected = duplicatesCounter.most_common(1)[0][0] - - log.debug( - "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." - " Removed ambiguity by choosing dimension %s.", - fieldName, - ", ".join(assignedDimensions), - selected, - ) - - for candidateDimension in assignedDimensions: - if candidateDimension != selected: - del guessedAssociation[candidateDimension][fieldName] - - # Update the record look up dict with the new associations - for dimensionName, values in guessedAssociation.items(): - if values: # A dict might now be empty - log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values) - byRecord[dimensionName].update(values) - - if byRecord: - # Some record specifiers were found so we need to convert - # them to the Id form - for dimensionName, values in byRecord.items(): - if dimensionName in newDataId: - log.debug( - "DataId specified explicit %s dimension value of %s in addition to" - " general record specifiers for it of %s. Ignoring record information.", - dimensionName, - newDataId[dimensionName], - str(values), - ) - # Get the actual record and compare with these values. - try: - recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) - except DataIdError: - raise ValueError( - f"Could not find dimension '{dimensionName}'" - f" with dataId {newDataId} as part of comparing with" - f" record values {byRecord[dimensionName]}" - ) from None - if len(recs) == 1: - errmsg: list[str] = [] - for k, v in values.items(): - if (recval := getattr(recs[0], k)) != v: - errmsg.append(f"{k}({recval} != {v})") - if errmsg: - raise ValueError( - f"Dimension {dimensionName} in dataId has explicit value" - " inconsistent with records: " + ", ".join(errmsg) - ) - else: - # Multiple matches for an explicit dimension - # should never happen but let downstream complain. - pass - continue - - # Build up a WHERE expression - bind = dict(values.items()) - where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) - - # Hopefully we get a single record that matches - records = set( - self._registry.queryDimensionRecords( - dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs - ) - ) - - if len(records) != 1: - if len(records) > 1: - # visit can have an ambiguous answer without involving - # visit_system. The default visit_system is defined - # by the instrument. - if ( - dimensionName == "visit" - and "visit_system_membership" in self.dimensions - and "visit_system" in self.dimensions["instrument"].metadata - ): - instrument_records = list( - self._registry.queryDimensionRecords( - "instrument", - dataId=newDataId, - **kwargs, - ) - ) - if len(instrument_records) == 1: - visit_system = instrument_records[0].visit_system - if visit_system is None: - # Set to a value that will never match. - visit_system = -1 - - # Look up each visit in the - # visit_system_membership records. - for rec in records: - membership = list( - self._registry.queryDimensionRecords( - # Use bind to allow zero results. - # This is a fully-specified query. - "visit_system_membership", - where="instrument = inst AND visit_system = system AND visit = v", - bind=dict( - inst=instrument_records[0].name, system=visit_system, v=rec.id - ), - ) - ) - if membership: - # This record is the right answer. - records = {rec} - break - - # The ambiguity may have been resolved so check again. - if len(records) > 1: - log.debug("Received %d records from constraints of %s", len(records), str(values)) - for r in records: - log.debug("- %s", str(r)) - raise ValueError( - f"DataId specification for dimension {dimensionName} is not" - f" uniquely constrained to a single dataset by {values}." - f" Got {len(records)} results." - ) - else: - raise ValueError( - f"DataId specification for dimension {dimensionName} matched no" - f" records when constrained by {values}" - ) - - # Get the primary key from the real dimension object - dimension = self.dimensions.getStaticDimensions()[dimensionName] - if not isinstance(dimension, Dimension): - raise RuntimeError( - f"{dimension.name} is not a true dimension, and cannot be used in data IDs." - ) - newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) - - return newDataId, kwargs - - def _findDatasetRef( - self, - datasetRefOrType: DatasetRef | DatasetType | str, - dataId: DataId | None = None, - *, - collections: Any = None, - predict: bool = False, - run: str | None = None, - datastore_records: bool = False, - **kwargs: Any, - ) -> DatasetRef: - """Shared logic for methods that start with a search for a dataset in - the registry. - - Parameters - ---------- - datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` - When `DatasetRef` the `dataId` should be `None`. - Otherwise the `DatasetType` or name thereof. - dataId : `dict` or `DataCoordinate`, optional - A `dict` of `Dimension` link name, value pairs that label the - `DatasetRef` within a Collection. When `None`, a `DatasetRef` - should be provided as the first argument. - collections : Any, optional - Collections to be searched, overriding ``self.collections``. - Can be any of the types supported by the ``collections`` argument - to butler construction. - predict : `bool`, optional - If `True`, return a newly created `DatasetRef` with a unique - dataset ID if finding a reference in the `Registry` fails. - Defaults to `False`. - run : `str`, optional - Run collection name to use for creating `DatasetRef` for predicted - datasets. Only used if ``predict`` is `True`. - datastore_records : `bool`, optional - If `True` add datastore records to returned `DatasetRef`. - **kwargs - Additional keyword arguments used to augment or construct a - `DataId`. See `DataId` parameters. + return ButlerRepoIndex.get_known_repos() - Returns - ------- - ref : `DatasetRef` - A reference to the dataset identified by the given arguments. - This can be the same dataset reference as given if it was - resolved. + @abstractmethod + def transaction(self) -> AbstractContextManager[None]: + """Context manager supporting `Butler` transactions. - Raises - ------ - LookupError - Raised if no matching dataset exists in the `Registry` (and - ``predict`` is `False`). - ValueError - Raised if a resolved `DatasetRef` was passed as an input, but it - differs from the one found in the registry. - TypeError - Raised if no collections were provided. + Transactions can be nested. """ - datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) - if isinstance(datasetRefOrType, DatasetRef): - if collections is not None: - warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) - # May need to retrieve datastore records if requested. - if datastore_records and datasetRefOrType._datastore_records is None: - datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) - return datasetRefOrType - timespan: Timespan | None = None - - dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) - - if datasetType.isCalibration(): - # Because this is a calibration dataset, first try to make a - # standardize the data ID without restricting the dimensions to - # those of the dataset type requested, because there may be extra - # dimensions that provide temporal information for a validity-range - # lookup. - dataId = DataCoordinate.standardize( - dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs - ) - if dataId.graph.temporal: - dataId = self._registry.expandDataId(dataId) - timespan = dataId.timespan - else: - # Standardize the data ID to just the dimensions of the dataset - # type instead of letting registry.findDataset do it, so we get the - # result even if no dataset is found. - dataId = DataCoordinate.standardize( - dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs - ) - # Always lookup the DatasetRef, even if one is given, to ensure it is - # present in the current collection. - ref = self._registry.findDataset( - datasetType, - dataId, - collections=collections, - timespan=timespan, - datastore_records=datastore_records, - ) - if ref is None: - if predict: - if run is None: - run = self.run - if run is None: - raise TypeError("Cannot predict dataset ID/location with run=None.") - return DatasetRef(datasetType, dataId, run=run) - else: - if collections is None: - collections = self._registry.defaults.collections - raise LookupError( - f"Dataset {datasetType.name} with data ID {dataId} " - f"could not be found in collections {collections}." - ) - if datasetType != ref.datasetType: - # If they differ it is because the user explicitly specified - # a compatible dataset type to this call rather than using the - # registry definition. The DatasetRef must therefore be recreated - # using the user definition such that the expected type is - # returned. - ref = DatasetRef( - datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records - ) + raise NotImplementedError() - return ref - - # TODO: remove on DM-40067. - @transactional - @deprecated( - reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." - " Please use Butler.put(). Be aware that you may need to adjust your usage if you" - " were relying on the run parameter to determine the run." - " Will be removed after v26.0.", - version="v26.0", - category=FutureWarning, - ) - def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: - # Docstring inherited. - return self.put(obj, ref) - - @transactional + @abstractmethod def put( self, obj: Any, @@ -1182,127 +533,9 @@ def put( TypeError Raised if the butler is read-only or if no run has been provided. """ - if isinstance(datasetRefOrType, DatasetRef): - # This is a direct put of predefined DatasetRef. - log.debug("Butler put direct: %s", datasetRefOrType) - if run is not None: - warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) - # If registry already has a dataset with the same dataset ID, - # dataset type and DataId, then _importDatasets will do nothing and - # just return an original ref. We have to raise in this case, there - # is a datastore check below for that. - self._registry._importDatasets([datasetRefOrType], expand=True) - # Before trying to write to the datastore check that it does not - # know this dataset. This is prone to races, of course. - if self._datastore.knows(datasetRefOrType): - raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") - # Try to write dataset to the datastore, if it fails due to a race - # with another write, the content of stored data may be - # unpredictable. - try: - self._datastore.put(obj, datasetRefOrType) - except IntegrityError as e: - raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e - return datasetRefOrType - - log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) - - # Handle dimension records in dataId - dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) - - # Add Registry Dataset entry. - dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) - (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) - self._datastore.put(obj, ref) - - return ref - - # TODO: remove on DM-40067. - @deprecated( - reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." - " Please use Butler.get(). Will be removed after v26.0.", - version="v26.0", - category=FutureWarning, - ) - def getDirect( - self, - ref: DatasetRef, - *, - parameters: dict[str, Any] | None = None, - storageClass: StorageClass | str | None = None, - ) -> Any: - """Retrieve a stored dataset. - - Parameters - ---------- - ref : `DatasetRef` - Resolved reference to an already stored dataset. - parameters : `dict` - Additional StorageClass-defined options to control reading, - typically used to efficiently read only a subset of the dataset. - storageClass : `StorageClass` or `str`, optional - The storage class to be used to override the Python type - returned by this method. By default the returned type matches - the dataset type definition for this dataset. Specifying a - read `StorageClass` can force a different type to be returned. - This type must be compatible with the original type. - - Returns - ------- - obj : `object` - The dataset. - """ - return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) - - # TODO: remove on DM-40067. - @deprecated( - reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " - "Please use Butler.getDeferred(). Will be removed after v26.0.", - version="v26.0", - category=FutureWarning, - ) - def getDirectDeferred( - self, - ref: DatasetRef, - *, - parameters: dict | None = None, - storageClass: str | StorageClass | None = None, - ) -> DeferredDatasetHandle: - """Create a `DeferredDatasetHandle` which can later retrieve a dataset, - from a resolved `DatasetRef`. - - Parameters - ---------- - ref : `DatasetRef` - Resolved reference to an already stored dataset. - parameters : `dict` - Additional StorageClass-defined options to control reading, - typically used to efficiently read only a subset of the dataset. - storageClass : `StorageClass` or `str`, optional - The storage class to be used to override the Python type - returned by this method. By default the returned type matches - the dataset type definition for this dataset. Specifying a - read `StorageClass` can force a different type to be returned. - This type must be compatible with the original type. - - Returns - ------- - obj : `DeferredDatasetHandle` - A handle which can be used to retrieve a dataset at a later time. - - Raises - ------ - LookupError - Raised if no matching dataset exists in the `Registry`. - """ - # Check that dataset is known to the datastore. - if not self._datastore.knows(ref): - raise LookupError(f"Dataset reference {ref} is not known to datastore.") - return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) + raise NotImplementedError() + @abstractmethod def getDeferred( self, datasetRefOrType: DatasetRef | DatasetType | str, @@ -1359,19 +592,9 @@ def getDeferred( TypeError Raised if no collections were provided. """ - if isinstance(datasetRefOrType, DatasetRef): - # Do the quick check first and if that fails, check for artifact - # existence. This is necessary for datastores that are configured - # in trust mode where there won't be a record but there will be - # a file. - if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): - ref = datasetRefOrType - else: - raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") - else: - ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) - return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) + raise NotImplementedError() + @abstractmethod def get( self, datasetRefOrType: DatasetRef | DatasetType | str, @@ -1436,12 +659,9 @@ def get( fetched with a ``{instrument, detector, exposure}`` data ID, because ``exposure`` is a temporal dimension. """ - log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) - ref = self._findDatasetRef( - datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs - ) - return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) + raise NotImplementedError() + @abstractmethod def getURIs( self, datasetRefOrType: DatasetRef | DatasetType | str, @@ -1486,11 +706,9 @@ def getURIs( `None`), and the URIs to any components associated with the dataset artifact. (can be empty if there are no components). """ - ref = self._findDatasetRef( - datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs - ) - return self._datastore.getURIs(ref, predict) + raise NotImplementedError() + @abstractmethod def getURI( self, datasetRefOrType: DatasetRef | DatasetType | str, @@ -1552,17 +770,9 @@ def getURI( Raised if a URI is requested for a dataset that consists of multiple artifacts. """ - primary, components = self.getURIs( - datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs - ) - - if primary is None or components: - raise RuntimeError( - f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " - "Use Butler.getURIs() instead." - ) - return primary + raise NotImplementedError() + @abstractmethod def retrieveArtifacts( self, refs: Iterable[DatasetRef], @@ -1606,14 +816,9 @@ def retrieveArtifacts( a hierarchical data structure in a NoSQL database may well be stored as a JSON file. """ - return self._datastore.retrieveArtifacts( - refs, - ResourcePath(destination), - transfer=transfer, - preserve_path=preserve_path, - overwrite=overwrite, - ) + raise NotImplementedError() + @abstractmethod def exists( self, dataset_ref_or_type: DatasetRef | DatasetType | str, @@ -1658,49 +863,9 @@ def exists( datastore. Evaluates to `True` if the dataset is present and known to both. """ - existence = DatasetExistence.UNRECOGNIZED - - if isinstance(dataset_ref_or_type, DatasetRef): - if collections is not None: - warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) - if data_id is not None: - warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) - ref = dataset_ref_or_type - registry_ref = self._registry.getDataset(dataset_ref_or_type.id) - if registry_ref is not None: - existence |= DatasetExistence.RECORDED - - if dataset_ref_or_type != registry_ref: - # This could mean that storage classes differ, so we should - # check for that but use the registry ref for the rest of - # the method. - if registry_ref.is_compatible_with(dataset_ref_or_type): - # Use the registry version from now on. - ref = registry_ref - else: - raise ValueError( - f"The ref given to exists() ({ref}) has the same dataset ID as one " - f"in registry but has different incompatible values ({registry_ref})." - ) - else: - try: - ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) - except (LookupError, TypeError, NoDefaultCollectionError): - return existence - existence |= DatasetExistence.RECORDED - - if self._datastore.knows(ref): - existence |= DatasetExistence.DATASTORE - - if full_check: - if self._datastore.exists(ref): - existence |= DatasetExistence._ARTIFACT - elif existence.value != DatasetExistence.UNRECOGNIZED.value: - # Do not add this flag if we have no other idea about a dataset. - existence |= DatasetExistence(DatasetExistence._ASSUMED) - - return existence + raise NotImplementedError() + @abstractmethod def _exists_many( self, refs: Iterable[DatasetRef], @@ -1732,97 +897,9 @@ def _exists_many( Each value evaluates to `True` if the dataset is present and known to both. """ - existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} - - # Registry does not have a bulk API to check for a ref. - for ref in refs: - registry_ref = self._registry.getDataset(ref.id) - if registry_ref is not None: - # It is possible, albeit unlikely, that the given ref does - # not match the one in registry even though the UUID matches. - # When checking a single ref we raise, but it's impolite to - # do that when potentially hundreds of refs are being checked. - # We could change the API to only accept UUIDs and that would - # remove the ability to even check and remove the worry - # about differing storage classes. Given the ongoing discussion - # on refs vs UUIDs and whether to raise or have a new - # private flag, treat this as a private API for now. - existence[ref] |= DatasetExistence.RECORDED - - # Ask datastore if it knows about these refs. - knows = self._datastore.knows_these(refs) - for ref, known in knows.items(): - if known: - existence[ref] |= DatasetExistence.DATASTORE - - if full_check: - mexists = self._datastore.mexists(refs) - for ref, exists in mexists.items(): - if exists: - existence[ref] |= DatasetExistence._ARTIFACT - else: - # Do not set this flag if nothing is known about the dataset. - for ref in existence: - if existence[ref] != DatasetExistence.UNRECOGNIZED: - existence[ref] |= DatasetExistence._ASSUMED - - return existence - - # TODO: remove on DM-40079. - @deprecated( - reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", - version="v26.0", - category=FutureWarning, - ) - def datasetExists( - self, - datasetRefOrType: DatasetRef | DatasetType | str, - dataId: DataId | None = None, - *, - collections: Any = None, - **kwargs: Any, - ) -> bool: - """Return True if the Dataset is actually present in the Datastore. - - Parameters - ---------- - datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` - When `DatasetRef` the `dataId` should be `None`. - Otherwise the `DatasetType` or name thereof. - dataId : `dict` or `DataCoordinate` - A `dict` of `Dimension` link name, value pairs that label the - `DatasetRef` within a Collection. When `None`, a `DatasetRef` - should be provided as the first argument. - collections : Any, optional - Collections to be searched, overriding ``self.collections``. - Can be any of the types supported by the ``collections`` argument - to butler construction. - **kwargs - Additional keyword arguments used to augment or construct a - `DataCoordinate`. See `DataCoordinate.standardize` - parameters. - - Raises - ------ - LookupError - Raised if the dataset is not even present in the Registry. - ValueError - Raised if a resolved `DatasetRef` was passed as an input, but it - differs from the one found in the registry. - NoDefaultCollectionError - Raised if no collections were provided. - """ - # A resolved ref may be given that is not known to this butler. - if isinstance(datasetRefOrType, DatasetRef): - ref = self._registry.getDataset(datasetRefOrType.id) - if ref is None: - raise LookupError( - f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." - ) - else: - ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) - return self._datastore.exists(ref) + raise NotImplementedError() + @abstractmethod def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: """Remove one or more `~CollectionType.RUN` collections and the datasets within them. @@ -1844,92 +921,9 @@ def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: Raised if one or more collections are not of type `~CollectionType.RUN`. """ - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - names = list(names) - refs: list[DatasetRef] = [] - for name in names: - collectionType = self._registry.getCollectionType(name) - if collectionType is not CollectionType.RUN: - raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") - refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) - with self._datastore.transaction(), self._registry.transaction(): - if unstore: - self._datastore.trash(refs) - else: - self._datastore.forget(refs) - for name in names: - self._registry.removeCollection(name) - if unstore: - # Point of no return for removing artifacts - self._datastore.emptyTrash() - - def pruneDatasets( - self, - refs: Iterable[DatasetRef], - *, - disassociate: bool = True, - unstore: bool = False, - tags: Iterable[str] = (), - purge: bool = False, - ) -> None: - # docstring inherited from LimitedButler - - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - if purge: - if not disassociate: - raise TypeError("Cannot pass purge=True without disassociate=True.") - if not unstore: - raise TypeError("Cannot pass purge=True without unstore=True.") - elif disassociate: - tags = tuple(tags) - if not tags: - raise TypeError("No tags provided but disassociate=True.") - for tag in tags: - collectionType = self._registry.getCollectionType(tag) - if collectionType is not CollectionType.TAGGED: - raise TypeError( - f"Cannot disassociate from collection '{tag}' " - f"of non-TAGGED type {collectionType.name}." - ) - # Transform possibly-single-pass iterable into something we can iterate - # over multiple times. - refs = list(refs) - # Pruning a component of a DatasetRef makes no sense since registry - # doesn't know about components and datastore might not store - # components in a separate file - for ref in refs: - if ref.datasetType.component(): - raise ValueError(f"Can not prune a component of a dataset (ref={ref})") - # We don't need an unreliable Datastore transaction for this, because - # we've been extra careful to ensure that Datastore.trash only involves - # mutating the Registry (it can _look_ at Datastore-specific things, - # but shouldn't change them), and hence all operations here are - # Registry operations. - with self._datastore.transaction(), self._registry.transaction(): - if unstore: - self._datastore.trash(refs) - if purge: - self._registry.removeDatasets(refs) - elif disassociate: - assert tags, "Guaranteed by earlier logic in this function." - for tag in tags: - self._registry.disassociate(tag, refs) - # We've exited the Registry transaction, and apparently committed. - # (if there was an exception, everything rolled back, and it's as if - # nothing happened - and we never get here). - # Datastore artifacts are not yet gone, but they're clearly marked - # as trash, so if we fail to delete now because of (e.g.) filesystem - # problems we can try again later, and if manual administrative - # intervention is required, it's pretty clear what that should entail: - # deleting everything on disk and in private Datastore tables that is - # in the dataset_location_trash table. - if unstore: - # Point of no return for removing artifacts - self._datastore.emptyTrash() - - @transactional + raise NotImplementedError() + + @abstractmethod def ingest( self, *datasets: FileDataset, @@ -1998,122 +992,9 @@ def ingest( filesystem operations as well, but this cannot be implemented rigorously for most datastores. """ - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - - log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") - if not datasets: - return - - if idGenerationMode is not None: - warnings.warn( - "The idGenerationMode parameter is no longer used and is ignored. " - " Will be removed after v26.0", - FutureWarning, - stacklevel=2, - ) + raise NotImplementedError() - progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) - - # We need to reorganize all the inputs so that they are grouped - # by dataset type and run. Multiple refs in a single FileDataset - # are required to share the run and dataset type. - GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] - groupedData: GroupedData = defaultdict(list) - - # Track DataIDs that are being ingested so we can spot issues early - # with duplication. Retain previous FileDataset so we can report it. - groupedDataIds: MutableMapping[ - tuple[DatasetType, str], dict[DataCoordinate, FileDataset] - ] = defaultdict(dict) - - used_run = False - - # And the nested loop that populates it: - for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): - # Somewhere to store pre-existing refs if we have an - # execution butler. - existingRefs: list[DatasetRef] = [] - - for ref in dataset.refs: - assert ref.run is not None # For mypy - group_key = (ref.datasetType, ref.run) - - if ref.dataId in groupedDataIds[group_key]: - raise ConflictingDefinitionError( - f"Ingest conflict. Dataset {dataset.path} has same" - " DataId as other ingest dataset" - f" {groupedDataIds[group_key][ref.dataId].path} " - f" ({ref.dataId})" - ) - - groupedDataIds[group_key][ref.dataId] = dataset - - if existingRefs: - if len(dataset.refs) != len(existingRefs): - # Keeping track of partially pre-existing datasets is hard - # and should generally never happen. For now don't allow - # it. - raise ConflictingDefinitionError( - f"For dataset {dataset.path} some dataIds already exist" - " in registry but others do not. This is not supported." - ) - - # Store expanded form in the original FileDataset. - dataset.refs = existingRefs - else: - groupedData[group_key].append(dataset) - - if not used_run and run is not None: - warnings.warn( - "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " - f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", - category=FutureWarning, - stacklevel=3, # Take into account the @transactional decorator. - ) - - # Now we can bulk-insert into Registry for each DatasetType. - for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( - groupedData.items(), desc="Bulk-inserting datasets by type" - ): - refs_to_import = [] - for dataset in grouped_datasets: - refs_to_import.extend(dataset.refs) - - n_refs = len(refs_to_import) - log.verbose( - "Importing %d ref%s of dataset type %r into run %r", - n_refs, - "" if n_refs == 1 else "s", - datasetType.name, - this_run, - ) - - # Import the refs and expand the DataCoordinates since we can't - # guarantee that they are expanded and Datastore will need - # the records. - imported_refs = self._registry._importDatasets(refs_to_import, expand=True) - assert set(imported_refs) == set(refs_to_import) - - # Replace all the refs in the FileDataset with expanded versions. - # Pull them off in the order we put them on the list. - for dataset in grouped_datasets: - n_dataset_refs = len(dataset.refs) - dataset.refs = imported_refs[:n_dataset_refs] - del imported_refs[:n_dataset_refs] - - # Bulk-insert everything into Datastore. - # We do not know if any of the registry entries already existed - # (_importDatasets only complains if they exist but differ) so - # we have to catch IntegrityError explicitly. - try: - self._datastore.ingest( - *datasets, transfer=transfer, record_validation_info=record_validation_info - ) - except IntegrityError as e: - raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e - - @contextlib.contextmanager + @abstractmethod def export( self, *, @@ -2121,7 +1002,7 @@ def export( filename: str | None = None, format: str | None = None, transfer: str | None = None, - ) -> Iterator[RepoExportContext]: + ) -> AbstractContextManager[RepoExportContext]: """Export datasets from the repository represented by this `Butler`. This method is a context manager that returns a helper object @@ -2165,38 +1046,9 @@ def export( # their associated data ID information. export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*")) """ - if directory is None and transfer is not None: - raise TypeError("Cannot transfer without providing a directory.") - if transfer == "move": - raise TypeError("Transfer may not be 'move': export is read-only") - if format is None: - if filename is None: - raise TypeError("At least one of 'filename' or 'format' must be provided.") - else: - _, format = os.path.splitext(filename) - if not format: - raise ValueError("Please specify a file extension to determine export format.") - format = format[1:] # Strip leading "."" - elif filename is None: - filename = f"export.{format}" - if directory is not None: - filename = os.path.join(directory, filename) - formats = self._config["repo_transfer_formats"] - if format not in formats: - raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") - BackendClass = get_class_of(formats[format, "export"]) - with open(filename, "w") as stream: - backend = BackendClass(stream, universe=self.dimensions) - try: - helper = RepoExportContext( - self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer - ) - yield helper - except BaseException: - raise - else: - helper._finish() + raise NotImplementedError() + @abstractmethod def import_( self, *, @@ -2237,66 +1089,9 @@ def import_( Raised if the set of arguments passed is inconsistent, or if the butler is read-only. """ - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - if format is None: - if filename is None: - raise TypeError("At least one of 'filename' or 'format' must be provided.") - else: - _, format = os.path.splitext(filename) # type: ignore - elif filename is None: - filename = ResourcePath(f"export.{format}", forceAbsolute=False) - if directory is not None: - directory = ResourcePath(directory, forceDirectory=True) - # mypy doesn't think this will work but it does in python >= 3.10. - if isinstance(filename, ResourcePathExpression): # type: ignore - filename = ResourcePath(filename, forceAbsolute=False) # type: ignore - if not filename.isabs() and directory is not None: - potential = directory.join(filename) - exists_in_cwd = filename.exists() - exists_in_dir = potential.exists() - if exists_in_cwd and exists_in_dir: - log.warning( - "A relative path for filename was specified (%s) which exists relative to cwd. " - "Additionally, the file exists relative to the given search directory (%s). " - "Using the export file in the given directory.", - filename, - potential, - ) - # Given they specified an explicit directory and that - # directory has the export file in it, assume that that - # is what was meant despite the file in cwd. - filename = potential - elif exists_in_dir: - filename = potential - elif not exists_in_cwd and not exists_in_dir: - # Raise early. - raise FileNotFoundError( - f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." - ) - BackendClass: type[RepoImportBackend] = get_class_of( - self._config["repo_transfer_formats"][format]["import"] - ) - - def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: - backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] - backend.register() - with self.transaction(): - backend.load( - self._datastore, - directory=directory, - transfer=transfer, - skip_dimensions=skip_dimensions, - ) - - if isinstance(filename, ResourcePath): - # We can not use open() here at the moment because of - # DM-38589 since yaml does stream.read(8192) in a loop. - stream = io.StringIO(filename.read().decode()) - doImport(stream) - else: - doImport(filename) # type: ignore + raise NotImplementedError() + @abstractmethod def transfer_from( self, source_butler: LimitedButler, @@ -2305,7 +1100,7 @@ def transfer_from( skip_missing: bool = True, register_dataset_types: bool = False, transfer_dimensions: bool = False, - ) -> collections.abc.Collection[DatasetRef]: + ) -> Collection[DatasetRef]: """Transfer datasets to this Butler from a run in another Butler. Parameters @@ -2348,171 +1143,9 @@ def transfer_from( This means that it is possible for a dataset type to be registered even though transfer has failed. """ - if not self.isWriteable(): - raise TypeError("Butler is read-only.") - progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) - - # Will iterate through the refs multiple times so need to convert - # to a list if this isn't a collection. - if not isinstance(source_refs, collections.abc.Collection): - source_refs = list(source_refs) - - original_count = len(source_refs) - log.info("Transferring %d datasets into %s", original_count, str(self)) - - # In some situations the datastore artifact may be missing - # and we do not want that registry entry to be imported. - # Asking datastore is not sufficient, the records may have been - # purged, we have to ask for the (predicted) URI and check - # existence explicitly. Execution butler is set up exactly like - # this with no datastore records. - artifact_existence: dict[ResourcePath, bool] = {} - if skip_missing: - dataset_existence = source_butler._datastore.mexists( - source_refs, artifact_existence=artifact_existence - ) - source_refs = [ref for ref, exists in dataset_existence.items() if exists] - filtered_count = len(source_refs) - n_missing = original_count - filtered_count - log.verbose( - "%d dataset%s removed because the artifact does not exist. Now have %d.", - n_missing, - "" if n_missing == 1 else "s", - filtered_count, - ) - - # Importing requires that we group the refs by dataset type and run - # before doing the import. - source_dataset_types = set() - grouped_refs = defaultdict(list) - for ref in source_refs: - grouped_refs[ref.datasetType, ref.run].append(ref) - source_dataset_types.add(ref.datasetType) - - # Check to see if the dataset type in the source butler has - # the same definition in the target butler and register missing - # ones if requested. Registration must happen outside a transaction. - newly_registered_dataset_types = set() - for datasetType in source_dataset_types: - if register_dataset_types: - # Let this raise immediately if inconsistent. Continuing - # on to find additional inconsistent dataset types - # might result in additional unwanted dataset types being - # registered. - if self._registry.registerDatasetType(datasetType): - newly_registered_dataset_types.add(datasetType) - else: - # If the dataset type is missing, let it fail immediately. - target_dataset_type = self._registry.getDatasetType(datasetType.name) - if target_dataset_type != datasetType: - raise ConflictingDefinitionError( - "Source butler dataset type differs from definition" - f" in target butler: {datasetType} !=" - f" {target_dataset_type}" - ) - if newly_registered_dataset_types: - # We may have registered some even if there were inconsistencies - # but should let people know (or else remove them again). - log.log( - VERBOSE, - "Registered the following dataset types in the target Butler: %s", - ", ".join(d.name for d in newly_registered_dataset_types), - ) - else: - log.log(VERBOSE, "All required dataset types are known to the target Butler") - - dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) - if transfer_dimensions: - # Collect all the dimension records for these refs. - # All dimensions are to be copied but the list of valid dimensions - # come from this butler's universe. - elements = frozenset( - element - for element in self.dimensions.getStaticElements() - if element.hasTable() and element.viewOf is None - ) - dataIds = {ref.dataId for ref in source_refs} - # This logic comes from saveDataIds. - for dataId in dataIds: - # Need an expanded record, if not expanded that we need a full - # butler with registry (allow mocks with registry too). - if not dataId.hasRecords(): - if registry := getattr(source_butler, "registry", None): - dataId = registry.expandDataId(dataId) - else: - raise TypeError("Input butler needs to be a full butler to expand DataId.") - # If this butler doesn't know about a dimension in the source - # butler things will break later. - for record in dataId.records.values(): - if record is not None and record.definition in elements: - dimension_records[record.definition].setdefault(record.dataId, record) - - handled_collections: set[str] = set() - - # Do all the importing in a single transaction. - with self.transaction(): - if dimension_records: - log.verbose("Ensuring that dimension records exist for transferred datasets.") - for element, r in dimension_records.items(): - records = [r[dataId] for dataId in r] - # Assume that if the record is already present that we can - # use it without having to check that the record metadata - # is consistent. - self._registry.insertDimensionData(element, *records, skip_existing=True) - - n_imported = 0 - for (datasetType, run), refs_to_import in progress.iter_item_chunks( - grouped_refs.items(), desc="Importing to registry by run and dataset type" - ): - if run not in handled_collections: - # May need to create output collection. If source butler - # has a registry, ask for documentation string. - run_doc = None - if registry := getattr(source_butler, "registry", None): - run_doc = registry.getCollectionDocumentation(run) - registered = self._registry.registerRun(run, doc=run_doc) - handled_collections.add(run) - if registered: - log.log(VERBOSE, "Creating output run %s", run) - - n_refs = len(refs_to_import) - log.verbose( - "Importing %d ref%s of dataset type %s into run %s", - n_refs, - "" if n_refs == 1 else "s", - datasetType.name, - run, - ) - - # Assume we are using UUIDs and the source refs will match - # those imported. - imported_refs = self._registry._importDatasets(refs_to_import, expand=False) - assert set(imported_refs) == set(refs_to_import) - n_imported += len(imported_refs) - - assert len(source_refs) == n_imported - log.verbose("Imported %d datasets into destination butler", n_imported) - - # Ask the datastore to transfer. The datastore has to check that - # the source datastore is compatible with the target datastore. - accepted, rejected = self._datastore.transfer_from( - source_butler._datastore, - source_refs, - transfer=transfer, - artifact_existence=artifact_existence, - ) - if rejected: - # For now, accept the registry entries but not the files. - log.warning( - "%d datasets were rejected and %d accepted for dataset type %s in run %r.", - len(rejected), - len(accepted), - datasetType, - run, - ) - - return source_refs + raise NotImplementedError() + @abstractmethod def validateConfiguration( self, logFailures: bool = False, @@ -2543,141 +1176,26 @@ def validateConfiguration( Raised if there is some inconsistency with how this Butler is configured. """ - if datasetTypeNames: - datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] - else: - datasetTypes = list(self._registry.queryDatasetTypes()) - - # filter out anything from the ignore list - if ignore: - ignore = set(ignore) - datasetTypes = [ - e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore - ] - else: - ignore = set() - - # For each datasetType that has an instrument dimension, create - # a DatasetRef for each defined instrument - datasetRefs = [] - - # Find all the registered instruments (if "instrument" is in the - # universe). - if "instrument" in self.dimensions: - instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} - - for datasetType in datasetTypes: - if "instrument" in datasetType.dimensions: - # In order to create a conforming dataset ref, create - # fake DataCoordinate values for the non-instrument - # dimensions. The type of the value does not matter here. - dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} - - for instrument in instruments: - datasetRef = DatasetRef( - datasetType, - DataCoordinate.standardize( - dataId, instrument=instrument, graph=datasetType.dimensions - ), - run="validate", - ) - datasetRefs.append(datasetRef) - - entities: list[DatasetType | DatasetRef] = [] - entities.extend(datasetTypes) - entities.extend(datasetRefs) - - datastoreErrorStr = None - try: - self._datastore.validateConfiguration(entities, logFailures=logFailures) - except ValidationError as e: - datastoreErrorStr = str(e) - - # Also check that the LookupKeys used by the datastores match - # registry and storage class definitions - keys = self._datastore.getLookupKeys() - - failedNames = set() - failedDataId = set() - for key in keys: - if key.name is not None: - if key.name in ignore: - continue - - # skip if specific datasetType names were requested and this - # name does not match - if datasetTypeNames and key.name not in datasetTypeNames: - continue - - # See if it is a StorageClass or a DatasetType - if key.name in self.storageClasses: - pass - else: - try: - self._registry.getDatasetType(key.name) - except KeyError: - if logFailures: - log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key) - failedNames.add(key) - else: - # Dimensions are checked for consistency when the Butler - # is created and rendezvoused with a universe. - pass - - # Check that the instrument is a valid instrument - # Currently only support instrument so check for that - if key.dataId: - dataIdKeys = set(key.dataId) - if {"instrument"} != dataIdKeys: - if logFailures: - log.critical("Key '%s' has unsupported DataId override", key) - failedDataId.add(key) - elif key.dataId["instrument"] not in instruments: - if logFailures: - log.critical("Key '%s' has unknown instrument", key) - failedDataId.add(key) - - messages = [] - - if datastoreErrorStr: - messages.append(datastoreErrorStr) - - for failed, msg in ( - (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), - (failedDataId, "Keys with bad DataId entries: "), - ): - if failed: - msg += ", ".join(str(k) for k in failed) - messages.append(msg) - - if messages: - raise ValidationError(";\n".join(messages)) + raise NotImplementedError() @property + @abstractmethod def collections(self) -> Sequence[str]: """The collections to search by default, in order (`~collections.abc.Sequence` [ `str` ]). - - This is an alias for ``self.registry.defaults.collections``. It cannot - be set directly in isolation, but all defaults may be changed together - by assigning a new `RegistryDefaults` instance to - ``self.registry.defaults``. """ - return self._registry.defaults.collections + raise NotImplementedError() @property + @abstractmethod def run(self) -> str | None: """Name of the run this butler writes outputs to by default (`str` or `None`). - - This is an alias for ``self.registry.defaults.run``. It cannot be set - directly in isolation, but all defaults may be changed together by - assigning a new `RegistryDefaults` instance to - ``self.registry.defaults``. """ - return self._registry.defaults.run + raise NotImplementedError() @property + @abstractmethod def registry(self) -> Registry: """The object that manages dataset metadata and relationships (`Registry`). @@ -2686,30 +1204,4 @@ def registry(self) -> Registry: are accessible only via `Registry` methods. Eventually these methods will be replaced by equivalent `Butler` methods. """ - return self._registry_shim - - @property - def dimensions(self) -> DimensionUniverse: - # Docstring inherited. - return self._registry.dimensions - - _registry: _ButlerRegistry - """The object that manages dataset metadata and relationships - (`_ButlerRegistry`). - - Most operations that don't involve reading or writing butler datasets are - accessible only via `Registry` methods. - """ - - datastore: Datastore - """The object that manages actual dataset storage (`Datastore`). - - Direct user access to the datastore should rarely be necessary; the primary - exception is the case where a `Datastore` implementation provides extra - functionality beyond what the base class defines. - """ - - storageClasses: StorageClassFactory - """An object that maps known storage class names to objects that fully - describe them (`StorageClassFactory`). - """ + raise NotImplementedError() diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py index 5d3edb797d..fd33107e23 100644 --- a/python/lsst/daf/butler/_quantum_backed.py +++ b/python/lsst/daf/butler/_quantum_backed.py @@ -661,7 +661,7 @@ class QuantumProvenanceData(_BaseModelCompat): def collect_and_transfer( butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData] ) -> None: - """Transfer output datasets from multiple quanta to a more permantent + """Transfer output datasets from multiple quanta to a more permanent `Butler` repository. Parameters diff --git a/python/lsst/daf/butler/_registry_shim.py b/python/lsst/daf/butler/_registry_shim.py index 4ba989e829..67f50a16e1 100644 --- a/python/lsst/daf/butler/_registry_shim.py +++ b/python/lsst/daf/butler/_registry_shim.py @@ -54,7 +54,7 @@ from .registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults if TYPE_CHECKING: - from ._butler import Butler + from .direct_butler import DirectButler from .registry._registry import CollectionArgType from .registry.interfaces import ObsCoreTableManager @@ -64,7 +64,7 @@ class RegistryShim(Registry): Parameters ---------- - butler : `Butler` + butler : `DirectButler` Data butler instance. Notes @@ -75,7 +75,7 @@ class RegistryShim(Registry): while we perform re-structuring of Registry and Butler implementations. """ - def __init__(self, butler: Butler): + def __init__(self, butler: DirectButler): self._butler = butler self._registry = butler._registry diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py new file mode 100644 index 0000000000..68619848fd --- /dev/null +++ b/python/lsst/daf/butler/direct_butler.py @@ -0,0 +1,2167 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Butler top level classes. +""" +from __future__ import annotations + +__all__ = ( + "DirectButler", + "ButlerValidationError", +) + +import collections.abc +import contextlib +import io +import logging +import numbers +import os +import warnings +from collections import Counter, defaultdict +from collections.abc import Iterable, Iterator, MutableMapping, Sequence +from typing import TYPE_CHECKING, Any, ClassVar, TextIO + +from deprecated.sphinx import deprecated +from lsst.resources import ResourcePath, ResourcePathExpression +from lsst.utils.introspection import get_class_of +from lsst.utils.logging import VERBOSE, getLogger +from sqlalchemy.exc import IntegrityError + +from ._butler import Butler +from ._butler_config import ButlerConfig +from ._config import Config +from ._dataset_existence import DatasetExistence +from ._dataset_ref import DatasetIdGenEnum, DatasetRef +from ._dataset_type import DatasetType +from ._deferredDatasetHandle import DeferredDatasetHandle +from ._exceptions import ValidationError +from ._file_dataset import FileDataset +from ._limited_butler import LimitedButler +from ._registry_shim import RegistryShim +from ._storage_class import StorageClass, StorageClassFactory +from ._timespan import Timespan +from .datastore import DatasetRefURIs, Datastore, NullDatastore +from .dimensions import ( + DataCoordinate, + DataId, + DataIdValue, + Dimension, + DimensionElement, + DimensionRecord, + DimensionUniverse, +) +from .progress import Progress +from .registry import ( + CollectionType, + ConflictingDefinitionError, + DataIdError, + MissingDatasetTypeError, + NoDefaultCollectionError, + Registry, + RegistryDefaults, + _ButlerRegistry, + _RegistryFactory, +) +from .transfers import RepoExportContext +from .utils import transactional + +if TYPE_CHECKING: + from lsst.resources import ResourceHandleProtocol + + from .transfers import RepoImportBackend + +_LOG = getLogger(__name__) + + +class ButlerValidationError(ValidationError): + """There is a problem with the Butler configuration.""" + + pass + + +class DirectButler(Butler): + """Main entry point for the data access system. + + Parameters + ---------- + config : `ButlerConfig`, `Config` or `str`, optional. + Configuration. Anything acceptable to the + `ButlerConfig` constructor. If a directory path + is given the configuration will be read from a ``butler.yaml`` file in + that location. If `None` is given default values will be used. + butler : `DirectButler`, optional. + If provided, construct a new Butler that uses the same registry and + datastore as the given one, but with the given collection and run. + Incompatible with the ``config``, ``searchPaths``, and ``writeable`` + arguments. + collections : `str` or `~collections.abc.Iterable` [ `str` ], optional + An expression specifying the collections to be searched (in order) when + reading datasets. + This may be a `str` collection name or an iterable thereof. + See :ref:`daf_butler_collection_expressions` for more information. + These collections are not registered automatically and must be + manually registered before they are used by any method, but they may be + manually registered after the `Butler` is initialized. + run : `str`, optional + Name of the `~CollectionType.RUN` collection new datasets should be + inserted into. If ``collections`` is `None` and ``run`` is not `None`, + ``collections`` will be set to ``[run]``. If not `None`, this + collection will automatically be registered. If this is not set (and + ``writeable`` is not set either), a read-only butler will be created. + searchPaths : `list` of `str`, optional + Directory paths to search when calculating the full Butler + configuration. Not used if the supplied config is already a + `ButlerConfig`. + writeable : `bool`, optional + Explicitly sets whether the butler supports write operations. If not + provided, a read-write butler is created if any of ``run``, ``tags``, + or ``chains`` is non-empty. + inferDefaults : `bool`, optional + If `True` (default) infer default data ID values from the values + present in the datasets in ``collections``: if all collections have the + same value (or no value) for a governor dimension, that value will be + the default for that dimension. Nonexistent collections are ignored. + If a default value is provided explicitly for a governor dimension via + ``**kwargs``, no default will be inferred for that dimension. + without_datastore : `bool`, optional + If `True` do not attach a datastore to this butler. Any attempts + to use a datastore will fail. + **kwargs : `str` + Default data ID key-value pairs. These may only identify "governor" + dimensions like ``instrument`` and ``skymap``. + """ + + def __init__( + self, + config: Config | ResourcePathExpression | None = None, + *, + butler: DirectButler | None = None, + collections: Any = None, + run: str | None = None, + searchPaths: Sequence[ResourcePathExpression] | None = None, + writeable: bool | None = None, + inferDefaults: bool = True, + without_datastore: bool = False, + **kwargs: str, + ): + defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs) + # Load registry, datastore, etc. from config or existing butler. + if butler is not None: + if config is not None or searchPaths is not None or writeable is not None: + raise TypeError( + "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument." + ) + self._registry = butler._registry.copy(defaults) + self._datastore = butler._datastore + self.storageClasses = butler.storageClasses + self._config: ButlerConfig = butler._config + else: + self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore) + try: + butlerRoot = self._config.get("root", self._config.configDir) + if writeable is None: + writeable = run is not None + self._registry = _RegistryFactory(self._config).from_config( + butlerRoot=butlerRoot, writeable=writeable, defaults=defaults + ) + if without_datastore: + self._datastore = NullDatastore(None, None) + else: + self._datastore = Datastore.fromConfig( + self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot + ) + # TODO: Once datastore drops dependency on registry we can + # construct datastore first and pass opaque tables to registry + # constructor. + self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions()) + self.storageClasses = StorageClassFactory() + self.storageClasses.addFromConfig(self._config) + except Exception: + # Failures here usually mean that configuration is incomplete, + # just issue an error message which includes config file URI. + _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.") + raise + + # For execution butler the datastore needs a special + # dependency-inversion trick. This is not used by regular butler, + # but we do not have a way to distinguish regular butler from execution + # butler. + self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type) + + if "run" in self._config or "collection" in self._config: + raise ValueError("Passing a run or collection via configuration is no longer supported.") + + self._registry_shim = RegistryShim(self) + + GENERATION: ClassVar[int] = 3 + """This is a Generation 3 Butler. + + This attribute may be removed in the future, once the Generation 2 Butler + interface has been fully retired; it should only be used in transitional + code. + """ + + def _retrieve_dataset_type(self, name: str) -> DatasetType | None: + """Return DatasetType defined in registry given dataset type name.""" + try: + return self._registry.getDatasetType(name) + except MissingDatasetTypeError: + return None + + @classmethod + def _unpickle( + cls, + config: ButlerConfig, + collections: tuple[str, ...] | None, + run: str | None, + defaultDataId: dict[str, str], + writeable: bool, + ) -> DirectButler: + """Callable used to unpickle a Butler. + + We prefer not to use ``Butler.__init__`` directly so we can force some + of its many arguments to be keyword-only (note that ``__reduce__`` + can only invoke callables with positional arguments). + + Parameters + ---------- + config : `ButlerConfig` + Butler configuration, already coerced into a true `ButlerConfig` + instance (and hence after any search paths for overrides have been + utilized). + collections : `tuple` [ `str` ] + Names of the default collections to read from. + run : `str`, optional + Name of the default `~CollectionType.RUN` collection to write to. + defaultDataId : `dict` [ `str`, `str` ] + Default data ID values. + writeable : `bool` + Whether the Butler should support write operations. + + Returns + ------- + butler : `Butler` + A new `Butler` instance. + """ + # MyPy doesn't recognize that the kwargs below are totally valid; it + # seems to think '**defaultDataId* is a _positional_ argument! + return cls( + config=config, + collections=collections, + run=run, + writeable=writeable, + **defaultDataId, # type: ignore + ) + + def __reduce__(self) -> tuple: + """Support pickling.""" + return ( + DirectButler._unpickle, + ( + self._config, + self.collections, + self.run, + self._registry.defaults.dataId.byName(), + self._registry.isWriteable(), + ), + ) + + def __str__(self) -> str: + return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format( + self.collections, self.run, self._datastore, self._registry + ) + + def isWriteable(self) -> bool: + # Docstring inherited. + return self._registry.isWriteable() + + @contextlib.contextmanager + def transaction(self) -> Iterator[None]: + """Context manager supporting `Butler` transactions. + + Transactions can be nested. + """ + with self._registry.transaction(), self._datastore.transaction(): + yield + + def _standardizeArgs( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + dataId: DataId | None = None, + for_put: bool = True, + **kwargs: Any, + ) -> tuple[DatasetType, DataId | None]: + """Standardize the arguments passed to several Butler APIs. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the second argument. + for_put : `bool`, optional + If `True` this call is invoked as part of a `Butler.put()`. + Otherwise it is assumed to be part of a `Butler.get()`. This + parameter is only relevant if there is dataset type + inconsistency. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. + + Returns + ------- + datasetType : `DatasetType` + A `DatasetType` instance extracted from ``datasetRefOrType``. + dataId : `dict` or `DataId`, optional + Argument that can be used (along with ``kwargs``) to construct a + `DataId`. + + Notes + ----- + Butler APIs that conceptually need a DatasetRef also allow passing a + `DatasetType` (or the name of one) and a `DataId` (or a dict and + keyword arguments that can be used to construct one) separately. This + method accepts those arguments and always returns a true `DatasetType` + and a `DataId` or `dict`. + + Standardization of `dict` vs `DataId` is best handled by passing the + returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are + generally similarly flexible. + """ + externalDatasetType: DatasetType | None = None + internalDatasetType: DatasetType | None = None + if isinstance(datasetRefOrType, DatasetRef): + if dataId is not None or kwargs: + raise ValueError("DatasetRef given, cannot use dataId as well") + externalDatasetType = datasetRefOrType.datasetType + dataId = datasetRefOrType.dataId + else: + # Don't check whether DataId is provided, because Registry APIs + # can usually construct a better error message when it wasn't. + if isinstance(datasetRefOrType, DatasetType): + externalDatasetType = datasetRefOrType + else: + internalDatasetType = self._registry.getDatasetType(datasetRefOrType) + + # Check that they are self-consistent + if externalDatasetType is not None: + internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) + if externalDatasetType != internalDatasetType: + # We can allow differences if they are compatible, depending + # on whether this is a get or a put. A get requires that + # the python type associated with the datastore can be + # converted to the user type. A put requires that the user + # supplied python type can be converted to the internal + # type expected by registry. + relevantDatasetType = internalDatasetType + if for_put: + is_compatible = internalDatasetType.is_compatible_with(externalDatasetType) + else: + is_compatible = externalDatasetType.is_compatible_with(internalDatasetType) + relevantDatasetType = externalDatasetType + if not is_compatible: + raise ValueError( + f"Supplied dataset type ({externalDatasetType}) inconsistent with " + f"registry definition ({internalDatasetType})" + ) + # Override the internal definition. + internalDatasetType = relevantDatasetType + + assert internalDatasetType is not None + return internalDatasetType, dataId + + def _rewrite_data_id( + self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any + ) -> tuple[DataId | None, dict[str, Any]]: + """Rewrite a data ID taking into account dimension records. + + Take a Data ID and keyword args and rewrite it if necessary to + allow the user to specify dimension records rather than dimension + primary values. + + This allows a user to include a dataId dict with keys of + ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving + the integer exposure ID. It also allows a string to be given + for a dimension value rather than the integer ID if that is more + convenient. For example, rather than having to specifying the + detector with ``detector.full_name``, a string given for ``detector`` + will be interpreted as the full name and converted to the integer + value. + + Keyword arguments can also use strings for dimensions like detector + and exposure but python does not allow them to include ``.`` and + so the ``exposure.day_obs`` syntax can not be used in a keyword + argument. + + Parameters + ---------- + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that will label the + `DatasetRef` within a Collection. + datasetType : `DatasetType` + The dataset type associated with this dataId. Required to + determine the relevant dimensions. + **kwargs + Additional keyword arguments used to augment or construct a + `DataId`. See `DataId` parameters. + + Returns + ------- + dataId : `dict` or `DataCoordinate` + The, possibly rewritten, dataId. If given a `DataCoordinate` and + no keyword arguments, the original dataId will be returned + unchanged. + **kwargs : `dict` + Any unused keyword arguments (would normally be empty dict). + """ + # Do nothing if we have a standalone DataCoordinate. + if isinstance(dataId, DataCoordinate) and not kwargs: + return dataId, kwargs + + # Process dimension records that are using record information + # rather than ids + newDataId: dict[str, DataIdValue] = {} + byRecord: dict[str, dict[str, Any]] = defaultdict(dict) + + # if all the dataId comes from keyword parameters we do not need + # to do anything here because they can't be of the form + # exposure.obs_id because a "." is not allowed in a keyword parameter. + if dataId: + for k, v in dataId.items(): + # If we have a Dimension we do not need to do anything + # because it cannot be a compound key. + if isinstance(k, str) and "." in k: + # Someone is using a more human-readable dataId + dimensionName, record = k.split(".", 1) + byRecord[dimensionName][record] = v + elif isinstance(k, Dimension): + newDataId[k.name] = v + else: + newDataId[k] = v + + # Go through the updated dataId and check the type in case someone is + # using an alternate key. We have already filtered out the compound + # keys dimensions.record format. + not_dimensions = {} + + # Will need to look in the dataId and the keyword arguments + # and will remove them if they need to be fixed or are unrecognized. + for dataIdDict in (newDataId, kwargs): + # Use a list so we can adjust the dict safely in the loop + for dimensionName in list(dataIdDict): + value = dataIdDict[dimensionName] + try: + dimension = self.dimensions.getStaticDimensions()[dimensionName] + except KeyError: + # This is not a real dimension + not_dimensions[dimensionName] = value + del dataIdDict[dimensionName] + continue + + # Convert an integral type to an explicit int to simplify + # comparisons here + if isinstance(value, numbers.Integral): + value = int(value) + + if not isinstance(value, dimension.primaryKey.getPythonType()): + for alternate in dimension.alternateKeys: + if isinstance(value, alternate.getPythonType()): + byRecord[dimensionName][alternate.name] = value + del dataIdDict[dimensionName] + _LOG.debug( + "Converting dimension %s to %s.%s=%s", + dimensionName, + dimensionName, + alternate.name, + value, + ) + break + else: + _LOG.warning( + "Type mismatch found for value '%r' provided for dimension %s. " + "Could not find matching alternative (primary key has type %s) " + "so attempting to use as-is.", + value, + dimensionName, + dimension.primaryKey.getPythonType(), + ) + + # By this point kwargs and newDataId should only include valid + # dimensions. Merge kwargs in to the new dataId and log if there + # are dimensions in both (rather than calling update). + for k, v in kwargs.items(): + if k in newDataId and newDataId[k] != v: + _LOG.debug( + "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v + ) + newDataId[k] = v + # No need to retain any values in kwargs now. + kwargs = {} + + # If we have some unrecognized dimensions we have to try to connect + # them to records in other dimensions. This is made more complicated + # by some dimensions having records with clashing names. A mitigation + # is that we can tell by this point which dimensions are missing + # for the DatasetType but this does not work for calibrations + # where additional dimensions can be used to constrain the temporal + # axis. + if not_dimensions: + # Search for all dimensions even if we have been given a value + # explicitly. In some cases records are given as well as the + # actually dimension and this should not be an error if they + # match. + mandatoryDimensions = datasetType.dimensions.names # - provided + + candidateDimensions: set[str] = set() + candidateDimensions.update(mandatoryDimensions) + + # For calibrations we may well be needing temporal dimensions + # so rather than always including all dimensions in the scan + # restrict things a little. It is still possible for there + # to be confusion over day_obs in visit vs exposure for example. + # If we are not searching calibration collections things may + # fail but they are going to fail anyway because of the + # ambiguousness of the dataId... + if datasetType.isCalibration(): + for dim in self.dimensions.getStaticDimensions(): + if dim.temporal: + candidateDimensions.add(str(dim)) + + # Look up table for the first association with a dimension + guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict) + + # Keep track of whether an item is associated with multiple + # dimensions. + counter: Counter[str] = Counter() + assigned: dict[str, set[str]] = defaultdict(set) + + # Go through the missing dimensions and associate the + # given names with records within those dimensions + matched_dims = set() + for dimensionName in candidateDimensions: + dimension = self.dimensions.getStaticDimensions()[dimensionName] + fields = dimension.metadata.names | dimension.uniqueKeys.names + for field in not_dimensions: + if field in fields: + guessedAssociation[dimensionName][field] = not_dimensions[field] + counter[dimensionName] += 1 + assigned[field].add(dimensionName) + matched_dims.add(field) + + # Calculate the fields that matched nothing. + never_found = set(not_dimensions) - matched_dims + + if never_found: + raise ValueError(f"Unrecognized keyword args given: {never_found}") + + # There is a chance we have allocated a single dataId item + # to multiple dimensions. Need to decide which should be retained. + # For now assume that the most popular alternative wins. + # This means that day_obs with seq_num will result in + # exposure.day_obs and not visit.day_obs + # Also prefer an explicitly missing dimension over an inferred + # temporal dimension. + for fieldName, assignedDimensions in assigned.items(): + if len(assignedDimensions) > 1: + # Pick the most popular (preferring mandatory dimensions) + requiredButMissing = assignedDimensions.intersection(mandatoryDimensions) + if requiredButMissing: + candidateDimensions = requiredButMissing + else: + candidateDimensions = assignedDimensions + + # If this is a choice between visit and exposure and + # neither was a required part of the dataset type, + # (hence in this branch) always prefer exposure over + # visit since exposures are always defined and visits + # are defined from exposures. + if candidateDimensions == {"exposure", "visit"}: + candidateDimensions = {"exposure"} + + # Select the relevant items and get a new restricted + # counter. + theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions} + duplicatesCounter: Counter[str] = Counter() + duplicatesCounter.update(theseCounts) + + # Choose the most common. If they are equally common + # we will pick the one that was found first. + # Returns a list of tuples + selected = duplicatesCounter.most_common(1)[0][0] + + _LOG.debug( + "Ambiguous dataId entry '%s' associated with multiple dimensions: %s." + " Removed ambiguity by choosing dimension %s.", + fieldName, + ", ".join(assignedDimensions), + selected, + ) + + for candidateDimension in assignedDimensions: + if candidateDimension != selected: + del guessedAssociation[candidateDimension][fieldName] + + # Update the record look up dict with the new associations + for dimensionName, values in guessedAssociation.items(): + if values: # A dict might now be empty + _LOG.debug( + "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values + ) + byRecord[dimensionName].update(values) + + if byRecord: + # Some record specifiers were found so we need to convert + # them to the Id form + for dimensionName, values in byRecord.items(): + if dimensionName in newDataId: + _LOG.debug( + "DataId specified explicit %s dimension value of %s in addition to" + " general record specifiers for it of %s. Ignoring record information.", + dimensionName, + newDataId[dimensionName], + str(values), + ) + # Get the actual record and compare with these values. + try: + recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId)) + except DataIdError: + raise ValueError( + f"Could not find dimension '{dimensionName}'" + f" with dataId {newDataId} as part of comparing with" + f" record values {byRecord[dimensionName]}" + ) from None + if len(recs) == 1: + errmsg: list[str] = [] + for k, v in values.items(): + if (recval := getattr(recs[0], k)) != v: + errmsg.append(f"{k}({recval} != {v})") + if errmsg: + raise ValueError( + f"Dimension {dimensionName} in dataId has explicit value" + " inconsistent with records: " + ", ".join(errmsg) + ) + else: + # Multiple matches for an explicit dimension + # should never happen but let downstream complain. + pass + continue + + # Build up a WHERE expression + bind = dict(values.items()) + where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind) + + # Hopefully we get a single record that matches + records = set( + self._registry.queryDimensionRecords( + dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs + ) + ) + + if len(records) != 1: + if len(records) > 1: + # visit can have an ambiguous answer without involving + # visit_system. The default visit_system is defined + # by the instrument. + if ( + dimensionName == "visit" + and "visit_system_membership" in self.dimensions + and "visit_system" in self.dimensions["instrument"].metadata + ): + instrument_records = list( + self._registry.queryDimensionRecords( + "instrument", + dataId=newDataId, + **kwargs, + ) + ) + if len(instrument_records) == 1: + visit_system = instrument_records[0].visit_system + if visit_system is None: + # Set to a value that will never match. + visit_system = -1 + + # Look up each visit in the + # visit_system_membership records. + for rec in records: + membership = list( + self._registry.queryDimensionRecords( + # Use bind to allow zero results. + # This is a fully-specified query. + "visit_system_membership", + where="instrument = inst AND visit_system = system AND visit = v", + bind=dict( + inst=instrument_records[0].name, system=visit_system, v=rec.id + ), + ) + ) + if membership: + # This record is the right answer. + records = {rec} + break + + # The ambiguity may have been resolved so check again. + if len(records) > 1: + _LOG.debug( + "Received %d records from constraints of %s", len(records), str(values) + ) + for r in records: + _LOG.debug("- %s", str(r)) + raise ValueError( + f"DataId specification for dimension {dimensionName} is not" + f" uniquely constrained to a single dataset by {values}." + f" Got {len(records)} results." + ) + else: + raise ValueError( + f"DataId specification for dimension {dimensionName} matched no" + f" records when constrained by {values}" + ) + + # Get the primary key from the real dimension object + dimension = self.dimensions.getStaticDimensions()[dimensionName] + if not isinstance(dimension, Dimension): + raise RuntimeError( + f"{dimension.name} is not a true dimension, and cannot be used in data IDs." + ) + newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name) + + return newDataId, kwargs + + def _findDatasetRef( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + dataId: DataId | None = None, + *, + collections: Any = None, + predict: bool = False, + run: str | None = None, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef: + """Shared logic for methods that start with a search for a dataset in + the registry. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate`, optional + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + predict : `bool`, optional + If `True`, return a newly created `DatasetRef` with a unique + dataset ID if finding a reference in the `Registry` fails. + Defaults to `False`. + run : `str`, optional + Run collection name to use for creating `DatasetRef` for predicted + datasets. Only used if ``predict`` is `True`. + datastore_records : `bool`, optional + If `True` add datastore records to returned `DatasetRef`. + **kwargs + Additional keyword arguments used to augment or construct a + `DataId`. See `DataId` parameters. + + Returns + ------- + ref : `DatasetRef` + A reference to the dataset identified by the given arguments. + This can be the same dataset reference as given if it was + resolved. + + Raises + ------ + LookupError + Raised if no matching dataset exists in the `Registry` (and + ``predict`` is `False`). + ValueError + Raised if a resolved `DatasetRef` was passed as an input, but it + differs from the one found in the registry. + TypeError + Raised if no collections were provided. + """ + datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs) + if isinstance(datasetRefOrType, DatasetRef): + if collections is not None: + warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3) + # May need to retrieve datastore records if requested. + if datastore_records and datasetRefOrType._datastore_records is None: + datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType) + return datasetRefOrType + timespan: Timespan | None = None + + dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) + + if datasetType.isCalibration(): + # Because this is a calibration dataset, first try to make a + # standardize the data ID without restricting the dimensions to + # those of the dataset type requested, because there may be extra + # dimensions that provide temporal information for a validity-range + # lookup. + dataId = DataCoordinate.standardize( + dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs + ) + if dataId.graph.temporal: + dataId = self._registry.expandDataId(dataId) + timespan = dataId.timespan + else: + # Standardize the data ID to just the dimensions of the dataset + # type instead of letting registry.findDataset do it, so we get the + # result even if no dataset is found. + dataId = DataCoordinate.standardize( + dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs + ) + # Always lookup the DatasetRef, even if one is given, to ensure it is + # present in the current collection. + ref = self._registry.findDataset( + datasetType, + dataId, + collections=collections, + timespan=timespan, + datastore_records=datastore_records, + ) + if ref is None: + if predict: + if run is None: + run = self.run + if run is None: + raise TypeError("Cannot predict dataset ID/location with run=None.") + return DatasetRef(datasetType, dataId, run=run) + else: + if collections is None: + collections = self._registry.defaults.collections + raise LookupError( + f"Dataset {datasetType.name} with data ID {dataId} " + f"could not be found in collections {collections}." + ) + if datasetType != ref.datasetType: + # If they differ it is because the user explicitly specified + # a compatible dataset type to this call rather than using the + # registry definition. The DatasetRef must therefore be recreated + # using the user definition such that the expected type is + # returned. + ref = DatasetRef( + datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records + ) + + return ref + + # TODO: remove on DM-40067. + @transactional + @deprecated( + reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef." + " Please use Butler.put(). Be aware that you may need to adjust your usage if you" + " were relying on the run parameter to determine the run." + " Will be removed after v26.0.", + version="v26.0", + category=FutureWarning, + ) + def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef: + # Docstring inherited. + return self.put(obj, ref) + + @transactional + def put( + self, + obj: Any, + datasetRefOrType: DatasetRef | DatasetType | str, + /, + dataId: DataId | None = None, + *, + run: str | None = None, + **kwargs: Any, + ) -> DatasetRef: + """Store and register a dataset. + + Parameters + ---------- + obj : `object` + The dataset. + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` is provided, ``dataId`` should be `None`. + Otherwise the `DatasetType` or name thereof. If a fully resolved + `DatasetRef` is given the run and ID are used directly. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the second argument. + run : `str`, optional + The name of the run the dataset should be added to, overriding + ``self.run``. Not used if a resolved `DatasetRef` is provided. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. Not used if a resolve `DatasetRef` is provided. + + Returns + ------- + ref : `DatasetRef` + A reference to the stored dataset, updated with the correct id if + given. + + Raises + ------ + TypeError + Raised if the butler is read-only or if no run has been provided. + """ + if isinstance(datasetRefOrType, DatasetRef): + # This is a direct put of predefined DatasetRef. + _LOG.debug("Butler put direct: %s", datasetRefOrType) + if run is not None: + warnings.warn("Run collection is not used for DatasetRef", stacklevel=3) + # If registry already has a dataset with the same dataset ID, + # dataset type and DataId, then _importDatasets will do nothing and + # just return an original ref. We have to raise in this case, there + # is a datastore check below for that. + self._registry._importDatasets([datasetRefOrType], expand=True) + # Before trying to write to the datastore check that it does not + # know this dataset. This is prone to races, of course. + if self._datastore.knows(datasetRefOrType): + raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}") + # Try to write dataset to the datastore, if it fails due to a race + # with another write, the content of stored data may be + # unpredictable. + try: + self._datastore.put(obj, datasetRefOrType) + except IntegrityError as e: + raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e + return datasetRefOrType + + _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run) + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs) + + # Handle dimension records in dataId + dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs) + + # Add Registry Dataset entry. + dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs) + (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId]) + self._datastore.put(obj, ref) + + return ref + + # TODO: remove on DM-40067. + @deprecated( + reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef." + " Please use Butler.get(). Will be removed after v26.0.", + version="v26.0", + category=FutureWarning, + ) + def getDirect( + self, + ref: DatasetRef, + *, + parameters: dict[str, Any] | None = None, + storageClass: StorageClass | str | None = None, + ) -> Any: + """Retrieve a stored dataset. + + Parameters + ---------- + ref : `DatasetRef` + Resolved reference to an already stored dataset. + parameters : `dict` + Additional StorageClass-defined options to control reading, + typically used to efficiently read only a subset of the dataset. + storageClass : `StorageClass` or `str`, optional + The storage class to be used to override the Python type + returned by this method. By default the returned type matches + the dataset type definition for this dataset. Specifying a + read `StorageClass` can force a different type to be returned. + This type must be compatible with the original type. + + Returns + ------- + obj : `object` + The dataset. + """ + return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) + + # TODO: remove on DM-40067. + @deprecated( + reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. " + "Please use Butler.getDeferred(). Will be removed after v26.0.", + version="v26.0", + category=FutureWarning, + ) + def getDirectDeferred( + self, + ref: DatasetRef, + *, + parameters: dict[str, Any] | None = None, + storageClass: str | StorageClass | None = None, + ) -> DeferredDatasetHandle: + """Create a `DeferredDatasetHandle` which can later retrieve a dataset, + from a resolved `DatasetRef`. + + Parameters + ---------- + ref : `DatasetRef` + Resolved reference to an already stored dataset. + parameters : `dict` + Additional StorageClass-defined options to control reading, + typically used to efficiently read only a subset of the dataset. + storageClass : `StorageClass` or `str`, optional + The storage class to be used to override the Python type + returned by this method. By default the returned type matches + the dataset type definition for this dataset. Specifying a + read `StorageClass` can force a different type to be returned. + This type must be compatible with the original type. + + Returns + ------- + obj : `DeferredDatasetHandle` + A handle which can be used to retrieve a dataset at a later time. + + Raises + ------ + LookupError + Raised if no matching dataset exists in the `Registry`. + """ + # Check that dataset is known to the datastore. + if not self._datastore.knows(ref): + raise LookupError(f"Dataset reference {ref} is not known to datastore.") + return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) + + def getDeferred( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + /, + dataId: DataId | None = None, + *, + parameters: dict | None = None, + collections: Any = None, + storageClass: str | StorageClass | None = None, + **kwargs: Any, + ) -> DeferredDatasetHandle: + """Create a `DeferredDatasetHandle` which can later retrieve a dataset, + after an immediate registry lookup. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate`, optional + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + parameters : `dict` + Additional StorageClass-defined options to control reading, + typically used to efficiently read only a subset of the dataset. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + storageClass : `StorageClass` or `str`, optional + The storage class to be used to override the Python type + returned by this method. By default the returned type matches + the dataset type definition for this dataset. Specifying a + read `StorageClass` can force a different type to be returned. + This type must be compatible with the original type. + **kwargs + Additional keyword arguments used to augment or construct a + `DataId`. See `DataId` parameters. + + Returns + ------- + obj : `DeferredDatasetHandle` + A handle which can be used to retrieve a dataset at a later time. + + Raises + ------ + LookupError + Raised if no matching dataset exists in the `Registry` or + datastore. + ValueError + Raised if a resolved `DatasetRef` was passed as an input, but it + differs from the one found in the registry. + TypeError + Raised if no collections were provided. + """ + if isinstance(datasetRefOrType, DatasetRef): + # Do the quick check first and if that fails, check for artifact + # existence. This is necessary for datastores that are configured + # in trust mode where there won't be a record but there will be + # a file. + if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType): + ref = datasetRefOrType + else: + raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.") + else: + ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) + return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass) + + def get( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + /, + dataId: DataId | None = None, + *, + parameters: dict[str, Any] | None = None, + collections: Any = None, + storageClass: StorageClass | str | None = None, + **kwargs: Any, + ) -> Any: + """Retrieve a stored dataset. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + If a resolved `DatasetRef`, the associated dataset + is returned directly without additional querying. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + parameters : `dict` + Additional StorageClass-defined options to control reading, + typically used to efficiently read only a subset of the dataset. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + storageClass : `StorageClass` or `str`, optional + The storage class to be used to override the Python type + returned by this method. By default the returned type matches + the dataset type definition for this dataset. Specifying a + read `StorageClass` can force a different type to be returned. + This type must be compatible with the original type. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. + + Returns + ------- + obj : `object` + The dataset. + + Raises + ------ + LookupError + Raised if no matching dataset exists in the `Registry`. + TypeError + Raised if no collections were provided. + + Notes + ----- + When looking up datasets in a `~CollectionType.CALIBRATION` collection, + this method requires that the given data ID include temporal dimensions + beyond the dimensions of the dataset type itself, in order to find the + dataset with the appropriate validity range. For example, a "bias" + dataset with native dimensions ``{instrument, detector}`` could be + fetched with a ``{instrument, detector, exposure}`` data ID, because + ``exposure`` is a temporal dimension. + """ + _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters) + ref = self._findDatasetRef( + datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs + ) + return self._datastore.get(ref, parameters=parameters, storageClass=storageClass) + + def getURIs( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + /, + dataId: DataId | None = None, + *, + predict: bool = False, + collections: Any = None, + run: str | None = None, + **kwargs: Any, + ) -> DatasetRefURIs: + """Return the URIs associated with the dataset. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + predict : `bool` + If `True`, allow URIs to be returned of datasets that have not + been written. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + run : `str`, optional + Run to use for predictions, overriding ``self.run``. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. + + Returns + ------- + uris : `DatasetRefURIs` + The URI to the primary artifact associated with this dataset (if + the dataset was disassembled within the datastore this may be + `None`), and the URIs to any components associated with the dataset + artifact. (can be empty if there are no components). + """ + ref = self._findDatasetRef( + datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs + ) + return self._datastore.getURIs(ref, predict) + + def getURI( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + /, + dataId: DataId | None = None, + *, + predict: bool = False, + collections: Any = None, + run: str | None = None, + **kwargs: Any, + ) -> ResourcePath: + """Return the URI to the Dataset. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + predict : `bool` + If `True`, allow URIs to be returned of datasets that have not + been written. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + run : `str`, optional + Run to use for predictions, overriding ``self.run``. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. + + Returns + ------- + uri : `lsst.resources.ResourcePath` + URI pointing to the Dataset within the datastore. If the + Dataset does not exist in the datastore, and if ``predict`` is + `True`, the URI will be a prediction and will include a URI + fragment "#predicted". + If the datastore does not have entities that relate well + to the concept of a URI the returned URI string will be + descriptive. The returned URI is not guaranteed to be obtainable. + + Raises + ------ + LookupError + A URI has been requested for a dataset that does not exist and + guessing is not allowed. + ValueError + Raised if a resolved `DatasetRef` was passed as an input, but it + differs from the one found in the registry. + TypeError + Raised if no collections were provided. + RuntimeError + Raised if a URI is requested for a dataset that consists of + multiple artifacts. + """ + primary, components = self.getURIs( + datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs + ) + + if primary is None or components: + raise RuntimeError( + f"Dataset ({datasetRefOrType}) includes distinct URIs for components. " + "Use Butler.getURIs() instead." + ) + return primary + + def retrieveArtifacts( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + transfer: str = "auto", + preserve_path: bool = True, + overwrite: bool = False, + ) -> list[ResourcePath]: + # Docstring inherited. + return self._datastore.retrieveArtifacts( + refs, + ResourcePath(destination), + transfer=transfer, + preserve_path=preserve_path, + overwrite=overwrite, + ) + + def exists( + self, + dataset_ref_or_type: DatasetRef | DatasetType | str, + /, + data_id: DataId | None = None, + *, + full_check: bool = True, + collections: Any = None, + **kwargs: Any, + ) -> DatasetExistence: + # Docstring inherited. + existence = DatasetExistence.UNRECOGNIZED + + if isinstance(dataset_ref_or_type, DatasetRef): + if collections is not None: + warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2) + if data_id is not None: + warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2) + ref = dataset_ref_or_type + registry_ref = self._registry.getDataset(dataset_ref_or_type.id) + if registry_ref is not None: + existence |= DatasetExistence.RECORDED + + if dataset_ref_or_type != registry_ref: + # This could mean that storage classes differ, so we should + # check for that but use the registry ref for the rest of + # the method. + if registry_ref.is_compatible_with(dataset_ref_or_type): + # Use the registry version from now on. + ref = registry_ref + else: + raise ValueError( + f"The ref given to exists() ({ref}) has the same dataset ID as one " + f"in registry but has different incompatible values ({registry_ref})." + ) + else: + try: + ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs) + except (LookupError, TypeError, NoDefaultCollectionError): + return existence + existence |= DatasetExistence.RECORDED + + if self._datastore.knows(ref): + existence |= DatasetExistence.DATASTORE + + if full_check: + if self._datastore.exists(ref): + existence |= DatasetExistence._ARTIFACT + elif existence.value != DatasetExistence.UNRECOGNIZED.value: + # Do not add this flag if we have no other idea about a dataset. + existence |= DatasetExistence(DatasetExistence._ASSUMED) + + return existence + + def _exists_many( + self, + refs: Iterable[DatasetRef], + /, + *, + full_check: bool = True, + ) -> dict[DatasetRef, DatasetExistence]: + # Docstring inherited. + existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs} + + # Registry does not have a bulk API to check for a ref. + for ref in refs: + registry_ref = self._registry.getDataset(ref.id) + if registry_ref is not None: + # It is possible, albeit unlikely, that the given ref does + # not match the one in registry even though the UUID matches. + # When checking a single ref we raise, but it's impolite to + # do that when potentially hundreds of refs are being checked. + # We could change the API to only accept UUIDs and that would + # remove the ability to even check and remove the worry + # about differing storage classes. Given the ongoing discussion + # on refs vs UUIDs and whether to raise or have a new + # private flag, treat this as a private API for now. + existence[ref] |= DatasetExistence.RECORDED + + # Ask datastore if it knows about these refs. + knows = self._datastore.knows_these(refs) + for ref, known in knows.items(): + if known: + existence[ref] |= DatasetExistence.DATASTORE + + if full_check: + mexists = self._datastore.mexists(refs) + for ref, exists in mexists.items(): + if exists: + existence[ref] |= DatasetExistence._ARTIFACT + else: + # Do not set this flag if nothing is known about the dataset. + for ref in existence: + if existence[ref] != DatasetExistence.UNRECOGNIZED: + existence[ref] |= DatasetExistence._ASSUMED + + return existence + + # TODO: remove on DM-40079. + @deprecated( + reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.", + version="v26.0", + category=FutureWarning, + ) + def datasetExists( + self, + datasetRefOrType: DatasetRef | DatasetType | str, + dataId: DataId | None = None, + *, + collections: Any = None, + **kwargs: Any, + ) -> bool: + """Return True if the Dataset is actually present in the Datastore. + + Parameters + ---------- + datasetRefOrType : `DatasetRef`, `DatasetType`, or `str` + When `DatasetRef` the `dataId` should be `None`. + Otherwise the `DatasetType` or name thereof. + dataId : `dict` or `DataCoordinate` + A `dict` of `Dimension` link name, value pairs that label the + `DatasetRef` within a Collection. When `None`, a `DatasetRef` + should be provided as the first argument. + collections : Any, optional + Collections to be searched, overriding ``self.collections``. + Can be any of the types supported by the ``collections`` argument + to butler construction. + **kwargs + Additional keyword arguments used to augment or construct a + `DataCoordinate`. See `DataCoordinate.standardize` + parameters. + + Raises + ------ + LookupError + Raised if the dataset is not even present in the Registry. + ValueError + Raised if a resolved `DatasetRef` was passed as an input, but it + differs from the one found in the registry. + NoDefaultCollectionError + Raised if no collections were provided. + """ + # A resolved ref may be given that is not known to this butler. + if isinstance(datasetRefOrType, DatasetRef): + ref = self._registry.getDataset(datasetRefOrType.id) + if ref is None: + raise LookupError( + f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry." + ) + else: + ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs) + return self._datastore.exists(ref) + + def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: + # Docstring inherited. + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + names = list(names) + refs: list[DatasetRef] = [] + for name in names: + collectionType = self._registry.getCollectionType(name) + if collectionType is not CollectionType.RUN: + raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.") + refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True)) + with self._datastore.transaction(), self._registry.transaction(): + if unstore: + self._datastore.trash(refs) + else: + self._datastore.forget(refs) + for name in names: + self._registry.removeCollection(name) + if unstore: + # Point of no return for removing artifacts + self._datastore.emptyTrash() + + def pruneDatasets( + self, + refs: Iterable[DatasetRef], + *, + disassociate: bool = True, + unstore: bool = False, + tags: Iterable[str] = (), + purge: bool = False, + ) -> None: + # docstring inherited from LimitedButler + + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + if purge: + if not disassociate: + raise TypeError("Cannot pass purge=True without disassociate=True.") + if not unstore: + raise TypeError("Cannot pass purge=True without unstore=True.") + elif disassociate: + tags = tuple(tags) + if not tags: + raise TypeError("No tags provided but disassociate=True.") + for tag in tags: + collectionType = self._registry.getCollectionType(tag) + if collectionType is not CollectionType.TAGGED: + raise TypeError( + f"Cannot disassociate from collection '{tag}' " + f"of non-TAGGED type {collectionType.name}." + ) + # Transform possibly-single-pass iterable into something we can iterate + # over multiple times. + refs = list(refs) + # Pruning a component of a DatasetRef makes no sense since registry + # doesn't know about components and datastore might not store + # components in a separate file + for ref in refs: + if ref.datasetType.component(): + raise ValueError(f"Can not prune a component of a dataset (ref={ref})") + # We don't need an unreliable Datastore transaction for this, because + # we've been extra careful to ensure that Datastore.trash only involves + # mutating the Registry (it can _look_ at Datastore-specific things, + # but shouldn't change them), and hence all operations here are + # Registry operations. + with self._datastore.transaction(), self._registry.transaction(): + if unstore: + self._datastore.trash(refs) + if purge: + self._registry.removeDatasets(refs) + elif disassociate: + assert tags, "Guaranteed by earlier logic in this function." + for tag in tags: + self._registry.disassociate(tag, refs) + # We've exited the Registry transaction, and apparently committed. + # (if there was an exception, everything rolled back, and it's as if + # nothing happened - and we never get here). + # Datastore artifacts are not yet gone, but they're clearly marked + # as trash, so if we fail to delete now because of (e.g.) filesystem + # problems we can try again later, and if manual administrative + # intervention is required, it's pretty clear what that should entail: + # deleting everything on disk and in private Datastore tables that is + # in the dataset_location_trash table. + if unstore: + # Point of no return for removing artifacts + self._datastore.emptyTrash() + + @transactional + def ingest( + self, + *datasets: FileDataset, + transfer: str | None = "auto", + run: str | None = None, + idGenerationMode: DatasetIdGenEnum | None = None, + record_validation_info: bool = True, + ) -> None: + # Docstring inherited. + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + + _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") + if not datasets: + return + + if idGenerationMode is not None: + warnings.warn( + "The idGenerationMode parameter is no longer used and is ignored. " + " Will be removed after v26.0", + FutureWarning, + stacklevel=2, + ) + + progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG) + + # We need to reorganize all the inputs so that they are grouped + # by dataset type and run. Multiple refs in a single FileDataset + # are required to share the run and dataset type. + GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]] + groupedData: GroupedData = defaultdict(list) + + # Track DataIDs that are being ingested so we can spot issues early + # with duplication. Retain previous FileDataset so we can report it. + groupedDataIds: MutableMapping[ + tuple[DatasetType, str], dict[DataCoordinate, FileDataset] + ] = defaultdict(dict) + + used_run = False + + # And the nested loop that populates it: + for dataset in progress.wrap(datasets, desc="Grouping by dataset type"): + # Somewhere to store pre-existing refs if we have an + # execution butler. + existingRefs: list[DatasetRef] = [] + + for ref in dataset.refs: + assert ref.run is not None # For mypy + group_key = (ref.datasetType, ref.run) + + if ref.dataId in groupedDataIds[group_key]: + raise ConflictingDefinitionError( + f"Ingest conflict. Dataset {dataset.path} has same" + " DataId as other ingest dataset" + f" {groupedDataIds[group_key][ref.dataId].path} " + f" ({ref.dataId})" + ) + + groupedDataIds[group_key][ref.dataId] = dataset + + if existingRefs: + if len(dataset.refs) != len(existingRefs): + # Keeping track of partially pre-existing datasets is hard + # and should generally never happen. For now don't allow + # it. + raise ConflictingDefinitionError( + f"For dataset {dataset.path} some dataIds already exist" + " in registry but others do not. This is not supported." + ) + + # Store expanded form in the original FileDataset. + dataset.refs = existingRefs + else: + groupedData[group_key].append(dataset) + + if not used_run and run is not None: + warnings.warn( + "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the " + f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.", + category=FutureWarning, + stacklevel=3, # Take into account the @transactional decorator. + ) + + # Now we can bulk-insert into Registry for each DatasetType. + for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks( + groupedData.items(), desc="Bulk-inserting datasets by type" + ): + refs_to_import = [] + for dataset in grouped_datasets: + refs_to_import.extend(dataset.refs) + + n_refs = len(refs_to_import) + _LOG.verbose( + "Importing %d ref%s of dataset type %r into run %r", + n_refs, + "" if n_refs == 1 else "s", + datasetType.name, + this_run, + ) + + # Import the refs and expand the DataCoordinates since we can't + # guarantee that they are expanded and Datastore will need + # the records. + imported_refs = self._registry._importDatasets(refs_to_import, expand=True) + assert set(imported_refs) == set(refs_to_import) + + # Replace all the refs in the FileDataset with expanded versions. + # Pull them off in the order we put them on the list. + for dataset in grouped_datasets: + n_dataset_refs = len(dataset.refs) + dataset.refs = imported_refs[:n_dataset_refs] + del imported_refs[:n_dataset_refs] + + # Bulk-insert everything into Datastore. + # We do not know if any of the registry entries already existed + # (_importDatasets only complains if they exist but differ) so + # we have to catch IntegrityError explicitly. + try: + self._datastore.ingest( + *datasets, transfer=transfer, record_validation_info=record_validation_info + ) + except IntegrityError as e: + raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e + + @contextlib.contextmanager + def export( + self, + *, + directory: str | None = None, + filename: str | None = None, + format: str | None = None, + transfer: str | None = None, + ) -> Iterator[RepoExportContext]: + # Docstring inherited. + if directory is None and transfer is not None: + raise TypeError("Cannot transfer without providing a directory.") + if transfer == "move": + raise TypeError("Transfer may not be 'move': export is read-only") + if format is None: + if filename is None: + raise TypeError("At least one of 'filename' or 'format' must be provided.") + else: + _, format = os.path.splitext(filename) + if not format: + raise ValueError("Please specify a file extension to determine export format.") + format = format[1:] # Strip leading "."" + elif filename is None: + filename = f"export.{format}" + if directory is not None: + filename = os.path.join(directory, filename) + formats = self._config["repo_transfer_formats"] + if format not in formats: + raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}") + BackendClass = get_class_of(formats[format, "export"]) + with open(filename, "w") as stream: + backend = BackendClass(stream, universe=self.dimensions) + try: + helper = RepoExportContext( + self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer + ) + yield helper + except BaseException: + raise + else: + helper._finish() + + def import_( + self, + *, + directory: ResourcePathExpression | None = None, + filename: ResourcePathExpression | TextIO | None = None, + format: str | None = None, + transfer: str | None = None, + skip_dimensions: set | None = None, + ) -> None: + # Docstring inherited. + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + if format is None: + if filename is None: + raise TypeError("At least one of 'filename' or 'format' must be provided.") + else: + _, format = os.path.splitext(filename) # type: ignore + elif filename is None: + filename = ResourcePath(f"export.{format}", forceAbsolute=False) + if directory is not None: + directory = ResourcePath(directory, forceDirectory=True) + # mypy doesn't think this will work but it does in python >= 3.10. + if isinstance(filename, ResourcePathExpression): # type: ignore + filename = ResourcePath(filename, forceAbsolute=False) # type: ignore + if not filename.isabs() and directory is not None: + potential = directory.join(filename) + exists_in_cwd = filename.exists() + exists_in_dir = potential.exists() + if exists_in_cwd and exists_in_dir: + _LOG.warning( + "A relative path for filename was specified (%s) which exists relative to cwd. " + "Additionally, the file exists relative to the given search directory (%s). " + "Using the export file in the given directory.", + filename, + potential, + ) + # Given they specified an explicit directory and that + # directory has the export file in it, assume that that + # is what was meant despite the file in cwd. + filename = potential + elif exists_in_dir: + filename = potential + elif not exists_in_cwd and not exists_in_dir: + # Raise early. + raise FileNotFoundError( + f"Export file could not be found in {filename.abspath()} or {potential.abspath()}." + ) + BackendClass: type[RepoImportBackend] = get_class_of( + self._config["repo_transfer_formats"][format]["import"] + ) + + def doImport(importStream: TextIO | ResourceHandleProtocol) -> None: + backend = BackendClass(importStream, self._registry) # type: ignore[call-arg] + backend.register() + with self.transaction(): + backend.load( + self._datastore, + directory=directory, + transfer=transfer, + skip_dimensions=skip_dimensions, + ) + + if isinstance(filename, ResourcePath): + # We can not use open() here at the moment because of + # DM-38589 since yaml does stream.read(8192) in a loop. + stream = io.StringIO(filename.read().decode()) + doImport(stream) + else: + doImport(filename) # type: ignore + + def transfer_from( + self, + source_butler: LimitedButler, + source_refs: Iterable[DatasetRef], + transfer: str = "auto", + skip_missing: bool = True, + register_dataset_types: bool = False, + transfer_dimensions: bool = False, + ) -> collections.abc.Collection[DatasetRef]: + # Docstring inherited. + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE) + + # Will iterate through the refs multiple times so need to convert + # to a list if this isn't a collection. + if not isinstance(source_refs, collections.abc.Collection): + source_refs = list(source_refs) + + original_count = len(source_refs) + _LOG.info("Transferring %d datasets into %s", original_count, str(self)) + + # In some situations the datastore artifact may be missing + # and we do not want that registry entry to be imported. + # Asking datastore is not sufficient, the records may have been + # purged, we have to ask for the (predicted) URI and check + # existence explicitly. Execution butler is set up exactly like + # this with no datastore records. + artifact_existence: dict[ResourcePath, bool] = {} + if skip_missing: + dataset_existence = source_butler._datastore.mexists( + source_refs, artifact_existence=artifact_existence + ) + source_refs = [ref for ref, exists in dataset_existence.items() if exists] + filtered_count = len(source_refs) + n_missing = original_count - filtered_count + _LOG.verbose( + "%d dataset%s removed because the artifact does not exist. Now have %d.", + n_missing, + "" if n_missing == 1 else "s", + filtered_count, + ) + + # Importing requires that we group the refs by dataset type and run + # before doing the import. + source_dataset_types = set() + grouped_refs = defaultdict(list) + for ref in source_refs: + grouped_refs[ref.datasetType, ref.run].append(ref) + source_dataset_types.add(ref.datasetType) + + # Check to see if the dataset type in the source butler has + # the same definition in the target butler and register missing + # ones if requested. Registration must happen outside a transaction. + newly_registered_dataset_types = set() + for datasetType in source_dataset_types: + if register_dataset_types: + # Let this raise immediately if inconsistent. Continuing + # on to find additional inconsistent dataset types + # might result in additional unwanted dataset types being + # registered. + if self._registry.registerDatasetType(datasetType): + newly_registered_dataset_types.add(datasetType) + else: + # If the dataset type is missing, let it fail immediately. + target_dataset_type = self._registry.getDatasetType(datasetType.name) + if target_dataset_type != datasetType: + raise ConflictingDefinitionError( + "Source butler dataset type differs from definition" + f" in target butler: {datasetType} !=" + f" {target_dataset_type}" + ) + if newly_registered_dataset_types: + # We may have registered some even if there were inconsistencies + # but should let people know (or else remove them again). + _LOG.verbose( + "Registered the following dataset types in the target Butler: %s", + ", ".join(d.name for d in newly_registered_dataset_types), + ) + else: + _LOG.verbose("All required dataset types are known to the target Butler") + + dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict) + if transfer_dimensions: + # Collect all the dimension records for these refs. + # All dimensions are to be copied but the list of valid dimensions + # come from this butler's universe. + elements = frozenset( + element + for element in self.dimensions.getStaticElements() + if element.hasTable() and element.viewOf is None + ) + dataIds = {ref.dataId for ref in source_refs} + # This logic comes from saveDataIds. + for dataId in dataIds: + # Need an expanded record, if not expanded that we need a full + # butler with registry (allow mocks with registry too). + if not dataId.hasRecords(): + if registry := getattr(source_butler, "registry", None): + dataId = registry.expandDataId(dataId) + else: + raise TypeError("Input butler needs to be a full butler to expand DataId.") + # If this butler doesn't know about a dimension in the source + # butler things will break later. + for record in dataId.records.values(): + if record is not None and record.definition in elements: + dimension_records[record.definition].setdefault(record.dataId, record) + + handled_collections: set[str] = set() + + # Do all the importing in a single transaction. + with self.transaction(): + if dimension_records: + _LOG.verbose("Ensuring that dimension records exist for transferred datasets.") + for element, r in dimension_records.items(): + records = [r[dataId] for dataId in r] + # Assume that if the record is already present that we can + # use it without having to check that the record metadata + # is consistent. + self._registry.insertDimensionData(element, *records, skip_existing=True) + + n_imported = 0 + for (datasetType, run), refs_to_import in progress.iter_item_chunks( + grouped_refs.items(), desc="Importing to registry by run and dataset type" + ): + if run not in handled_collections: + # May need to create output collection. If source butler + # has a registry, ask for documentation string. + run_doc = None + if registry := getattr(source_butler, "registry", None): + run_doc = registry.getCollectionDocumentation(run) + registered = self._registry.registerRun(run, doc=run_doc) + handled_collections.add(run) + if registered: + _LOG.verbose("Creating output run %s", run) + + n_refs = len(refs_to_import) + _LOG.verbose( + "Importing %d ref%s of dataset type %s into run %s", + n_refs, + "" if n_refs == 1 else "s", + datasetType.name, + run, + ) + + # Assume we are using UUIDs and the source refs will match + # those imported. + imported_refs = self._registry._importDatasets(refs_to_import, expand=False) + assert set(imported_refs) == set(refs_to_import) + n_imported += len(imported_refs) + + assert len(source_refs) == n_imported + _LOG.verbose("Imported %d datasets into destination butler", n_imported) + + # Ask the datastore to transfer. The datastore has to check that + # the source datastore is compatible with the target datastore. + accepted, rejected = self._datastore.transfer_from( + source_butler._datastore, + source_refs, + transfer=transfer, + artifact_existence=artifact_existence, + ) + if rejected: + # For now, accept the registry entries but not the files. + _LOG.warning( + "%d datasets were rejected and %d accepted for dataset type %s in run %r.", + len(rejected), + len(accepted), + datasetType, + run, + ) + + return source_refs + + def validateConfiguration( + self, + logFailures: bool = False, + datasetTypeNames: Iterable[str] | None = None, + ignore: Iterable[str] | None = None, + ) -> None: + # Docstring inherited. + if datasetTypeNames: + datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] + else: + datasetTypes = list(self._registry.queryDatasetTypes()) + + # filter out anything from the ignore list + if ignore: + ignore = set(ignore) + datasetTypes = [ + e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore + ] + else: + ignore = set() + + # For each datasetType that has an instrument dimension, create + # a DatasetRef for each defined instrument + datasetRefs = [] + + # Find all the registered instruments (if "instrument" is in the + # universe). + if "instrument" in self.dimensions: + instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")} + + for datasetType in datasetTypes: + if "instrument" in datasetType.dimensions: + # In order to create a conforming dataset ref, create + # fake DataCoordinate values for the non-instrument + # dimensions. The type of the value does not matter here. + dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"} + + for instrument in instruments: + datasetRef = DatasetRef( + datasetType, + DataCoordinate.standardize( + dataId, instrument=instrument, graph=datasetType.dimensions + ), + run="validate", + ) + datasetRefs.append(datasetRef) + + entities: list[DatasetType | DatasetRef] = [] + entities.extend(datasetTypes) + entities.extend(datasetRefs) + + datastoreErrorStr = None + try: + self._datastore.validateConfiguration(entities, logFailures=logFailures) + except ValidationError as e: + datastoreErrorStr = str(e) + + # Also check that the LookupKeys used by the datastores match + # registry and storage class definitions + keys = self._datastore.getLookupKeys() + + failedNames = set() + failedDataId = set() + for key in keys: + if key.name is not None: + if key.name in ignore: + continue + + # skip if specific datasetType names were requested and this + # name does not match + if datasetTypeNames and key.name not in datasetTypeNames: + continue + + # See if it is a StorageClass or a DatasetType + if key.name in self.storageClasses: + pass + else: + try: + self._registry.getDatasetType(key.name) + except KeyError: + if logFailures: + _LOG.critical( + "Key '%s' does not correspond to a DatasetType or StorageClass", key + ) + failedNames.add(key) + else: + # Dimensions are checked for consistency when the Butler + # is created and rendezvoused with a universe. + pass + + # Check that the instrument is a valid instrument + # Currently only support instrument so check for that + if key.dataId: + dataIdKeys = set(key.dataId) + if {"instrument"} != dataIdKeys: + if logFailures: + _LOG.critical("Key '%s' has unsupported DataId override", key) + failedDataId.add(key) + elif key.dataId["instrument"] not in instruments: + if logFailures: + _LOG.critical("Key '%s' has unknown instrument", key) + failedDataId.add(key) + + messages = [] + + if datastoreErrorStr: + messages.append(datastoreErrorStr) + + for failed, msg in ( + (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "), + (failedDataId, "Keys with bad DataId entries: "), + ): + if failed: + msg += ", ".join(str(k) for k in failed) + messages.append(msg) + + if messages: + raise ValidationError(";\n".join(messages)) + + @property + def collections(self) -> Sequence[str]: + """The collections to search by default, in order + (`~collections.abc.Sequence` [ `str` ]). + + This is an alias for ``self.registry.defaults.collections``. It cannot + be set directly in isolation, but all defaults may be changed together + by assigning a new `RegistryDefaults` instance to + ``self.registry.defaults``. + """ + return self._registry.defaults.collections + + @property + def run(self) -> str | None: + """Name of the run this butler writes outputs to by default (`str` or + `None`). + + This is an alias for ``self.registry.defaults.run``. It cannot be set + directly in isolation, but all defaults may be changed together by + assigning a new `RegistryDefaults` instance to + ``self.registry.defaults``. + """ + return self._registry.defaults.run + + @property + def registry(self) -> Registry: + """The object that manages dataset metadata and relationships + (`Registry`). + + Many operations that don't involve reading or writing butler datasets + are accessible only via `Registry` methods. Eventually these methods + will be replaced by equivalent `Butler` methods. + """ + return self._registry_shim + + @property + def dimensions(self) -> DimensionUniverse: + # Docstring inherited. + return self._registry.dimensions + + _registry: _ButlerRegistry + """The object that manages dataset metadata and relationships + (`_ButlerRegistry`). + + Most operations that don't involve reading or writing butler datasets are + accessible only via `Registry` methods. + """ + + datastore: Datastore + """The object that manages actual dataset storage (`Datastore`). + + Direct user access to the datastore should rarely be necessary; the primary + exception is the case where a `Datastore` implementation provides extra + functionality beyond what the base class defines. + """ + + storageClasses: StorageClassFactory + """An object that maps known storage class names to objects that fully + describe them (`StorageClassFactory`). + """ diff --git a/python/lsst/daf/butler/script/_associate.py b/python/lsst/daf/butler/script/_associate.py index ef6ceb878d..5e1943b981 100644 --- a/python/lsst/daf/butler/script/_associate.py +++ b/python/lsst/daf/butler/script/_associate.py @@ -42,7 +42,7 @@ def associate( find_first: bool, ) -> None: """Add existing datasets to a CHAINED collection.""" - butler = Butler(repo, writeable=True) + butler = Butler.from_config(repo, writeable=True) butler.registry.registerCollection(collection, CollectionType.TAGGED) diff --git a/python/lsst/daf/butler/script/_pruneDatasets.py b/python/lsst/daf/butler/script/_pruneDatasets.py index 9b1c318af2..17f27d9bc7 100644 --- a/python/lsst/daf/butler/script/_pruneDatasets.py +++ b/python/lsst/daf/butler/script/_pruneDatasets.py @@ -218,7 +218,7 @@ def pruneDatasets( if not collections: return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_COLLECTION_RESTRICTION) - butler = Butler(repo) + butler = Butler.from_config(repo) # If purging, verify that the collection to purge is RUN type collection. if purge_run: @@ -253,7 +253,7 @@ def pruneDatasets( return result def doPruneDatasets() -> PruneDatasetsResult: - butler = Butler(repo, writeable=True) + butler = Butler.from_config(repo, writeable=True) butler.pruneDatasets( refs=datasets_found.getDatasets(), disassociate=disassociate, diff --git a/python/lsst/daf/butler/script/butlerImport.py b/python/lsst/daf/butler/script/butlerImport.py index 37aba0662c..a4af72b85a 100644 --- a/python/lsst/daf/butler/script/butlerImport.py +++ b/python/lsst/daf/butler/script/butlerImport.py @@ -59,7 +59,7 @@ def butlerImport( skip_dimensions : `list`, or `None` Dimensions that should be skipped. """ - butler = Butler(repo, writeable=True) + butler = Butler.from_config(repo, writeable=True) if skip_dimensions is not None: skip_dimensions = set(skip_dimensions) diff --git a/python/lsst/daf/butler/script/certifyCalibrations.py b/python/lsst/daf/butler/script/certifyCalibrations.py index 6f99f0fc06..42bdb53458 100644 --- a/python/lsst/daf/butler/script/certifyCalibrations.py +++ b/python/lsst/daf/butler/script/certifyCalibrations.py @@ -69,7 +69,7 @@ def certifyCalibrations( Search all children of the inputCollection if it is a CHAINED collection, instead of just the most recent one. """ - butler = Butler(repo, writeable=True, without_datastore=True) + butler = Butler.from_config(repo, writeable=True, without_datastore=True) registry = butler.registry timespan = Timespan( begin=astropy.time.Time(begin_date, scale="tai") if begin_date is not None else None, diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py index ba6d53ecd5..888baede11 100644 --- a/python/lsst/daf/butler/script/collectionChain.py +++ b/python/lsst/daf/butler/script/collectionChain.py @@ -71,7 +71,7 @@ def collectionChain( chain : `tuple` of `str` The collections in the chain following this command. """ - butler = Butler(repo, writeable=True, without_datastore=True) + butler = Butler.from_config(repo, writeable=True, without_datastore=True) # Every mode needs children except pop. if not children and mode != "pop": diff --git a/python/lsst/daf/butler/script/configValidate.py b/python/lsst/daf/butler/script/configValidate.py index 83b6c0b921..2f71319302 100644 --- a/python/lsst/daf/butler/script/configValidate.py +++ b/python/lsst/daf/butler/script/configValidate.py @@ -52,7 +52,7 @@ def configValidate(repo: str, quiet: bool, dataset_type: list[str], ignore: list error. """ logFailures = not quiet - butler = Butler(config=repo) + butler = Butler.from_config(config=repo) is_good = True try: butler.validateConfiguration(logFailures=logFailures, datasetTypeNames=dataset_type, ignore=ignore) diff --git a/python/lsst/daf/butler/script/exportCalibs.py b/python/lsst/daf/butler/script/exportCalibs.py index ae28e8b35d..1406f0a132 100644 --- a/python/lsst/daf/butler/script/exportCalibs.py +++ b/python/lsst/daf/butler/script/exportCalibs.py @@ -122,7 +122,7 @@ def exportCalibs( RuntimeError : Raised if the output directory already exists. """ - butler = Butler(repo, writeable=False) + butler = Butler.from_config(repo, writeable=False) dataset_type_query = dataset_type or ... collections_query = collections or ... diff --git a/python/lsst/daf/butler/script/ingest_files.py b/python/lsst/daf/butler/script/ingest_files.py index df51f25da6..e4e645229b 100644 --- a/python/lsst/daf/butler/script/ingest_files.py +++ b/python/lsst/daf/butler/script/ingest_files.py @@ -105,7 +105,7 @@ def ingest_files( id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode] # Create the butler with the relevant run attached. - butler = Butler(repo, run=run) + butler = Butler.from_config(repo, run=run) datasetType = butler.registry.getDatasetType(dataset_type) diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py index a0977d1d97..4358d23c7d 100644 --- a/python/lsst/daf/butler/script/queryCollections.py +++ b/python/lsst/daf/butler/script/queryCollections.py @@ -68,7 +68,7 @@ def _getTable( names=("Name", typeCol, descriptionCol), dtype=(str, str, str), ) - butler = Butler(repo) + butler = Butler.from_config(repo) names = sorted( butler.registry.queryCollections(collectionTypes=frozenset(collection_type), expression=glob or ...) ) @@ -140,7 +140,7 @@ def _getTree( names=("Name", "Type"), dtype=(str, str), ) - butler = Butler(repo, without_datastore=True) + butler = Butler.from_config(repo, without_datastore=True) def addCollection(name: str, level: int = 0) -> None: collectionType = butler.registry.getCollectionType(name) @@ -168,7 +168,7 @@ def _getFlatten( glob: Iterable[str], collection_type: Iterable[CollectionType], ) -> Table: - butler = Butler(repo) + butler = Butler.from_config(repo) collectionNames = list( butler.registry.queryCollections( collectionTypes=frozenset(collection_type), flattenChains=True, expression=glob or ... diff --git a/python/lsst/daf/butler/script/queryDataIds.py b/python/lsst/daf/butler/script/queryDataIds.py index cb70f114d3..415d2652d8 100644 --- a/python/lsst/daf/butler/script/queryDataIds.py +++ b/python/lsst/daf/butler/script/queryDataIds.py @@ -34,8 +34,9 @@ import numpy as np from astropy.table import Table as AstropyTable -from .._butler import Butler, DataCoordinate +from .._butler import Butler from ..cli.utils import sortAstropyTable +from ..dimensions import DataCoordinate if TYPE_CHECKING: from lsst.daf.butler import DimensionGraph @@ -109,7 +110,7 @@ def queryDataIds( Docstring for supported parameters is the same as `~lsst.daf.butler.Registry.queryDataIds`. """ - butler = Butler(repo, without_datastore=True) + butler = Butler.from_config(repo, without_datastore=True) if datasets and collections and not dimensions: # Determine the dimensions relevant to all given dataset types. diff --git a/python/lsst/daf/butler/script/queryDatasetTypes.py b/python/lsst/daf/butler/script/queryDatasetTypes.py index 4c1eafd5e2..efe9aeaeb0 100644 --- a/python/lsst/daf/butler/script/queryDatasetTypes.py +++ b/python/lsst/daf/butler/script/queryDatasetTypes.py @@ -61,7 +61,7 @@ def queryDatasetTypes(repo: str, verbose: bool, glob: Iterable[str], components: A dict whose key is "datasetTypes" and whose value is a list of collection names. """ - butler = Butler(repo, without_datastore=True) + butler = Butler.from_config(repo, without_datastore=True) expression = glob or ... datasetTypes = butler.registry.queryDatasetTypes(components=components, expression=expression) if verbose: diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index e6b17a79ca..4a7cac38f3 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -175,7 +175,7 @@ def __init__( raise RuntimeError("One of repo and butler must be provided and the other must be None.") # show_uri requires a datastore. without_datastore = not show_uri - self.butler = butler or Butler(repo, without_datastore=without_datastore) + self.butler = butler or Butler.from_config(repo, without_datastore=without_datastore) self._getDatasets(glob, collections, where, find_first) self.showUri = show_uri diff --git a/python/lsst/daf/butler/script/queryDimensionRecords.py b/python/lsst/daf/butler/script/queryDimensionRecords.py index 8f26af86be..88197cf2bf 100644 --- a/python/lsst/daf/butler/script/queryDimensionRecords.py +++ b/python/lsst/daf/butler/script/queryDimensionRecords.py @@ -54,7 +54,7 @@ def queryDimensionRecords( `~lsst.daf.butler.Registry.queryDimensionRecords` except for ``no_check``, which is the inverse of ``check``. """ - butler = Butler(repo, without_datastore=True) + butler = Butler.from_config(repo, without_datastore=True) query_collections: Iterable[str] | EllipsisType | None = None if datasets: diff --git a/python/lsst/daf/butler/script/register_dataset_type.py b/python/lsst/daf/butler/script/register_dataset_type.py index 4de6f31a6d..f46fda8817 100644 --- a/python/lsst/daf/butler/script/register_dataset_type.py +++ b/python/lsst/daf/butler/script/register_dataset_type.py @@ -69,7 +69,7 @@ def register_dataset_type( be created by this command. They are always derived from the composite dataset type. """ - butler = Butler(repo, writeable=True, without_datastore=True) + butler = Butler.from_config(repo, writeable=True, without_datastore=True) composite, component = DatasetType.splitDatasetTypeName(dataset_type) if component: diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py index e0ee80e21d..8dc49015ed 100644 --- a/python/lsst/daf/butler/script/removeCollections.py +++ b/python/lsst/daf/butler/script/removeCollections.py @@ -82,7 +82,7 @@ def _getCollectionInfo( collectionInfo : `CollectionInfo` Contains tables with run and non-run collection info. """ - butler = Butler(repo, without_datastore=True) + butler = Butler.from_config(repo, without_datastore=True) try: names = sorted( butler.registry.queryCollections( @@ -135,7 +135,7 @@ def removeCollections( def doRemove(collections: Table) -> None: """Perform the prune collection step.""" - butler = Butler(repo, writeable=True, without_datastore=True) + butler = Butler.from_config(repo, writeable=True, without_datastore=True) for name in collections["Collection"]: butler.registry.removeCollection(name) diff --git a/python/lsst/daf/butler/script/removeDatasetType.py b/python/lsst/daf/butler/script/removeDatasetType.py index 3279a6cc6e..4fe9e020b3 100644 --- a/python/lsst/daf/butler/script/removeDatasetType.py +++ b/python/lsst/daf/butler/script/removeDatasetType.py @@ -43,5 +43,5 @@ def removeDatasetType(repo: str, dataset_type_name: tuple[str, ...]) -> None: datasetTypeName : `str` The name of the dataset type to be removed. """ - butler = Butler(repo, writeable=True, without_datastore=True) + butler = Butler.from_config(repo, writeable=True, without_datastore=True) butler.registry.removeDatasetType(dataset_type_name) diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py index 8259f9984e..1186e53b05 100644 --- a/python/lsst/daf/butler/script/removeRuns.py +++ b/python/lsst/daf/butler/script/removeRuns.py @@ -85,7 +85,7 @@ def _getCollectionInfo( datasets : `dict` [`str`, `int`] The dataset types and and how many will be removed. """ - butler = Butler(repo) + butler = Butler.from_config(repo) try: collectionNames = list( butler.registry.queryCollections( @@ -132,7 +132,7 @@ def removeRuns( def doRemove(runs: Sequence[RemoveRun]) -> None: """Perform the remove step.""" - butler = Butler(repo, writeable=True) + butler = Butler.from_config(repo, writeable=True) with butler.transaction(): for run in runs: for parent in run.parents: diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index 10edf446ac..01a4d4a11f 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -86,7 +86,7 @@ def retrieveArtifacts( query_types = dataset_type or ... query_collections: tuple[str, ...] | EllipsisType = collections or ... - butler = Butler(repo, writeable=False) + butler = Butler.from_config(repo, writeable=False) # Need to store in list so we can count the number to give some feedback # to caller. diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py index c63835e109..845f37b87d 100644 --- a/python/lsst/daf/butler/script/transferDatasets.py +++ b/python/lsst/daf/butler/script/transferDatasets.py @@ -74,8 +74,8 @@ def transferDatasets( datasets. It can be more efficient to disable this if it is known that all dimensions exist. """ - source_butler = Butler(source, writeable=False) - dest_butler = Butler(dest, writeable=True) + source_butler = Butler.from_config(source, writeable=False) + dest_butler = Butler.from_config(dest, writeable=True) dataset_type_expr = dataset_type or ... collections_expr: tuple[str, ...] | EllipsisType = collections or ... diff --git a/python/lsst/daf/butler/server.py b/python/lsst/daf/butler/server.py index 7ee3a387f5..1839838954 100644 --- a/python/lsst/daf/butler/server.py +++ b/python/lsst/daf/butler/server.py @@ -84,21 +84,21 @@ def _generate_next_value_(name, start, count, last_values) -> str: # type: igno def _make_global_butler() -> None: global GLOBAL_READONLY_BUTLER, GLOBAL_READWRITE_BUTLER if GLOBAL_READONLY_BUTLER is None: - GLOBAL_READONLY_BUTLER = Butler(BUTLER_ROOT, writeable=False) + GLOBAL_READONLY_BUTLER = Butler.from_config(BUTLER_ROOT, writeable=False) if GLOBAL_READWRITE_BUTLER is None: - GLOBAL_READWRITE_BUTLER = Butler(BUTLER_ROOT, writeable=True) + GLOBAL_READWRITE_BUTLER = Butler.from_config(BUTLER_ROOT, writeable=True) def butler_readonly_dependency() -> Butler: """Return global read-only butler.""" _make_global_butler() - return Butler(butler=GLOBAL_READONLY_BUTLER) + return Butler.from_config(butler=GLOBAL_READONLY_BUTLER) def butler_readwrite_dependency() -> Butler: """Return read-write butler.""" _make_global_butler() - return Butler(butler=GLOBAL_READWRITE_BUTLER) + return Butler.from_config(butler=GLOBAL_READWRITE_BUTLER) def unpack_dataId(butler: Butler, data_id: SerializedDataCoordinate | None) -> DataCoordinate | None: diff --git a/python/lsst/daf/butler/tests/_testRepo.py b/python/lsst/daf/butler/tests/_testRepo.py index af121db9e6..eba08df974 100644 --- a/python/lsst/daf/butler/tests/_testRepo.py +++ b/python/lsst/daf/butler/tests/_testRepo.py @@ -116,7 +116,7 @@ def makeTestRepo( # not be ignored. # newConfig guards against location-related keywords like outfile newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs) - butler = Butler(newConfig, writeable=True) + butler = Butler.from_config(newConfig, writeable=True) dimensionRecords = _makeRecords(dataIds, butler.dimensions) for dimension, records in dimensionRecords.items(): if butler.dimensions[dimension].viewOf is None: @@ -154,7 +154,7 @@ def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler: # Speed matters more than cryptographic guarantees uniqueId = str(random.randrange(1_000_000_000)) collection = "test_" + uniqueId - return Butler(butler=repo, run=collection) + return Butler.from_config(butler=repo, run=collection) def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]: diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py index 802498a0d0..fe1ccc7965 100644 --- a/python/lsst/daf/butler/tests/utils.py +++ b/python/lsst/daf/butler/tests/utils.py @@ -243,7 +243,7 @@ def __init__(self, root: str, configFile: str) -> None: # tag when looking up datasets. run = "ingest/run" tag = "ingest" - self.butler = Butler(butlerConfigFile, run=run, collections=[tag]) + self.butler = Butler.from_config(butlerConfigFile, run=run, collections=[tag]) self.butler.registry.registerCollection(tag, CollectionType.TAGGED) # Create and register a DatasetType diff --git a/tests/test_butler.py b/tests/test_butler.py index 9d185221ce..0dda39a51f 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -87,6 +87,7 @@ def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] from lsst.daf.butler.datastore import NullDatastore from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError from lsst.daf.butler.datastores.fileDatastore import FileDatastore +from lsst.daf.butler.direct_butler import DirectButler from lsst.daf.butler.registries.sql import SqlRegistry from lsst.daf.butler.registry import ( CollectionError, @@ -210,8 +211,9 @@ def tearDown(self) -> None: def create_butler( self, run: str, storageClass: StorageClass | str, datasetTypeName: str - ) -> tuple[Butler, DatasetType]: - butler = Butler(self.tmpConfigFile, run=run) + ) -> tuple[DirectButler, DatasetType]: + butler = Butler.from_config(self.tmpConfigFile, run=run) + assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" collections = set(butler.registry.queryCollections()) self.assertEqual(collections, {run}) @@ -258,7 +260,7 @@ def create_butler( ) return butler, datasetType - def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler: + def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> DirectButler: # New datasets will be added to run and tag, but we will only look in # tag when looking up datasets. run = self.default_run @@ -512,7 +514,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But def testDeferredCollectionPassing(self) -> None: # Construct a butler with no run or collection, but make it writeable. - butler = Butler(self.tmpConfigFile, writeable=True) + butler = Butler.from_config(self.tmpConfigFile, writeable=True) # Create and register a DatasetType dimensions = butler.dimensions.extract(["instrument", "visit"]) datasetType = self.addDatasetType( @@ -576,17 +578,17 @@ def setUp(self) -> None: def testConstructor(self) -> None: """Independent test of constructor.""" - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) self.assertIsInstance(butler, Butler) # Check that butler.yaml is added automatically. if self.tmpConfigFile.endswith(end := "/butler.yaml"): config_dir = self.tmpConfigFile[: -len(end)] - butler = Butler(config_dir, run=self.default_run) + butler = Butler.from_config(config_dir, run=self.default_run) self.assertIsInstance(butler, Butler) # Even with a ResourcePath. - butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) + butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run) self.assertIsInstance(butler, Butler) collections = set(butler.registry.queryCollections()) @@ -594,11 +596,11 @@ def testConstructor(self) -> None: # Check that some special characters can be included in run name. special_run = "u@b.c-A" - butler_special = Butler(butler=butler, run=special_run) + butler_special = Butler.from_config(butler=butler, run=special_run) collections = set(butler_special.registry.queryCollections("*@*")) self.assertEqual(collections, {special_run}) - butler2 = Butler(butler=butler, collections=["other"]) + butler2 = Butler.from_config(butler=butler, collections=["other"]) self.assertEqual(butler2.collections, ("other",)) self.assertIsNone(butler2.run) self.assertIs(butler._datastore, butler2._datastore) @@ -619,17 +621,17 @@ def testConstructor(self) -> None: uri = Butler.get_repo_uri("bad_label") self.assertEqual(uri, ResourcePath(bad_label)) uri = Butler.get_repo_uri("label") - butler = Butler(uri, writeable=False) + butler = Butler.from_config(uri, writeable=False) self.assertIsInstance(butler, Butler) - butler = Butler("label", writeable=False) + butler = Butler.from_config("label", writeable=False) self.assertIsInstance(butler, Butler) with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"): - Butler("not_there", writeable=False) + Butler.from_config("not_there", writeable=False) with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"): - Butler("bad_label") + Butler.from_config("bad_label") with self.assertRaises(FileNotFoundError): # Should ignore aliases. - Butler(ResourcePath("label", forceAbsolute=False)) + Butler.from_config(ResourcePath("label", forceAbsolute=False)) with self.assertRaises(KeyError) as cm: Butler.get_repo_uri("missing") self.assertEqual( @@ -644,24 +646,24 @@ def testConstructor(self) -> None: butler_index.dumpToUri(temp_file) with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"): - Butler("label") + Butler.from_config("label") with ResourcePath.temporary_uri(suffix=suffix) as temp_file: # Now with bad contents. with open(temp_file.ospath, "w") as fh: print("'", file=fh) with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}): with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"): - Butler("label") + Butler.from_config("label") with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}): with self.assertRaises(FileNotFoundError): Butler.get_repo_uri("label") self.assertEqual(Butler.get_known_repos(), set()) with self.assertRaisesRegex(FileNotFoundError, "index file not found"): - Butler("label") + Butler.from_config("label") # Check that we can create Butler when the alias file is not found. - butler = Butler(self.tmpConfigFile, writeable=False) + butler = Butler.from_config(self.tmpConfigFile, writeable=False) self.assertIsInstance(butler, Butler) with self.assertRaises(KeyError) as cm: # No environment variable set. @@ -670,7 +672,7 @@ def testConstructor(self) -> None: self.assertIn("No repository index defined", str(cm.exception)) with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"): # No aliases registered. - Butler("not_there") + Butler.from_config("not_there") self.assertEqual(Butler.get_known_repos(), set()) def testBasicPutGet(self) -> None: @@ -842,7 +844,7 @@ def testPytypePutCoercion(self) -> None: self.assertEqual(get_full_type_name(test_dict3), "dict") def testIngest(self) -> None: - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) # Create and register a DatasetType dimensions = butler.dimensions.extract(["instrument", "visit", "detector"]) @@ -994,7 +996,8 @@ def testIngest(self) -> None: def testPickle(self) -> None: """Test pickle support.""" - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) + assert isinstance(butler, DirectButler), "Expect DirectButler in configuration" butlerOut = pickle.loads(pickle.dumps(butler)) self.assertIsInstance(butlerOut, Butler) self.assertEqual(butlerOut._config, butler._config) @@ -1002,7 +1005,7 @@ def testPickle(self) -> None: self.assertEqual(butlerOut.run, butler.run) def testGetDatasetTypes(self) -> None: - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"]) dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [ ( @@ -1076,7 +1079,7 @@ def testGetDatasetTypes(self) -> None: ) def testTransaction(self) -> None: - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) datasetTypeName = "test_metric" dimensions = butler.dimensions.extract(["instrument", "visit"]) dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = ( @@ -1133,10 +1136,12 @@ def testMakeRepo(self) -> None: butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile)) limited = Config(self.configFile) - butler1 = Butler(butlerConfig) + butler1 = Butler.from_config(butlerConfig) + assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration" butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile)) full = Config(self.tmpConfigFile) - butler2 = Butler(butlerConfig) + butler2 = Butler.from_config(butlerConfig) + assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration" # Butlers should have the same configuration regardless of whether # defaults were expanded. self.assertEqual(butler1._config, butler2._config) @@ -1156,13 +1161,13 @@ def testMakeRepo(self) -> None: # work properly with relocatable Butler repo butlerConfig.configFile = None with self.assertRaises(ValueError): - Butler(butlerConfig) + Butler.from_config(butlerConfig) with self.assertRaises(FileExistsError): Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False) def testStringification(self) -> None: - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) butlerStr = str(butler) if self.datastoreStr is not None: @@ -1178,7 +1183,7 @@ def testStringification(self) -> None: def testButlerRewriteDataId(self) -> None: """Test that dataIds can be rewritten based on dimension records.""" - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict") datasetTypeName = "random_data" @@ -1244,7 +1249,7 @@ def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath) def testPutTemplates(self) -> None: storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) # Add needed Dimensions butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"}) @@ -1380,7 +1385,7 @@ def runImportExportTest(self, storageClass: StorageClass) -> None: transfer="auto", skip_dimensions=None, ) - importButler = Butler(importDir, run=self.default_run) + importButler = Butler.from_config(importDir, run=self.default_run) for ref in datasets: with self.subTest(ref=ref): # Test for existence by passing in the DatasetType and @@ -1393,7 +1398,7 @@ def runImportExportTest(self, storageClass: StorageClass) -> None: def testRemoveRuns(self) -> None: storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") - butler = Butler(self.tmpConfigFile, writeable=True) + butler = Butler.from_config(self.tmpConfigFile, writeable=True) # Load registry data with dimensions to hang datasets off of. registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry")) butler.import_(filename=os.path.join(registryDataDir, "base.yaml")) @@ -1453,12 +1458,12 @@ class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase): def testPathConstructor(self) -> None: """Independent test of constructor using PathLike.""" - butler = Butler(self.tmpConfigFile, run=self.default_run) + butler = Butler.from_config(self.tmpConfigFile, run=self.default_run) self.assertIsInstance(butler, Butler) # And again with a Path object with the butler yaml path = pathlib.Path(self.tmpConfigFile) - butler = Butler(path, writeable=False) + butler = Butler.from_config(path, writeable=False) self.assertIsInstance(butler, Butler) # And again with a Path object without the butler yaml @@ -1466,7 +1471,7 @@ def testPathConstructor(self) -> None: # in butler.yaml -- which is the case for a subclass) if self.tmpConfigFile.endswith("butler.yaml"): path = pathlib.Path(os.path.dirname(self.tmpConfigFile)) - butler = Butler(path, writeable=False) + butler = Butler.from_config(path, writeable=False) self.assertIsInstance(butler, Butler) def testExportTransferCopy(self) -> None: @@ -1500,7 +1505,7 @@ def testExportTransferCopy(self) -> None: def testPruneDatasets(self) -> None: storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") - butler = Butler(self.tmpConfigFile, writeable=True) + butler = Butler.from_config(self.tmpConfigFile, writeable=True) assert isinstance(butler._datastore, FileDatastore) # Load registry data with dimensions to hang datasets off of. registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry")) @@ -2064,7 +2069,9 @@ def tearDown(self) -> None: def create_butler(self, manager: str, label: str) -> Butler: config = Config(self.configFile) config["registry", "managers", "datasets"] = manager - return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True) + return Butler.from_config( + Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True + ) def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None: default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID" @@ -2192,7 +2199,7 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St # we are rewriting integer dataset ids in the target if necessary. # Will not be relevant for UUID. run = "distraction" - butler = Butler(butler=self.source_butler, run=run) + butler = Butler.from_config(butler=self.source_butler, run=run) butler.put( makeExampleMetrics(), datasetTypeName, @@ -2202,7 +2209,7 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St ) # Write some example metrics to the source - butler = Butler(butler=self.source_butler) + butler = Butler.from_config(butler=self.source_butler) # Set of DatasetRefs that should be in the list of refs to transfer # but which will not be transferred. @@ -2383,9 +2390,9 @@ def test_fallback(self) -> None: bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore" with self.assertRaises(RuntimeError): - Butler(bad_config) + Butler.from_config(bad_config) - butler = Butler(bad_config, writeable=True, without_datastore=True) + butler = Butler.from_config(bad_config, writeable=True, without_datastore=True) self.assertIsInstance(butler._datastore, NullDatastore) # Check that registry is working. diff --git a/tests/test_cliCmdIngestFiles.py b/tests/test_cliCmdIngestFiles.py index fbb48f7ef0..29b9730297 100644 --- a/tests/test_cliCmdIngestFiles.py +++ b/tests/test_cliCmdIngestFiles.py @@ -104,7 +104,7 @@ def assertIngest(self, table, options): ) self.assertEqual(result.exit_code, 0, clickResultMsg(result)) - butler = Butler(self.root) + butler = Butler.from_config(self.root) refs = list(butler.registry.queryDatasets("test_metric_comp", collections=run)) self.assertEqual(len(refs), 2) diff --git a/tests/test_cliCmdPruneDatasets.py b/tests/test_cliCmdPruneDatasets.py index 7d4c5901a9..e77961994d 100644 --- a/tests/test_cliCmdPruneDatasets.py +++ b/tests/test_cliCmdPruneDatasets.py @@ -35,7 +35,6 @@ import lsst.daf.butler.registries.sql import lsst.daf.butler.script from astropy.table import Table -from lsst.daf.butler import Butler from lsst.daf.butler.cli.butler import cli as butlerCli from lsst.daf.butler.cli.cmd.commands import ( pruneDatasets_askContinueMsg, @@ -54,6 +53,7 @@ pruneDatasets_wouldRemoveMsg, ) from lsst.daf.butler.cli.utils import LogCliRunner, astropyTablesToStr, clickResultMsg +from lsst.daf.butler.direct_butler import DirectButler from lsst.daf.butler.registry import CollectionType from lsst.daf.butler.script import QueryDatasets @@ -118,7 +118,7 @@ def makePruneDatasetsArgs(**kwargs): @patch.object(lsst.daf.butler.script._pruneDatasets, "QueryDatasets", side_effect=makeQueryDatasets) # Mock the pruneDatasets butler command so we can test for expected calls # to it, without dealing with setting up a full repo with data for it. - @patch.object(Butler, "pruneDatasets") + @patch.object(DirectButler, "pruneDatasets") def run_test( self, mockPruneDatasets, diff --git a/tests/test_cliCmdQueryCollections.py b/tests/test_cliCmdQueryCollections.py index 1d88b40e1d..47eeb16cfa 100644 --- a/tests/test_cliCmdQueryCollections.py +++ b/tests/test_cliCmdQueryCollections.py @@ -98,7 +98,7 @@ def testGetCollections(self): with self.runner.isolated_filesystem(): butlerCfg = Butler.makeRepo("here") # the purpose of this call is to create some collections - butler = Butler(butlerCfg, run=run, collections=[tag], writeable=True) + butler = Butler.from_config(butlerCfg, run=run, collections=[tag], writeable=True) butler.registry.registerCollection(tag, CollectionType.TAGGED) # Verify collections that were created are found by @@ -140,7 +140,7 @@ def testChained(self): # Create a butler and add some chained collections: butlerCfg = Butler.makeRepo("here") - butler1 = Butler(butlerCfg, writeable=True) + butler1 = Butler.from_config(butlerCfg, writeable=True) # Replace datastore functions with mocks: DatastoreMock.apply(butler1) diff --git a/tests/test_cliCmdQueryDataIds.py b/tests/test_cliCmdQueryDataIds.py index f0535ab2ac..56cfa69e49 100644 --- a/tests/test_cliCmdQueryDataIds.py +++ b/tests/test_cliCmdQueryDataIds.py @@ -70,7 +70,7 @@ def loadData(self, *filenames: str) -> Butler: """Load registry test data from ``TESTDIR/data/registry/``, which should be a YAML import/export file. """ - butler = Butler(self.repo, writeable=True) + butler = Butler.from_config(self.repo, writeable=True) for filename in filenames: with open(os.path.join(TESTDIR, "data", "registry", filename)) as stream: # Go behind the back of the import code a bit to deal with diff --git a/tests/test_cliCmdQueryDimensionRecords.py b/tests/test_cliCmdQueryDimensionRecords.py index 3f982f5789..876a77453d 100644 --- a/tests/test_cliCmdQueryDimensionRecords.py +++ b/tests/test_cliCmdQueryDimensionRecords.py @@ -166,7 +166,7 @@ def testWhere(self): self.assertAstropyTablesEqual(readTable(result.output), expected) def testCollection(self): - butler = Butler(self.root, run="foo") + butler = Butler.from_config(self.root, run="foo") # try replacing the testRepo's butler with the one with the "foo" run. self.testRepo.butler = butler @@ -273,7 +273,7 @@ def testCollection(self): self.assertAstropyTablesEqual(readTable(result.output), expected) def testSkymap(self): - butler = Butler(self.root, run="foo") + butler = Butler.from_config(self.root, run="foo") # try replacing the testRepo's butler with the one with the "foo" run. self.testRepo.butler = butler diff --git a/tests/test_cliCmdRemoveCollections.py b/tests/test_cliCmdRemoveCollections.py index 080e78816e..ec20e316f5 100644 --- a/tests/test_cliCmdRemoveCollections.py +++ b/tests/test_cliCmdRemoveCollections.py @@ -220,7 +220,7 @@ def testRemoveCmd(self): # verify chained-run-1 was removed: - butler = Butler(self.root) + butler = Butler.from_config(self.root) collections = butler.registry.queryCollections( collectionTypes=frozenset( ( diff --git a/tests/test_logFormatter.py b/tests/test_logFormatter.py index 8f3d0a4d1d..a166ebadfc 100644 --- a/tests/test_logFormatter.py +++ b/tests/test_logFormatter.py @@ -49,7 +49,7 @@ def setUp(self): Butler.makeRepo(self.root) self.run = "testrun" - self.butler = Butler(self.root, run=self.run) + self.butler = Butler.from_config(self.root, run=self.run) self.datasetType = DatasetType("test_logs", [], "ButlerLogRecords", universe=self.butler.dimensions) self.butler.registry.registerDatasetType(self.datasetType) diff --git a/tests/test_matplotlibFormatter.py b/tests/test_matplotlibFormatter.py index 8851d095f9..78b5f887d8 100644 --- a/tests/test_matplotlibFormatter.py +++ b/tests/test_matplotlibFormatter.py @@ -65,7 +65,7 @@ def tearDown(self): removeTestTempDir(self.root) def testMatplotlibFormatter(self): - butler = Butler(self.root, run="testrun") + butler = Butler.from_config(self.root, run="testrun") datasetType = DatasetType("test_plot", [], "Plot", universe=butler.dimensions) butler.registry.registerDatasetType(datasetType) # Does not have to be a random image diff --git a/tests/test_packages.py b/tests/test_packages.py index 16b395c93f..1f602304ad 100644 --- a/tests/test_packages.py +++ b/tests/test_packages.py @@ -45,7 +45,7 @@ def setUp(self): """Create a new butler root for each test.""" self.root = makeTestTempDir(TESTDIR) Butler.makeRepo(self.root) - self.butler = Butler(self.root, run="test_run") + self.butler = Butler.from_config(self.root, run="test_run") # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( diff --git a/tests/test_parquet.py b/tests/test_parquet.py index 93753cc1c8..b39a0af407 100644 --- a/tests/test_parquet.py +++ b/tests/test_parquet.py @@ -306,7 +306,9 @@ def setUp(self): self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) self.run = "test_run" - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run=self.run + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( @@ -726,7 +728,9 @@ def setUp(self): self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) self.run = "test_run" - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run) + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run=self.run + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( @@ -1053,7 +1057,9 @@ def setUp(self): """Create a new butler root for each test.""" self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( @@ -1313,7 +1319,9 @@ def setUp(self): """Create a new butler root for each test.""" self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( @@ -1634,7 +1642,9 @@ def setUp(self): """Create a new butler root for each test.""" self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( @@ -1787,7 +1797,9 @@ def setUp(self): """Create a new butler root for each test.""" self.root = makeTestTempDir(TESTDIR) config = Config(self.configFile) - self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run") + self.butler = Butler.from_config( + Butler.makeRepo(self.root, config=config), writeable=True, run="test_run" + ) # No dimensions in dataset type so we don't have to worry about # inserting dimension data or defining data IDs. self.datasetType = DatasetType( diff --git a/tests/test_quantumBackedButler.py b/tests/test_quantumBackedButler.py index 1cf801fdf9..423ee7e083 100644 --- a/tests/test_quantumBackedButler.py +++ b/tests/test_quantumBackedButler.py @@ -43,6 +43,7 @@ RegistryConfig, StorageClass, ) +from lsst.daf.butler.direct_butler import DirectButler from lsst.daf.butler.registry import _RegistryFactory from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir from lsst.resources import ResourcePath @@ -62,7 +63,9 @@ def setUp(self) -> None: # Make a butler and import dimension definitions. registryConfig = RegistryConfig(self.config.get("registry")) _RegistryFactory(registryConfig).create_from_config(butlerRoot=self.root) - self.butler = Butler(self.config, writeable=True, run="RUN") + butler = Butler.from_config(self.config, writeable=True, run="RUN") + assert isinstance(butler, DirectButler) + self.butler = butler self.butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) # make all dataset types diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index a564e4b08b..483d7d03ef 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -79,7 +79,7 @@ def makeButler(self, **kwargs: Any) -> Butler: registryConfig = RegistryConfig(config.get("registry")) _RegistryFactory(registryConfig).create_from_config() - butler = Butler(config, **kwargs) + butler = Butler.from_config(config, **kwargs) DatastoreMock.apply(butler) return butler @@ -549,13 +549,13 @@ def testRegistryDefaults(self): # Initialize a new butler with `imported_g` as its default run. # This should not have a default instrument, because there are two. # Pass run instead of collections; this should set both. - butler2 = Butler(butler=butler, run="imported_g") + butler2 = Butler.from_config(butler=butler, run="imported_g") self.assertEqual(list(butler2.registry.defaults.collections), ["imported_g"]) self.assertEqual(butler2.registry.defaults.run, "imported_g") self.assertFalse(butler2.registry.defaults.dataId) # Initialize a new butler with an instrument default explicitly given. # Set collections instead of run, which should then be None. - butler3 = Butler(butler=butler, collections=["imported_g"], instrument="Cam2") + butler3 = Butler.from_config(butler=butler, collections=["imported_g"], instrument="Cam2") self.assertEqual(list(butler3.registry.defaults.collections), ["imported_g"]) self.assertIsNone(butler3.registry.defaults.run, None) self.assertEqual(butler3.registry.defaults.dataId.byName(), {"instrument": "Cam2"}) diff --git a/tests/test_testRepo.py b/tests/test_testRepo.py index faf9518291..71f40e7e6f 100644 --- a/tests/test_testRepo.py +++ b/tests/test_testRepo.py @@ -211,7 +211,7 @@ def testRegisterMetricsExampleChained(self): ] repo = lsst.daf.butler.Butler.makeRepo(temp, config=config) - butler = lsst.daf.butler.Butler(repo, run="chainedExample") + butler = lsst.daf.butler.Butler.from_config(repo, run="chainedExample") registerMetricsExample(butler) addDatasetType(butler, "DummyType", {}, "StructuredDataNoComponents")