diff --git a/doc/changes/DM-41116.api.md b/doc/changes/DM-41116.api.md
new file mode 100644
index 0000000000..7dbbf2837e
--- /dev/null
+++ b/doc/changes/DM-41116.api.md
@@ -0,0 +1,3 @@
+- `Butler` class becomes an abstract base class, original `Butler` was renamed to `DirectButler`.
+- Clients that need an access to `DirectButler` class will have to import it from `lsst.daf.butler.direct_butler`.
+- `Butler.from_config(...)` should be used to make `Butler` instances. `Butler(...)` still works and is identical to `Butler.from_config(...)`, but will generate `mypy` errors.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
index a8a3e0a18d..a01541f9ad 100644
--- a/python/lsst/daf/butler/_butler.py
+++ b/python/lsst/daf/butler/_butler.py
@@ -25,32 +25,18 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-"""Butler top level classes.
-"""
from __future__ import annotations
-__all__ = (
- "Butler",
- "ButlerValidationError",
-)
-
-import collections.abc
-import contextlib
-import io
-import logging
-import numbers
-import os
-import warnings
-from collections import Counter, defaultdict
-from collections.abc import Iterable, Iterator, MutableMapping, Sequence
-from typing import TYPE_CHECKING, Any, ClassVar, TextIO
-
-from deprecated.sphinx import deprecated
+__all__ = ["Butler"]
+
+from abc import abstractmethod
+from collections.abc import Collection, Iterable, Sequence
+from contextlib import AbstractContextManager
+from typing import Any, TextIO
+
from lsst.resources import ResourcePath, ResourcePathExpression
from lsst.utils import doImportType
-from lsst.utils.introspection import get_class_of
-from lsst.utils.logging import VERBOSE, getLogger
-from sqlalchemy.exc import IntegrityError
+from lsst.utils.logging import getLogger
from ._butler_config import ButlerConfig
from ._butler_repo_index import ButlerRepoIndex
@@ -59,69 +45,30 @@
from ._dataset_ref import DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
-from ._exceptions import ValidationError
from ._file_dataset import FileDataset
from ._limited_butler import LimitedButler
-from ._registry_shim import RegistryShim
-from ._storage_class import StorageClass, StorageClassFactory
-from ._timespan import Timespan
-from .datastore import DatasetRefURIs, Datastore, NullDatastore
-from .dimensions import (
- DataCoordinate,
- DataId,
- DataIdValue,
- Dimension,
- DimensionConfig,
- DimensionElement,
- DimensionRecord,
- DimensionUniverse,
-)
-from .progress import Progress
-from .registry import (
- CollectionType,
- ConflictingDefinitionError,
- DataIdError,
- MissingDatasetTypeError,
- NoDefaultCollectionError,
- Registry,
- RegistryConfig,
- RegistryDefaults,
- _ButlerRegistry,
- _RegistryFactory,
-)
+from ._storage_class import StorageClass
+from .datastore import DatasetRefURIs, Datastore
+from .dimensions import DataId, DimensionConfig
+from .registry import Registry, RegistryConfig, _RegistryFactory
from .repo_relocation import BUTLER_ROOT_TAG
from .transfers import RepoExportContext
-from .utils import transactional
-
-if TYPE_CHECKING:
- from lsst.resources import ResourceHandleProtocol
-
- from .transfers import RepoImportBackend
-
-log = getLogger(__name__)
-
-class ButlerValidationError(ValidationError):
- """There is a problem with the Butler configuration."""
-
- pass
+_LOG = getLogger(__name__)
class Butler(LimitedButler):
- """Main entry point for the data access system.
+ """Interface for data butler and factory for Butler instances.
Parameters
----------
config : `ButlerConfig`, `Config` or `str`, optional.
- Configuration. Anything acceptable to the
- `ButlerConfig` constructor. If a directory path
- is given the configuration will be read from a ``butler.yaml`` file in
- that location. If `None` is given default values will be used.
- butler : `Butler`, optional.
- If provided, construct a new Butler that uses the same registry and
- datastore as the given one, but with the given collection and run.
- Incompatible with the ``config``, ``searchPaths``, and ``writeable``
- arguments.
+ Configuration. Anything acceptable to the `ButlerConfig` constructor.
+ If a directory path is given the configuration will be read from a
+ ``butler.yaml`` file in that location. If `None` is given default
+ values will be used. If ``config`` contains "cls" key then its value is
+ used as a name of butler class and it must be a sub-class of this
+ class, otherwise `DirectButler` is instantiated.
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
An expression specifying the collections to be searched (in order) when
reading datasets.
@@ -151,179 +98,182 @@ class Butler(LimitedButler):
the default for that dimension. Nonexistent collections are ignored.
If a default value is provided explicitly for a governor dimension via
``**kwargs``, no default will be inferred for that dimension.
- without_datastore : `bool`, optional
- If `True` do not attach a datastore to this butler. Any attempts
- to use a datastore will fail.
- **kwargs : `str`
- Default data ID key-value pairs. These may only identify "governor"
- dimensions like ``instrument`` and ``skymap``.
-
- Examples
- --------
- While there are many ways to control exactly how a `Butler` interacts with
- the collections in its `Registry`, the most common cases are still simple.
-
- For a read-only `Butler` that searches one collection, do::
-
- butler = Butler("/path/to/repo", collections=["u/alice/DM-50000"])
-
- For a read-write `Butler` that writes to and reads from a
- `~CollectionType.RUN` collection::
-
- butler = Butler("/path/to/repo", run="u/alice/DM-50000/a")
-
- The `Butler` passed to a ``PipelineTask`` is often much more complex,
- because we want to write to one `~CollectionType.RUN` collection but read
- from several others (as well)::
-
- butler = Butler("/path/to/repo", run="u/alice/DM-50000/a",
- collections=["u/alice/DM-50000/a",
- "u/bob/DM-49998",
- "HSC/defaults"])
-
- This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
- Datasets will be read first from that run (since it appears first in the
- chain), and then from ``u/bob/DM-49998`` and finally ``HSC/defaults``.
-
- Finally, one can always create a `Butler` with no collections::
-
- butler = Butler("/path/to/repo", writeable=True)
-
- This can be extremely useful when you just want to use ``butler.registry``,
- e.g. for inserting dimension data or managing collections, or when the
- collections you want to use with the butler are not consistent.
- Passing ``writeable`` explicitly here is only necessary if you want to be
- able to make changes to the repo - usually the value for ``writeable`` can
- be guessed from the collection arguments provided, but it defaults to
- `False` when there are not collection arguments.
+ **kwargs : `Any`
+ Additional keyword arguments passed to a constructor of actual butler
+ class.
+
+ Notes
+ -----
+ The preferred way to instantiate Butler is via the `from_config` method.
+ The call to ``Butler(...)`` is equivalent to ``Butler.from_config(...)``,
+ but ``mypy`` will complain about the former.
"""
- def __init__(
- self,
+ def __new__(
+ cls,
config: Config | ResourcePathExpression | None = None,
*,
- butler: Butler | None = None,
collections: Any = None,
run: str | None = None,
searchPaths: Sequence[ResourcePathExpression] | None = None,
writeable: bool | None = None,
inferDefaults: bool = True,
- without_datastore: bool = False,
- **kwargs: str,
- ):
- defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
- # Load registry, datastore, etc. from config or existing butler.
- if butler is not None:
- if config is not None or searchPaths is not None or writeable is not None:
- raise TypeError(
- "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
- )
- self._registry = butler._registry.copy(defaults)
- self._datastore = butler._datastore
- self.storageClasses = butler.storageClasses
- self._config: ButlerConfig = butler._config
- else:
- self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
- try:
- butlerRoot = self._config.get("root", self._config.configDir)
- if writeable is None:
- writeable = run is not None
- self._registry = _RegistryFactory(self._config).from_config(
- butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
- )
- if without_datastore:
- self._datastore = NullDatastore(None, None)
- else:
- self._datastore = Datastore.fromConfig(
- self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
- )
- # TODO: Once datastore drops dependency on registry we can
- # construct datastore first and pass opaque tables to registry
- # constructor.
- self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions())
- self.storageClasses = StorageClassFactory()
- self.storageClasses.addFromConfig(self._config)
- except Exception:
- # Failures here usually mean that configuration is incomplete,
- # just issue an error message which includes config file URI.
- log.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
- raise
-
- # For execution butler the datastore needs a special
- # dependency-inversion trick. This is not used by regular butler,
- # but we do not have a way to distinguish regular butler from execution
- # butler.
- self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
-
- if "run" in self._config or "collection" in self._config:
- raise ValueError("Passing a run or collection via configuration is no longer supported.")
-
- self._registry_shim = RegistryShim(self)
-
- GENERATION: ClassVar[int] = 3
- """This is a Generation 3 Butler.
-
- This attribute may be removed in the future, once the Generation 2 Butler
- interface has been fully retired; it should only be used in transitional
- code.
- """
+ **kwargs: Any,
+ ) -> Butler:
+ if cls is Butler:
+ cls = cls._find_butler_class(config, searchPaths)
+ # Note: we do not pass any parameters to __new__, Python will pass them
+ # to __init__ after __new__ returns sub-class instance.
+ return super().__new__(cls)
- def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
- """Return DatasetType defined in registry given dataset type name."""
- try:
- return self._registry.getDatasetType(name)
- except MissingDatasetTypeError:
- return None
+ @staticmethod
+ def _find_butler_class(
+ config: Config | ResourcePathExpression | None = None,
+ searchPaths: Sequence[ResourcePathExpression] | None = None,
+ ) -> type[Butler]:
+ """Find actual class to instantiate."""
+ butler_class_name: str | None = None
+ if config is not None:
+ # Check for optional "cls" key in config.
+ if not isinstance(config, Config):
+ config = ButlerConfig(config, searchPaths=searchPaths)
+ butler_class_name = config.get("cls")
+
+ # Make DirectButler if class is not specified.
+ butler_class: type[Butler]
+ if butler_class_name is None:
+ from .direct_butler import DirectButler
+
+ butler_class = DirectButler
+ else:
+ butler_class = doImportType(butler_class_name)
+ if not issubclass(butler_class, Butler):
+ raise TypeError(f"{butler_class_name} is not a subclass of Butler")
+ return butler_class
@classmethod
- def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
- """Look up the label in a butler repository index.
+ def from_config(
+ cls,
+ config: Config | ResourcePathExpression | None = None,
+ *,
+ collections: Any = None,
+ run: str | None = None,
+ searchPaths: Sequence[ResourcePathExpression] | None = None,
+ writeable: bool | None = None,
+ inferDefaults: bool = True,
+ **kwargs: Any,
+ ) -> Butler:
+ """Create butler instance from configuration.
Parameters
----------
- label : `str`
- Label of the Butler repository to look up.
- return_label : `bool`, optional
- If ``label`` cannot be found in the repository index (either
- because index is not defined or ``label`` is not in the index) and
- ``return_label`` is `True` then return ``ResourcePath(label)``.
- If ``return_label`` is `False` (default) then an exception will be
- raised instead.
-
- Returns
- -------
- uri : `lsst.resources.ResourcePath`
- URI to the Butler repository associated with the given label or
- default value if it is provided.
-
- Raises
- ------
- KeyError
- Raised if the label is not found in the index, or if an index
- is not defined, and ``return_label`` is `False`.
+ config : `ButlerConfig`, `Config` or `str`, optional.
+ Configuration. Anything acceptable to the `ButlerConfig`
+ constructor. If a directory path is given the configuration will be
+ read from a ``butler.yaml`` file in that location. If `None` is
+ given default values will be used. If ``config`` contains "cls" key
+ then its value is used as a name of butler class and it must be a
+ sub-class of this class, otherwise `DirectButler` is instantiated.
+ collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
+ An expression specifying the collections to be searched (in order)
+ when reading datasets.
+ This may be a `str` collection name or an iterable thereof.
+ See :ref:`daf_butler_collection_expressions` for more information.
+ These collections are not registered automatically and must be
+ manually registered before they are used by any method, but they
+ may be manually registered after the `Butler` is initialized.
+ run : `str`, optional
+ Name of the `~CollectionType.RUN` collection new datasets should be
+ inserted into. If ``collections`` is `None` and ``run`` is not
+ `None`, ``collections`` will be set to ``[run]``. If not `None`,
+ this collection will automatically be registered. If this is not
+ set (and ``writeable`` is not set either), a read-only butler will
+ be created.
+ searchPaths : `list` of `str`, optional
+ Directory paths to search when calculating the full Butler
+ configuration. Not used if the supplied config is already a
+ `ButlerConfig`.
+ writeable : `bool`, optional
+ Explicitly sets whether the butler supports write operations. If
+ not provided, a read-write butler is created if any of ``run``,
+ ``tags``, or ``chains`` is non-empty.
+ inferDefaults : `bool`, optional
+ If `True` (default) infer default data ID values from the values
+ present in the datasets in ``collections``: if all collections have
+ the same value (or no value) for a governor dimension, that value
+ will be the default for that dimension. Nonexistent collections
+ are ignored. If a default value is provided explicitly for a
+ governor dimension via ``**kwargs``, no default will be inferred
+ for that dimension.
+ **kwargs : `Any`
+ Additional keyword arguments passed to a constructor of actual
+ butler class.
Notes
-----
- See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
- information is discovered.
- """
- return ButlerRepoIndex.get_repo_uri(label, return_label)
+ Calling this factory method is identical to calling
+ ``Butler(config, ...)``. Its only raison d'ĂȘtre is that ``mypy``
+ complains about ``Butler()`` call.
- @classmethod
- def get_known_repos(cls) -> set[str]:
- """Retrieve the list of known repository labels.
+ Examples
+ --------
+ While there are many ways to control exactly how a `Butler` interacts
+ with the collections in its `Registry`, the most common cases are still
+ simple.
- Returns
- -------
- repos : `set` of `str`
- All the known labels. Can be empty if no index can be found.
+ For a read-only `Butler` that searches one collection, do::
- Notes
- -----
- See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
- information is discovered.
+ butler = Butler.from_config(
+ "/path/to/repo", collections=["u/alice/DM-50000"]
+ )
+
+ For a read-write `Butler` that writes to and reads from a
+ `~CollectionType.RUN` collection::
+
+ butler = Butler.from_config(
+ "/path/to/repo", run="u/alice/DM-50000/a"
+ )
+
+ The `Butler` passed to a ``PipelineTask`` is often much more complex,
+ because we want to write to one `~CollectionType.RUN` collection but
+ read from several others (as well)::
+
+ butler = Butler.from_config(
+ "/path/to/repo",
+ run="u/alice/DM-50000/a",
+ collections=[
+ "u/alice/DM-50000/a", "u/bob/DM-49998", "HSC/defaults"
+ ]
+ )
+
+ This butler will `put` new datasets to the run ``u/alice/DM-50000/a``.
+ Datasets will be read first from that run (since it appears first in
+ the chain), and then from ``u/bob/DM-49998`` and finally
+ ``HSC/defaults``.
+
+ Finally, one can always create a `Butler` with no collections::
+
+ butler = Butler.from_config("/path/to/repo", writeable=True)
+
+ This can be extremely useful when you just want to use
+ ``butler.registry``, e.g. for inserting dimension data or managing
+ collections, or when the collections you want to use with the butler
+ are not consistent. Passing ``writeable`` explicitly here is only
+ necessary if you want to be able to make changes to the repo - usually
+ the value for ``writeable`` can be guessed from the collection
+ arguments provided, but it defaults to `False` when there are not
+ collection arguments.
"""
- return ButlerRepoIndex.get_known_repos()
+ cls = cls._find_butler_class(config, searchPaths)
+ return cls(
+ config,
+ collections=collections,
+ run=run,
+ searchPaths=searchPaths,
+ writeable=writeable,
+ inferDefaults=inferDefaults,
+ **kwargs,
+ )
@staticmethod
def makeRepo(
@@ -477,668 +427,69 @@ def makeRepo(
dimensionConfig=dimensionConfig, butlerRoot=root_uri
)
- log.verbose("Wrote new Butler configuration file to %s", configURI)
+ _LOG.verbose("Wrote new Butler configuration file to %s", configURI)
return config
@classmethod
- def _unpickle(
- cls,
- config: ButlerConfig,
- collections: tuple[str, ...] | None,
- run: str | None,
- defaultDataId: dict[str, str],
- writeable: bool,
- ) -> Butler:
- """Callable used to unpickle a Butler.
-
- We prefer not to use ``Butler.__init__`` directly so we can force some
- of its many arguments to be keyword-only (note that ``__reduce__``
- can only invoke callables with positional arguments).
+ def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath:
+ """Look up the label in a butler repository index.
Parameters
----------
- config : `ButlerConfig`
- Butler configuration, already coerced into a true `ButlerConfig`
- instance (and hence after any search paths for overrides have been
- utilized).
- collections : `tuple` [ `str` ]
- Names of the default collections to read from.
- run : `str`, optional
- Name of the default `~CollectionType.RUN` collection to write to.
- defaultDataId : `dict` [ `str`, `str` ]
- Default data ID values.
- writeable : `bool`
- Whether the Butler should support write operations.
+ label : `str`
+ Label of the Butler repository to look up.
+ return_label : `bool`, optional
+ If ``label`` cannot be found in the repository index (either
+ because index is not defined or ``label`` is not in the index) and
+ ``return_label`` is `True` then return ``ResourcePath(label)``.
+ If ``return_label`` is `False` (default) then an exception will be
+ raised instead.
Returns
-------
- butler : `Butler`
- A new `Butler` instance.
- """
- # MyPy doesn't recognize that the kwargs below are totally valid; it
- # seems to think '**defaultDataId* is a _positional_ argument!
- return cls(
- config=config,
- collections=collections,
- run=run,
- writeable=writeable,
- **defaultDataId, # type: ignore
- )
-
- def __reduce__(self) -> tuple:
- """Support pickling."""
- return (
- Butler._unpickle,
- (
- self._config,
- self.collections,
- self.run,
- self._registry.defaults.dataId.byName(),
- self._registry.isWriteable(),
- ),
- )
-
- def __str__(self) -> str:
- return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
- self.collections, self.run, self._datastore, self._registry
- )
-
- def isWriteable(self) -> bool:
- """Return `True` if this `Butler` supports write operations."""
- return self._registry.isWriteable()
+ uri : `lsst.resources.ResourcePath`
+ URI to the Butler repository associated with the given label or
+ default value if it is provided.
- @contextlib.contextmanager
- def transaction(self) -> Iterator[None]:
- """Context manager supporting `Butler` transactions.
+ Raises
+ ------
+ KeyError
+ Raised if the label is not found in the index, or if an index
+ is not defined, and ``return_label`` is `False`.
- Transactions can be nested.
+ Notes
+ -----
+ See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
+ information is discovered.
"""
- with self._registry.transaction(), self._datastore.transaction():
- yield
-
- def _standardizeArgs(
- self,
- datasetRefOrType: DatasetRef | DatasetType | str,
- dataId: DataId | None = None,
- for_put: bool = True,
- **kwargs: Any,
- ) -> tuple[DatasetType, DataId | None]:
- """Standardize the arguments passed to several Butler APIs.
+ return ButlerRepoIndex.get_repo_uri(label, return_label)
- Parameters
- ----------
- datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
- When `DatasetRef` the `dataId` should be `None`.
- Otherwise the `DatasetType` or name thereof.
- dataId : `dict` or `DataCoordinate`
- A `dict` of `Dimension` link name, value pairs that label the
- `DatasetRef` within a Collection. When `None`, a `DatasetRef`
- should be provided as the second argument.
- for_put : `bool`, optional
- If `True` this call is invoked as part of a `Butler.put()`.
- Otherwise it is assumed to be part of a `Butler.get()`. This
- parameter is only relevant if there is dataset type
- inconsistency.
- **kwargs
- Additional keyword arguments used to augment or construct a
- `DataCoordinate`. See `DataCoordinate.standardize`
- parameters.
+ @classmethod
+ def get_known_repos(cls) -> set[str]:
+ """Retrieve the list of known repository labels.
Returns
-------
- datasetType : `DatasetType`
- A `DatasetType` instance extracted from ``datasetRefOrType``.
- dataId : `dict` or `DataId`, optional
- Argument that can be used (along with ``kwargs``) to construct a
- `DataId`.
+ repos : `set` of `str`
+ All the known labels. Can be empty if no index can be found.
Notes
-----
- Butler APIs that conceptually need a DatasetRef also allow passing a
- `DatasetType` (or the name of one) and a `DataId` (or a dict and
- keyword arguments that can be used to construct one) separately. This
- method accepts those arguments and always returns a true `DatasetType`
- and a `DataId` or `dict`.
-
- Standardization of `dict` vs `DataId` is best handled by passing the
- returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
- generally similarly flexible.
- """
- externalDatasetType: DatasetType | None = None
- internalDatasetType: DatasetType | None = None
- if isinstance(datasetRefOrType, DatasetRef):
- if dataId is not None or kwargs:
- raise ValueError("DatasetRef given, cannot use dataId as well")
- externalDatasetType = datasetRefOrType.datasetType
- dataId = datasetRefOrType.dataId
- else:
- # Don't check whether DataId is provided, because Registry APIs
- # can usually construct a better error message when it wasn't.
- if isinstance(datasetRefOrType, DatasetType):
- externalDatasetType = datasetRefOrType
- else:
- internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
-
- # Check that they are self-consistent
- if externalDatasetType is not None:
- internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
- if externalDatasetType != internalDatasetType:
- # We can allow differences if they are compatible, depending
- # on whether this is a get or a put. A get requires that
- # the python type associated with the datastore can be
- # converted to the user type. A put requires that the user
- # supplied python type can be converted to the internal
- # type expected by registry.
- relevantDatasetType = internalDatasetType
- if for_put:
- is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
- else:
- is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
- relevantDatasetType = externalDatasetType
- if not is_compatible:
- raise ValueError(
- f"Supplied dataset type ({externalDatasetType}) inconsistent with "
- f"registry definition ({internalDatasetType})"
- )
- # Override the internal definition.
- internalDatasetType = relevantDatasetType
-
- assert internalDatasetType is not None
- return internalDatasetType, dataId
-
- def _rewrite_data_id(
- self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
- ) -> tuple[DataId | None, dict[str, Any]]:
- """Rewrite a data ID taking into account dimension records.
-
- Take a Data ID and keyword args and rewrite it if necessary to
- allow the user to specify dimension records rather than dimension
- primary values.
-
- This allows a user to include a dataId dict with keys of
- ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
- the integer exposure ID. It also allows a string to be given
- for a dimension value rather than the integer ID if that is more
- convenient. For example, rather than having to specifying the
- detector with ``detector.full_name``, a string given for ``detector``
- will be interpreted as the full name and converted to the integer
- value.
-
- Keyword arguments can also use strings for dimensions like detector
- and exposure but python does not allow them to include ``.`` and
- so the ``exposure.day_obs`` syntax can not be used in a keyword
- argument.
-
- Parameters
- ----------
- dataId : `dict` or `DataCoordinate`
- A `dict` of `Dimension` link name, value pairs that will label the
- `DatasetRef` within a Collection.
- datasetType : `DatasetType`
- The dataset type associated with this dataId. Required to
- determine the relevant dimensions.
- **kwargs
- Additional keyword arguments used to augment or construct a
- `DataId`. See `DataId` parameters.
-
- Returns
- -------
- dataId : `dict` or `DataCoordinate`
- The, possibly rewritten, dataId. If given a `DataCoordinate` and
- no keyword arguments, the original dataId will be returned
- unchanged.
- **kwargs : `dict`
- Any unused keyword arguments (would normally be empty dict).
+ See `~lsst.daf.butler.ButlerRepoIndex` for details on how the
+ information is discovered.
"""
- # Do nothing if we have a standalone DataCoordinate.
- if isinstance(dataId, DataCoordinate) and not kwargs:
- return dataId, kwargs
-
- # Process dimension records that are using record information
- # rather than ids
- newDataId: dict[str, DataIdValue] = {}
- byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
-
- # if all the dataId comes from keyword parameters we do not need
- # to do anything here because they can't be of the form
- # exposure.obs_id because a "." is not allowed in a keyword parameter.
- if dataId:
- for k, v in dataId.items():
- # If we have a Dimension we do not need to do anything
- # because it cannot be a compound key.
- if isinstance(k, str) and "." in k:
- # Someone is using a more human-readable dataId
- dimensionName, record = k.split(".", 1)
- byRecord[dimensionName][record] = v
- elif isinstance(k, Dimension):
- newDataId[k.name] = v
- else:
- newDataId[k] = v
-
- # Go through the updated dataId and check the type in case someone is
- # using an alternate key. We have already filtered out the compound
- # keys dimensions.record format.
- not_dimensions = {}
-
- # Will need to look in the dataId and the keyword arguments
- # and will remove them if they need to be fixed or are unrecognized.
- for dataIdDict in (newDataId, kwargs):
- # Use a list so we can adjust the dict safely in the loop
- for dimensionName in list(dataIdDict):
- value = dataIdDict[dimensionName]
- try:
- dimension = self.dimensions.getStaticDimensions()[dimensionName]
- except KeyError:
- # This is not a real dimension
- not_dimensions[dimensionName] = value
- del dataIdDict[dimensionName]
- continue
-
- # Convert an integral type to an explicit int to simplify
- # comparisons here
- if isinstance(value, numbers.Integral):
- value = int(value)
-
- if not isinstance(value, dimension.primaryKey.getPythonType()):
- for alternate in dimension.alternateKeys:
- if isinstance(value, alternate.getPythonType()):
- byRecord[dimensionName][alternate.name] = value
- del dataIdDict[dimensionName]
- log.debug(
- "Converting dimension %s to %s.%s=%s",
- dimensionName,
- dimensionName,
- alternate.name,
- value,
- )
- break
- else:
- log.warning(
- "Type mismatch found for value '%r' provided for dimension %s. "
- "Could not find matching alternative (primary key has type %s) "
- "so attempting to use as-is.",
- value,
- dimensionName,
- dimension.primaryKey.getPythonType(),
- )
-
- # By this point kwargs and newDataId should only include valid
- # dimensions. Merge kwargs in to the new dataId and log if there
- # are dimensions in both (rather than calling update).
- for k, v in kwargs.items():
- if k in newDataId and newDataId[k] != v:
- log.debug(
- "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
- )
- newDataId[k] = v
- # No need to retain any values in kwargs now.
- kwargs = {}
-
- # If we have some unrecognized dimensions we have to try to connect
- # them to records in other dimensions. This is made more complicated
- # by some dimensions having records with clashing names. A mitigation
- # is that we can tell by this point which dimensions are missing
- # for the DatasetType but this does not work for calibrations
- # where additional dimensions can be used to constrain the temporal
- # axis.
- if not_dimensions:
- # Search for all dimensions even if we have been given a value
- # explicitly. In some cases records are given as well as the
- # actually dimension and this should not be an error if they
- # match.
- mandatoryDimensions = datasetType.dimensions.names # - provided
-
- candidateDimensions: set[str] = set()
- candidateDimensions.update(mandatoryDimensions)
-
- # For calibrations we may well be needing temporal dimensions
- # so rather than always including all dimensions in the scan
- # restrict things a little. It is still possible for there
- # to be confusion over day_obs in visit vs exposure for example.
- # If we are not searching calibration collections things may
- # fail but they are going to fail anyway because of the
- # ambiguousness of the dataId...
- if datasetType.isCalibration():
- for dim in self.dimensions.getStaticDimensions():
- if dim.temporal:
- candidateDimensions.add(str(dim))
-
- # Look up table for the first association with a dimension
- guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
-
- # Keep track of whether an item is associated with multiple
- # dimensions.
- counter: Counter[str] = Counter()
- assigned: dict[str, set[str]] = defaultdict(set)
-
- # Go through the missing dimensions and associate the
- # given names with records within those dimensions
- matched_dims = set()
- for dimensionName in candidateDimensions:
- dimension = self.dimensions.getStaticDimensions()[dimensionName]
- fields = dimension.metadata.names | dimension.uniqueKeys.names
- for field in not_dimensions:
- if field in fields:
- guessedAssociation[dimensionName][field] = not_dimensions[field]
- counter[dimensionName] += 1
- assigned[field].add(dimensionName)
- matched_dims.add(field)
-
- # Calculate the fields that matched nothing.
- never_found = set(not_dimensions) - matched_dims
-
- if never_found:
- raise ValueError(f"Unrecognized keyword args given: {never_found}")
-
- # There is a chance we have allocated a single dataId item
- # to multiple dimensions. Need to decide which should be retained.
- # For now assume that the most popular alternative wins.
- # This means that day_obs with seq_num will result in
- # exposure.day_obs and not visit.day_obs
- # Also prefer an explicitly missing dimension over an inferred
- # temporal dimension.
- for fieldName, assignedDimensions in assigned.items():
- if len(assignedDimensions) > 1:
- # Pick the most popular (preferring mandatory dimensions)
- requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
- if requiredButMissing:
- candidateDimensions = requiredButMissing
- else:
- candidateDimensions = assignedDimensions
-
- # If this is a choice between visit and exposure and
- # neither was a required part of the dataset type,
- # (hence in this branch) always prefer exposure over
- # visit since exposures are always defined and visits
- # are defined from exposures.
- if candidateDimensions == {"exposure", "visit"}:
- candidateDimensions = {"exposure"}
-
- # Select the relevant items and get a new restricted
- # counter.
- theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
- duplicatesCounter: Counter[str] = Counter()
- duplicatesCounter.update(theseCounts)
-
- # Choose the most common. If they are equally common
- # we will pick the one that was found first.
- # Returns a list of tuples
- selected = duplicatesCounter.most_common(1)[0][0]
-
- log.debug(
- "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
- " Removed ambiguity by choosing dimension %s.",
- fieldName,
- ", ".join(assignedDimensions),
- selected,
- )
-
- for candidateDimension in assignedDimensions:
- if candidateDimension != selected:
- del guessedAssociation[candidateDimension][fieldName]
-
- # Update the record look up dict with the new associations
- for dimensionName, values in guessedAssociation.items():
- if values: # A dict might now be empty
- log.debug("Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values)
- byRecord[dimensionName].update(values)
-
- if byRecord:
- # Some record specifiers were found so we need to convert
- # them to the Id form
- for dimensionName, values in byRecord.items():
- if dimensionName in newDataId:
- log.debug(
- "DataId specified explicit %s dimension value of %s in addition to"
- " general record specifiers for it of %s. Ignoring record information.",
- dimensionName,
- newDataId[dimensionName],
- str(values),
- )
- # Get the actual record and compare with these values.
- try:
- recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
- except DataIdError:
- raise ValueError(
- f"Could not find dimension '{dimensionName}'"
- f" with dataId {newDataId} as part of comparing with"
- f" record values {byRecord[dimensionName]}"
- ) from None
- if len(recs) == 1:
- errmsg: list[str] = []
- for k, v in values.items():
- if (recval := getattr(recs[0], k)) != v:
- errmsg.append(f"{k}({recval} != {v})")
- if errmsg:
- raise ValueError(
- f"Dimension {dimensionName} in dataId has explicit value"
- " inconsistent with records: " + ", ".join(errmsg)
- )
- else:
- # Multiple matches for an explicit dimension
- # should never happen but let downstream complain.
- pass
- continue
-
- # Build up a WHERE expression
- bind = dict(values.items())
- where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
-
- # Hopefully we get a single record that matches
- records = set(
- self._registry.queryDimensionRecords(
- dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
- )
- )
-
- if len(records) != 1:
- if len(records) > 1:
- # visit can have an ambiguous answer without involving
- # visit_system. The default visit_system is defined
- # by the instrument.
- if (
- dimensionName == "visit"
- and "visit_system_membership" in self.dimensions
- and "visit_system" in self.dimensions["instrument"].metadata
- ):
- instrument_records = list(
- self._registry.queryDimensionRecords(
- "instrument",
- dataId=newDataId,
- **kwargs,
- )
- )
- if len(instrument_records) == 1:
- visit_system = instrument_records[0].visit_system
- if visit_system is None:
- # Set to a value that will never match.
- visit_system = -1
-
- # Look up each visit in the
- # visit_system_membership records.
- for rec in records:
- membership = list(
- self._registry.queryDimensionRecords(
- # Use bind to allow zero results.
- # This is a fully-specified query.
- "visit_system_membership",
- where="instrument = inst AND visit_system = system AND visit = v",
- bind=dict(
- inst=instrument_records[0].name, system=visit_system, v=rec.id
- ),
- )
- )
- if membership:
- # This record is the right answer.
- records = {rec}
- break
-
- # The ambiguity may have been resolved so check again.
- if len(records) > 1:
- log.debug("Received %d records from constraints of %s", len(records), str(values))
- for r in records:
- log.debug("- %s", str(r))
- raise ValueError(
- f"DataId specification for dimension {dimensionName} is not"
- f" uniquely constrained to a single dataset by {values}."
- f" Got {len(records)} results."
- )
- else:
- raise ValueError(
- f"DataId specification for dimension {dimensionName} matched no"
- f" records when constrained by {values}"
- )
-
- # Get the primary key from the real dimension object
- dimension = self.dimensions.getStaticDimensions()[dimensionName]
- if not isinstance(dimension, Dimension):
- raise RuntimeError(
- f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
- )
- newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
-
- return newDataId, kwargs
-
- def _findDatasetRef(
- self,
- datasetRefOrType: DatasetRef | DatasetType | str,
- dataId: DataId | None = None,
- *,
- collections: Any = None,
- predict: bool = False,
- run: str | None = None,
- datastore_records: bool = False,
- **kwargs: Any,
- ) -> DatasetRef:
- """Shared logic for methods that start with a search for a dataset in
- the registry.
-
- Parameters
- ----------
- datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
- When `DatasetRef` the `dataId` should be `None`.
- Otherwise the `DatasetType` or name thereof.
- dataId : `dict` or `DataCoordinate`, optional
- A `dict` of `Dimension` link name, value pairs that label the
- `DatasetRef` within a Collection. When `None`, a `DatasetRef`
- should be provided as the first argument.
- collections : Any, optional
- Collections to be searched, overriding ``self.collections``.
- Can be any of the types supported by the ``collections`` argument
- to butler construction.
- predict : `bool`, optional
- If `True`, return a newly created `DatasetRef` with a unique
- dataset ID if finding a reference in the `Registry` fails.
- Defaults to `False`.
- run : `str`, optional
- Run collection name to use for creating `DatasetRef` for predicted
- datasets. Only used if ``predict`` is `True`.
- datastore_records : `bool`, optional
- If `True` add datastore records to returned `DatasetRef`.
- **kwargs
- Additional keyword arguments used to augment or construct a
- `DataId`. See `DataId` parameters.
+ return ButlerRepoIndex.get_known_repos()
- Returns
- -------
- ref : `DatasetRef`
- A reference to the dataset identified by the given arguments.
- This can be the same dataset reference as given if it was
- resolved.
+ @abstractmethod
+ def transaction(self) -> AbstractContextManager[None]:
+ """Context manager supporting `Butler` transactions.
- Raises
- ------
- LookupError
- Raised if no matching dataset exists in the `Registry` (and
- ``predict`` is `False`).
- ValueError
- Raised if a resolved `DatasetRef` was passed as an input, but it
- differs from the one found in the registry.
- TypeError
- Raised if no collections were provided.
+ Transactions can be nested.
"""
- datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
- if isinstance(datasetRefOrType, DatasetRef):
- if collections is not None:
- warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
- # May need to retrieve datastore records if requested.
- if datastore_records and datasetRefOrType._datastore_records is None:
- datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
- return datasetRefOrType
- timespan: Timespan | None = None
-
- dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
-
- if datasetType.isCalibration():
- # Because this is a calibration dataset, first try to make a
- # standardize the data ID without restricting the dimensions to
- # those of the dataset type requested, because there may be extra
- # dimensions that provide temporal information for a validity-range
- # lookup.
- dataId = DataCoordinate.standardize(
- dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
- )
- if dataId.graph.temporal:
- dataId = self._registry.expandDataId(dataId)
- timespan = dataId.timespan
- else:
- # Standardize the data ID to just the dimensions of the dataset
- # type instead of letting registry.findDataset do it, so we get the
- # result even if no dataset is found.
- dataId = DataCoordinate.standardize(
- dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs
- )
- # Always lookup the DatasetRef, even if one is given, to ensure it is
- # present in the current collection.
- ref = self._registry.findDataset(
- datasetType,
- dataId,
- collections=collections,
- timespan=timespan,
- datastore_records=datastore_records,
- )
- if ref is None:
- if predict:
- if run is None:
- run = self.run
- if run is None:
- raise TypeError("Cannot predict dataset ID/location with run=None.")
- return DatasetRef(datasetType, dataId, run=run)
- else:
- if collections is None:
- collections = self._registry.defaults.collections
- raise LookupError(
- f"Dataset {datasetType.name} with data ID {dataId} "
- f"could not be found in collections {collections}."
- )
- if datasetType != ref.datasetType:
- # If they differ it is because the user explicitly specified
- # a compatible dataset type to this call rather than using the
- # registry definition. The DatasetRef must therefore be recreated
- # using the user definition such that the expected type is
- # returned.
- ref = DatasetRef(
- datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
- )
+ raise NotImplementedError()
- return ref
-
- # TODO: remove on DM-40067.
- @transactional
- @deprecated(
- reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
- " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
- " were relying on the run parameter to determine the run."
- " Will be removed after v26.0.",
- version="v26.0",
- category=FutureWarning,
- )
- def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
- # Docstring inherited.
- return self.put(obj, ref)
-
- @transactional
+ @abstractmethod
def put(
self,
obj: Any,
@@ -1182,127 +533,9 @@ def put(
TypeError
Raised if the butler is read-only or if no run has been provided.
"""
- if isinstance(datasetRefOrType, DatasetRef):
- # This is a direct put of predefined DatasetRef.
- log.debug("Butler put direct: %s", datasetRefOrType)
- if run is not None:
- warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
- # If registry already has a dataset with the same dataset ID,
- # dataset type and DataId, then _importDatasets will do nothing and
- # just return an original ref. We have to raise in this case, there
- # is a datastore check below for that.
- self._registry._importDatasets([datasetRefOrType], expand=True)
- # Before trying to write to the datastore check that it does not
- # know this dataset. This is prone to races, of course.
- if self._datastore.knows(datasetRefOrType):
- raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
- # Try to write dataset to the datastore, if it fails due to a race
- # with another write, the content of stored data may be
- # unpredictable.
- try:
- self._datastore.put(obj, datasetRefOrType)
- except IntegrityError as e:
- raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
- return datasetRefOrType
-
- log.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
- datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
-
- # Handle dimension records in dataId
- dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
-
- # Add Registry Dataset entry.
- dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
- (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
- self._datastore.put(obj, ref)
-
- return ref
-
- # TODO: remove on DM-40067.
- @deprecated(
- reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
- " Please use Butler.get(). Will be removed after v26.0.",
- version="v26.0",
- category=FutureWarning,
- )
- def getDirect(
- self,
- ref: DatasetRef,
- *,
- parameters: dict[str, Any] | None = None,
- storageClass: StorageClass | str | None = None,
- ) -> Any:
- """Retrieve a stored dataset.
-
- Parameters
- ----------
- ref : `DatasetRef`
- Resolved reference to an already stored dataset.
- parameters : `dict`
- Additional StorageClass-defined options to control reading,
- typically used to efficiently read only a subset of the dataset.
- storageClass : `StorageClass` or `str`, optional
- The storage class to be used to override the Python type
- returned by this method. By default the returned type matches
- the dataset type definition for this dataset. Specifying a
- read `StorageClass` can force a different type to be returned.
- This type must be compatible with the original type.
-
- Returns
- -------
- obj : `object`
- The dataset.
- """
- return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
-
- # TODO: remove on DM-40067.
- @deprecated(
- reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
- "Please use Butler.getDeferred(). Will be removed after v26.0.",
- version="v26.0",
- category=FutureWarning,
- )
- def getDirectDeferred(
- self,
- ref: DatasetRef,
- *,
- parameters: dict | None = None,
- storageClass: str | StorageClass | None = None,
- ) -> DeferredDatasetHandle:
- """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
- from a resolved `DatasetRef`.
-
- Parameters
- ----------
- ref : `DatasetRef`
- Resolved reference to an already stored dataset.
- parameters : `dict`
- Additional StorageClass-defined options to control reading,
- typically used to efficiently read only a subset of the dataset.
- storageClass : `StorageClass` or `str`, optional
- The storage class to be used to override the Python type
- returned by this method. By default the returned type matches
- the dataset type definition for this dataset. Specifying a
- read `StorageClass` can force a different type to be returned.
- This type must be compatible with the original type.
-
- Returns
- -------
- obj : `DeferredDatasetHandle`
- A handle which can be used to retrieve a dataset at a later time.
-
- Raises
- ------
- LookupError
- Raised if no matching dataset exists in the `Registry`.
- """
- # Check that dataset is known to the datastore.
- if not self._datastore.knows(ref):
- raise LookupError(f"Dataset reference {ref} is not known to datastore.")
- return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
+ raise NotImplementedError()
+ @abstractmethod
def getDeferred(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
@@ -1359,19 +592,9 @@ def getDeferred(
TypeError
Raised if no collections were provided.
"""
- if isinstance(datasetRefOrType, DatasetRef):
- # Do the quick check first and if that fails, check for artifact
- # existence. This is necessary for datastores that are configured
- # in trust mode where there won't be a record but there will be
- # a file.
- if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
- ref = datasetRefOrType
- else:
- raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
- else:
- ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
- return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
+ raise NotImplementedError()
+ @abstractmethod
def get(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
@@ -1436,12 +659,9 @@ def get(
fetched with a ``{instrument, detector, exposure}`` data ID, because
``exposure`` is a temporal dimension.
"""
- log.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
- ref = self._findDatasetRef(
- datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
- )
- return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
+ raise NotImplementedError()
+ @abstractmethod
def getURIs(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
@@ -1486,11 +706,9 @@ def getURIs(
`None`), and the URIs to any components associated with the dataset
artifact. (can be empty if there are no components).
"""
- ref = self._findDatasetRef(
- datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
- )
- return self._datastore.getURIs(ref, predict)
+ raise NotImplementedError()
+ @abstractmethod
def getURI(
self,
datasetRefOrType: DatasetRef | DatasetType | str,
@@ -1552,17 +770,9 @@ def getURI(
Raised if a URI is requested for a dataset that consists of
multiple artifacts.
"""
- primary, components = self.getURIs(
- datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
- )
-
- if primary is None or components:
- raise RuntimeError(
- f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
- "Use Butler.getURIs() instead."
- )
- return primary
+ raise NotImplementedError()
+ @abstractmethod
def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
@@ -1606,14 +816,9 @@ def retrieveArtifacts(
a hierarchical data structure in a NoSQL database may well be stored
as a JSON file.
"""
- return self._datastore.retrieveArtifacts(
- refs,
- ResourcePath(destination),
- transfer=transfer,
- preserve_path=preserve_path,
- overwrite=overwrite,
- )
+ raise NotImplementedError()
+ @abstractmethod
def exists(
self,
dataset_ref_or_type: DatasetRef | DatasetType | str,
@@ -1658,49 +863,9 @@ def exists(
datastore. Evaluates to `True` if the dataset is present and known
to both.
"""
- existence = DatasetExistence.UNRECOGNIZED
-
- if isinstance(dataset_ref_or_type, DatasetRef):
- if collections is not None:
- warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
- if data_id is not None:
- warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
- ref = dataset_ref_or_type
- registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
- if registry_ref is not None:
- existence |= DatasetExistence.RECORDED
-
- if dataset_ref_or_type != registry_ref:
- # This could mean that storage classes differ, so we should
- # check for that but use the registry ref for the rest of
- # the method.
- if registry_ref.is_compatible_with(dataset_ref_or_type):
- # Use the registry version from now on.
- ref = registry_ref
- else:
- raise ValueError(
- f"The ref given to exists() ({ref}) has the same dataset ID as one "
- f"in registry but has different incompatible values ({registry_ref})."
- )
- else:
- try:
- ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
- except (LookupError, TypeError, NoDefaultCollectionError):
- return existence
- existence |= DatasetExistence.RECORDED
-
- if self._datastore.knows(ref):
- existence |= DatasetExistence.DATASTORE
-
- if full_check:
- if self._datastore.exists(ref):
- existence |= DatasetExistence._ARTIFACT
- elif existence.value != DatasetExistence.UNRECOGNIZED.value:
- # Do not add this flag if we have no other idea about a dataset.
- existence |= DatasetExistence(DatasetExistence._ASSUMED)
-
- return existence
+ raise NotImplementedError()
+ @abstractmethod
def _exists_many(
self,
refs: Iterable[DatasetRef],
@@ -1732,97 +897,9 @@ def _exists_many(
Each value evaluates to `True` if the dataset is present and known
to both.
"""
- existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
-
- # Registry does not have a bulk API to check for a ref.
- for ref in refs:
- registry_ref = self._registry.getDataset(ref.id)
- if registry_ref is not None:
- # It is possible, albeit unlikely, that the given ref does
- # not match the one in registry even though the UUID matches.
- # When checking a single ref we raise, but it's impolite to
- # do that when potentially hundreds of refs are being checked.
- # We could change the API to only accept UUIDs and that would
- # remove the ability to even check and remove the worry
- # about differing storage classes. Given the ongoing discussion
- # on refs vs UUIDs and whether to raise or have a new
- # private flag, treat this as a private API for now.
- existence[ref] |= DatasetExistence.RECORDED
-
- # Ask datastore if it knows about these refs.
- knows = self._datastore.knows_these(refs)
- for ref, known in knows.items():
- if known:
- existence[ref] |= DatasetExistence.DATASTORE
-
- if full_check:
- mexists = self._datastore.mexists(refs)
- for ref, exists in mexists.items():
- if exists:
- existence[ref] |= DatasetExistence._ARTIFACT
- else:
- # Do not set this flag if nothing is known about the dataset.
- for ref in existence:
- if existence[ref] != DatasetExistence.UNRECOGNIZED:
- existence[ref] |= DatasetExistence._ASSUMED
-
- return existence
-
- # TODO: remove on DM-40079.
- @deprecated(
- reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.",
- version="v26.0",
- category=FutureWarning,
- )
- def datasetExists(
- self,
- datasetRefOrType: DatasetRef | DatasetType | str,
- dataId: DataId | None = None,
- *,
- collections: Any = None,
- **kwargs: Any,
- ) -> bool:
- """Return True if the Dataset is actually present in the Datastore.
-
- Parameters
- ----------
- datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
- When `DatasetRef` the `dataId` should be `None`.
- Otherwise the `DatasetType` or name thereof.
- dataId : `dict` or `DataCoordinate`
- A `dict` of `Dimension` link name, value pairs that label the
- `DatasetRef` within a Collection. When `None`, a `DatasetRef`
- should be provided as the first argument.
- collections : Any, optional
- Collections to be searched, overriding ``self.collections``.
- Can be any of the types supported by the ``collections`` argument
- to butler construction.
- **kwargs
- Additional keyword arguments used to augment or construct a
- `DataCoordinate`. See `DataCoordinate.standardize`
- parameters.
-
- Raises
- ------
- LookupError
- Raised if the dataset is not even present in the Registry.
- ValueError
- Raised if a resolved `DatasetRef` was passed as an input, but it
- differs from the one found in the registry.
- NoDefaultCollectionError
- Raised if no collections were provided.
- """
- # A resolved ref may be given that is not known to this butler.
- if isinstance(datasetRefOrType, DatasetRef):
- ref = self._registry.getDataset(datasetRefOrType.id)
- if ref is None:
- raise LookupError(
- f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
- )
- else:
- ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
- return self._datastore.exists(ref)
+ raise NotImplementedError()
+ @abstractmethod
def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
"""Remove one or more `~CollectionType.RUN` collections and the
datasets within them.
@@ -1844,92 +921,9 @@ def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
Raised if one or more collections are not of type
`~CollectionType.RUN`.
"""
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
- names = list(names)
- refs: list[DatasetRef] = []
- for name in names:
- collectionType = self._registry.getCollectionType(name)
- if collectionType is not CollectionType.RUN:
- raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
- refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
- with self._datastore.transaction(), self._registry.transaction():
- if unstore:
- self._datastore.trash(refs)
- else:
- self._datastore.forget(refs)
- for name in names:
- self._registry.removeCollection(name)
- if unstore:
- # Point of no return for removing artifacts
- self._datastore.emptyTrash()
-
- def pruneDatasets(
- self,
- refs: Iterable[DatasetRef],
- *,
- disassociate: bool = True,
- unstore: bool = False,
- tags: Iterable[str] = (),
- purge: bool = False,
- ) -> None:
- # docstring inherited from LimitedButler
-
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
- if purge:
- if not disassociate:
- raise TypeError("Cannot pass purge=True without disassociate=True.")
- if not unstore:
- raise TypeError("Cannot pass purge=True without unstore=True.")
- elif disassociate:
- tags = tuple(tags)
- if not tags:
- raise TypeError("No tags provided but disassociate=True.")
- for tag in tags:
- collectionType = self._registry.getCollectionType(tag)
- if collectionType is not CollectionType.TAGGED:
- raise TypeError(
- f"Cannot disassociate from collection '{tag}' "
- f"of non-TAGGED type {collectionType.name}."
- )
- # Transform possibly-single-pass iterable into something we can iterate
- # over multiple times.
- refs = list(refs)
- # Pruning a component of a DatasetRef makes no sense since registry
- # doesn't know about components and datastore might not store
- # components in a separate file
- for ref in refs:
- if ref.datasetType.component():
- raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
- # We don't need an unreliable Datastore transaction for this, because
- # we've been extra careful to ensure that Datastore.trash only involves
- # mutating the Registry (it can _look_ at Datastore-specific things,
- # but shouldn't change them), and hence all operations here are
- # Registry operations.
- with self._datastore.transaction(), self._registry.transaction():
- if unstore:
- self._datastore.trash(refs)
- if purge:
- self._registry.removeDatasets(refs)
- elif disassociate:
- assert tags, "Guaranteed by earlier logic in this function."
- for tag in tags:
- self._registry.disassociate(tag, refs)
- # We've exited the Registry transaction, and apparently committed.
- # (if there was an exception, everything rolled back, and it's as if
- # nothing happened - and we never get here).
- # Datastore artifacts are not yet gone, but they're clearly marked
- # as trash, so if we fail to delete now because of (e.g.) filesystem
- # problems we can try again later, and if manual administrative
- # intervention is required, it's pretty clear what that should entail:
- # deleting everything on disk and in private Datastore tables that is
- # in the dataset_location_trash table.
- if unstore:
- # Point of no return for removing artifacts
- self._datastore.emptyTrash()
-
- @transactional
+ raise NotImplementedError()
+
+ @abstractmethod
def ingest(
self,
*datasets: FileDataset,
@@ -1998,122 +992,9 @@ def ingest(
filesystem operations as well, but this cannot be implemented
rigorously for most datastores.
"""
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
-
- log.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
- if not datasets:
- return
-
- if idGenerationMode is not None:
- warnings.warn(
- "The idGenerationMode parameter is no longer used and is ignored. "
- " Will be removed after v26.0",
- FutureWarning,
- stacklevel=2,
- )
+ raise NotImplementedError()
- progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
-
- # We need to reorganize all the inputs so that they are grouped
- # by dataset type and run. Multiple refs in a single FileDataset
- # are required to share the run and dataset type.
- GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
- groupedData: GroupedData = defaultdict(list)
-
- # Track DataIDs that are being ingested so we can spot issues early
- # with duplication. Retain previous FileDataset so we can report it.
- groupedDataIds: MutableMapping[
- tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
- ] = defaultdict(dict)
-
- used_run = False
-
- # And the nested loop that populates it:
- for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
- # Somewhere to store pre-existing refs if we have an
- # execution butler.
- existingRefs: list[DatasetRef] = []
-
- for ref in dataset.refs:
- assert ref.run is not None # For mypy
- group_key = (ref.datasetType, ref.run)
-
- if ref.dataId in groupedDataIds[group_key]:
- raise ConflictingDefinitionError(
- f"Ingest conflict. Dataset {dataset.path} has same"
- " DataId as other ingest dataset"
- f" {groupedDataIds[group_key][ref.dataId].path} "
- f" ({ref.dataId})"
- )
-
- groupedDataIds[group_key][ref.dataId] = dataset
-
- if existingRefs:
- if len(dataset.refs) != len(existingRefs):
- # Keeping track of partially pre-existing datasets is hard
- # and should generally never happen. For now don't allow
- # it.
- raise ConflictingDefinitionError(
- f"For dataset {dataset.path} some dataIds already exist"
- " in registry but others do not. This is not supported."
- )
-
- # Store expanded form in the original FileDataset.
- dataset.refs = existingRefs
- else:
- groupedData[group_key].append(dataset)
-
- if not used_run and run is not None:
- warnings.warn(
- "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
- f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
- category=FutureWarning,
- stacklevel=3, # Take into account the @transactional decorator.
- )
-
- # Now we can bulk-insert into Registry for each DatasetType.
- for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
- groupedData.items(), desc="Bulk-inserting datasets by type"
- ):
- refs_to_import = []
- for dataset in grouped_datasets:
- refs_to_import.extend(dataset.refs)
-
- n_refs = len(refs_to_import)
- log.verbose(
- "Importing %d ref%s of dataset type %r into run %r",
- n_refs,
- "" if n_refs == 1 else "s",
- datasetType.name,
- this_run,
- )
-
- # Import the refs and expand the DataCoordinates since we can't
- # guarantee that they are expanded and Datastore will need
- # the records.
- imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
- assert set(imported_refs) == set(refs_to_import)
-
- # Replace all the refs in the FileDataset with expanded versions.
- # Pull them off in the order we put them on the list.
- for dataset in grouped_datasets:
- n_dataset_refs = len(dataset.refs)
- dataset.refs = imported_refs[:n_dataset_refs]
- del imported_refs[:n_dataset_refs]
-
- # Bulk-insert everything into Datastore.
- # We do not know if any of the registry entries already existed
- # (_importDatasets only complains if they exist but differ) so
- # we have to catch IntegrityError explicitly.
- try:
- self._datastore.ingest(
- *datasets, transfer=transfer, record_validation_info=record_validation_info
- )
- except IntegrityError as e:
- raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
-
- @contextlib.contextmanager
+ @abstractmethod
def export(
self,
*,
@@ -2121,7 +1002,7 @@ def export(
filename: str | None = None,
format: str | None = None,
transfer: str | None = None,
- ) -> Iterator[RepoExportContext]:
+ ) -> AbstractContextManager[RepoExportContext]:
"""Export datasets from the repository represented by this `Butler`.
This method is a context manager that returns a helper object
@@ -2165,38 +1046,9 @@ def export(
# their associated data ID information.
export.saveDatasets(butler.registry.queryDatasets("deepCoadd_*"))
"""
- if directory is None and transfer is not None:
- raise TypeError("Cannot transfer without providing a directory.")
- if transfer == "move":
- raise TypeError("Transfer may not be 'move': export is read-only")
- if format is None:
- if filename is None:
- raise TypeError("At least one of 'filename' or 'format' must be provided.")
- else:
- _, format = os.path.splitext(filename)
- if not format:
- raise ValueError("Please specify a file extension to determine export format.")
- format = format[1:] # Strip leading ".""
- elif filename is None:
- filename = f"export.{format}"
- if directory is not None:
- filename = os.path.join(directory, filename)
- formats = self._config["repo_transfer_formats"]
- if format not in formats:
- raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
- BackendClass = get_class_of(formats[format, "export"])
- with open(filename, "w") as stream:
- backend = BackendClass(stream, universe=self.dimensions)
- try:
- helper = RepoExportContext(
- self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
- )
- yield helper
- except BaseException:
- raise
- else:
- helper._finish()
+ raise NotImplementedError()
+ @abstractmethod
def import_(
self,
*,
@@ -2237,66 +1089,9 @@ def import_(
Raised if the set of arguments passed is inconsistent, or if the
butler is read-only.
"""
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
- if format is None:
- if filename is None:
- raise TypeError("At least one of 'filename' or 'format' must be provided.")
- else:
- _, format = os.path.splitext(filename) # type: ignore
- elif filename is None:
- filename = ResourcePath(f"export.{format}", forceAbsolute=False)
- if directory is not None:
- directory = ResourcePath(directory, forceDirectory=True)
- # mypy doesn't think this will work but it does in python >= 3.10.
- if isinstance(filename, ResourcePathExpression): # type: ignore
- filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
- if not filename.isabs() and directory is not None:
- potential = directory.join(filename)
- exists_in_cwd = filename.exists()
- exists_in_dir = potential.exists()
- if exists_in_cwd and exists_in_dir:
- log.warning(
- "A relative path for filename was specified (%s) which exists relative to cwd. "
- "Additionally, the file exists relative to the given search directory (%s). "
- "Using the export file in the given directory.",
- filename,
- potential,
- )
- # Given they specified an explicit directory and that
- # directory has the export file in it, assume that that
- # is what was meant despite the file in cwd.
- filename = potential
- elif exists_in_dir:
- filename = potential
- elif not exists_in_cwd and not exists_in_dir:
- # Raise early.
- raise FileNotFoundError(
- f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
- )
- BackendClass: type[RepoImportBackend] = get_class_of(
- self._config["repo_transfer_formats"][format]["import"]
- )
-
- def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
- backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
- backend.register()
- with self.transaction():
- backend.load(
- self._datastore,
- directory=directory,
- transfer=transfer,
- skip_dimensions=skip_dimensions,
- )
-
- if isinstance(filename, ResourcePath):
- # We can not use open() here at the moment because of
- # DM-38589 since yaml does stream.read(8192) in a loop.
- stream = io.StringIO(filename.read().decode())
- doImport(stream)
- else:
- doImport(filename) # type: ignore
+ raise NotImplementedError()
+ @abstractmethod
def transfer_from(
self,
source_butler: LimitedButler,
@@ -2305,7 +1100,7 @@ def transfer_from(
skip_missing: bool = True,
register_dataset_types: bool = False,
transfer_dimensions: bool = False,
- ) -> collections.abc.Collection[DatasetRef]:
+ ) -> Collection[DatasetRef]:
"""Transfer datasets to this Butler from a run in another Butler.
Parameters
@@ -2348,171 +1143,9 @@ def transfer_from(
This means that it is possible for a dataset type to be registered
even though transfer has failed.
"""
- if not self.isWriteable():
- raise TypeError("Butler is read-only.")
- progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
-
- # Will iterate through the refs multiple times so need to convert
- # to a list if this isn't a collection.
- if not isinstance(source_refs, collections.abc.Collection):
- source_refs = list(source_refs)
-
- original_count = len(source_refs)
- log.info("Transferring %d datasets into %s", original_count, str(self))
-
- # In some situations the datastore artifact may be missing
- # and we do not want that registry entry to be imported.
- # Asking datastore is not sufficient, the records may have been
- # purged, we have to ask for the (predicted) URI and check
- # existence explicitly. Execution butler is set up exactly like
- # this with no datastore records.
- artifact_existence: dict[ResourcePath, bool] = {}
- if skip_missing:
- dataset_existence = source_butler._datastore.mexists(
- source_refs, artifact_existence=artifact_existence
- )
- source_refs = [ref for ref, exists in dataset_existence.items() if exists]
- filtered_count = len(source_refs)
- n_missing = original_count - filtered_count
- log.verbose(
- "%d dataset%s removed because the artifact does not exist. Now have %d.",
- n_missing,
- "" if n_missing == 1 else "s",
- filtered_count,
- )
-
- # Importing requires that we group the refs by dataset type and run
- # before doing the import.
- source_dataset_types = set()
- grouped_refs = defaultdict(list)
- for ref in source_refs:
- grouped_refs[ref.datasetType, ref.run].append(ref)
- source_dataset_types.add(ref.datasetType)
-
- # Check to see if the dataset type in the source butler has
- # the same definition in the target butler and register missing
- # ones if requested. Registration must happen outside a transaction.
- newly_registered_dataset_types = set()
- for datasetType in source_dataset_types:
- if register_dataset_types:
- # Let this raise immediately if inconsistent. Continuing
- # on to find additional inconsistent dataset types
- # might result in additional unwanted dataset types being
- # registered.
- if self._registry.registerDatasetType(datasetType):
- newly_registered_dataset_types.add(datasetType)
- else:
- # If the dataset type is missing, let it fail immediately.
- target_dataset_type = self._registry.getDatasetType(datasetType.name)
- if target_dataset_type != datasetType:
- raise ConflictingDefinitionError(
- "Source butler dataset type differs from definition"
- f" in target butler: {datasetType} !="
- f" {target_dataset_type}"
- )
- if newly_registered_dataset_types:
- # We may have registered some even if there were inconsistencies
- # but should let people know (or else remove them again).
- log.log(
- VERBOSE,
- "Registered the following dataset types in the target Butler: %s",
- ", ".join(d.name for d in newly_registered_dataset_types),
- )
- else:
- log.log(VERBOSE, "All required dataset types are known to the target Butler")
-
- dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
- if transfer_dimensions:
- # Collect all the dimension records for these refs.
- # All dimensions are to be copied but the list of valid dimensions
- # come from this butler's universe.
- elements = frozenset(
- element
- for element in self.dimensions.getStaticElements()
- if element.hasTable() and element.viewOf is None
- )
- dataIds = {ref.dataId for ref in source_refs}
- # This logic comes from saveDataIds.
- for dataId in dataIds:
- # Need an expanded record, if not expanded that we need a full
- # butler with registry (allow mocks with registry too).
- if not dataId.hasRecords():
- if registry := getattr(source_butler, "registry", None):
- dataId = registry.expandDataId(dataId)
- else:
- raise TypeError("Input butler needs to be a full butler to expand DataId.")
- # If this butler doesn't know about a dimension in the source
- # butler things will break later.
- for record in dataId.records.values():
- if record is not None and record.definition in elements:
- dimension_records[record.definition].setdefault(record.dataId, record)
-
- handled_collections: set[str] = set()
-
- # Do all the importing in a single transaction.
- with self.transaction():
- if dimension_records:
- log.verbose("Ensuring that dimension records exist for transferred datasets.")
- for element, r in dimension_records.items():
- records = [r[dataId] for dataId in r]
- # Assume that if the record is already present that we can
- # use it without having to check that the record metadata
- # is consistent.
- self._registry.insertDimensionData(element, *records, skip_existing=True)
-
- n_imported = 0
- for (datasetType, run), refs_to_import in progress.iter_item_chunks(
- grouped_refs.items(), desc="Importing to registry by run and dataset type"
- ):
- if run not in handled_collections:
- # May need to create output collection. If source butler
- # has a registry, ask for documentation string.
- run_doc = None
- if registry := getattr(source_butler, "registry", None):
- run_doc = registry.getCollectionDocumentation(run)
- registered = self._registry.registerRun(run, doc=run_doc)
- handled_collections.add(run)
- if registered:
- log.log(VERBOSE, "Creating output run %s", run)
-
- n_refs = len(refs_to_import)
- log.verbose(
- "Importing %d ref%s of dataset type %s into run %s",
- n_refs,
- "" if n_refs == 1 else "s",
- datasetType.name,
- run,
- )
-
- # Assume we are using UUIDs and the source refs will match
- # those imported.
- imported_refs = self._registry._importDatasets(refs_to_import, expand=False)
- assert set(imported_refs) == set(refs_to_import)
- n_imported += len(imported_refs)
-
- assert len(source_refs) == n_imported
- log.verbose("Imported %d datasets into destination butler", n_imported)
-
- # Ask the datastore to transfer. The datastore has to check that
- # the source datastore is compatible with the target datastore.
- accepted, rejected = self._datastore.transfer_from(
- source_butler._datastore,
- source_refs,
- transfer=transfer,
- artifact_existence=artifact_existence,
- )
- if rejected:
- # For now, accept the registry entries but not the files.
- log.warning(
- "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
- len(rejected),
- len(accepted),
- datasetType,
- run,
- )
-
- return source_refs
+ raise NotImplementedError()
+ @abstractmethod
def validateConfiguration(
self,
logFailures: bool = False,
@@ -2543,141 +1176,26 @@ def validateConfiguration(
Raised if there is some inconsistency with how this Butler
is configured.
"""
- if datasetTypeNames:
- datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
- else:
- datasetTypes = list(self._registry.queryDatasetTypes())
-
- # filter out anything from the ignore list
- if ignore:
- ignore = set(ignore)
- datasetTypes = [
- e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
- ]
- else:
- ignore = set()
-
- # For each datasetType that has an instrument dimension, create
- # a DatasetRef for each defined instrument
- datasetRefs = []
-
- # Find all the registered instruments (if "instrument" is in the
- # universe).
- if "instrument" in self.dimensions:
- instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
-
- for datasetType in datasetTypes:
- if "instrument" in datasetType.dimensions:
- # In order to create a conforming dataset ref, create
- # fake DataCoordinate values for the non-instrument
- # dimensions. The type of the value does not matter here.
- dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"}
-
- for instrument in instruments:
- datasetRef = DatasetRef(
- datasetType,
- DataCoordinate.standardize(
- dataId, instrument=instrument, graph=datasetType.dimensions
- ),
- run="validate",
- )
- datasetRefs.append(datasetRef)
-
- entities: list[DatasetType | DatasetRef] = []
- entities.extend(datasetTypes)
- entities.extend(datasetRefs)
-
- datastoreErrorStr = None
- try:
- self._datastore.validateConfiguration(entities, logFailures=logFailures)
- except ValidationError as e:
- datastoreErrorStr = str(e)
-
- # Also check that the LookupKeys used by the datastores match
- # registry and storage class definitions
- keys = self._datastore.getLookupKeys()
-
- failedNames = set()
- failedDataId = set()
- for key in keys:
- if key.name is not None:
- if key.name in ignore:
- continue
-
- # skip if specific datasetType names were requested and this
- # name does not match
- if datasetTypeNames and key.name not in datasetTypeNames:
- continue
-
- # See if it is a StorageClass or a DatasetType
- if key.name in self.storageClasses:
- pass
- else:
- try:
- self._registry.getDatasetType(key.name)
- except KeyError:
- if logFailures:
- log.critical("Key '%s' does not correspond to a DatasetType or StorageClass", key)
- failedNames.add(key)
- else:
- # Dimensions are checked for consistency when the Butler
- # is created and rendezvoused with a universe.
- pass
-
- # Check that the instrument is a valid instrument
- # Currently only support instrument so check for that
- if key.dataId:
- dataIdKeys = set(key.dataId)
- if {"instrument"} != dataIdKeys:
- if logFailures:
- log.critical("Key '%s' has unsupported DataId override", key)
- failedDataId.add(key)
- elif key.dataId["instrument"] not in instruments:
- if logFailures:
- log.critical("Key '%s' has unknown instrument", key)
- failedDataId.add(key)
-
- messages = []
-
- if datastoreErrorStr:
- messages.append(datastoreErrorStr)
-
- for failed, msg in (
- (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
- (failedDataId, "Keys with bad DataId entries: "),
- ):
- if failed:
- msg += ", ".join(str(k) for k in failed)
- messages.append(msg)
-
- if messages:
- raise ValidationError(";\n".join(messages))
+ raise NotImplementedError()
@property
+ @abstractmethod
def collections(self) -> Sequence[str]:
"""The collections to search by default, in order
(`~collections.abc.Sequence` [ `str` ]).
-
- This is an alias for ``self.registry.defaults.collections``. It cannot
- be set directly in isolation, but all defaults may be changed together
- by assigning a new `RegistryDefaults` instance to
- ``self.registry.defaults``.
"""
- return self._registry.defaults.collections
+ raise NotImplementedError()
@property
+ @abstractmethod
def run(self) -> str | None:
"""Name of the run this butler writes outputs to by default (`str` or
`None`).
-
- This is an alias for ``self.registry.defaults.run``. It cannot be set
- directly in isolation, but all defaults may be changed together by
- assigning a new `RegistryDefaults` instance to
- ``self.registry.defaults``.
"""
- return self._registry.defaults.run
+ raise NotImplementedError()
@property
+ @abstractmethod
def registry(self) -> Registry:
"""The object that manages dataset metadata and relationships
(`Registry`).
@@ -2686,30 +1204,4 @@ def registry(self) -> Registry:
are accessible only via `Registry` methods. Eventually these methods
will be replaced by equivalent `Butler` methods.
"""
- return self._registry_shim
-
- @property
- def dimensions(self) -> DimensionUniverse:
- # Docstring inherited.
- return self._registry.dimensions
-
- _registry: _ButlerRegistry
- """The object that manages dataset metadata and relationships
- (`_ButlerRegistry`).
-
- Most operations that don't involve reading or writing butler datasets are
- accessible only via `Registry` methods.
- """
-
- datastore: Datastore
- """The object that manages actual dataset storage (`Datastore`).
-
- Direct user access to the datastore should rarely be necessary; the primary
- exception is the case where a `Datastore` implementation provides extra
- functionality beyond what the base class defines.
- """
-
- storageClasses: StorageClassFactory
- """An object that maps known storage class names to objects that fully
- describe them (`StorageClassFactory`).
- """
+ raise NotImplementedError()
diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py
index 5d3edb797d..fd33107e23 100644
--- a/python/lsst/daf/butler/_quantum_backed.py
+++ b/python/lsst/daf/butler/_quantum_backed.py
@@ -661,7 +661,7 @@ class QuantumProvenanceData(_BaseModelCompat):
def collect_and_transfer(
butler: Butler, quanta: Iterable[Quantum], provenance: Iterable[QuantumProvenanceData]
) -> None:
- """Transfer output datasets from multiple quanta to a more permantent
+ """Transfer output datasets from multiple quanta to a more permanent
`Butler` repository.
Parameters
diff --git a/python/lsst/daf/butler/_registry_shim.py b/python/lsst/daf/butler/_registry_shim.py
index 4ba989e829..67f50a16e1 100644
--- a/python/lsst/daf/butler/_registry_shim.py
+++ b/python/lsst/daf/butler/_registry_shim.py
@@ -54,7 +54,7 @@
from .registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
if TYPE_CHECKING:
- from ._butler import Butler
+ from .direct_butler import DirectButler
from .registry._registry import CollectionArgType
from .registry.interfaces import ObsCoreTableManager
@@ -64,7 +64,7 @@ class RegistryShim(Registry):
Parameters
----------
- butler : `Butler`
+ butler : `DirectButler`
Data butler instance.
Notes
@@ -75,7 +75,7 @@ class RegistryShim(Registry):
while we perform re-structuring of Registry and Butler implementations.
"""
- def __init__(self, butler: Butler):
+ def __init__(self, butler: DirectButler):
self._butler = butler
self._registry = butler._registry
diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py
new file mode 100644
index 0000000000..68619848fd
--- /dev/null
+++ b/python/lsst/daf/butler/direct_butler.py
@@ -0,0 +1,2167 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively. If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+"""Butler top level classes.
+"""
+from __future__ import annotations
+
+__all__ = (
+ "DirectButler",
+ "ButlerValidationError",
+)
+
+import collections.abc
+import contextlib
+import io
+import logging
+import numbers
+import os
+import warnings
+from collections import Counter, defaultdict
+from collections.abc import Iterable, Iterator, MutableMapping, Sequence
+from typing import TYPE_CHECKING, Any, ClassVar, TextIO
+
+from deprecated.sphinx import deprecated
+from lsst.resources import ResourcePath, ResourcePathExpression
+from lsst.utils.introspection import get_class_of
+from lsst.utils.logging import VERBOSE, getLogger
+from sqlalchemy.exc import IntegrityError
+
+from ._butler import Butler
+from ._butler_config import ButlerConfig
+from ._config import Config
+from ._dataset_existence import DatasetExistence
+from ._dataset_ref import DatasetIdGenEnum, DatasetRef
+from ._dataset_type import DatasetType
+from ._deferredDatasetHandle import DeferredDatasetHandle
+from ._exceptions import ValidationError
+from ._file_dataset import FileDataset
+from ._limited_butler import LimitedButler
+from ._registry_shim import RegistryShim
+from ._storage_class import StorageClass, StorageClassFactory
+from ._timespan import Timespan
+from .datastore import DatasetRefURIs, Datastore, NullDatastore
+from .dimensions import (
+ DataCoordinate,
+ DataId,
+ DataIdValue,
+ Dimension,
+ DimensionElement,
+ DimensionRecord,
+ DimensionUniverse,
+)
+from .progress import Progress
+from .registry import (
+ CollectionType,
+ ConflictingDefinitionError,
+ DataIdError,
+ MissingDatasetTypeError,
+ NoDefaultCollectionError,
+ Registry,
+ RegistryDefaults,
+ _ButlerRegistry,
+ _RegistryFactory,
+)
+from .transfers import RepoExportContext
+from .utils import transactional
+
+if TYPE_CHECKING:
+ from lsst.resources import ResourceHandleProtocol
+
+ from .transfers import RepoImportBackend
+
+_LOG = getLogger(__name__)
+
+
+class ButlerValidationError(ValidationError):
+ """There is a problem with the Butler configuration."""
+
+ pass
+
+
+class DirectButler(Butler):
+ """Main entry point for the data access system.
+
+ Parameters
+ ----------
+ config : `ButlerConfig`, `Config` or `str`, optional.
+ Configuration. Anything acceptable to the
+ `ButlerConfig` constructor. If a directory path
+ is given the configuration will be read from a ``butler.yaml`` file in
+ that location. If `None` is given default values will be used.
+ butler : `DirectButler`, optional.
+ If provided, construct a new Butler that uses the same registry and
+ datastore as the given one, but with the given collection and run.
+ Incompatible with the ``config``, ``searchPaths``, and ``writeable``
+ arguments.
+ collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
+ An expression specifying the collections to be searched (in order) when
+ reading datasets.
+ This may be a `str` collection name or an iterable thereof.
+ See :ref:`daf_butler_collection_expressions` for more information.
+ These collections are not registered automatically and must be
+ manually registered before they are used by any method, but they may be
+ manually registered after the `Butler` is initialized.
+ run : `str`, optional
+ Name of the `~CollectionType.RUN` collection new datasets should be
+ inserted into. If ``collections`` is `None` and ``run`` is not `None`,
+ ``collections`` will be set to ``[run]``. If not `None`, this
+ collection will automatically be registered. If this is not set (and
+ ``writeable`` is not set either), a read-only butler will be created.
+ searchPaths : `list` of `str`, optional
+ Directory paths to search when calculating the full Butler
+ configuration. Not used if the supplied config is already a
+ `ButlerConfig`.
+ writeable : `bool`, optional
+ Explicitly sets whether the butler supports write operations. If not
+ provided, a read-write butler is created if any of ``run``, ``tags``,
+ or ``chains`` is non-empty.
+ inferDefaults : `bool`, optional
+ If `True` (default) infer default data ID values from the values
+ present in the datasets in ``collections``: if all collections have the
+ same value (or no value) for a governor dimension, that value will be
+ the default for that dimension. Nonexistent collections are ignored.
+ If a default value is provided explicitly for a governor dimension via
+ ``**kwargs``, no default will be inferred for that dimension.
+ without_datastore : `bool`, optional
+ If `True` do not attach a datastore to this butler. Any attempts
+ to use a datastore will fail.
+ **kwargs : `str`
+ Default data ID key-value pairs. These may only identify "governor"
+ dimensions like ``instrument`` and ``skymap``.
+ """
+
+ def __init__(
+ self,
+ config: Config | ResourcePathExpression | None = None,
+ *,
+ butler: DirectButler | None = None,
+ collections: Any = None,
+ run: str | None = None,
+ searchPaths: Sequence[ResourcePathExpression] | None = None,
+ writeable: bool | None = None,
+ inferDefaults: bool = True,
+ without_datastore: bool = False,
+ **kwargs: str,
+ ):
+ defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
+ # Load registry, datastore, etc. from config or existing butler.
+ if butler is not None:
+ if config is not None or searchPaths is not None or writeable is not None:
+ raise TypeError(
+ "Cannot pass 'config', 'searchPaths', or 'writeable' arguments with 'butler' argument."
+ )
+ self._registry = butler._registry.copy(defaults)
+ self._datastore = butler._datastore
+ self.storageClasses = butler.storageClasses
+ self._config: ButlerConfig = butler._config
+ else:
+ self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
+ try:
+ butlerRoot = self._config.get("root", self._config.configDir)
+ if writeable is None:
+ writeable = run is not None
+ self._registry = _RegistryFactory(self._config).from_config(
+ butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
+ )
+ if without_datastore:
+ self._datastore = NullDatastore(None, None)
+ else:
+ self._datastore = Datastore.fromConfig(
+ self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
+ )
+ # TODO: Once datastore drops dependency on registry we can
+ # construct datastore first and pass opaque tables to registry
+ # constructor.
+ self._registry.make_datastore_tables(self._datastore.get_opaque_table_definitions())
+ self.storageClasses = StorageClassFactory()
+ self.storageClasses.addFromConfig(self._config)
+ except Exception:
+ # Failures here usually mean that configuration is incomplete,
+ # just issue an error message which includes config file URI.
+ _LOG.error(f"Failed to instantiate Butler from config {self._config.configFile}.")
+ raise
+
+ # For execution butler the datastore needs a special
+ # dependency-inversion trick. This is not used by regular butler,
+ # but we do not have a way to distinguish regular butler from execution
+ # butler.
+ self._datastore.set_retrieve_dataset_type_method(self._retrieve_dataset_type)
+
+ if "run" in self._config or "collection" in self._config:
+ raise ValueError("Passing a run or collection via configuration is no longer supported.")
+
+ self._registry_shim = RegistryShim(self)
+
+ GENERATION: ClassVar[int] = 3
+ """This is a Generation 3 Butler.
+
+ This attribute may be removed in the future, once the Generation 2 Butler
+ interface has been fully retired; it should only be used in transitional
+ code.
+ """
+
+ def _retrieve_dataset_type(self, name: str) -> DatasetType | None:
+ """Return DatasetType defined in registry given dataset type name."""
+ try:
+ return self._registry.getDatasetType(name)
+ except MissingDatasetTypeError:
+ return None
+
+ @classmethod
+ def _unpickle(
+ cls,
+ config: ButlerConfig,
+ collections: tuple[str, ...] | None,
+ run: str | None,
+ defaultDataId: dict[str, str],
+ writeable: bool,
+ ) -> DirectButler:
+ """Callable used to unpickle a Butler.
+
+ We prefer not to use ``Butler.__init__`` directly so we can force some
+ of its many arguments to be keyword-only (note that ``__reduce__``
+ can only invoke callables with positional arguments).
+
+ Parameters
+ ----------
+ config : `ButlerConfig`
+ Butler configuration, already coerced into a true `ButlerConfig`
+ instance (and hence after any search paths for overrides have been
+ utilized).
+ collections : `tuple` [ `str` ]
+ Names of the default collections to read from.
+ run : `str`, optional
+ Name of the default `~CollectionType.RUN` collection to write to.
+ defaultDataId : `dict` [ `str`, `str` ]
+ Default data ID values.
+ writeable : `bool`
+ Whether the Butler should support write operations.
+
+ Returns
+ -------
+ butler : `Butler`
+ A new `Butler` instance.
+ """
+ # MyPy doesn't recognize that the kwargs below are totally valid; it
+ # seems to think '**defaultDataId* is a _positional_ argument!
+ return cls(
+ config=config,
+ collections=collections,
+ run=run,
+ writeable=writeable,
+ **defaultDataId, # type: ignore
+ )
+
+ def __reduce__(self) -> tuple:
+ """Support pickling."""
+ return (
+ DirectButler._unpickle,
+ (
+ self._config,
+ self.collections,
+ self.run,
+ self._registry.defaults.dataId.byName(),
+ self._registry.isWriteable(),
+ ),
+ )
+
+ def __str__(self) -> str:
+ return "Butler(collections={}, run={}, datastore='{}', registry='{}')".format(
+ self.collections, self.run, self._datastore, self._registry
+ )
+
+ def isWriteable(self) -> bool:
+ # Docstring inherited.
+ return self._registry.isWriteable()
+
+ @contextlib.contextmanager
+ def transaction(self) -> Iterator[None]:
+ """Context manager supporting `Butler` transactions.
+
+ Transactions can be nested.
+ """
+ with self._registry.transaction(), self._datastore.transaction():
+ yield
+
+ def _standardizeArgs(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ dataId: DataId | None = None,
+ for_put: bool = True,
+ **kwargs: Any,
+ ) -> tuple[DatasetType, DataId | None]:
+ """Standardize the arguments passed to several Butler APIs.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the second argument.
+ for_put : `bool`, optional
+ If `True` this call is invoked as part of a `Butler.put()`.
+ Otherwise it is assumed to be part of a `Butler.get()`. This
+ parameter is only relevant if there is dataset type
+ inconsistency.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters.
+
+ Returns
+ -------
+ datasetType : `DatasetType`
+ A `DatasetType` instance extracted from ``datasetRefOrType``.
+ dataId : `dict` or `DataId`, optional
+ Argument that can be used (along with ``kwargs``) to construct a
+ `DataId`.
+
+ Notes
+ -----
+ Butler APIs that conceptually need a DatasetRef also allow passing a
+ `DatasetType` (or the name of one) and a `DataId` (or a dict and
+ keyword arguments that can be used to construct one) separately. This
+ method accepts those arguments and always returns a true `DatasetType`
+ and a `DataId` or `dict`.
+
+ Standardization of `dict` vs `DataId` is best handled by passing the
+ returned ``dataId`` (and ``kwargs``) to `Registry` APIs, which are
+ generally similarly flexible.
+ """
+ externalDatasetType: DatasetType | None = None
+ internalDatasetType: DatasetType | None = None
+ if isinstance(datasetRefOrType, DatasetRef):
+ if dataId is not None or kwargs:
+ raise ValueError("DatasetRef given, cannot use dataId as well")
+ externalDatasetType = datasetRefOrType.datasetType
+ dataId = datasetRefOrType.dataId
+ else:
+ # Don't check whether DataId is provided, because Registry APIs
+ # can usually construct a better error message when it wasn't.
+ if isinstance(datasetRefOrType, DatasetType):
+ externalDatasetType = datasetRefOrType
+ else:
+ internalDatasetType = self._registry.getDatasetType(datasetRefOrType)
+
+ # Check that they are self-consistent
+ if externalDatasetType is not None:
+ internalDatasetType = self._registry.getDatasetType(externalDatasetType.name)
+ if externalDatasetType != internalDatasetType:
+ # We can allow differences if they are compatible, depending
+ # on whether this is a get or a put. A get requires that
+ # the python type associated with the datastore can be
+ # converted to the user type. A put requires that the user
+ # supplied python type can be converted to the internal
+ # type expected by registry.
+ relevantDatasetType = internalDatasetType
+ if for_put:
+ is_compatible = internalDatasetType.is_compatible_with(externalDatasetType)
+ else:
+ is_compatible = externalDatasetType.is_compatible_with(internalDatasetType)
+ relevantDatasetType = externalDatasetType
+ if not is_compatible:
+ raise ValueError(
+ f"Supplied dataset type ({externalDatasetType}) inconsistent with "
+ f"registry definition ({internalDatasetType})"
+ )
+ # Override the internal definition.
+ internalDatasetType = relevantDatasetType
+
+ assert internalDatasetType is not None
+ return internalDatasetType, dataId
+
+ def _rewrite_data_id(
+ self, dataId: DataId | None, datasetType: DatasetType, **kwargs: Any
+ ) -> tuple[DataId | None, dict[str, Any]]:
+ """Rewrite a data ID taking into account dimension records.
+
+ Take a Data ID and keyword args and rewrite it if necessary to
+ allow the user to specify dimension records rather than dimension
+ primary values.
+
+ This allows a user to include a dataId dict with keys of
+ ``exposure.day_obs`` and ``exposure.seq_num`` instead of giving
+ the integer exposure ID. It also allows a string to be given
+ for a dimension value rather than the integer ID if that is more
+ convenient. For example, rather than having to specifying the
+ detector with ``detector.full_name``, a string given for ``detector``
+ will be interpreted as the full name and converted to the integer
+ value.
+
+ Keyword arguments can also use strings for dimensions like detector
+ and exposure but python does not allow them to include ``.`` and
+ so the ``exposure.day_obs`` syntax can not be used in a keyword
+ argument.
+
+ Parameters
+ ----------
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that will label the
+ `DatasetRef` within a Collection.
+ datasetType : `DatasetType`
+ The dataset type associated with this dataId. Required to
+ determine the relevant dimensions.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataId`. See `DataId` parameters.
+
+ Returns
+ -------
+ dataId : `dict` or `DataCoordinate`
+ The, possibly rewritten, dataId. If given a `DataCoordinate` and
+ no keyword arguments, the original dataId will be returned
+ unchanged.
+ **kwargs : `dict`
+ Any unused keyword arguments (would normally be empty dict).
+ """
+ # Do nothing if we have a standalone DataCoordinate.
+ if isinstance(dataId, DataCoordinate) and not kwargs:
+ return dataId, kwargs
+
+ # Process dimension records that are using record information
+ # rather than ids
+ newDataId: dict[str, DataIdValue] = {}
+ byRecord: dict[str, dict[str, Any]] = defaultdict(dict)
+
+ # if all the dataId comes from keyword parameters we do not need
+ # to do anything here because they can't be of the form
+ # exposure.obs_id because a "." is not allowed in a keyword parameter.
+ if dataId:
+ for k, v in dataId.items():
+ # If we have a Dimension we do not need to do anything
+ # because it cannot be a compound key.
+ if isinstance(k, str) and "." in k:
+ # Someone is using a more human-readable dataId
+ dimensionName, record = k.split(".", 1)
+ byRecord[dimensionName][record] = v
+ elif isinstance(k, Dimension):
+ newDataId[k.name] = v
+ else:
+ newDataId[k] = v
+
+ # Go through the updated dataId and check the type in case someone is
+ # using an alternate key. We have already filtered out the compound
+ # keys dimensions.record format.
+ not_dimensions = {}
+
+ # Will need to look in the dataId and the keyword arguments
+ # and will remove them if they need to be fixed or are unrecognized.
+ for dataIdDict in (newDataId, kwargs):
+ # Use a list so we can adjust the dict safely in the loop
+ for dimensionName in list(dataIdDict):
+ value = dataIdDict[dimensionName]
+ try:
+ dimension = self.dimensions.getStaticDimensions()[dimensionName]
+ except KeyError:
+ # This is not a real dimension
+ not_dimensions[dimensionName] = value
+ del dataIdDict[dimensionName]
+ continue
+
+ # Convert an integral type to an explicit int to simplify
+ # comparisons here
+ if isinstance(value, numbers.Integral):
+ value = int(value)
+
+ if not isinstance(value, dimension.primaryKey.getPythonType()):
+ for alternate in dimension.alternateKeys:
+ if isinstance(value, alternate.getPythonType()):
+ byRecord[dimensionName][alternate.name] = value
+ del dataIdDict[dimensionName]
+ _LOG.debug(
+ "Converting dimension %s to %s.%s=%s",
+ dimensionName,
+ dimensionName,
+ alternate.name,
+ value,
+ )
+ break
+ else:
+ _LOG.warning(
+ "Type mismatch found for value '%r' provided for dimension %s. "
+ "Could not find matching alternative (primary key has type %s) "
+ "so attempting to use as-is.",
+ value,
+ dimensionName,
+ dimension.primaryKey.getPythonType(),
+ )
+
+ # By this point kwargs and newDataId should only include valid
+ # dimensions. Merge kwargs in to the new dataId and log if there
+ # are dimensions in both (rather than calling update).
+ for k, v in kwargs.items():
+ if k in newDataId and newDataId[k] != v:
+ _LOG.debug(
+ "Keyword arg %s overriding explicit value in dataId of %s with %s", k, newDataId[k], v
+ )
+ newDataId[k] = v
+ # No need to retain any values in kwargs now.
+ kwargs = {}
+
+ # If we have some unrecognized dimensions we have to try to connect
+ # them to records in other dimensions. This is made more complicated
+ # by some dimensions having records with clashing names. A mitigation
+ # is that we can tell by this point which dimensions are missing
+ # for the DatasetType but this does not work for calibrations
+ # where additional dimensions can be used to constrain the temporal
+ # axis.
+ if not_dimensions:
+ # Search for all dimensions even if we have been given a value
+ # explicitly. In some cases records are given as well as the
+ # actually dimension and this should not be an error if they
+ # match.
+ mandatoryDimensions = datasetType.dimensions.names # - provided
+
+ candidateDimensions: set[str] = set()
+ candidateDimensions.update(mandatoryDimensions)
+
+ # For calibrations we may well be needing temporal dimensions
+ # so rather than always including all dimensions in the scan
+ # restrict things a little. It is still possible for there
+ # to be confusion over day_obs in visit vs exposure for example.
+ # If we are not searching calibration collections things may
+ # fail but they are going to fail anyway because of the
+ # ambiguousness of the dataId...
+ if datasetType.isCalibration():
+ for dim in self.dimensions.getStaticDimensions():
+ if dim.temporal:
+ candidateDimensions.add(str(dim))
+
+ # Look up table for the first association with a dimension
+ guessedAssociation: dict[str, dict[str, Any]] = defaultdict(dict)
+
+ # Keep track of whether an item is associated with multiple
+ # dimensions.
+ counter: Counter[str] = Counter()
+ assigned: dict[str, set[str]] = defaultdict(set)
+
+ # Go through the missing dimensions and associate the
+ # given names with records within those dimensions
+ matched_dims = set()
+ for dimensionName in candidateDimensions:
+ dimension = self.dimensions.getStaticDimensions()[dimensionName]
+ fields = dimension.metadata.names | dimension.uniqueKeys.names
+ for field in not_dimensions:
+ if field in fields:
+ guessedAssociation[dimensionName][field] = not_dimensions[field]
+ counter[dimensionName] += 1
+ assigned[field].add(dimensionName)
+ matched_dims.add(field)
+
+ # Calculate the fields that matched nothing.
+ never_found = set(not_dimensions) - matched_dims
+
+ if never_found:
+ raise ValueError(f"Unrecognized keyword args given: {never_found}")
+
+ # There is a chance we have allocated a single dataId item
+ # to multiple dimensions. Need to decide which should be retained.
+ # For now assume that the most popular alternative wins.
+ # This means that day_obs with seq_num will result in
+ # exposure.day_obs and not visit.day_obs
+ # Also prefer an explicitly missing dimension over an inferred
+ # temporal dimension.
+ for fieldName, assignedDimensions in assigned.items():
+ if len(assignedDimensions) > 1:
+ # Pick the most popular (preferring mandatory dimensions)
+ requiredButMissing = assignedDimensions.intersection(mandatoryDimensions)
+ if requiredButMissing:
+ candidateDimensions = requiredButMissing
+ else:
+ candidateDimensions = assignedDimensions
+
+ # If this is a choice between visit and exposure and
+ # neither was a required part of the dataset type,
+ # (hence in this branch) always prefer exposure over
+ # visit since exposures are always defined and visits
+ # are defined from exposures.
+ if candidateDimensions == {"exposure", "visit"}:
+ candidateDimensions = {"exposure"}
+
+ # Select the relevant items and get a new restricted
+ # counter.
+ theseCounts = {k: v for k, v in counter.items() if k in candidateDimensions}
+ duplicatesCounter: Counter[str] = Counter()
+ duplicatesCounter.update(theseCounts)
+
+ # Choose the most common. If they are equally common
+ # we will pick the one that was found first.
+ # Returns a list of tuples
+ selected = duplicatesCounter.most_common(1)[0][0]
+
+ _LOG.debug(
+ "Ambiguous dataId entry '%s' associated with multiple dimensions: %s."
+ " Removed ambiguity by choosing dimension %s.",
+ fieldName,
+ ", ".join(assignedDimensions),
+ selected,
+ )
+
+ for candidateDimension in assignedDimensions:
+ if candidateDimension != selected:
+ del guessedAssociation[candidateDimension][fieldName]
+
+ # Update the record look up dict with the new associations
+ for dimensionName, values in guessedAssociation.items():
+ if values: # A dict might now be empty
+ _LOG.debug(
+ "Assigned non-dimension dataId keys to dimension %s: %s", dimensionName, values
+ )
+ byRecord[dimensionName].update(values)
+
+ if byRecord:
+ # Some record specifiers were found so we need to convert
+ # them to the Id form
+ for dimensionName, values in byRecord.items():
+ if dimensionName in newDataId:
+ _LOG.debug(
+ "DataId specified explicit %s dimension value of %s in addition to"
+ " general record specifiers for it of %s. Ignoring record information.",
+ dimensionName,
+ newDataId[dimensionName],
+ str(values),
+ )
+ # Get the actual record and compare with these values.
+ try:
+ recs = list(self._registry.queryDimensionRecords(dimensionName, dataId=newDataId))
+ except DataIdError:
+ raise ValueError(
+ f"Could not find dimension '{dimensionName}'"
+ f" with dataId {newDataId} as part of comparing with"
+ f" record values {byRecord[dimensionName]}"
+ ) from None
+ if len(recs) == 1:
+ errmsg: list[str] = []
+ for k, v in values.items():
+ if (recval := getattr(recs[0], k)) != v:
+ errmsg.append(f"{k}({recval} != {v})")
+ if errmsg:
+ raise ValueError(
+ f"Dimension {dimensionName} in dataId has explicit value"
+ " inconsistent with records: " + ", ".join(errmsg)
+ )
+ else:
+ # Multiple matches for an explicit dimension
+ # should never happen but let downstream complain.
+ pass
+ continue
+
+ # Build up a WHERE expression
+ bind = dict(values.items())
+ where = " AND ".join(f"{dimensionName}.{k} = {k}" for k in bind)
+
+ # Hopefully we get a single record that matches
+ records = set(
+ self._registry.queryDimensionRecords(
+ dimensionName, dataId=newDataId, where=where, bind=bind, **kwargs
+ )
+ )
+
+ if len(records) != 1:
+ if len(records) > 1:
+ # visit can have an ambiguous answer without involving
+ # visit_system. The default visit_system is defined
+ # by the instrument.
+ if (
+ dimensionName == "visit"
+ and "visit_system_membership" in self.dimensions
+ and "visit_system" in self.dimensions["instrument"].metadata
+ ):
+ instrument_records = list(
+ self._registry.queryDimensionRecords(
+ "instrument",
+ dataId=newDataId,
+ **kwargs,
+ )
+ )
+ if len(instrument_records) == 1:
+ visit_system = instrument_records[0].visit_system
+ if visit_system is None:
+ # Set to a value that will never match.
+ visit_system = -1
+
+ # Look up each visit in the
+ # visit_system_membership records.
+ for rec in records:
+ membership = list(
+ self._registry.queryDimensionRecords(
+ # Use bind to allow zero results.
+ # This is a fully-specified query.
+ "visit_system_membership",
+ where="instrument = inst AND visit_system = system AND visit = v",
+ bind=dict(
+ inst=instrument_records[0].name, system=visit_system, v=rec.id
+ ),
+ )
+ )
+ if membership:
+ # This record is the right answer.
+ records = {rec}
+ break
+
+ # The ambiguity may have been resolved so check again.
+ if len(records) > 1:
+ _LOG.debug(
+ "Received %d records from constraints of %s", len(records), str(values)
+ )
+ for r in records:
+ _LOG.debug("- %s", str(r))
+ raise ValueError(
+ f"DataId specification for dimension {dimensionName} is not"
+ f" uniquely constrained to a single dataset by {values}."
+ f" Got {len(records)} results."
+ )
+ else:
+ raise ValueError(
+ f"DataId specification for dimension {dimensionName} matched no"
+ f" records when constrained by {values}"
+ )
+
+ # Get the primary key from the real dimension object
+ dimension = self.dimensions.getStaticDimensions()[dimensionName]
+ if not isinstance(dimension, Dimension):
+ raise RuntimeError(
+ f"{dimension.name} is not a true dimension, and cannot be used in data IDs."
+ )
+ newDataId[dimensionName] = getattr(records.pop(), dimension.primaryKey.name)
+
+ return newDataId, kwargs
+
+ def _findDatasetRef(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ dataId: DataId | None = None,
+ *,
+ collections: Any = None,
+ predict: bool = False,
+ run: str | None = None,
+ datastore_records: bool = False,
+ **kwargs: Any,
+ ) -> DatasetRef:
+ """Shared logic for methods that start with a search for a dataset in
+ the registry.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`, optional
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ predict : `bool`, optional
+ If `True`, return a newly created `DatasetRef` with a unique
+ dataset ID if finding a reference in the `Registry` fails.
+ Defaults to `False`.
+ run : `str`, optional
+ Run collection name to use for creating `DatasetRef` for predicted
+ datasets. Only used if ``predict`` is `True`.
+ datastore_records : `bool`, optional
+ If `True` add datastore records to returned `DatasetRef`.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataId`. See `DataId` parameters.
+
+ Returns
+ -------
+ ref : `DatasetRef`
+ A reference to the dataset identified by the given arguments.
+ This can be the same dataset reference as given if it was
+ resolved.
+
+ Raises
+ ------
+ LookupError
+ Raised if no matching dataset exists in the `Registry` (and
+ ``predict`` is `False`).
+ ValueError
+ Raised if a resolved `DatasetRef` was passed as an input, but it
+ differs from the one found in the registry.
+ TypeError
+ Raised if no collections were provided.
+ """
+ datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, for_put=False, **kwargs)
+ if isinstance(datasetRefOrType, DatasetRef):
+ if collections is not None:
+ warnings.warn("Collections should not be specified with DatasetRef", stacklevel=3)
+ # May need to retrieve datastore records if requested.
+ if datastore_records and datasetRefOrType._datastore_records is None:
+ datasetRefOrType = self._registry.get_datastore_records(datasetRefOrType)
+ return datasetRefOrType
+ timespan: Timespan | None = None
+
+ dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
+
+ if datasetType.isCalibration():
+ # Because this is a calibration dataset, first try to make a
+ # standardize the data ID without restricting the dimensions to
+ # those of the dataset type requested, because there may be extra
+ # dimensions that provide temporal information for a validity-range
+ # lookup.
+ dataId = DataCoordinate.standardize(
+ dataId, universe=self.dimensions, defaults=self._registry.defaults.dataId, **kwargs
+ )
+ if dataId.graph.temporal:
+ dataId = self._registry.expandDataId(dataId)
+ timespan = dataId.timespan
+ else:
+ # Standardize the data ID to just the dimensions of the dataset
+ # type instead of letting registry.findDataset do it, so we get the
+ # result even if no dataset is found.
+ dataId = DataCoordinate.standardize(
+ dataId, graph=datasetType.dimensions, defaults=self._registry.defaults.dataId, **kwargs
+ )
+ # Always lookup the DatasetRef, even if one is given, to ensure it is
+ # present in the current collection.
+ ref = self._registry.findDataset(
+ datasetType,
+ dataId,
+ collections=collections,
+ timespan=timespan,
+ datastore_records=datastore_records,
+ )
+ if ref is None:
+ if predict:
+ if run is None:
+ run = self.run
+ if run is None:
+ raise TypeError("Cannot predict dataset ID/location with run=None.")
+ return DatasetRef(datasetType, dataId, run=run)
+ else:
+ if collections is None:
+ collections = self._registry.defaults.collections
+ raise LookupError(
+ f"Dataset {datasetType.name} with data ID {dataId} "
+ f"could not be found in collections {collections}."
+ )
+ if datasetType != ref.datasetType:
+ # If they differ it is because the user explicitly specified
+ # a compatible dataset type to this call rather than using the
+ # registry definition. The DatasetRef must therefore be recreated
+ # using the user definition such that the expected type is
+ # returned.
+ ref = DatasetRef(
+ datasetType, ref.dataId, run=ref.run, id=ref.id, datastore_records=ref._datastore_records
+ )
+
+ return ref
+
+ # TODO: remove on DM-40067.
+ @transactional
+ @deprecated(
+ reason="Butler.put() now behaves like Butler.putDirect() when given a DatasetRef."
+ " Please use Butler.put(). Be aware that you may need to adjust your usage if you"
+ " were relying on the run parameter to determine the run."
+ " Will be removed after v26.0.",
+ version="v26.0",
+ category=FutureWarning,
+ )
+ def putDirect(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
+ # Docstring inherited.
+ return self.put(obj, ref)
+
+ @transactional
+ def put(
+ self,
+ obj: Any,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ /,
+ dataId: DataId | None = None,
+ *,
+ run: str | None = None,
+ **kwargs: Any,
+ ) -> DatasetRef:
+ """Store and register a dataset.
+
+ Parameters
+ ----------
+ obj : `object`
+ The dataset.
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` is provided, ``dataId`` should be `None`.
+ Otherwise the `DatasetType` or name thereof. If a fully resolved
+ `DatasetRef` is given the run and ID are used directly.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the second argument.
+ run : `str`, optional
+ The name of the run the dataset should be added to, overriding
+ ``self.run``. Not used if a resolved `DatasetRef` is provided.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters. Not used if a resolve `DatasetRef` is provided.
+
+ Returns
+ -------
+ ref : `DatasetRef`
+ A reference to the stored dataset, updated with the correct id if
+ given.
+
+ Raises
+ ------
+ TypeError
+ Raised if the butler is read-only or if no run has been provided.
+ """
+ if isinstance(datasetRefOrType, DatasetRef):
+ # This is a direct put of predefined DatasetRef.
+ _LOG.debug("Butler put direct: %s", datasetRefOrType)
+ if run is not None:
+ warnings.warn("Run collection is not used for DatasetRef", stacklevel=3)
+ # If registry already has a dataset with the same dataset ID,
+ # dataset type and DataId, then _importDatasets will do nothing and
+ # just return an original ref. We have to raise in this case, there
+ # is a datastore check below for that.
+ self._registry._importDatasets([datasetRefOrType], expand=True)
+ # Before trying to write to the datastore check that it does not
+ # know this dataset. This is prone to races, of course.
+ if self._datastore.knows(datasetRefOrType):
+ raise ConflictingDefinitionError(f"Datastore already contains dataset: {datasetRefOrType}")
+ # Try to write dataset to the datastore, if it fails due to a race
+ # with another write, the content of stored data may be
+ # unpredictable.
+ try:
+ self._datastore.put(obj, datasetRefOrType)
+ except IntegrityError as e:
+ raise ConflictingDefinitionError(f"Datastore already contains dataset: {e}") from e
+ return datasetRefOrType
+
+ _LOG.debug("Butler put: %s, dataId=%s, run=%s", datasetRefOrType, dataId, run)
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+ datasetType, dataId = self._standardizeArgs(datasetRefOrType, dataId, **kwargs)
+
+ # Handle dimension records in dataId
+ dataId, kwargs = self._rewrite_data_id(dataId, datasetType, **kwargs)
+
+ # Add Registry Dataset entry.
+ dataId = self._registry.expandDataId(dataId, graph=datasetType.dimensions, **kwargs)
+ (ref,) = self._registry.insertDatasets(datasetType, run=run, dataIds=[dataId])
+ self._datastore.put(obj, ref)
+
+ return ref
+
+ # TODO: remove on DM-40067.
+ @deprecated(
+ reason="Butler.get() now behaves like Butler.getDirect() when given a DatasetRef."
+ " Please use Butler.get(). Will be removed after v26.0.",
+ version="v26.0",
+ category=FutureWarning,
+ )
+ def getDirect(
+ self,
+ ref: DatasetRef,
+ *,
+ parameters: dict[str, Any] | None = None,
+ storageClass: StorageClass | str | None = None,
+ ) -> Any:
+ """Retrieve a stored dataset.
+
+ Parameters
+ ----------
+ ref : `DatasetRef`
+ Resolved reference to an already stored dataset.
+ parameters : `dict`
+ Additional StorageClass-defined options to control reading,
+ typically used to efficiently read only a subset of the dataset.
+ storageClass : `StorageClass` or `str`, optional
+ The storage class to be used to override the Python type
+ returned by this method. By default the returned type matches
+ the dataset type definition for this dataset. Specifying a
+ read `StorageClass` can force a different type to be returned.
+ This type must be compatible with the original type.
+
+ Returns
+ -------
+ obj : `object`
+ The dataset.
+ """
+ return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
+
+ # TODO: remove on DM-40067.
+ @deprecated(
+ reason="Butler.getDeferred() now behaves like getDirectDeferred() when given a DatasetRef. "
+ "Please use Butler.getDeferred(). Will be removed after v26.0.",
+ version="v26.0",
+ category=FutureWarning,
+ )
+ def getDirectDeferred(
+ self,
+ ref: DatasetRef,
+ *,
+ parameters: dict[str, Any] | None = None,
+ storageClass: str | StorageClass | None = None,
+ ) -> DeferredDatasetHandle:
+ """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
+ from a resolved `DatasetRef`.
+
+ Parameters
+ ----------
+ ref : `DatasetRef`
+ Resolved reference to an already stored dataset.
+ parameters : `dict`
+ Additional StorageClass-defined options to control reading,
+ typically used to efficiently read only a subset of the dataset.
+ storageClass : `StorageClass` or `str`, optional
+ The storage class to be used to override the Python type
+ returned by this method. By default the returned type matches
+ the dataset type definition for this dataset. Specifying a
+ read `StorageClass` can force a different type to be returned.
+ This type must be compatible with the original type.
+
+ Returns
+ -------
+ obj : `DeferredDatasetHandle`
+ A handle which can be used to retrieve a dataset at a later time.
+
+ Raises
+ ------
+ LookupError
+ Raised if no matching dataset exists in the `Registry`.
+ """
+ # Check that dataset is known to the datastore.
+ if not self._datastore.knows(ref):
+ raise LookupError(f"Dataset reference {ref} is not known to datastore.")
+ return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
+
+ def getDeferred(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ /,
+ dataId: DataId | None = None,
+ *,
+ parameters: dict | None = None,
+ collections: Any = None,
+ storageClass: str | StorageClass | None = None,
+ **kwargs: Any,
+ ) -> DeferredDatasetHandle:
+ """Create a `DeferredDatasetHandle` which can later retrieve a dataset,
+ after an immediate registry lookup.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`, optional
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ parameters : `dict`
+ Additional StorageClass-defined options to control reading,
+ typically used to efficiently read only a subset of the dataset.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ storageClass : `StorageClass` or `str`, optional
+ The storage class to be used to override the Python type
+ returned by this method. By default the returned type matches
+ the dataset type definition for this dataset. Specifying a
+ read `StorageClass` can force a different type to be returned.
+ This type must be compatible with the original type.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataId`. See `DataId` parameters.
+
+ Returns
+ -------
+ obj : `DeferredDatasetHandle`
+ A handle which can be used to retrieve a dataset at a later time.
+
+ Raises
+ ------
+ LookupError
+ Raised if no matching dataset exists in the `Registry` or
+ datastore.
+ ValueError
+ Raised if a resolved `DatasetRef` was passed as an input, but it
+ differs from the one found in the registry.
+ TypeError
+ Raised if no collections were provided.
+ """
+ if isinstance(datasetRefOrType, DatasetRef):
+ # Do the quick check first and if that fails, check for artifact
+ # existence. This is necessary for datastores that are configured
+ # in trust mode where there won't be a record but there will be
+ # a file.
+ if self._datastore.knows(datasetRefOrType) or self._datastore.exists(datasetRefOrType):
+ ref = datasetRefOrType
+ else:
+ raise LookupError(f"Dataset reference {datasetRefOrType} does not exist.")
+ else:
+ ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
+ return DeferredDatasetHandle(butler=self, ref=ref, parameters=parameters, storageClass=storageClass)
+
+ def get(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ /,
+ dataId: DataId | None = None,
+ *,
+ parameters: dict[str, Any] | None = None,
+ collections: Any = None,
+ storageClass: StorageClass | str | None = None,
+ **kwargs: Any,
+ ) -> Any:
+ """Retrieve a stored dataset.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ If a resolved `DatasetRef`, the associated dataset
+ is returned directly without additional querying.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ parameters : `dict`
+ Additional StorageClass-defined options to control reading,
+ typically used to efficiently read only a subset of the dataset.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ storageClass : `StorageClass` or `str`, optional
+ The storage class to be used to override the Python type
+ returned by this method. By default the returned type matches
+ the dataset type definition for this dataset. Specifying a
+ read `StorageClass` can force a different type to be returned.
+ This type must be compatible with the original type.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters.
+
+ Returns
+ -------
+ obj : `object`
+ The dataset.
+
+ Raises
+ ------
+ LookupError
+ Raised if no matching dataset exists in the `Registry`.
+ TypeError
+ Raised if no collections were provided.
+
+ Notes
+ -----
+ When looking up datasets in a `~CollectionType.CALIBRATION` collection,
+ this method requires that the given data ID include temporal dimensions
+ beyond the dimensions of the dataset type itself, in order to find the
+ dataset with the appropriate validity range. For example, a "bias"
+ dataset with native dimensions ``{instrument, detector}`` could be
+ fetched with a ``{instrument, detector, exposure}`` data ID, because
+ ``exposure`` is a temporal dimension.
+ """
+ _LOG.debug("Butler get: %s, dataId=%s, parameters=%s", datasetRefOrType, dataId, parameters)
+ ref = self._findDatasetRef(
+ datasetRefOrType, dataId, collections=collections, datastore_records=True, **kwargs
+ )
+ return self._datastore.get(ref, parameters=parameters, storageClass=storageClass)
+
+ def getURIs(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ /,
+ dataId: DataId | None = None,
+ *,
+ predict: bool = False,
+ collections: Any = None,
+ run: str | None = None,
+ **kwargs: Any,
+ ) -> DatasetRefURIs:
+ """Return the URIs associated with the dataset.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ predict : `bool`
+ If `True`, allow URIs to be returned of datasets that have not
+ been written.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ run : `str`, optional
+ Run to use for predictions, overriding ``self.run``.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters.
+
+ Returns
+ -------
+ uris : `DatasetRefURIs`
+ The URI to the primary artifact associated with this dataset (if
+ the dataset was disassembled within the datastore this may be
+ `None`), and the URIs to any components associated with the dataset
+ artifact. (can be empty if there are no components).
+ """
+ ref = self._findDatasetRef(
+ datasetRefOrType, dataId, predict=predict, run=run, collections=collections, **kwargs
+ )
+ return self._datastore.getURIs(ref, predict)
+
+ def getURI(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ /,
+ dataId: DataId | None = None,
+ *,
+ predict: bool = False,
+ collections: Any = None,
+ run: str | None = None,
+ **kwargs: Any,
+ ) -> ResourcePath:
+ """Return the URI to the Dataset.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ predict : `bool`
+ If `True`, allow URIs to be returned of datasets that have not
+ been written.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ run : `str`, optional
+ Run to use for predictions, overriding ``self.run``.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters.
+
+ Returns
+ -------
+ uri : `lsst.resources.ResourcePath`
+ URI pointing to the Dataset within the datastore. If the
+ Dataset does not exist in the datastore, and if ``predict`` is
+ `True`, the URI will be a prediction and will include a URI
+ fragment "#predicted".
+ If the datastore does not have entities that relate well
+ to the concept of a URI the returned URI string will be
+ descriptive. The returned URI is not guaranteed to be obtainable.
+
+ Raises
+ ------
+ LookupError
+ A URI has been requested for a dataset that does not exist and
+ guessing is not allowed.
+ ValueError
+ Raised if a resolved `DatasetRef` was passed as an input, but it
+ differs from the one found in the registry.
+ TypeError
+ Raised if no collections were provided.
+ RuntimeError
+ Raised if a URI is requested for a dataset that consists of
+ multiple artifacts.
+ """
+ primary, components = self.getURIs(
+ datasetRefOrType, dataId=dataId, predict=predict, collections=collections, run=run, **kwargs
+ )
+
+ if primary is None or components:
+ raise RuntimeError(
+ f"Dataset ({datasetRefOrType}) includes distinct URIs for components. "
+ "Use Butler.getURIs() instead."
+ )
+ return primary
+
+ def retrieveArtifacts(
+ self,
+ refs: Iterable[DatasetRef],
+ destination: ResourcePathExpression,
+ transfer: str = "auto",
+ preserve_path: bool = True,
+ overwrite: bool = False,
+ ) -> list[ResourcePath]:
+ # Docstring inherited.
+ return self._datastore.retrieveArtifacts(
+ refs,
+ ResourcePath(destination),
+ transfer=transfer,
+ preserve_path=preserve_path,
+ overwrite=overwrite,
+ )
+
+ def exists(
+ self,
+ dataset_ref_or_type: DatasetRef | DatasetType | str,
+ /,
+ data_id: DataId | None = None,
+ *,
+ full_check: bool = True,
+ collections: Any = None,
+ **kwargs: Any,
+ ) -> DatasetExistence:
+ # Docstring inherited.
+ existence = DatasetExistence.UNRECOGNIZED
+
+ if isinstance(dataset_ref_or_type, DatasetRef):
+ if collections is not None:
+ warnings.warn("Collections should not be specified with DatasetRef", stacklevel=2)
+ if data_id is not None:
+ warnings.warn("A DataID should not be specified with DatasetRef", stacklevel=2)
+ ref = dataset_ref_or_type
+ registry_ref = self._registry.getDataset(dataset_ref_or_type.id)
+ if registry_ref is not None:
+ existence |= DatasetExistence.RECORDED
+
+ if dataset_ref_or_type != registry_ref:
+ # This could mean that storage classes differ, so we should
+ # check for that but use the registry ref for the rest of
+ # the method.
+ if registry_ref.is_compatible_with(dataset_ref_or_type):
+ # Use the registry version from now on.
+ ref = registry_ref
+ else:
+ raise ValueError(
+ f"The ref given to exists() ({ref}) has the same dataset ID as one "
+ f"in registry but has different incompatible values ({registry_ref})."
+ )
+ else:
+ try:
+ ref = self._findDatasetRef(dataset_ref_or_type, data_id, collections=collections, **kwargs)
+ except (LookupError, TypeError, NoDefaultCollectionError):
+ return existence
+ existence |= DatasetExistence.RECORDED
+
+ if self._datastore.knows(ref):
+ existence |= DatasetExistence.DATASTORE
+
+ if full_check:
+ if self._datastore.exists(ref):
+ existence |= DatasetExistence._ARTIFACT
+ elif existence.value != DatasetExistence.UNRECOGNIZED.value:
+ # Do not add this flag if we have no other idea about a dataset.
+ existence |= DatasetExistence(DatasetExistence._ASSUMED)
+
+ return existence
+
+ def _exists_many(
+ self,
+ refs: Iterable[DatasetRef],
+ /,
+ *,
+ full_check: bool = True,
+ ) -> dict[DatasetRef, DatasetExistence]:
+ # Docstring inherited.
+ existence = {ref: DatasetExistence.UNRECOGNIZED for ref in refs}
+
+ # Registry does not have a bulk API to check for a ref.
+ for ref in refs:
+ registry_ref = self._registry.getDataset(ref.id)
+ if registry_ref is not None:
+ # It is possible, albeit unlikely, that the given ref does
+ # not match the one in registry even though the UUID matches.
+ # When checking a single ref we raise, but it's impolite to
+ # do that when potentially hundreds of refs are being checked.
+ # We could change the API to only accept UUIDs and that would
+ # remove the ability to even check and remove the worry
+ # about differing storage classes. Given the ongoing discussion
+ # on refs vs UUIDs and whether to raise or have a new
+ # private flag, treat this as a private API for now.
+ existence[ref] |= DatasetExistence.RECORDED
+
+ # Ask datastore if it knows about these refs.
+ knows = self._datastore.knows_these(refs)
+ for ref, known in knows.items():
+ if known:
+ existence[ref] |= DatasetExistence.DATASTORE
+
+ if full_check:
+ mexists = self._datastore.mexists(refs)
+ for ref, exists in mexists.items():
+ if exists:
+ existence[ref] |= DatasetExistence._ARTIFACT
+ else:
+ # Do not set this flag if nothing is known about the dataset.
+ for ref in existence:
+ if existence[ref] != DatasetExistence.UNRECOGNIZED:
+ existence[ref] |= DatasetExistence._ASSUMED
+
+ return existence
+
+ # TODO: remove on DM-40079.
+ @deprecated(
+ reason="Butler.datasetExists() has been replaced by Butler.exists(). Will be removed after v26.0.",
+ version="v26.0",
+ category=FutureWarning,
+ )
+ def datasetExists(
+ self,
+ datasetRefOrType: DatasetRef | DatasetType | str,
+ dataId: DataId | None = None,
+ *,
+ collections: Any = None,
+ **kwargs: Any,
+ ) -> bool:
+ """Return True if the Dataset is actually present in the Datastore.
+
+ Parameters
+ ----------
+ datasetRefOrType : `DatasetRef`, `DatasetType`, or `str`
+ When `DatasetRef` the `dataId` should be `None`.
+ Otherwise the `DatasetType` or name thereof.
+ dataId : `dict` or `DataCoordinate`
+ A `dict` of `Dimension` link name, value pairs that label the
+ `DatasetRef` within a Collection. When `None`, a `DatasetRef`
+ should be provided as the first argument.
+ collections : Any, optional
+ Collections to be searched, overriding ``self.collections``.
+ Can be any of the types supported by the ``collections`` argument
+ to butler construction.
+ **kwargs
+ Additional keyword arguments used to augment or construct a
+ `DataCoordinate`. See `DataCoordinate.standardize`
+ parameters.
+
+ Raises
+ ------
+ LookupError
+ Raised if the dataset is not even present in the Registry.
+ ValueError
+ Raised if a resolved `DatasetRef` was passed as an input, but it
+ differs from the one found in the registry.
+ NoDefaultCollectionError
+ Raised if no collections were provided.
+ """
+ # A resolved ref may be given that is not known to this butler.
+ if isinstance(datasetRefOrType, DatasetRef):
+ ref = self._registry.getDataset(datasetRefOrType.id)
+ if ref is None:
+ raise LookupError(
+ f"Resolved DatasetRef with id {datasetRefOrType.id} is not known to registry."
+ )
+ else:
+ ref = self._findDatasetRef(datasetRefOrType, dataId, collections=collections, **kwargs)
+ return self._datastore.exists(ref)
+
+ def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None:
+ # Docstring inherited.
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+ names = list(names)
+ refs: list[DatasetRef] = []
+ for name in names:
+ collectionType = self._registry.getCollectionType(name)
+ if collectionType is not CollectionType.RUN:
+ raise TypeError(f"The collection type of '{name}' is {collectionType.name}, not RUN.")
+ refs.extend(self._registry.queryDatasets(..., collections=name, findFirst=True))
+ with self._datastore.transaction(), self._registry.transaction():
+ if unstore:
+ self._datastore.trash(refs)
+ else:
+ self._datastore.forget(refs)
+ for name in names:
+ self._registry.removeCollection(name)
+ if unstore:
+ # Point of no return for removing artifacts
+ self._datastore.emptyTrash()
+
+ def pruneDatasets(
+ self,
+ refs: Iterable[DatasetRef],
+ *,
+ disassociate: bool = True,
+ unstore: bool = False,
+ tags: Iterable[str] = (),
+ purge: bool = False,
+ ) -> None:
+ # docstring inherited from LimitedButler
+
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+ if purge:
+ if not disassociate:
+ raise TypeError("Cannot pass purge=True without disassociate=True.")
+ if not unstore:
+ raise TypeError("Cannot pass purge=True without unstore=True.")
+ elif disassociate:
+ tags = tuple(tags)
+ if not tags:
+ raise TypeError("No tags provided but disassociate=True.")
+ for tag in tags:
+ collectionType = self._registry.getCollectionType(tag)
+ if collectionType is not CollectionType.TAGGED:
+ raise TypeError(
+ f"Cannot disassociate from collection '{tag}' "
+ f"of non-TAGGED type {collectionType.name}."
+ )
+ # Transform possibly-single-pass iterable into something we can iterate
+ # over multiple times.
+ refs = list(refs)
+ # Pruning a component of a DatasetRef makes no sense since registry
+ # doesn't know about components and datastore might not store
+ # components in a separate file
+ for ref in refs:
+ if ref.datasetType.component():
+ raise ValueError(f"Can not prune a component of a dataset (ref={ref})")
+ # We don't need an unreliable Datastore transaction for this, because
+ # we've been extra careful to ensure that Datastore.trash only involves
+ # mutating the Registry (it can _look_ at Datastore-specific things,
+ # but shouldn't change them), and hence all operations here are
+ # Registry operations.
+ with self._datastore.transaction(), self._registry.transaction():
+ if unstore:
+ self._datastore.trash(refs)
+ if purge:
+ self._registry.removeDatasets(refs)
+ elif disassociate:
+ assert tags, "Guaranteed by earlier logic in this function."
+ for tag in tags:
+ self._registry.disassociate(tag, refs)
+ # We've exited the Registry transaction, and apparently committed.
+ # (if there was an exception, everything rolled back, and it's as if
+ # nothing happened - and we never get here).
+ # Datastore artifacts are not yet gone, but they're clearly marked
+ # as trash, so if we fail to delete now because of (e.g.) filesystem
+ # problems we can try again later, and if manual administrative
+ # intervention is required, it's pretty clear what that should entail:
+ # deleting everything on disk and in private Datastore tables that is
+ # in the dataset_location_trash table.
+ if unstore:
+ # Point of no return for removing artifacts
+ self._datastore.emptyTrash()
+
+ @transactional
+ def ingest(
+ self,
+ *datasets: FileDataset,
+ transfer: str | None = "auto",
+ run: str | None = None,
+ idGenerationMode: DatasetIdGenEnum | None = None,
+ record_validation_info: bool = True,
+ ) -> None:
+ # Docstring inherited.
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+
+ _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s")
+ if not datasets:
+ return
+
+ if idGenerationMode is not None:
+ warnings.warn(
+ "The idGenerationMode parameter is no longer used and is ignored. "
+ " Will be removed after v26.0",
+ FutureWarning,
+ stacklevel=2,
+ )
+
+ progress = Progress("lsst.daf.butler.Butler.ingest", level=logging.DEBUG)
+
+ # We need to reorganize all the inputs so that they are grouped
+ # by dataset type and run. Multiple refs in a single FileDataset
+ # are required to share the run and dataset type.
+ GroupedData = MutableMapping[tuple[DatasetType, str], list[FileDataset]]
+ groupedData: GroupedData = defaultdict(list)
+
+ # Track DataIDs that are being ingested so we can spot issues early
+ # with duplication. Retain previous FileDataset so we can report it.
+ groupedDataIds: MutableMapping[
+ tuple[DatasetType, str], dict[DataCoordinate, FileDataset]
+ ] = defaultdict(dict)
+
+ used_run = False
+
+ # And the nested loop that populates it:
+ for dataset in progress.wrap(datasets, desc="Grouping by dataset type"):
+ # Somewhere to store pre-existing refs if we have an
+ # execution butler.
+ existingRefs: list[DatasetRef] = []
+
+ for ref in dataset.refs:
+ assert ref.run is not None # For mypy
+ group_key = (ref.datasetType, ref.run)
+
+ if ref.dataId in groupedDataIds[group_key]:
+ raise ConflictingDefinitionError(
+ f"Ingest conflict. Dataset {dataset.path} has same"
+ " DataId as other ingest dataset"
+ f" {groupedDataIds[group_key][ref.dataId].path} "
+ f" ({ref.dataId})"
+ )
+
+ groupedDataIds[group_key][ref.dataId] = dataset
+
+ if existingRefs:
+ if len(dataset.refs) != len(existingRefs):
+ # Keeping track of partially pre-existing datasets is hard
+ # and should generally never happen. For now don't allow
+ # it.
+ raise ConflictingDefinitionError(
+ f"For dataset {dataset.path} some dataIds already exist"
+ " in registry but others do not. This is not supported."
+ )
+
+ # Store expanded form in the original FileDataset.
+ dataset.refs = existingRefs
+ else:
+ groupedData[group_key].append(dataset)
+
+ if not used_run and run is not None:
+ warnings.warn(
+ "All DatasetRefs to be ingested had resolved dataset IDs. The value given to the "
+ f"'run' parameter ({run!r}) was not used and the parameter will be removed in the future.",
+ category=FutureWarning,
+ stacklevel=3, # Take into account the @transactional decorator.
+ )
+
+ # Now we can bulk-insert into Registry for each DatasetType.
+ for (datasetType, this_run), grouped_datasets in progress.iter_item_chunks(
+ groupedData.items(), desc="Bulk-inserting datasets by type"
+ ):
+ refs_to_import = []
+ for dataset in grouped_datasets:
+ refs_to_import.extend(dataset.refs)
+
+ n_refs = len(refs_to_import)
+ _LOG.verbose(
+ "Importing %d ref%s of dataset type %r into run %r",
+ n_refs,
+ "" if n_refs == 1 else "s",
+ datasetType.name,
+ this_run,
+ )
+
+ # Import the refs and expand the DataCoordinates since we can't
+ # guarantee that they are expanded and Datastore will need
+ # the records.
+ imported_refs = self._registry._importDatasets(refs_to_import, expand=True)
+ assert set(imported_refs) == set(refs_to_import)
+
+ # Replace all the refs in the FileDataset with expanded versions.
+ # Pull them off in the order we put them on the list.
+ for dataset in grouped_datasets:
+ n_dataset_refs = len(dataset.refs)
+ dataset.refs = imported_refs[:n_dataset_refs]
+ del imported_refs[:n_dataset_refs]
+
+ # Bulk-insert everything into Datastore.
+ # We do not know if any of the registry entries already existed
+ # (_importDatasets only complains if they exist but differ) so
+ # we have to catch IntegrityError explicitly.
+ try:
+ self._datastore.ingest(
+ *datasets, transfer=transfer, record_validation_info=record_validation_info
+ )
+ except IntegrityError as e:
+ raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e
+
+ @contextlib.contextmanager
+ def export(
+ self,
+ *,
+ directory: str | None = None,
+ filename: str | None = None,
+ format: str | None = None,
+ transfer: str | None = None,
+ ) -> Iterator[RepoExportContext]:
+ # Docstring inherited.
+ if directory is None and transfer is not None:
+ raise TypeError("Cannot transfer without providing a directory.")
+ if transfer == "move":
+ raise TypeError("Transfer may not be 'move': export is read-only")
+ if format is None:
+ if filename is None:
+ raise TypeError("At least one of 'filename' or 'format' must be provided.")
+ else:
+ _, format = os.path.splitext(filename)
+ if not format:
+ raise ValueError("Please specify a file extension to determine export format.")
+ format = format[1:] # Strip leading ".""
+ elif filename is None:
+ filename = f"export.{format}"
+ if directory is not None:
+ filename = os.path.join(directory, filename)
+ formats = self._config["repo_transfer_formats"]
+ if format not in formats:
+ raise ValueError(f"Unknown export format {format!r}, allowed: {','.join(formats.keys())}")
+ BackendClass = get_class_of(formats[format, "export"])
+ with open(filename, "w") as stream:
+ backend = BackendClass(stream, universe=self.dimensions)
+ try:
+ helper = RepoExportContext(
+ self._registry, self._datastore, backend=backend, directory=directory, transfer=transfer
+ )
+ yield helper
+ except BaseException:
+ raise
+ else:
+ helper._finish()
+
+ def import_(
+ self,
+ *,
+ directory: ResourcePathExpression | None = None,
+ filename: ResourcePathExpression | TextIO | None = None,
+ format: str | None = None,
+ transfer: str | None = None,
+ skip_dimensions: set | None = None,
+ ) -> None:
+ # Docstring inherited.
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+ if format is None:
+ if filename is None:
+ raise TypeError("At least one of 'filename' or 'format' must be provided.")
+ else:
+ _, format = os.path.splitext(filename) # type: ignore
+ elif filename is None:
+ filename = ResourcePath(f"export.{format}", forceAbsolute=False)
+ if directory is not None:
+ directory = ResourcePath(directory, forceDirectory=True)
+ # mypy doesn't think this will work but it does in python >= 3.10.
+ if isinstance(filename, ResourcePathExpression): # type: ignore
+ filename = ResourcePath(filename, forceAbsolute=False) # type: ignore
+ if not filename.isabs() and directory is not None:
+ potential = directory.join(filename)
+ exists_in_cwd = filename.exists()
+ exists_in_dir = potential.exists()
+ if exists_in_cwd and exists_in_dir:
+ _LOG.warning(
+ "A relative path for filename was specified (%s) which exists relative to cwd. "
+ "Additionally, the file exists relative to the given search directory (%s). "
+ "Using the export file in the given directory.",
+ filename,
+ potential,
+ )
+ # Given they specified an explicit directory and that
+ # directory has the export file in it, assume that that
+ # is what was meant despite the file in cwd.
+ filename = potential
+ elif exists_in_dir:
+ filename = potential
+ elif not exists_in_cwd and not exists_in_dir:
+ # Raise early.
+ raise FileNotFoundError(
+ f"Export file could not be found in {filename.abspath()} or {potential.abspath()}."
+ )
+ BackendClass: type[RepoImportBackend] = get_class_of(
+ self._config["repo_transfer_formats"][format]["import"]
+ )
+
+ def doImport(importStream: TextIO | ResourceHandleProtocol) -> None:
+ backend = BackendClass(importStream, self._registry) # type: ignore[call-arg]
+ backend.register()
+ with self.transaction():
+ backend.load(
+ self._datastore,
+ directory=directory,
+ transfer=transfer,
+ skip_dimensions=skip_dimensions,
+ )
+
+ if isinstance(filename, ResourcePath):
+ # We can not use open() here at the moment because of
+ # DM-38589 since yaml does stream.read(8192) in a loop.
+ stream = io.StringIO(filename.read().decode())
+ doImport(stream)
+ else:
+ doImport(filename) # type: ignore
+
+ def transfer_from(
+ self,
+ source_butler: LimitedButler,
+ source_refs: Iterable[DatasetRef],
+ transfer: str = "auto",
+ skip_missing: bool = True,
+ register_dataset_types: bool = False,
+ transfer_dimensions: bool = False,
+ ) -> collections.abc.Collection[DatasetRef]:
+ # Docstring inherited.
+ if not self.isWriteable():
+ raise TypeError("Butler is read-only.")
+ progress = Progress("lsst.daf.butler.Butler.transfer_from", level=VERBOSE)
+
+ # Will iterate through the refs multiple times so need to convert
+ # to a list if this isn't a collection.
+ if not isinstance(source_refs, collections.abc.Collection):
+ source_refs = list(source_refs)
+
+ original_count = len(source_refs)
+ _LOG.info("Transferring %d datasets into %s", original_count, str(self))
+
+ # In some situations the datastore artifact may be missing
+ # and we do not want that registry entry to be imported.
+ # Asking datastore is not sufficient, the records may have been
+ # purged, we have to ask for the (predicted) URI and check
+ # existence explicitly. Execution butler is set up exactly like
+ # this with no datastore records.
+ artifact_existence: dict[ResourcePath, bool] = {}
+ if skip_missing:
+ dataset_existence = source_butler._datastore.mexists(
+ source_refs, artifact_existence=artifact_existence
+ )
+ source_refs = [ref for ref, exists in dataset_existence.items() if exists]
+ filtered_count = len(source_refs)
+ n_missing = original_count - filtered_count
+ _LOG.verbose(
+ "%d dataset%s removed because the artifact does not exist. Now have %d.",
+ n_missing,
+ "" if n_missing == 1 else "s",
+ filtered_count,
+ )
+
+ # Importing requires that we group the refs by dataset type and run
+ # before doing the import.
+ source_dataset_types = set()
+ grouped_refs = defaultdict(list)
+ for ref in source_refs:
+ grouped_refs[ref.datasetType, ref.run].append(ref)
+ source_dataset_types.add(ref.datasetType)
+
+ # Check to see if the dataset type in the source butler has
+ # the same definition in the target butler and register missing
+ # ones if requested. Registration must happen outside a transaction.
+ newly_registered_dataset_types = set()
+ for datasetType in source_dataset_types:
+ if register_dataset_types:
+ # Let this raise immediately if inconsistent. Continuing
+ # on to find additional inconsistent dataset types
+ # might result in additional unwanted dataset types being
+ # registered.
+ if self._registry.registerDatasetType(datasetType):
+ newly_registered_dataset_types.add(datasetType)
+ else:
+ # If the dataset type is missing, let it fail immediately.
+ target_dataset_type = self._registry.getDatasetType(datasetType.name)
+ if target_dataset_type != datasetType:
+ raise ConflictingDefinitionError(
+ "Source butler dataset type differs from definition"
+ f" in target butler: {datasetType} !="
+ f" {target_dataset_type}"
+ )
+ if newly_registered_dataset_types:
+ # We may have registered some even if there were inconsistencies
+ # but should let people know (or else remove them again).
+ _LOG.verbose(
+ "Registered the following dataset types in the target Butler: %s",
+ ", ".join(d.name for d in newly_registered_dataset_types),
+ )
+ else:
+ _LOG.verbose("All required dataset types are known to the target Butler")
+
+ dimension_records: dict[DimensionElement, dict[DataCoordinate, DimensionRecord]] = defaultdict(dict)
+ if transfer_dimensions:
+ # Collect all the dimension records for these refs.
+ # All dimensions are to be copied but the list of valid dimensions
+ # come from this butler's universe.
+ elements = frozenset(
+ element
+ for element in self.dimensions.getStaticElements()
+ if element.hasTable() and element.viewOf is None
+ )
+ dataIds = {ref.dataId for ref in source_refs}
+ # This logic comes from saveDataIds.
+ for dataId in dataIds:
+ # Need an expanded record, if not expanded that we need a full
+ # butler with registry (allow mocks with registry too).
+ if not dataId.hasRecords():
+ if registry := getattr(source_butler, "registry", None):
+ dataId = registry.expandDataId(dataId)
+ else:
+ raise TypeError("Input butler needs to be a full butler to expand DataId.")
+ # If this butler doesn't know about a dimension in the source
+ # butler things will break later.
+ for record in dataId.records.values():
+ if record is not None and record.definition in elements:
+ dimension_records[record.definition].setdefault(record.dataId, record)
+
+ handled_collections: set[str] = set()
+
+ # Do all the importing in a single transaction.
+ with self.transaction():
+ if dimension_records:
+ _LOG.verbose("Ensuring that dimension records exist for transferred datasets.")
+ for element, r in dimension_records.items():
+ records = [r[dataId] for dataId in r]
+ # Assume that if the record is already present that we can
+ # use it without having to check that the record metadata
+ # is consistent.
+ self._registry.insertDimensionData(element, *records, skip_existing=True)
+
+ n_imported = 0
+ for (datasetType, run), refs_to_import in progress.iter_item_chunks(
+ grouped_refs.items(), desc="Importing to registry by run and dataset type"
+ ):
+ if run not in handled_collections:
+ # May need to create output collection. If source butler
+ # has a registry, ask for documentation string.
+ run_doc = None
+ if registry := getattr(source_butler, "registry", None):
+ run_doc = registry.getCollectionDocumentation(run)
+ registered = self._registry.registerRun(run, doc=run_doc)
+ handled_collections.add(run)
+ if registered:
+ _LOG.verbose("Creating output run %s", run)
+
+ n_refs = len(refs_to_import)
+ _LOG.verbose(
+ "Importing %d ref%s of dataset type %s into run %s",
+ n_refs,
+ "" if n_refs == 1 else "s",
+ datasetType.name,
+ run,
+ )
+
+ # Assume we are using UUIDs and the source refs will match
+ # those imported.
+ imported_refs = self._registry._importDatasets(refs_to_import, expand=False)
+ assert set(imported_refs) == set(refs_to_import)
+ n_imported += len(imported_refs)
+
+ assert len(source_refs) == n_imported
+ _LOG.verbose("Imported %d datasets into destination butler", n_imported)
+
+ # Ask the datastore to transfer. The datastore has to check that
+ # the source datastore is compatible with the target datastore.
+ accepted, rejected = self._datastore.transfer_from(
+ source_butler._datastore,
+ source_refs,
+ transfer=transfer,
+ artifact_existence=artifact_existence,
+ )
+ if rejected:
+ # For now, accept the registry entries but not the files.
+ _LOG.warning(
+ "%d datasets were rejected and %d accepted for dataset type %s in run %r.",
+ len(rejected),
+ len(accepted),
+ datasetType,
+ run,
+ )
+
+ return source_refs
+
+ def validateConfiguration(
+ self,
+ logFailures: bool = False,
+ datasetTypeNames: Iterable[str] | None = None,
+ ignore: Iterable[str] | None = None,
+ ) -> None:
+ # Docstring inherited.
+ if datasetTypeNames:
+ datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames]
+ else:
+ datasetTypes = list(self._registry.queryDatasetTypes())
+
+ # filter out anything from the ignore list
+ if ignore:
+ ignore = set(ignore)
+ datasetTypes = [
+ e for e in datasetTypes if e.name not in ignore and e.nameAndComponent()[0] not in ignore
+ ]
+ else:
+ ignore = set()
+
+ # For each datasetType that has an instrument dimension, create
+ # a DatasetRef for each defined instrument
+ datasetRefs = []
+
+ # Find all the registered instruments (if "instrument" is in the
+ # universe).
+ if "instrument" in self.dimensions:
+ instruments = {record.name for record in self._registry.queryDimensionRecords("instrument")}
+
+ for datasetType in datasetTypes:
+ if "instrument" in datasetType.dimensions:
+ # In order to create a conforming dataset ref, create
+ # fake DataCoordinate values for the non-instrument
+ # dimensions. The type of the value does not matter here.
+ dataId = {dim.name: 1 for dim in datasetType.dimensions if dim.name != "instrument"}
+
+ for instrument in instruments:
+ datasetRef = DatasetRef(
+ datasetType,
+ DataCoordinate.standardize(
+ dataId, instrument=instrument, graph=datasetType.dimensions
+ ),
+ run="validate",
+ )
+ datasetRefs.append(datasetRef)
+
+ entities: list[DatasetType | DatasetRef] = []
+ entities.extend(datasetTypes)
+ entities.extend(datasetRefs)
+
+ datastoreErrorStr = None
+ try:
+ self._datastore.validateConfiguration(entities, logFailures=logFailures)
+ except ValidationError as e:
+ datastoreErrorStr = str(e)
+
+ # Also check that the LookupKeys used by the datastores match
+ # registry and storage class definitions
+ keys = self._datastore.getLookupKeys()
+
+ failedNames = set()
+ failedDataId = set()
+ for key in keys:
+ if key.name is not None:
+ if key.name in ignore:
+ continue
+
+ # skip if specific datasetType names were requested and this
+ # name does not match
+ if datasetTypeNames and key.name not in datasetTypeNames:
+ continue
+
+ # See if it is a StorageClass or a DatasetType
+ if key.name in self.storageClasses:
+ pass
+ else:
+ try:
+ self._registry.getDatasetType(key.name)
+ except KeyError:
+ if logFailures:
+ _LOG.critical(
+ "Key '%s' does not correspond to a DatasetType or StorageClass", key
+ )
+ failedNames.add(key)
+ else:
+ # Dimensions are checked for consistency when the Butler
+ # is created and rendezvoused with a universe.
+ pass
+
+ # Check that the instrument is a valid instrument
+ # Currently only support instrument so check for that
+ if key.dataId:
+ dataIdKeys = set(key.dataId)
+ if {"instrument"} != dataIdKeys:
+ if logFailures:
+ _LOG.critical("Key '%s' has unsupported DataId override", key)
+ failedDataId.add(key)
+ elif key.dataId["instrument"] not in instruments:
+ if logFailures:
+ _LOG.critical("Key '%s' has unknown instrument", key)
+ failedDataId.add(key)
+
+ messages = []
+
+ if datastoreErrorStr:
+ messages.append(datastoreErrorStr)
+
+ for failed, msg in (
+ (failedNames, "Keys without corresponding DatasetType or StorageClass entry: "),
+ (failedDataId, "Keys with bad DataId entries: "),
+ ):
+ if failed:
+ msg += ", ".join(str(k) for k in failed)
+ messages.append(msg)
+
+ if messages:
+ raise ValidationError(";\n".join(messages))
+
+ @property
+ def collections(self) -> Sequence[str]:
+ """The collections to search by default, in order
+ (`~collections.abc.Sequence` [ `str` ]).
+
+ This is an alias for ``self.registry.defaults.collections``. It cannot
+ be set directly in isolation, but all defaults may be changed together
+ by assigning a new `RegistryDefaults` instance to
+ ``self.registry.defaults``.
+ """
+ return self._registry.defaults.collections
+
+ @property
+ def run(self) -> str | None:
+ """Name of the run this butler writes outputs to by default (`str` or
+ `None`).
+
+ This is an alias for ``self.registry.defaults.run``. It cannot be set
+ directly in isolation, but all defaults may be changed together by
+ assigning a new `RegistryDefaults` instance to
+ ``self.registry.defaults``.
+ """
+ return self._registry.defaults.run
+
+ @property
+ def registry(self) -> Registry:
+ """The object that manages dataset metadata and relationships
+ (`Registry`).
+
+ Many operations that don't involve reading or writing butler datasets
+ are accessible only via `Registry` methods. Eventually these methods
+ will be replaced by equivalent `Butler` methods.
+ """
+ return self._registry_shim
+
+ @property
+ def dimensions(self) -> DimensionUniverse:
+ # Docstring inherited.
+ return self._registry.dimensions
+
+ _registry: _ButlerRegistry
+ """The object that manages dataset metadata and relationships
+ (`_ButlerRegistry`).
+
+ Most operations that don't involve reading or writing butler datasets are
+ accessible only via `Registry` methods.
+ """
+
+ datastore: Datastore
+ """The object that manages actual dataset storage (`Datastore`).
+
+ Direct user access to the datastore should rarely be necessary; the primary
+ exception is the case where a `Datastore` implementation provides extra
+ functionality beyond what the base class defines.
+ """
+
+ storageClasses: StorageClassFactory
+ """An object that maps known storage class names to objects that fully
+ describe them (`StorageClassFactory`).
+ """
diff --git a/python/lsst/daf/butler/script/_associate.py b/python/lsst/daf/butler/script/_associate.py
index ef6ceb878d..5e1943b981 100644
--- a/python/lsst/daf/butler/script/_associate.py
+++ b/python/lsst/daf/butler/script/_associate.py
@@ -42,7 +42,7 @@ def associate(
find_first: bool,
) -> None:
"""Add existing datasets to a CHAINED collection."""
- butler = Butler(repo, writeable=True)
+ butler = Butler.from_config(repo, writeable=True)
butler.registry.registerCollection(collection, CollectionType.TAGGED)
diff --git a/python/lsst/daf/butler/script/_pruneDatasets.py b/python/lsst/daf/butler/script/_pruneDatasets.py
index 9b1c318af2..17f27d9bc7 100644
--- a/python/lsst/daf/butler/script/_pruneDatasets.py
+++ b/python/lsst/daf/butler/script/_pruneDatasets.py
@@ -218,7 +218,7 @@ def pruneDatasets(
if not collections:
return PruneDatasetsResult(state=PruneDatasetsResult.State.ERR_NO_COLLECTION_RESTRICTION)
- butler = Butler(repo)
+ butler = Butler.from_config(repo)
# If purging, verify that the collection to purge is RUN type collection.
if purge_run:
@@ -253,7 +253,7 @@ def pruneDatasets(
return result
def doPruneDatasets() -> PruneDatasetsResult:
- butler = Butler(repo, writeable=True)
+ butler = Butler.from_config(repo, writeable=True)
butler.pruneDatasets(
refs=datasets_found.getDatasets(),
disassociate=disassociate,
diff --git a/python/lsst/daf/butler/script/butlerImport.py b/python/lsst/daf/butler/script/butlerImport.py
index 37aba0662c..a4af72b85a 100644
--- a/python/lsst/daf/butler/script/butlerImport.py
+++ b/python/lsst/daf/butler/script/butlerImport.py
@@ -59,7 +59,7 @@ def butlerImport(
skip_dimensions : `list`, or `None`
Dimensions that should be skipped.
"""
- butler = Butler(repo, writeable=True)
+ butler = Butler.from_config(repo, writeable=True)
if skip_dimensions is not None:
skip_dimensions = set(skip_dimensions)
diff --git a/python/lsst/daf/butler/script/certifyCalibrations.py b/python/lsst/daf/butler/script/certifyCalibrations.py
index 6f99f0fc06..42bdb53458 100644
--- a/python/lsst/daf/butler/script/certifyCalibrations.py
+++ b/python/lsst/daf/butler/script/certifyCalibrations.py
@@ -69,7 +69,7 @@ def certifyCalibrations(
Search all children of the inputCollection if it is a CHAINED
collection, instead of just the most recent one.
"""
- butler = Butler(repo, writeable=True, without_datastore=True)
+ butler = Butler.from_config(repo, writeable=True, without_datastore=True)
registry = butler.registry
timespan = Timespan(
begin=astropy.time.Time(begin_date, scale="tai") if begin_date is not None else None,
diff --git a/python/lsst/daf/butler/script/collectionChain.py b/python/lsst/daf/butler/script/collectionChain.py
index ba6d53ecd5..888baede11 100644
--- a/python/lsst/daf/butler/script/collectionChain.py
+++ b/python/lsst/daf/butler/script/collectionChain.py
@@ -71,7 +71,7 @@ def collectionChain(
chain : `tuple` of `str`
The collections in the chain following this command.
"""
- butler = Butler(repo, writeable=True, without_datastore=True)
+ butler = Butler.from_config(repo, writeable=True, without_datastore=True)
# Every mode needs children except pop.
if not children and mode != "pop":
diff --git a/python/lsst/daf/butler/script/configValidate.py b/python/lsst/daf/butler/script/configValidate.py
index 83b6c0b921..2f71319302 100644
--- a/python/lsst/daf/butler/script/configValidate.py
+++ b/python/lsst/daf/butler/script/configValidate.py
@@ -52,7 +52,7 @@ def configValidate(repo: str, quiet: bool, dataset_type: list[str], ignore: list
error.
"""
logFailures = not quiet
- butler = Butler(config=repo)
+ butler = Butler.from_config(config=repo)
is_good = True
try:
butler.validateConfiguration(logFailures=logFailures, datasetTypeNames=dataset_type, ignore=ignore)
diff --git a/python/lsst/daf/butler/script/exportCalibs.py b/python/lsst/daf/butler/script/exportCalibs.py
index ae28e8b35d..1406f0a132 100644
--- a/python/lsst/daf/butler/script/exportCalibs.py
+++ b/python/lsst/daf/butler/script/exportCalibs.py
@@ -122,7 +122,7 @@ def exportCalibs(
RuntimeError :
Raised if the output directory already exists.
"""
- butler = Butler(repo, writeable=False)
+ butler = Butler.from_config(repo, writeable=False)
dataset_type_query = dataset_type or ...
collections_query = collections or ...
diff --git a/python/lsst/daf/butler/script/ingest_files.py b/python/lsst/daf/butler/script/ingest_files.py
index df51f25da6..e4e645229b 100644
--- a/python/lsst/daf/butler/script/ingest_files.py
+++ b/python/lsst/daf/butler/script/ingest_files.py
@@ -105,7 +105,7 @@ def ingest_files(
id_gen_mode = DatasetIdGenEnum.__members__[id_generation_mode]
# Create the butler with the relevant run attached.
- butler = Butler(repo, run=run)
+ butler = Butler.from_config(repo, run=run)
datasetType = butler.registry.getDatasetType(dataset_type)
diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py
index a0977d1d97..4358d23c7d 100644
--- a/python/lsst/daf/butler/script/queryCollections.py
+++ b/python/lsst/daf/butler/script/queryCollections.py
@@ -68,7 +68,7 @@ def _getTable(
names=("Name", typeCol, descriptionCol),
dtype=(str, str, str),
)
- butler = Butler(repo)
+ butler = Butler.from_config(repo)
names = sorted(
butler.registry.queryCollections(collectionTypes=frozenset(collection_type), expression=glob or ...)
)
@@ -140,7 +140,7 @@ def _getTree(
names=("Name", "Type"),
dtype=(str, str),
)
- butler = Butler(repo, without_datastore=True)
+ butler = Butler.from_config(repo, without_datastore=True)
def addCollection(name: str, level: int = 0) -> None:
collectionType = butler.registry.getCollectionType(name)
@@ -168,7 +168,7 @@ def _getFlatten(
glob: Iterable[str],
collection_type: Iterable[CollectionType],
) -> Table:
- butler = Butler(repo)
+ butler = Butler.from_config(repo)
collectionNames = list(
butler.registry.queryCollections(
collectionTypes=frozenset(collection_type), flattenChains=True, expression=glob or ...
diff --git a/python/lsst/daf/butler/script/queryDataIds.py b/python/lsst/daf/butler/script/queryDataIds.py
index cb70f114d3..415d2652d8 100644
--- a/python/lsst/daf/butler/script/queryDataIds.py
+++ b/python/lsst/daf/butler/script/queryDataIds.py
@@ -34,8 +34,9 @@
import numpy as np
from astropy.table import Table as AstropyTable
-from .._butler import Butler, DataCoordinate
+from .._butler import Butler
from ..cli.utils import sortAstropyTable
+from ..dimensions import DataCoordinate
if TYPE_CHECKING:
from lsst.daf.butler import DimensionGraph
@@ -109,7 +110,7 @@ def queryDataIds(
Docstring for supported parameters is the same as
`~lsst.daf.butler.Registry.queryDataIds`.
"""
- butler = Butler(repo, without_datastore=True)
+ butler = Butler.from_config(repo, without_datastore=True)
if datasets and collections and not dimensions:
# Determine the dimensions relevant to all given dataset types.
diff --git a/python/lsst/daf/butler/script/queryDatasetTypes.py b/python/lsst/daf/butler/script/queryDatasetTypes.py
index 4c1eafd5e2..efe9aeaeb0 100644
--- a/python/lsst/daf/butler/script/queryDatasetTypes.py
+++ b/python/lsst/daf/butler/script/queryDatasetTypes.py
@@ -61,7 +61,7 @@ def queryDatasetTypes(repo: str, verbose: bool, glob: Iterable[str], components:
A dict whose key is "datasetTypes" and whose value is a list of
collection names.
"""
- butler = Butler(repo, without_datastore=True)
+ butler = Butler.from_config(repo, without_datastore=True)
expression = glob or ...
datasetTypes = butler.registry.queryDatasetTypes(components=components, expression=expression)
if verbose:
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
index e6b17a79ca..4a7cac38f3 100644
--- a/python/lsst/daf/butler/script/queryDatasets.py
+++ b/python/lsst/daf/butler/script/queryDatasets.py
@@ -175,7 +175,7 @@ def __init__(
raise RuntimeError("One of repo and butler must be provided and the other must be None.")
# show_uri requires a datastore.
without_datastore = not show_uri
- self.butler = butler or Butler(repo, without_datastore=without_datastore)
+ self.butler = butler or Butler.from_config(repo, without_datastore=without_datastore)
self._getDatasets(glob, collections, where, find_first)
self.showUri = show_uri
diff --git a/python/lsst/daf/butler/script/queryDimensionRecords.py b/python/lsst/daf/butler/script/queryDimensionRecords.py
index 8f26af86be..88197cf2bf 100644
--- a/python/lsst/daf/butler/script/queryDimensionRecords.py
+++ b/python/lsst/daf/butler/script/queryDimensionRecords.py
@@ -54,7 +54,7 @@ def queryDimensionRecords(
`~lsst.daf.butler.Registry.queryDimensionRecords` except for ``no_check``,
which is the inverse of ``check``.
"""
- butler = Butler(repo, without_datastore=True)
+ butler = Butler.from_config(repo, without_datastore=True)
query_collections: Iterable[str] | EllipsisType | None = None
if datasets:
diff --git a/python/lsst/daf/butler/script/register_dataset_type.py b/python/lsst/daf/butler/script/register_dataset_type.py
index 4de6f31a6d..f46fda8817 100644
--- a/python/lsst/daf/butler/script/register_dataset_type.py
+++ b/python/lsst/daf/butler/script/register_dataset_type.py
@@ -69,7 +69,7 @@ def register_dataset_type(
be created by this command. They are always derived from the composite
dataset type.
"""
- butler = Butler(repo, writeable=True, without_datastore=True)
+ butler = Butler.from_config(repo, writeable=True, without_datastore=True)
composite, component = DatasetType.splitDatasetTypeName(dataset_type)
if component:
diff --git a/python/lsst/daf/butler/script/removeCollections.py b/python/lsst/daf/butler/script/removeCollections.py
index e0ee80e21d..8dc49015ed 100644
--- a/python/lsst/daf/butler/script/removeCollections.py
+++ b/python/lsst/daf/butler/script/removeCollections.py
@@ -82,7 +82,7 @@ def _getCollectionInfo(
collectionInfo : `CollectionInfo`
Contains tables with run and non-run collection info.
"""
- butler = Butler(repo, without_datastore=True)
+ butler = Butler.from_config(repo, without_datastore=True)
try:
names = sorted(
butler.registry.queryCollections(
@@ -135,7 +135,7 @@ def removeCollections(
def doRemove(collections: Table) -> None:
"""Perform the prune collection step."""
- butler = Butler(repo, writeable=True, without_datastore=True)
+ butler = Butler.from_config(repo, writeable=True, without_datastore=True)
for name in collections["Collection"]:
butler.registry.removeCollection(name)
diff --git a/python/lsst/daf/butler/script/removeDatasetType.py b/python/lsst/daf/butler/script/removeDatasetType.py
index 3279a6cc6e..4fe9e020b3 100644
--- a/python/lsst/daf/butler/script/removeDatasetType.py
+++ b/python/lsst/daf/butler/script/removeDatasetType.py
@@ -43,5 +43,5 @@ def removeDatasetType(repo: str, dataset_type_name: tuple[str, ...]) -> None:
datasetTypeName : `str`
The name of the dataset type to be removed.
"""
- butler = Butler(repo, writeable=True, without_datastore=True)
+ butler = Butler.from_config(repo, writeable=True, without_datastore=True)
butler.registry.removeDatasetType(dataset_type_name)
diff --git a/python/lsst/daf/butler/script/removeRuns.py b/python/lsst/daf/butler/script/removeRuns.py
index 8259f9984e..1186e53b05 100644
--- a/python/lsst/daf/butler/script/removeRuns.py
+++ b/python/lsst/daf/butler/script/removeRuns.py
@@ -85,7 +85,7 @@ def _getCollectionInfo(
datasets : `dict` [`str`, `int`]
The dataset types and and how many will be removed.
"""
- butler = Butler(repo)
+ butler = Butler.from_config(repo)
try:
collectionNames = list(
butler.registry.queryCollections(
@@ -132,7 +132,7 @@ def removeRuns(
def doRemove(runs: Sequence[RemoveRun]) -> None:
"""Perform the remove step."""
- butler = Butler(repo, writeable=True)
+ butler = Butler.from_config(repo, writeable=True)
with butler.transaction():
for run in runs:
for parent in run.parents:
diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py
index 10edf446ac..01a4d4a11f 100644
--- a/python/lsst/daf/butler/script/retrieveArtifacts.py
+++ b/python/lsst/daf/butler/script/retrieveArtifacts.py
@@ -86,7 +86,7 @@ def retrieveArtifacts(
query_types = dataset_type or ...
query_collections: tuple[str, ...] | EllipsisType = collections or ...
- butler = Butler(repo, writeable=False)
+ butler = Butler.from_config(repo, writeable=False)
# Need to store in list so we can count the number to give some feedback
# to caller.
diff --git a/python/lsst/daf/butler/script/transferDatasets.py b/python/lsst/daf/butler/script/transferDatasets.py
index c63835e109..845f37b87d 100644
--- a/python/lsst/daf/butler/script/transferDatasets.py
+++ b/python/lsst/daf/butler/script/transferDatasets.py
@@ -74,8 +74,8 @@ def transferDatasets(
datasets. It can be more efficient to disable this if it is known
that all dimensions exist.
"""
- source_butler = Butler(source, writeable=False)
- dest_butler = Butler(dest, writeable=True)
+ source_butler = Butler.from_config(source, writeable=False)
+ dest_butler = Butler.from_config(dest, writeable=True)
dataset_type_expr = dataset_type or ...
collections_expr: tuple[str, ...] | EllipsisType = collections or ...
diff --git a/python/lsst/daf/butler/server.py b/python/lsst/daf/butler/server.py
index 7ee3a387f5..1839838954 100644
--- a/python/lsst/daf/butler/server.py
+++ b/python/lsst/daf/butler/server.py
@@ -84,21 +84,21 @@ def _generate_next_value_(name, start, count, last_values) -> str: # type: igno
def _make_global_butler() -> None:
global GLOBAL_READONLY_BUTLER, GLOBAL_READWRITE_BUTLER
if GLOBAL_READONLY_BUTLER is None:
- GLOBAL_READONLY_BUTLER = Butler(BUTLER_ROOT, writeable=False)
+ GLOBAL_READONLY_BUTLER = Butler.from_config(BUTLER_ROOT, writeable=False)
if GLOBAL_READWRITE_BUTLER is None:
- GLOBAL_READWRITE_BUTLER = Butler(BUTLER_ROOT, writeable=True)
+ GLOBAL_READWRITE_BUTLER = Butler.from_config(BUTLER_ROOT, writeable=True)
def butler_readonly_dependency() -> Butler:
"""Return global read-only butler."""
_make_global_butler()
- return Butler(butler=GLOBAL_READONLY_BUTLER)
+ return Butler.from_config(butler=GLOBAL_READONLY_BUTLER)
def butler_readwrite_dependency() -> Butler:
"""Return read-write butler."""
_make_global_butler()
- return Butler(butler=GLOBAL_READWRITE_BUTLER)
+ return Butler.from_config(butler=GLOBAL_READWRITE_BUTLER)
def unpack_dataId(butler: Butler, data_id: SerializedDataCoordinate | None) -> DataCoordinate | None:
diff --git a/python/lsst/daf/butler/tests/_testRepo.py b/python/lsst/daf/butler/tests/_testRepo.py
index af121db9e6..eba08df974 100644
--- a/python/lsst/daf/butler/tests/_testRepo.py
+++ b/python/lsst/daf/butler/tests/_testRepo.py
@@ -116,7 +116,7 @@ def makeTestRepo(
# not be ignored.
# newConfig guards against location-related keywords like outfile
newConfig = Butler.makeRepo(root, config=defaults, forceConfigRoot=False, **kwargs)
- butler = Butler(newConfig, writeable=True)
+ butler = Butler.from_config(newConfig, writeable=True)
dimensionRecords = _makeRecords(dataIds, butler.dimensions)
for dimension, records in dimensionRecords.items():
if butler.dimensions[dimension].viewOf is None:
@@ -154,7 +154,7 @@ def makeTestCollection(repo: Butler, uniqueId: str | None = None) -> Butler:
# Speed matters more than cryptographic guarantees
uniqueId = str(random.randrange(1_000_000_000))
collection = "test_" + uniqueId
- return Butler(butler=repo, run=collection)
+ return Butler.from_config(butler=repo, run=collection)
def _makeRecords(dataIds: Mapping[str, Iterable], universe: DimensionUniverse) -> Mapping[str, Iterable]:
diff --git a/python/lsst/daf/butler/tests/utils.py b/python/lsst/daf/butler/tests/utils.py
index 802498a0d0..fe1ccc7965 100644
--- a/python/lsst/daf/butler/tests/utils.py
+++ b/python/lsst/daf/butler/tests/utils.py
@@ -243,7 +243,7 @@ def __init__(self, root: str, configFile: str) -> None:
# tag when looking up datasets.
run = "ingest/run"
tag = "ingest"
- self.butler = Butler(butlerConfigFile, run=run, collections=[tag])
+ self.butler = Butler.from_config(butlerConfigFile, run=run, collections=[tag])
self.butler.registry.registerCollection(tag, CollectionType.TAGGED)
# Create and register a DatasetType
diff --git a/tests/test_butler.py b/tests/test_butler.py
index 9d185221ce..0dda39a51f 100644
--- a/tests/test_butler.py
+++ b/tests/test_butler.py
@@ -87,6 +87,7 @@ def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def]
from lsst.daf.butler.datastore import NullDatastore
from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError
from lsst.daf.butler.datastores.fileDatastore import FileDatastore
+from lsst.daf.butler.direct_butler import DirectButler
from lsst.daf.butler.registries.sql import SqlRegistry
from lsst.daf.butler.registry import (
CollectionError,
@@ -210,8 +211,9 @@ def tearDown(self) -> None:
def create_butler(
self, run: str, storageClass: StorageClass | str, datasetTypeName: str
- ) -> tuple[Butler, DatasetType]:
- butler = Butler(self.tmpConfigFile, run=run)
+ ) -> tuple[DirectButler, DatasetType]:
+ butler = Butler.from_config(self.tmpConfigFile, run=run)
+ assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
collections = set(butler.registry.queryCollections())
self.assertEqual(collections, {run})
@@ -258,7 +260,7 @@ def create_butler(
)
return butler, datasetType
- def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Butler:
+ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> DirectButler:
# New datasets will be added to run and tag, but we will only look in
# tag when looking up datasets.
run = self.default_run
@@ -512,7 +514,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But
def testDeferredCollectionPassing(self) -> None:
# Construct a butler with no run or collection, but make it writeable.
- butler = Butler(self.tmpConfigFile, writeable=True)
+ butler = Butler.from_config(self.tmpConfigFile, writeable=True)
# Create and register a DatasetType
dimensions = butler.dimensions.extract(["instrument", "visit"])
datasetType = self.addDatasetType(
@@ -576,17 +578,17 @@ def setUp(self) -> None:
def testConstructor(self) -> None:
"""Independent test of constructor."""
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
self.assertIsInstance(butler, Butler)
# Check that butler.yaml is added automatically.
if self.tmpConfigFile.endswith(end := "/butler.yaml"):
config_dir = self.tmpConfigFile[: -len(end)]
- butler = Butler(config_dir, run=self.default_run)
+ butler = Butler.from_config(config_dir, run=self.default_run)
self.assertIsInstance(butler, Butler)
# Even with a ResourcePath.
- butler = Butler(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
+ butler = Butler.from_config(ResourcePath(config_dir, forceDirectory=True), run=self.default_run)
self.assertIsInstance(butler, Butler)
collections = set(butler.registry.queryCollections())
@@ -594,11 +596,11 @@ def testConstructor(self) -> None:
# Check that some special characters can be included in run name.
special_run = "u@b.c-A"
- butler_special = Butler(butler=butler, run=special_run)
+ butler_special = Butler.from_config(butler=butler, run=special_run)
collections = set(butler_special.registry.queryCollections("*@*"))
self.assertEqual(collections, {special_run})
- butler2 = Butler(butler=butler, collections=["other"])
+ butler2 = Butler.from_config(butler=butler, collections=["other"])
self.assertEqual(butler2.collections, ("other",))
self.assertIsNone(butler2.run)
self.assertIs(butler._datastore, butler2._datastore)
@@ -619,17 +621,17 @@ def testConstructor(self) -> None:
uri = Butler.get_repo_uri("bad_label")
self.assertEqual(uri, ResourcePath(bad_label))
uri = Butler.get_repo_uri("label")
- butler = Butler(uri, writeable=False)
+ butler = Butler.from_config(uri, writeable=False)
self.assertIsInstance(butler, Butler)
- butler = Butler("label", writeable=False)
+ butler = Butler.from_config("label", writeable=False)
self.assertIsInstance(butler, Butler)
with self.assertRaisesRegex(FileNotFoundError, "aliases:.*bad_label"):
- Butler("not_there", writeable=False)
+ Butler.from_config("not_there", writeable=False)
with self.assertRaisesRegex(FileNotFoundError, "resolved from alias 'bad_label'"):
- Butler("bad_label")
+ Butler.from_config("bad_label")
with self.assertRaises(FileNotFoundError):
# Should ignore aliases.
- Butler(ResourcePath("label", forceAbsolute=False))
+ Butler.from_config(ResourcePath("label", forceAbsolute=False))
with self.assertRaises(KeyError) as cm:
Butler.get_repo_uri("missing")
self.assertEqual(
@@ -644,24 +646,24 @@ def testConstructor(self) -> None:
butler_index.dumpToUri(temp_file)
with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
with self.assertRaisesRegex(FileNotFoundError, "(no known aliases)"):
- Butler("label")
+ Butler.from_config("label")
with ResourcePath.temporary_uri(suffix=suffix) as temp_file:
# Now with bad contents.
with open(temp_file.ospath, "w") as fh:
print("'", file=fh)
with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": str(temp_file)}):
with self.assertRaisesRegex(FileNotFoundError, "(no known aliases:.*could not be read)"):
- Butler("label")
+ Butler.from_config("label")
with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORY_INDEX": "file://not_found/x.yaml"}):
with self.assertRaises(FileNotFoundError):
Butler.get_repo_uri("label")
self.assertEqual(Butler.get_known_repos(), set())
with self.assertRaisesRegex(FileNotFoundError, "index file not found"):
- Butler("label")
+ Butler.from_config("label")
# Check that we can create Butler when the alias file is not found.
- butler = Butler(self.tmpConfigFile, writeable=False)
+ butler = Butler.from_config(self.tmpConfigFile, writeable=False)
self.assertIsInstance(butler, Butler)
with self.assertRaises(KeyError) as cm:
# No environment variable set.
@@ -670,7 +672,7 @@ def testConstructor(self) -> None:
self.assertIn("No repository index defined", str(cm.exception))
with self.assertRaisesRegex(FileNotFoundError, "no known aliases.*No repository index"):
# No aliases registered.
- Butler("not_there")
+ Butler.from_config("not_there")
self.assertEqual(Butler.get_known_repos(), set())
def testBasicPutGet(self) -> None:
@@ -842,7 +844,7 @@ def testPytypePutCoercion(self) -> None:
self.assertEqual(get_full_type_name(test_dict3), "dict")
def testIngest(self) -> None:
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
# Create and register a DatasetType
dimensions = butler.dimensions.extract(["instrument", "visit", "detector"])
@@ -994,7 +996,8 @@ def testIngest(self) -> None:
def testPickle(self) -> None:
"""Test pickle support."""
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
+ assert isinstance(butler, DirectButler), "Expect DirectButler in configuration"
butlerOut = pickle.loads(pickle.dumps(butler))
self.assertIsInstance(butlerOut, Butler)
self.assertEqual(butlerOut._config, butler._config)
@@ -1002,7 +1005,7 @@ def testPickle(self) -> None:
self.assertEqual(butlerOut.run, butler.run)
def testGetDatasetTypes(self) -> None:
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
dimensions = butler.dimensions.extract(["instrument", "visit", "physical_filter"])
dimensionEntries: list[tuple[str, list[Mapping[str, Any]]]] = [
(
@@ -1076,7 +1079,7 @@ def testGetDatasetTypes(self) -> None:
)
def testTransaction(self) -> None:
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
datasetTypeName = "test_metric"
dimensions = butler.dimensions.extract(["instrument", "visit"])
dimensionEntries: tuple[tuple[str, Mapping[str, Any]], ...] = (
@@ -1133,10 +1136,12 @@ def testMakeRepo(self) -> None:
butlerConfig = Butler.makeRepo(root1, config=Config(self.configFile))
limited = Config(self.configFile)
- butler1 = Butler(butlerConfig)
+ butler1 = Butler.from_config(butlerConfig)
+ assert isinstance(butler1, DirectButler), "Expect DirectButler in configuration"
butlerConfig = Butler.makeRepo(root2, standalone=True, config=Config(self.configFile))
full = Config(self.tmpConfigFile)
- butler2 = Butler(butlerConfig)
+ butler2 = Butler.from_config(butlerConfig)
+ assert isinstance(butler2, DirectButler), "Expect DirectButler in configuration"
# Butlers should have the same configuration regardless of whether
# defaults were expanded.
self.assertEqual(butler1._config, butler2._config)
@@ -1156,13 +1161,13 @@ def testMakeRepo(self) -> None:
# work properly with relocatable Butler repo
butlerConfig.configFile = None
with self.assertRaises(ValueError):
- Butler(butlerConfig)
+ Butler.from_config(butlerConfig)
with self.assertRaises(FileExistsError):
Butler.makeRepo(self.root, standalone=True, config=Config(self.configFile), overwrite=False)
def testStringification(self) -> None:
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
butlerStr = str(butler)
if self.datastoreStr is not None:
@@ -1178,7 +1183,7 @@ def testStringification(self) -> None:
def testButlerRewriteDataId(self) -> None:
"""Test that dataIds can be rewritten based on dimension records."""
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
datasetTypeName = "random_data"
@@ -1244,7 +1249,7 @@ def checkFileExists(self, root: str | ResourcePath, relpath: str | ResourcePath)
def testPutTemplates(self) -> None:
storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
# Add needed Dimensions
butler.registry.insertDimensionData("instrument", {"name": "DummyCamComp"})
@@ -1380,7 +1385,7 @@ def runImportExportTest(self, storageClass: StorageClass) -> None:
transfer="auto",
skip_dimensions=None,
)
- importButler = Butler(importDir, run=self.default_run)
+ importButler = Butler.from_config(importDir, run=self.default_run)
for ref in datasets:
with self.subTest(ref=ref):
# Test for existence by passing in the DatasetType and
@@ -1393,7 +1398,7 @@ def runImportExportTest(self, storageClass: StorageClass) -> None:
def testRemoveRuns(self) -> None:
storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
- butler = Butler(self.tmpConfigFile, writeable=True)
+ butler = Butler.from_config(self.tmpConfigFile, writeable=True)
# Load registry data with dimensions to hang datasets off of.
registryDataDir = os.path.normpath(os.path.join(os.path.dirname(__file__), "data", "registry"))
butler.import_(filename=os.path.join(registryDataDir, "base.yaml"))
@@ -1453,12 +1458,12 @@ class PosixDatastoreButlerTestCase(FileDatastoreButlerTests, unittest.TestCase):
def testPathConstructor(self) -> None:
"""Independent test of constructor using PathLike."""
- butler = Butler(self.tmpConfigFile, run=self.default_run)
+ butler = Butler.from_config(self.tmpConfigFile, run=self.default_run)
self.assertIsInstance(butler, Butler)
# And again with a Path object with the butler yaml
path = pathlib.Path(self.tmpConfigFile)
- butler = Butler(path, writeable=False)
+ butler = Butler.from_config(path, writeable=False)
self.assertIsInstance(butler, Butler)
# And again with a Path object without the butler yaml
@@ -1466,7 +1471,7 @@ def testPathConstructor(self) -> None:
# in butler.yaml -- which is the case for a subclass)
if self.tmpConfigFile.endswith("butler.yaml"):
path = pathlib.Path(os.path.dirname(self.tmpConfigFile))
- butler = Butler(path, writeable=False)
+ butler = Butler.from_config(path, writeable=False)
self.assertIsInstance(butler, Butler)
def testExportTransferCopy(self) -> None:
@@ -1500,7 +1505,7 @@ def testExportTransferCopy(self) -> None:
def testPruneDatasets(self) -> None:
storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents")
- butler = Butler(self.tmpConfigFile, writeable=True)
+ butler = Butler.from_config(self.tmpConfigFile, writeable=True)
assert isinstance(butler._datastore, FileDatastore)
# Load registry data with dimensions to hang datasets off of.
registryDataDir = os.path.normpath(os.path.join(TESTDIR, "data", "registry"))
@@ -2064,7 +2069,9 @@ def tearDown(self) -> None:
def create_butler(self, manager: str, label: str) -> Butler:
config = Config(self.configFile)
config["registry", "managers", "datasets"] = manager
- return Butler(Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True)
+ return Butler.from_config(
+ Butler.makeRepo(f"{self.root}/butler{label}", config=config), writeable=True
+ )
def create_butlers(self, manager1: str | None = None, manager2: str | None = None) -> None:
default = "lsst.daf.butler.registry.datasets.byDimensions.ByDimensionsDatasetRecordStorageManagerUUID"
@@ -2192,7 +2199,7 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St
# we are rewriting integer dataset ids in the target if necessary.
# Will not be relevant for UUID.
run = "distraction"
- butler = Butler(butler=self.source_butler, run=run)
+ butler = Butler.from_config(butler=self.source_butler, run=run)
butler.put(
makeExampleMetrics(),
datasetTypeName,
@@ -2202,7 +2209,7 @@ def assertButlerTransfers(self, purge: bool = False, storageClassName: str = "St
)
# Write some example metrics to the source
- butler = Butler(butler=self.source_butler)
+ butler = Butler.from_config(butler=self.source_butler)
# Set of DatasetRefs that should be in the list of refs to transfer
# but which will not be transferred.
@@ -2383,9 +2390,9 @@ def test_fallback(self) -> None:
bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore"
with self.assertRaises(RuntimeError):
- Butler(bad_config)
+ Butler.from_config(bad_config)
- butler = Butler(bad_config, writeable=True, without_datastore=True)
+ butler = Butler.from_config(bad_config, writeable=True, without_datastore=True)
self.assertIsInstance(butler._datastore, NullDatastore)
# Check that registry is working.
diff --git a/tests/test_cliCmdIngestFiles.py b/tests/test_cliCmdIngestFiles.py
index fbb48f7ef0..29b9730297 100644
--- a/tests/test_cliCmdIngestFiles.py
+++ b/tests/test_cliCmdIngestFiles.py
@@ -104,7 +104,7 @@ def assertIngest(self, table, options):
)
self.assertEqual(result.exit_code, 0, clickResultMsg(result))
- butler = Butler(self.root)
+ butler = Butler.from_config(self.root)
refs = list(butler.registry.queryDatasets("test_metric_comp", collections=run))
self.assertEqual(len(refs), 2)
diff --git a/tests/test_cliCmdPruneDatasets.py b/tests/test_cliCmdPruneDatasets.py
index 7d4c5901a9..e77961994d 100644
--- a/tests/test_cliCmdPruneDatasets.py
+++ b/tests/test_cliCmdPruneDatasets.py
@@ -35,7 +35,6 @@
import lsst.daf.butler.registries.sql
import lsst.daf.butler.script
from astropy.table import Table
-from lsst.daf.butler import Butler
from lsst.daf.butler.cli.butler import cli as butlerCli
from lsst.daf.butler.cli.cmd.commands import (
pruneDatasets_askContinueMsg,
@@ -54,6 +53,7 @@
pruneDatasets_wouldRemoveMsg,
)
from lsst.daf.butler.cli.utils import LogCliRunner, astropyTablesToStr, clickResultMsg
+from lsst.daf.butler.direct_butler import DirectButler
from lsst.daf.butler.registry import CollectionType
from lsst.daf.butler.script import QueryDatasets
@@ -118,7 +118,7 @@ def makePruneDatasetsArgs(**kwargs):
@patch.object(lsst.daf.butler.script._pruneDatasets, "QueryDatasets", side_effect=makeQueryDatasets)
# Mock the pruneDatasets butler command so we can test for expected calls
# to it, without dealing with setting up a full repo with data for it.
- @patch.object(Butler, "pruneDatasets")
+ @patch.object(DirectButler, "pruneDatasets")
def run_test(
self,
mockPruneDatasets,
diff --git a/tests/test_cliCmdQueryCollections.py b/tests/test_cliCmdQueryCollections.py
index 1d88b40e1d..47eeb16cfa 100644
--- a/tests/test_cliCmdQueryCollections.py
+++ b/tests/test_cliCmdQueryCollections.py
@@ -98,7 +98,7 @@ def testGetCollections(self):
with self.runner.isolated_filesystem():
butlerCfg = Butler.makeRepo("here")
# the purpose of this call is to create some collections
- butler = Butler(butlerCfg, run=run, collections=[tag], writeable=True)
+ butler = Butler.from_config(butlerCfg, run=run, collections=[tag], writeable=True)
butler.registry.registerCollection(tag, CollectionType.TAGGED)
# Verify collections that were created are found by
@@ -140,7 +140,7 @@ def testChained(self):
# Create a butler and add some chained collections:
butlerCfg = Butler.makeRepo("here")
- butler1 = Butler(butlerCfg, writeable=True)
+ butler1 = Butler.from_config(butlerCfg, writeable=True)
# Replace datastore functions with mocks:
DatastoreMock.apply(butler1)
diff --git a/tests/test_cliCmdQueryDataIds.py b/tests/test_cliCmdQueryDataIds.py
index f0535ab2ac..56cfa69e49 100644
--- a/tests/test_cliCmdQueryDataIds.py
+++ b/tests/test_cliCmdQueryDataIds.py
@@ -70,7 +70,7 @@ def loadData(self, *filenames: str) -> Butler:
"""Load registry test data from ``TESTDIR/data/registry/``,
which should be a YAML import/export file.
"""
- butler = Butler(self.repo, writeable=True)
+ butler = Butler.from_config(self.repo, writeable=True)
for filename in filenames:
with open(os.path.join(TESTDIR, "data", "registry", filename)) as stream:
# Go behind the back of the import code a bit to deal with
diff --git a/tests/test_cliCmdQueryDimensionRecords.py b/tests/test_cliCmdQueryDimensionRecords.py
index 3f982f5789..876a77453d 100644
--- a/tests/test_cliCmdQueryDimensionRecords.py
+++ b/tests/test_cliCmdQueryDimensionRecords.py
@@ -166,7 +166,7 @@ def testWhere(self):
self.assertAstropyTablesEqual(readTable(result.output), expected)
def testCollection(self):
- butler = Butler(self.root, run="foo")
+ butler = Butler.from_config(self.root, run="foo")
# try replacing the testRepo's butler with the one with the "foo" run.
self.testRepo.butler = butler
@@ -273,7 +273,7 @@ def testCollection(self):
self.assertAstropyTablesEqual(readTable(result.output), expected)
def testSkymap(self):
- butler = Butler(self.root, run="foo")
+ butler = Butler.from_config(self.root, run="foo")
# try replacing the testRepo's butler with the one with the "foo" run.
self.testRepo.butler = butler
diff --git a/tests/test_cliCmdRemoveCollections.py b/tests/test_cliCmdRemoveCollections.py
index 080e78816e..ec20e316f5 100644
--- a/tests/test_cliCmdRemoveCollections.py
+++ b/tests/test_cliCmdRemoveCollections.py
@@ -220,7 +220,7 @@ def testRemoveCmd(self):
# verify chained-run-1 was removed:
- butler = Butler(self.root)
+ butler = Butler.from_config(self.root)
collections = butler.registry.queryCollections(
collectionTypes=frozenset(
(
diff --git a/tests/test_logFormatter.py b/tests/test_logFormatter.py
index 8f3d0a4d1d..a166ebadfc 100644
--- a/tests/test_logFormatter.py
+++ b/tests/test_logFormatter.py
@@ -49,7 +49,7 @@ def setUp(self):
Butler.makeRepo(self.root)
self.run = "testrun"
- self.butler = Butler(self.root, run=self.run)
+ self.butler = Butler.from_config(self.root, run=self.run)
self.datasetType = DatasetType("test_logs", [], "ButlerLogRecords", universe=self.butler.dimensions)
self.butler.registry.registerDatasetType(self.datasetType)
diff --git a/tests/test_matplotlibFormatter.py b/tests/test_matplotlibFormatter.py
index 8851d095f9..78b5f887d8 100644
--- a/tests/test_matplotlibFormatter.py
+++ b/tests/test_matplotlibFormatter.py
@@ -65,7 +65,7 @@ def tearDown(self):
removeTestTempDir(self.root)
def testMatplotlibFormatter(self):
- butler = Butler(self.root, run="testrun")
+ butler = Butler.from_config(self.root, run="testrun")
datasetType = DatasetType("test_plot", [], "Plot", universe=butler.dimensions)
butler.registry.registerDatasetType(datasetType)
# Does not have to be a random image
diff --git a/tests/test_packages.py b/tests/test_packages.py
index 16b395c93f..1f602304ad 100644
--- a/tests/test_packages.py
+++ b/tests/test_packages.py
@@ -45,7 +45,7 @@ def setUp(self):
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
Butler.makeRepo(self.root)
- self.butler = Butler(self.root, run="test_run")
+ self.butler = Butler.from_config(self.root, run="test_run")
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
diff --git a/tests/test_parquet.py b/tests/test_parquet.py
index 93753cc1c8..b39a0af407 100644
--- a/tests/test_parquet.py
+++ b/tests/test_parquet.py
@@ -306,7 +306,9 @@ def setUp(self):
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
self.run = "test_run"
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
@@ -726,7 +728,9 @@ def setUp(self):
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
self.run = "test_run"
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run=self.run)
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run=self.run
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
@@ -1053,7 +1057,9 @@ def setUp(self):
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
@@ -1313,7 +1319,9 @@ def setUp(self):
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
@@ -1634,7 +1642,9 @@ def setUp(self):
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
@@ -1787,7 +1797,9 @@ def setUp(self):
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
config = Config(self.configFile)
- self.butler = Butler(Butler.makeRepo(self.root, config=config), writeable=True, run="test_run")
+ self.butler = Butler.from_config(
+ Butler.makeRepo(self.root, config=config), writeable=True, run="test_run"
+ )
# No dimensions in dataset type so we don't have to worry about
# inserting dimension data or defining data IDs.
self.datasetType = DatasetType(
diff --git a/tests/test_quantumBackedButler.py b/tests/test_quantumBackedButler.py
index 1cf801fdf9..423ee7e083 100644
--- a/tests/test_quantumBackedButler.py
+++ b/tests/test_quantumBackedButler.py
@@ -43,6 +43,7 @@
RegistryConfig,
StorageClass,
)
+from lsst.daf.butler.direct_butler import DirectButler
from lsst.daf.butler.registry import _RegistryFactory
from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir
from lsst.resources import ResourcePath
@@ -62,7 +63,9 @@ def setUp(self) -> None:
# Make a butler and import dimension definitions.
registryConfig = RegistryConfig(self.config.get("registry"))
_RegistryFactory(registryConfig).create_from_config(butlerRoot=self.root)
- self.butler = Butler(self.config, writeable=True, run="RUN")
+ butler = Butler.from_config(self.config, writeable=True, run="RUN")
+ assert isinstance(butler, DirectButler)
+ self.butler = butler
self.butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
# make all dataset types
diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
index a564e4b08b..483d7d03ef 100644
--- a/tests/test_simpleButler.py
+++ b/tests/test_simpleButler.py
@@ -79,7 +79,7 @@ def makeButler(self, **kwargs: Any) -> Butler:
registryConfig = RegistryConfig(config.get("registry"))
_RegistryFactory(registryConfig).create_from_config()
- butler = Butler(config, **kwargs)
+ butler = Butler.from_config(config, **kwargs)
DatastoreMock.apply(butler)
return butler
@@ -549,13 +549,13 @@ def testRegistryDefaults(self):
# Initialize a new butler with `imported_g` as its default run.
# This should not have a default instrument, because there are two.
# Pass run instead of collections; this should set both.
- butler2 = Butler(butler=butler, run="imported_g")
+ butler2 = Butler.from_config(butler=butler, run="imported_g")
self.assertEqual(list(butler2.registry.defaults.collections), ["imported_g"])
self.assertEqual(butler2.registry.defaults.run, "imported_g")
self.assertFalse(butler2.registry.defaults.dataId)
# Initialize a new butler with an instrument default explicitly given.
# Set collections instead of run, which should then be None.
- butler3 = Butler(butler=butler, collections=["imported_g"], instrument="Cam2")
+ butler3 = Butler.from_config(butler=butler, collections=["imported_g"], instrument="Cam2")
self.assertEqual(list(butler3.registry.defaults.collections), ["imported_g"])
self.assertIsNone(butler3.registry.defaults.run, None)
self.assertEqual(butler3.registry.defaults.dataId.byName(), {"instrument": "Cam2"})
diff --git a/tests/test_testRepo.py b/tests/test_testRepo.py
index faf9518291..71f40e7e6f 100644
--- a/tests/test_testRepo.py
+++ b/tests/test_testRepo.py
@@ -211,7 +211,7 @@ def testRegisterMetricsExampleChained(self):
]
repo = lsst.daf.butler.Butler.makeRepo(temp, config=config)
- butler = lsst.daf.butler.Butler(repo, run="chainedExample")
+ butler = lsst.daf.butler.Butler.from_config(repo, run="chainedExample")
registerMetricsExample(butler)
addDatasetType(butler, "DummyType", {}, "StructuredDataNoComponents")