Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-40120: Add without_datastore flag to Butler #867

Merged
merged 8 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/changes/DM-40120.api.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added new parameter ``without_datastore`` to the ``Butler`` and ``ButlerConfig`` constructors to allow a butler to be created that can not access a datastore.
This can be helpful if you want to query registry without requiring the overhead of the datastore.
16 changes: 12 additions & 4 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
DimensionRecord,
DimensionUniverse,
FileDataset,
NullDatastore,
Progress,
StorageClass,
StorageClassFactory,
Expand Down Expand Up @@ -149,6 +150,9 @@ class Butler(LimitedButler):
the default for that dimension. Nonexistent collections are ignored.
If a default value is provided explicitly for a governor dimension via
``**kwargs``, no default will be inferred for that dimension.
without_datastore : `bool`, optional
If `True` do not attach a datastore to this butler. Any attempts
to use a datastore will fail.
timj marked this conversation as resolved.
Show resolved Hide resolved
**kwargs : `str`
Default data ID key-value pairs. These may only identify "governor"
dimensions like ``instrument`` and ``skymap``.
Expand Down Expand Up @@ -203,6 +207,7 @@ def __init__(
searchPaths: Sequence[ResourcePathExpression] | None = None,
writeable: bool | None = None,
inferDefaults: bool = True,
without_datastore: bool = False,
**kwargs: str,
):
defaults = RegistryDefaults(collections=collections, run=run, infer=inferDefaults, **kwargs)
Expand All @@ -217,7 +222,7 @@ def __init__(
self.storageClasses = butler.storageClasses
self._config: ButlerConfig = butler._config
else:
self._config = ButlerConfig(config, searchPaths=searchPaths)
self._config = ButlerConfig(config, searchPaths=searchPaths, without_datastore=without_datastore)
try:
if "root" in self._config:
butlerRoot = self._config["root"]
Expand All @@ -228,9 +233,12 @@ def __init__(
self._registry = _RegistryFactory(self._config).from_config(
butlerRoot=butlerRoot, writeable=writeable, defaults=defaults
)
self._datastore = Datastore.fromConfig(
self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
)
if without_datastore:
self._datastore = NullDatastore(None, None)
else:
self._datastore = Datastore.fromConfig(
self._config, self._registry.getDatastoreBridgeManager(), butlerRoot=butlerRoot
)
self.storageClasses = StorageClassFactory()
self.storageClasses.addFromConfig(self._config)
except Exception:
Expand Down
11 changes: 10 additions & 1 deletion python/lsst/daf/butler/_butlerConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,15 @@ class ButlerConfig(Config):
than those read from the environment in
`ConfigSubset.defaultSearchPaths()`. They are only read if ``other``
refers to a configuration file or directory.
without_datastore : `bool`, optional
If `True` remove the datastore configuration.
"""

def __init__(
self,
other: ResourcePathExpression | Config | None = None,
searchPaths: Sequence[ResourcePathExpression] | None = None,
without_datastore: bool = False,
):
self.configDir: ResourcePath | None = None

Expand Down Expand Up @@ -155,6 +158,13 @@ def __init__(
# configuration classes. We ask each of them to apply defaults to
# the values we have been supplied by the user.
for configClass in CONFIG_COMPONENT_CLASSES:
assert configClass.component is not None, "Config class component cannot be None"

if without_datastore and configClass is DatastoreConfig:
if configClass.component in butlerConfig:
del butlerConfig[configClass.component]
continue

# Only send the parent config if the child
# config component is present (otherwise it assumes that the
# keys from other components are part of the child)
Expand All @@ -163,7 +173,6 @@ def __init__(
localOverrides = butlerConfig
config = configClass(localOverrides, searchPaths=searchPaths)
# Re-attach it using the global namespace
assert configClass.component is not None, "Config class component cannot be None"
self.update({configClass.component: config})
# Remove the key from the butlerConfig since we have already
# merged that information.
Expand Down
124 changes: 123 additions & 1 deletion python/lsst/daf/butler/core/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@

from __future__ import annotations

__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs")
__all__ = ("DatastoreConfig", "Datastore", "DatastoreValidationError", "DatasetRefURIs", "NullDatastore")

import contextlib
import dataclasses
import logging
import time
from abc import ABCMeta, abstractmethod
from collections import abc, defaultdict
from collections.abc import Callable, Iterable, Iterator, Mapping
Expand All @@ -50,6 +51,8 @@
from .datastoreRecordData import DatastoreRecordData
from .storageClass import StorageClass

_LOG = logging.getLogger(__name__)


class DatastoreConfig(ConfigSubset):
"""Configuration for Datastores."""
Expand Down Expand Up @@ -1205,3 +1208,122 @@
guess dataset location based on its stored dataset type.
"""
pass


class NullDatastore(Datastore):
"""A datastore that implements the `Datastore` API but always fails when
it accepts any request.
"""

@classmethod
def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None:
# Nothing to do. This is not a real Datastore.
pass

Check warning on line 1221 in python/lsst/daf/butler/core/datastore.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/core/datastore.py#L1221

Added line #L1221 was not covered by tests

def __init__(
self,
config: Config | ResourcePathExpression | None,
bridgeManager: DatastoreRegistryBridgeManager | None,
butlerRoot: ResourcePathExpression | None = None,
):
# Name ourselves with the timestamp the datastore
# was created.
self.name = f"{type(self).__name__}@{time.time()}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need the timestamp? Two NullDatastore instances should be identical (it could even be a singleton, though I don't think it needs to be), and I'd hope that means we don't need to care about conflicts.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, although since this datastore should never result in a dataset being stored anywhere there's no requirement for it to be a fixed name. The time is really there to let you know when you created that butler and it will appear in stringified output of Butler. Whether anyone cares is a question.

_LOG.debug("Creating datastore %s", self.name)

return

def knows(self, ref: DatasetRef) -> bool:
return False

def exists(self, datasetRef: DatasetRef) -> bool:
return False

def get(
self,
datasetRef: DatasetRef,
parameters: Mapping[str, Any] | None = None,
storageClass: StorageClass | str | None = None,
) -> Any:
raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")

def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def ingest(
self, *datasets: FileDataset, transfer: str | None = None, record_validation_info: bool = True
) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def transfer_from(
self,
source_datastore: Datastore,
refs: Iterable[DatasetRef],
transfer: str = "auto",
artifact_existence: dict[ResourcePath, bool] | None = None,
) -> tuple[set[DatasetRef], set[DatasetRef]]:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefURIs:
raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")

def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath:
raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")

def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
destination: ResourcePath,
transfer: str = "auto",
preserve_path: bool = True,
overwrite: bool = False,
) -> list[ResourcePath]:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def remove(self, datasetRef: DatasetRef) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def forget(self, refs: Iterable[DatasetRef]) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def trash(self, ref: DatasetRef | Iterable[DatasetRef], ignore_errors: bool = True) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def emptyTrash(self, ignore_errors: bool = True) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def transfer(self, inputDatastore: Datastore, datasetRef: DatasetRef) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def export(
self,
refs: Iterable[DatasetRef],
*,
directory: ResourcePathExpression | None = None,
transfer: str | None = "auto",
) -> Iterable[FileDataset]:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def validateConfiguration(
self, entities: Iterable[DatasetRef | DatasetType | StorageClass], logFailures: bool = False
) -> None:
# No configuration so always validates.
pass

def validateKey(self, lookupKey: LookupKey, entity: DatasetRef | DatasetType | StorageClass) -> None:
pass

Check warning on line 1314 in python/lsst/daf/butler/core/datastore.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/core/datastore.py#L1314

Added line #L1314 was not covered by tests

def getLookupKeys(self) -> set[LookupKey]:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def import_records(
self,
data: Mapping[str, DatastoreRecordData],
) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def export_records(
self,
refs: Iterable[DatasetIdRef],
) -> Mapping[str, DatastoreRecordData]:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/certifyCalibrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
Search all children of the inputCollection if it is a CHAINED
collection, instead of just the most recent one.
"""
butler = Butler(repo, writeable=True)
butler = Butler(repo, writeable=True, without_datastore=True)

Check warning on line 66 in python/lsst/daf/butler/script/certifyCalibrations.py

View check run for this annotation

Codecov / codecov/patch

python/lsst/daf/butler/script/certifyCalibrations.py#L66

Added line #L66 was not covered by tests
registry = butler.registry
timespan = Timespan(
begin=astropy.time.Time(begin_date, scale="tai") if begin_date is not None else None,
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/collectionChain.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def collectionChain(
chain : `tuple` of `str`
The collections in the chain following this command.
"""
butler = Butler(repo, writeable=True)
butler = Butler(repo, writeable=True, without_datastore=True)

# Every mode needs children except pop.
if not children and mode != "pop":
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/queryCollections.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def _getTree(
names=("Name", "Type"),
dtype=(str, str),
)
butler = Butler(repo)
butler = Butler(repo, without_datastore=True)

def addCollection(name: str, level: int = 0) -> None:
collectionType = butler.registry.getCollectionType(name)
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/queryDataIds.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def queryDataIds(
Docstring for supported parameters is the same as
`~lsst.daf.butler.Registry.queryDataIds`.
"""
butler = Butler(repo)
butler = Butler(repo, without_datastore=True)

if datasets and collections and not dimensions:
# Determine the dimensions relevant to all given dataset types.
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/queryDatasetTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def queryDatasetTypes(repo: str, verbose: bool, glob: Iterable[str], components:
A dict whose key is "datasetTypes" and whose value is a list of
collection names.
"""
butler = Butler(repo)
butler = Butler(repo, without_datastore=True)
expression = glob if glob else ...
datasetTypes = butler.registry.queryDatasetTypes(components=components, expression=expression)
if verbose:
Expand Down
4 changes: 3 additions & 1 deletion python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ def __init__(
):
if (repo and butler) or (not repo and not butler):
raise RuntimeError("One of repo and butler must be provided and the other must be None.")
self.butler = butler or Butler(repo)
# show_uri requires a datastore.
without_datastore = False if show_uri else True
self.butler = butler or Butler(repo, without_datastore=without_datastore)
self._getDatasets(glob, collections, where, find_first)
self.showUri = show_uri

Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/queryDimensionRecords.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def queryDimensionRecords(
`~lsst.daf.butler.Registry.queryDimensionRecords` except for ``no_check``,
which is the inverse of ``check``.
"""
butler = Butler(repo)
butler = Butler(repo, without_datastore=True)

query_collections: Iterable[str] | EllipsisType | None = None
if datasets:
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/register_dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def register_dataset_type(
be created by this command. They are always derived from the composite
dataset type.
"""
butler = Butler(repo, writeable=True)
butler = Butler(repo, writeable=True, without_datastore=True)

composite, component = DatasetType.splitDatasetTypeName(dataset_type)
if component:
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/script/removeDatasetType.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ def removeDatasetType(repo: str, dataset_type_name: tuple[str, ...]) -> None:
datasetTypeName : `str`
The name of the dataset type to be removed.
"""
butler = Butler(repo, writeable=True)
butler = Butler(repo, writeable=True, without_datastore=True)
butler.registry.removeDatasetType(dataset_type_name)
51 changes: 51 additions & 0 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def mock_s3(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def]
FileDataset,
FileTemplate,
FileTemplateValidationError,
NullDatastore,
StorageClassFactory,
ValidationError,
script,
Expand Down Expand Up @@ -2332,6 +2333,56 @@ class ChainedDatastoreTransfers(PosixDatastoreTransfers):
configFile = os.path.join(TESTDIR, "config/basic/butler-chained.yaml")


class NullDatastoreTestCase(unittest.TestCase):
"""Test that we can fall back to a null datastore."""

# Need a good config to create the repo.
configFile = os.path.join(TESTDIR, "config/basic/butler.yaml")

@classmethod
def setUpClass(cls) -> None:
cls.storageClassFactory = StorageClassFactory()
cls.storageClassFactory.addFromConfig(cls.configFile)

def setUp(self) -> None:
"""Create a new butler root for each test."""
self.root = makeTestTempDir(TESTDIR)
Butler.makeRepo(self.root, config=Config(self.configFile))

def tearDown(self) -> None:
removeTestTempDir(self.root)

def test_fallback(self) -> None:
# Read the butler config and mess with the datastore section.
bad_config = Config(os.path.join(self.root, "butler.yaml"))
bad_config["datastore", "cls"] = "lsst.not.a.datastore.Datastore"

with self.assertRaises(RuntimeError):
Butler(bad_config)

butler = Butler(bad_config, writeable=True, without_datastore=True)
self.assertIsInstance(butler._datastore, NullDatastore)

# Check that registry is working.
butler.registry.registerRun("MYRUN")
collections = butler.registry.queryCollections(...)
self.assertIn("MYRUN", set(collections))

# Create a ref.
dimensions = butler.dimensions.extract([])
storageClass = self.storageClassFactory.getStorageClass("StructuredDataDict")
datasetTypeName = "metric"
datasetType = DatasetType(datasetTypeName, dimensions, storageClass)
butler.registry.registerDatasetType(datasetType)
ref = DatasetRef(datasetType, {}, run="MYRUN")

# Check that datastore will complain.
with self.assertRaises(FileNotFoundError):
butler.get(ref)
with self.assertRaises(FileNotFoundError):
butler.getURI(ref)


def setup_module(module: types.ModuleType) -> None:
"""Set up the module for pytest."""
clean_environment()
Expand Down
Loading
Loading