Skip to content

Commit

Permalink
Add DirectButler.find_dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
timj committed Oct 27, 2023
1 parent d2e3489 commit 90aae4a
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 14 deletions.
11 changes: 10 additions & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,16 @@

# Do not import or lift symbols from 'server' or 'server_models'.
# Import the registry subpackage directly for other symbols.
from .registry import CollectionSearch, CollectionType, MissingDatasetTypeError, Registry, RegistryConfig
from .registry import (
CollectionArgType,
CollectionSearch,
CollectionType,
MissingCollectionError,
MissingDatasetTypeError,
NoDefaultCollectionError,
Registry,
RegistryConfig,
)
from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend
from .version import *

Expand Down
80 changes: 79 additions & 1 deletion python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@
from ._file_dataset import FileDataset
from ._limited_butler import LimitedButler
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs, Datastore
from .dimensions import DataId, DimensionConfig
from .registry import Registry, RegistryConfig, _RegistryFactory
from .registry import CollectionArgType, Registry, RegistryConfig, _RegistryFactory
from .repo_relocation import BUTLER_ROOT_TAG
from .transfers import RepoExportContext

Expand Down Expand Up @@ -798,6 +799,83 @@ def get_dataset_type(self, name: str) -> DatasetType:
"""
raise NotImplementedError()

@abstractmethod
def find_dataset(
self,
datasetType: DatasetType | str,
dataId: DataId | None = None,
*,
collections: CollectionArgType | None = None,
timespan: Timespan | None = None,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
"""Find a dataset given its `DatasetType` and data ID.
This can be used to obtain a `DatasetRef` that permits the dataset to
be read from a `Datastore`. If the dataset is a component and can not
be found using the provided dataset type, a dataset ref for the parent
will be returned instead but with the correct dataset type.
Parameters
----------
datasetType : `DatasetType` or `str`
A `DatasetType` or the name of one. If this is a `DatasetType`
instance, its storage class will be respected and propagated to
the output, even if it differs from the dataset type definition
in the registry, as long as the storage classes are convertible.
dataId : `dict` or `DataCoordinate`, optional
A `dict`-like object containing the `Dimension` links that identify
the dataset within a collection.
collections : collection expression, optional
An expression that fully or partially identifies the collections to
search for the dataset; see
:ref:`daf_butler_collection_expressions` for more information.
Defaults to ``self.defaults.collections``.
timespan : `Timespan`, optional
A timespan that the validity range of the dataset must overlap.
If not provided, any `~CollectionType.CALIBRATION` collections
matched by the ``collections`` argument will not be searched.
**kwargs
Additional keyword arguments passed to
`DataCoordinate.standardize` to convert ``dataId`` to a true
`DataCoordinate` or augment an existing one.
Returns
-------
ref : `DatasetRef`
A reference to the dataset, or `None` if no matching Dataset
was found.
Raises
------
lsst.daf.butler.NoDefaultCollectionError
Raised if ``collections`` is `None` and
``self.collections`` is `None`.
LookupError
Raised if one or more data ID keys are missing.
lsst.daf.butler.registry.MissingDatasetTypeError
Raised if the dataset type does not exist.
lsst.daf.butler.MissingCollectionError
Raised if any of ``collections`` does not exist in the registry.
Notes
-----
This method simply returns `None` and does not raise an exception even
when the set of collections searched is intrinsically incompatible with
the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
only `~CollectionType.CALIBRATION` collections are being searched.
This may make it harder to debug some lookup failures, but the behavior
is intentional; we consider it more important that failed searches are
reported consistently, regardless of the reason, and that adding
additional collections that do not contain a match to the search path
never changes the behavior.
This method handles component dataset types automatically, though most
other registry operations do not.
"""
raise NotImplementedError()

@abstractmethod
def retrieveArtifacts(
self,
Expand Down
22 changes: 21 additions & 1 deletion python/lsst/daf/butler/direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
)
from .progress import Progress
from .registry import (
CollectionArgType,
CollectionType,
ConflictingDefinitionError,
DataIdError,
Expand Down Expand Up @@ -846,7 +847,7 @@ def _findDatasetRef(
)
# Always lookup the DatasetRef, even if one is given, to ensure it is
# present in the current collection.
ref = self._registry.findDataset(
ref = self.find_dataset(
datasetType,
dataId,
collections=collections,
Expand Down Expand Up @@ -1321,6 +1322,25 @@ def getURI(
def get_dataset_type(self, name: str) -> DatasetType:
return self._registry.getDatasetType(name)

def find_dataset(
self,
datasetType: DatasetType | str,
dataId: DataId | None = None,
*,
collections: CollectionArgType | None = None,
timespan: Timespan | None = None,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
return self._registry.findDataset(
datasetType,
dataId,
collections=collections,
timespan=timespan,
dataset_records=datastore_records,
**kwargs,
)

def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
Expand Down
8 changes: 5 additions & 3 deletions python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@

from __future__ import annotations

__all__ = ("Registry",)
__all__ = ("Registry", "CollectionArgType")

import contextlib
import logging
import re
from abc import ABC, abstractmethod
from collections.abc import Iterable, Iterator, Mapping, Sequence
from types import EllipsisType
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, TypeAlias

from .._dataset_association import DatasetAssociation
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
Expand Down Expand Up @@ -64,7 +64,9 @@
_LOG = logging.getLogger(__name__)

# TYpe alias for `collections` arguments.
CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
CollectionArgType: TypeAlias = (
str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
)


class Registry(ABC):
Expand Down
15 changes: 14 additions & 1 deletion python/lsst/daf/butler/remote_butler/_remote_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@
from .._file_dataset import FileDataset
from .._limited_butler import LimitedButler
from .._storage_class import StorageClass
from .._timespan import Timespan
from ..datastore import DatasetRefURIs
from ..dimensions import DataId, DimensionConfig, DimensionUniverse
from ..registry import Registry, RegistryDefaults
from ..registry import CollectionArgType, Registry, RegistryDefaults
from ..transfers import RepoExportContext
from ._config import RemoteButlerConfigModel

Expand Down Expand Up @@ -187,6 +188,18 @@ def get_dataset_type(self, name: str) -> DatasetType:
response.raise_for_status()
return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions)

def find_dataset(
self,
datasetType: DatasetType | str,
dataId: DataId | None = None,
*,
collections: CollectionArgType | None = None,
timespan: Timespan | None = None,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
raise NotImplementedError()

def retrieveArtifacts(
self,
refs: Iterable[DatasetRef],
Expand Down
8 changes: 4 additions & 4 deletions tests/test_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir
)
self.assertEqual(count, stop)

compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections)
assert compRef is not None
summary = butler.get(compRef)
self.assertEqual(summary, metric.summary)
Expand Down Expand Up @@ -928,7 +928,7 @@ def testIngest(self) -> None:
datasets[0].refs = [
cast(
DatasetRef,
butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
butler.find_dataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
)
for ref in datasets[0].refs
]
Expand All @@ -938,7 +938,7 @@ def testIngest(self) -> None:
for ref in dataset.refs:
# Create a dict from the dataId to drop the records.
new_data_id = {str(k): v for k, v in ref.dataId.items()}
new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run)
new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run)
assert new_ref is not None
self.assertFalse(new_ref.dataId.hasRecords())
refs.append(new_ref)
Expand Down Expand Up @@ -1115,7 +1115,7 @@ def testTransaction(self) -> None:
with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
butler.get(datasetTypeName, dataId)
# Also check explicitly if Dataset entry is missing
self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections))
# Direct retrieval should not find the file in the Datastore
with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
butler.get(ref)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def testButlerGet(self):

# Find the DatasetRef for a flat
coll = "imported_g"
flat2g = butler.registry.findDataset(
flat2g = butler.find_dataset(
"flat", instrument="Cam1", detector=2, physical_filter="Cam1-G", collections=coll
)

Expand Down Expand Up @@ -512,7 +512,7 @@ def testRegistryDefaults(self):
# input collections.
butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
# Use findDataset without collections or instrument.
ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G")
ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
# Do the same with Butler.get; this should ultimately invoke a lot of
# the same code, so it's a bit circular, but mostly we're checking that
# it works at all.
Expand Down Expand Up @@ -583,7 +583,7 @@ def testJson(self):
# input collections.
butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
# Use findDataset without collections or instrument.
ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G")
ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")

# Transform the ref and dataset type to and from JSON
# and check that it can be reconstructed properly
Expand Down

0 comments on commit 90aae4a

Please sign in to comment.