Add DirectButler.find_dataset

lsst · Oct 27, 2023 · 90aae4a · 90aae4a
1 parent d2e3489
commit 90aae4a
Show file tree

Hide file tree

Showing 7 changed files with 136 additions and 14 deletions.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -79,7 +79,16 @@
 
 # Do not import or lift symbols from 'server' or 'server_models'.
 # Import the registry subpackage directly for other symbols.
-from .registry import CollectionSearch, CollectionType, MissingDatasetTypeError, Registry, RegistryConfig
+from .registry import (
+    CollectionArgType,
+    CollectionSearch,
+    CollectionType,
+    MissingCollectionError,
+    MissingDatasetTypeError,
+    NoDefaultCollectionError,
+    Registry,
+    RegistryConfig,
+)
 from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend
 from .version import *
 

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -48,9 +48,10 @@
 from ._file_dataset import FileDataset
 from ._limited_butler import LimitedButler
 from ._storage_class import StorageClass
+from ._timespan import Timespan
 from .datastore import DatasetRefURIs, Datastore
 from .dimensions import DataId, DimensionConfig
-from .registry import Registry, RegistryConfig, _RegistryFactory
+from .registry import CollectionArgType, Registry, RegistryConfig, _RegistryFactory
 from .repo_relocation import BUTLER_ROOT_TAG
 from .transfers import RepoExportContext
 
@@ -798,6 +799,83 @@ def get_dataset_type(self, name: str) -> DatasetType:
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def find_dataset(
+        self,
+        datasetType: DatasetType | str,
+        dataId: DataId | None = None,
+        *,
+        collections: CollectionArgType | None = None,
+        timespan: Timespan | None = None,
+        datastore_records: bool = False,
+        **kwargs: Any,
+    ) -> DatasetRef | None:
+        """Find a dataset given its `DatasetType` and data ID.
+
+        This can be used to obtain a `DatasetRef` that permits the dataset to
+        be read from a `Datastore`. If the dataset is a component and can not
+        be found using the provided dataset type, a dataset ref for the parent
+        will be returned instead but with the correct dataset type.
+
+        Parameters
+        ----------
+        datasetType : `DatasetType` or `str`
+            A `DatasetType` or the name of one.  If this is a `DatasetType`
+            instance, its storage class will be respected and propagated to
+            the output, even if it differs from the dataset type definition
+            in the registry, as long as the storage classes are convertible.
+        dataId : `dict` or `DataCoordinate`, optional
+            A `dict`-like object containing the `Dimension` links that identify
+            the dataset within a collection.
+        collections : collection expression, optional
+            An expression that fully or partially identifies the collections to
+            search for the dataset; see
+            :ref:`daf_butler_collection_expressions` for more information.
+            Defaults to ``self.defaults.collections``.
+        timespan : `Timespan`, optional
+            A timespan that the validity range of the dataset must overlap.
+            If not provided, any `~CollectionType.CALIBRATION` collections
+            matched by the ``collections`` argument will not be searched.
+        **kwargs
+            Additional keyword arguments passed to
+            `DataCoordinate.standardize` to convert ``dataId`` to a true
+            `DataCoordinate` or augment an existing one.
+
+        Returns
+        -------
+        ref : `DatasetRef`
+            A reference to the dataset, or `None` if no matching Dataset
+            was found.
+
+        Raises
+        ------
+        lsst.daf.butler.NoDefaultCollectionError
+            Raised if ``collections`` is `None` and
+            ``self.collections`` is `None`.
+        LookupError
+            Raised if one or more data ID keys are missing.
+        lsst.daf.butler.registry.MissingDatasetTypeError
+            Raised if the dataset type does not exist.
+        lsst.daf.butler.MissingCollectionError
+            Raised if any of ``collections`` does not exist in the registry.
+
+        Notes
+        -----
+        This method simply returns `None` and does not raise an exception even
+        when the set of collections searched is intrinsically incompatible with
+        the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
+        only `~CollectionType.CALIBRATION` collections are being searched.
+        This may make it harder to debug some lookup failures, but the behavior
+        is intentional; we consider it more important that failed searches are
+        reported consistently, regardless of the reason, and that adding
+        additional collections that do not contain a match to the search path
+        never changes the behavior.
+
+        This method handles component dataset types automatically, though most
+        other registry operations do not.
+        """
+        raise NotImplementedError()
+
     @abstractmethod
     def retrieveArtifacts(
         self,

diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py
@@ -76,6 +76,7 @@
 )
 from .progress import Progress
 from .registry import (
+    CollectionArgType,
     CollectionType,
     ConflictingDefinitionError,
     DataIdError,
@@ -846,7 +847,7 @@ def _findDatasetRef(
             )
         # Always lookup the DatasetRef, even if one is given, to ensure it is
         # present in the current collection.
-        ref = self._registry.findDataset(
+        ref = self.find_dataset(
             datasetType,
             dataId,
             collections=collections,
@@ -1321,6 +1322,25 @@ def getURI(
     def get_dataset_type(self, name: str) -> DatasetType:
         return self._registry.getDatasetType(name)
 
+    def find_dataset(
+        self,
+        datasetType: DatasetType | str,
+        dataId: DataId | None = None,
+        *,
+        collections: CollectionArgType | None = None,
+        timespan: Timespan | None = None,
+        datastore_records: bool = False,
+        **kwargs: Any,
+    ) -> DatasetRef | None:
+        return self._registry.findDataset(
+            datasetType,
+            dataId,
+            collections=collections,
+            timespan=timespan,
+            dataset_records=datastore_records,
+            **kwargs,
+        )
+
     def retrieveArtifacts(
         self,
         refs: Iterable[DatasetRef],

diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -27,15 +27,15 @@
 
 from __future__ import annotations
 
-__all__ = ("Registry",)
+__all__ = ("Registry", "CollectionArgType")
 
 import contextlib
 import logging
 import re
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from types import EllipsisType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypeAlias
 
 from .._dataset_association import DatasetAssociation
 from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
@@ -64,7 +64,9 @@
 _LOG = logging.getLogger(__name__)
 
 # TYpe alias for `collections` arguments.
-CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
+CollectionArgType: TypeAlias = (
+    str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard
+)
 
 
 class Registry(ABC):

diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py
@@ -46,9 +46,10 @@
 from .._file_dataset import FileDataset
 from .._limited_butler import LimitedButler
 from .._storage_class import StorageClass
+from .._timespan import Timespan
 from ..datastore import DatasetRefURIs
 from ..dimensions import DataId, DimensionConfig, DimensionUniverse
-from ..registry import Registry, RegistryDefaults
+from ..registry import CollectionArgType, Registry, RegistryDefaults
 from ..transfers import RepoExportContext
 from ._config import RemoteButlerConfigModel
 
@@ -187,6 +188,18 @@ def get_dataset_type(self, name: str) -> DatasetType:
         response.raise_for_status()
         return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions)
 
+    def find_dataset(
+        self,
+        datasetType: DatasetType | str,
+        dataId: DataId | None = None,
+        *,
+        collections: CollectionArgType | None = None,
+        timespan: Timespan | None = None,
+        datastore_records: bool = False,
+        **kwargs: Any,
+    ) -> DatasetRef | None:
+        raise NotImplementedError()
+
     def retrieveArtifacts(
         self,
         refs: Iterable[DatasetRef],

diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -442,7 +442,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir
                 )
                 self.assertEqual(count, stop)
 
-            compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections)
+            compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections)
             assert compRef is not None
             summary = butler.get(compRef)
             self.assertEqual(summary, metric.summary)
@@ -928,7 +928,7 @@ def testIngest(self) -> None:
             datasets[0].refs = [
                 cast(
                     DatasetRef,
-                    butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
+                    butler.find_dataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
                 )
                 for ref in datasets[0].refs
             ]
@@ -938,7 +938,7 @@ def testIngest(self) -> None:
                 for ref in dataset.refs:
                     # Create a dict from the dataId to drop the records.
                     new_data_id = {str(k): v for k, v in ref.dataId.items()}
-                    new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run)
+                    new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run)
                     assert new_ref is not None
                     self.assertFalse(new_ref.dataId.hasRecords())
                     refs.append(new_ref)
@@ -1115,7 +1115,7 @@ def testTransaction(self) -> None:
         with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"):
             butler.get(datasetTypeName, dataId)
         # Also check explicitly if Dataset entry is missing
-        self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections))
+        self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections))
         # Direct retrieval should not find the file in the Datastore
         with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"):
             butler.get(ref)

diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
@@ -277,7 +277,7 @@ def testButlerGet(self):
 
         # Find the DatasetRef for a flat
         coll = "imported_g"
-        flat2g = butler.registry.findDataset(
+        flat2g = butler.find_dataset(
             "flat", instrument="Cam1", detector=2, physical_filter="Cam1-G", collections=coll
         )
 
@@ -512,7 +512,7 @@ def testRegistryDefaults(self):
         # input collections.
         butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
         # Use findDataset without collections or instrument.
-        ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G")
+        ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
         # Do the same with Butler.get; this should ultimately invoke a lot of
         # the same code, so it's a bit circular, but mostly we're checking that
         # it works at all.
@@ -583,7 +583,7 @@ def testJson(self):
         # input collections.
         butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
         # Use findDataset without collections or instrument.
-        ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G")
+        ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
 
         # Transform the ref and dataset type to and from JSON
         # and check that it can be reconstructed properly