Merge pull request #871 from lsst/tickets/DM-40184

DM-40184: fix rare bug in QG generation with warps as an overall-input
lsst · Jul 31, 2023 · 4927b47 · 4927b47
2 parents a997e9e + 8879207
commit 4927b47
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 1 deletion.
diff --git a/doc/changes/DM-40184.bugfix.md b/doc/changes/DM-40184.bugfix.md
@@ -0,0 +1,3 @@
+Fix a rare bug in follow-up dataset queries involving relation commutators.
+
+This occurred when building QuantumGraphs where a "warp" dataset type was an overall input to the pipeline and present in more than one input RUN collection.
diff --git a/python/lsst/daf/butler/registry/queries/find_first_dataset.py b/python/lsst/daf/butler/registry/queries/find_first_dataset.py
@@ -26,7 +26,7 @@
 from collections.abc import Sequence, Set
 from typing import final
 
-from lsst.daf.relation import ColumnTag, Relation, RowFilter
+from lsst.daf.relation import ColumnTag, Relation, RowFilter, UnaryCommutator, UnaryOperationRelation
 from lsst.utils.classes import cached_getter
 
 from ...core import DatasetColumnTag, DimensionKeyColumnTag
@@ -73,3 +73,24 @@ def __str__(self) -> str:
     def applied_min_rows(self, target: Relation) -> int:
         # Docstring inherited.
         return 1 if target.min_rows else 0
+
+    def commute(self, current: UnaryOperationRelation) -> UnaryCommutator:
+        # Docstring inherited.
+        if not self.columns_required <= current.target.columns:
+            return UnaryCommutator(
+                first=None,
+                second=current.operation,
+                done=False,
+                messages=(
+                    f"{current.target} is missing columns "
+                    f"{set(self.columns_required - current.target.columns)}",
+                ),
+            )
+        if current.operation.is_count_dependent:
+            return UnaryCommutator(
+                first=None,
+                second=current.operation,
+                done=False,
+                messages=(f"{current.operation} is count-dependent",),
+            )
+        return UnaryCommutator(self, current.operation)
diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -3428,6 +3428,52 @@ def pop_transfer(tree: Relation) -> Relation:
             sql.Engine,
         )
 
+    def test_query_find_datasets_drop_postprocessing(self) -> None:
+        """Test that DataCoordinateQueryResults.findDatasets avoids commutator
+        problems with the FindFirstDataset relation operation.
+        """
+        # Setup: load some visit, tract, and patch records, and insert two
+        # datasets with dimensions {visit, patch}, with one in each of two
+        # RUN collections.
+        registry = self.makeRegistry()
+        self.loadData(registry, "base.yaml")
+        self.loadData(registry, "spatial.yaml")
+        storage_class = StorageClass("Warpy")
+        registry.storageClasses.registerStorageClass(storage_class)
+        dataset_type = DatasetType(
+            "warp", {"visit", "patch"}, storageClass=storage_class, universe=registry.dimensions
+        )
+        registry.registerDatasetType(dataset_type)
+        (data_id,) = registry.queryDataIds(["visit", "patch"]).limit(1)
+        registry.registerRun("run1")
+        registry.registerRun("run2")
+        (ref1,) = registry.insertDatasets(dataset_type, [data_id], run="run1")
+        (ref2,) = registry.insertDatasets(dataset_type, [data_id], run="run2")
+        # Query for the dataset using queryDataIds(...).findDatasets(...)
+        # against only one of the two collections.  This should work even
+        # though the relation returned by queryDataIds ends with
+        # iteration-engine region-filtering, because we can recognize before
+        # running the query that there is only one collecton to search and
+        # hence the (default) findFirst=True is irrelevant, and joining in the
+        # dataset query commutes past the iteration-engine postprocessing.
+        query1 = registry.queryDataIds(
+            {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
+        )
+        self.assertEqual(
+            set(query1.findDatasets(dataset_type.name, collections=["run1"])),
+            {ref1},
+        )
+        # Query for the dataset using queryDataIds(...).findDatasets(...)
+        # against both collections.  This can only work if the FindFirstDataset
+        # operation can be commuted past the iteration-engine options into SQL.
+        query2 = registry.queryDataIds(
+            {"visit", "patch"}, visit=data_id["visit"], instrument=data_id["instrument"]
+        )
+        self.assertEqual(
+            set(query2.findDatasets(dataset_type.name, collections=["run2", "run1"])),
+            {ref2},
+        )
+
     def test_query_empty_collections(self) -> None:
         """Test for registry query methods with empty collections. The methods
         should return empty result set (or None when applicable) and provide