From dc9b74c30ef7ab2e90ebcb2e49e0f3b18817bf12 Mon Sep 17 00:00:00 2001
From: Tim Jenness <tjenness@lsst.org>
Date: Thu, 5 Sep 2024 11:59:08 -0700
Subject: [PATCH] Fix handling of find_first=True with collection wildcards

---
 python/lsst/daf/butler/_butler.py             | 19 +++++++++++++++----
 .../lsst/daf/butler/script/queryDatasets.py   |  5 ++++-
 .../lsst/daf/butler/tests/butler_queries.py   |  5 ++++-
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
index 166e2280ee..e109dbe64b 100644
--- a/python/lsst/daf/butler/_butler.py
+++ b/python/lsst/daf/butler/_butler.py
@@ -1588,14 +1588,16 @@ def query_datasets(
             Dataset type object or name to search for.
         collections : collection expression, optional
             A collection name or iterable of collection names to search. If not
-            provided, the default collections are used. Can be a wildcard. See
-            :ref:`daf_butler_collection_expressions` for more information.
+            provided, the default collections are used. Can be a wildcard if
+            ``find_first`` is `False` (if find first is requested the order
+            of collections matters and wildcards make the order indeterminate).
+             See :ref:`daf_butler_collection_expressions` for more information.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
             `DatasetRef` of each `DatasetType`, from the first collection in
             which a dataset of that dataset type appears (according to the
             order of ``collections`` passed in).  If `True`, ``collections``
-            must not contain regular expressions and may not be ``...``.
+            must not contain wildcards.
         data_id : `dict` or `DataCoordinate`, optional
             A data ID whose key-value pairs are used as equality constraints in
             the query.
@@ -1667,7 +1669,16 @@ def query_datasets(
         if order_by is None:
             order_by = []
         if collections:
-            collections = self.collections.query(collections)
+            # Wild cards need to be expanded but can only be allowed if
+            # find_first=False because expanding wildcards does not return
+            # a guaranteed ordering.
+            expanded_collections = self.collections.query(collections)
+            if find_first and set(expanded_collections) != set(ensure_iterable(collections)):
+                raise RuntimeError(
+                    "Can not use wildcards in collections when find_first=True "
+                    f" (given {collections} which expanded to {expanded_collections})"
+                )
+            collections = expanded_collections
         with self.query() as query:
             result = (
                 query.where(data_id, where, bind=bind, **kwargs)
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
index 5d092cda69..7343c56ca1 100644
--- a/python/lsst/daf/butler/script/queryDatasets.py
+++ b/python/lsst/daf/butler/script/queryDatasets.py
@@ -242,7 +242,10 @@ def getDatasets(self) -> Iterator[DatasetRef]:
 
         # Expand the collections query and include summary information.
         query_collections_info = self.butler.collections.query_info(query_collections, include_summary=True)
-        query_collections = [c.name for c in query_collections_info]
+        expanded_query_collections = [c.name for c in query_collections_info]
+        if self._find_first and set(query_collections) != set(expanded_query_collections):
+            raise RuntimeError("Can not use wildcards in collections when find_first=True")
+        query_collections = expanded_query_collections
 
         # Only iterate over dataset types that are relevant for the query.
         dataset_types = set(
diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py
index f893e64827..f6b11f2470 100644
--- a/python/lsst/daf/butler/tests/butler_queries.py
+++ b/python/lsst/daf/butler/tests/butler_queries.py
@@ -257,8 +257,11 @@ def test_simple_dataset_query(self) -> None:
         self.assertEqual(refs_q[0].id, UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22"))
         self.assertEqual(refs_q[1].id, UUID("51352db4-a47a-447c-b12d-a50b206b17cd"))
 
+        with self.assertRaises(RuntimeError) as cm:
+            butler.query_datasets("bias", "*", detector=100, instrument="Unknown", find_first=True)
+        self.assertIn("Can not use wildcards", str(cm.exception))
         with self.assertRaises(EmptyQueryResultError) as cm:
-            butler.query_datasets("bias", "*", detector=100, instrument="Unknown")
+            butler.query_datasets("bias", "*", detector=100, instrument="Unknown", find_first=False)
         self.assertIn("doomed", str(cm.exception))
 
     def test_general_query(self) -> None: