From 1d5ca5aa3f28c8f6cf5c6bea7715d0e06bf9dc16 Mon Sep 17 00:00:00 2001
From: "David H. Irving" <david.irving@noirlab.edu>
Date: Fri, 13 Sep 2024 15:43:12 -0700
Subject: [PATCH] Fix issue with "collection" dataset field

Fix an issue similar to the previous commit, where query_datasets would fail on Postgres 16 with the error 'psycopg2.errors.DatatypeMismatch: could not determine polymorphic type because input has type unknown'.

This was occurring when there was a single collection specified for the dataset search, causing a literal  "collection" dataset field value to be included in an any_value aggregate function.
---
 .../datasets/byDimensions/_storage.py         |  8 +++++++-
 .../lsst/daf/butler/tests/butler_queries.py   | 19 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
index 9c49b9ee3f..fe38ed42e9 100644
--- a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
+++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
@@ -674,7 +674,13 @@ def _finish_query_builder(
             only_collection_record = collections[0]
             sql_projection.joiner.where(collection_col == only_collection_record.key)
             if "collection" in fields:
-                fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name)
+                fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name).cast(
+                    # This cast is necessary to ensure that Postgres knows the
+                    # type of this column if it is used in an aggregate
+                    # function.
+                    sqlalchemy.String
+                )
+
         elif not collections:
             sql_projection.joiner.where(sqlalchemy.literal(False))
             if "collection" in fields:
diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py
index 8bb6acba25..36335094b9 100644
--- a/python/lsst/daf/butler/tests/butler_queries.py
+++ b/python/lsst/daf/butler/tests/butler_queries.py
@@ -1832,7 +1832,8 @@ def test_dataset_queries(self) -> None:
 
         # Tests for a regression of DM-46340, where invalid SQL would be
         # generated when the list of collections is a single run collection and
-        # there is region-postprocessing logic involved.
+        # there is region-postprocessing logic involved.  This was due to
+        # missing type information associated with the "run" dataset field.
         result = butler.query_datasets(
             "dt",
             "run",
@@ -1841,6 +1842,22 @@ def test_dataset_queries(self) -> None:
         )
         self.assertEqual(result[0].dataId, {"instrument": "Cam1", "visit": 1, "detector": 1})
 
+        # A similar issue to the "run" issue above was occuring with the
+        # 'collection' dataset field.
+        with butler.query() as query:
+            rows = list(
+                query.join_dataset_search("dt", "run")
+                .where("instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0")
+                .general(
+                    dimensions=["visit", "detector"],
+                    dataset_fields={"dt": set(["collection"])},
+                    find_first=True,
+                )
+            )
+            self.assertEqual(len(rows), 1)
+            self.assertEqual(rows[0]["visit"], 1)
+            self.assertEqual(rows[0]["dt.collection"], "run")
+
 
 def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]:
     output = []