From 1d5ca5aa3f28c8f6cf5c6bea7715d0e06bf9dc16 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Fri, 13 Sep 2024 15:43:12 -0700 Subject: [PATCH] Fix issue with "collection" dataset field Fix an issue similar to the previous commit, where query_datasets would fail on Postgres 16 with the error 'psycopg2.errors.DatatypeMismatch: could not determine polymorphic type because input has type unknown'. This was occurring when there was a single collection specified for the dataset search, causing a literal "collection" dataset field value to be included in an any_value aggregate function. --- .../datasets/byDimensions/_storage.py | 8 +++++++- .../lsst/daf/butler/tests/butler_queries.py | 19 ++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py index 9c49b9ee3f..fe38ed42e9 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py @@ -674,7 +674,13 @@ def _finish_query_builder( only_collection_record = collections[0] sql_projection.joiner.where(collection_col == only_collection_record.key) if "collection" in fields: - fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name) + fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name).cast( + # This cast is necessary to ensure that Postgres knows the + # type of this column if it is used in an aggregate + # function. + sqlalchemy.String + ) + elif not collections: sql_projection.joiner.where(sqlalchemy.literal(False)) if "collection" in fields: diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py index 8bb6acba25..36335094b9 100644 --- a/python/lsst/daf/butler/tests/butler_queries.py +++ b/python/lsst/daf/butler/tests/butler_queries.py @@ -1832,7 +1832,8 @@ def test_dataset_queries(self) -> None: # Tests for a regression of DM-46340, where invalid SQL would be # generated when the list of collections is a single run collection and - # there is region-postprocessing logic involved. + # there is region-postprocessing logic involved. This was due to + # missing type information associated with the "run" dataset field. result = butler.query_datasets( "dt", "run", @@ -1841,6 +1842,22 @@ def test_dataset_queries(self) -> None: ) self.assertEqual(result[0].dataId, {"instrument": "Cam1", "visit": 1, "detector": 1}) + # A similar issue to the "run" issue above was occuring with the + # 'collection' dataset field. + with butler.query() as query: + rows = list( + query.join_dataset_search("dt", "run") + .where("instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0") + .general( + dimensions=["visit", "detector"], + dataset_fields={"dt": set(["collection"])}, + find_first=True, + ) + ) + self.assertEqual(len(rows), 1) + self.assertEqual(rows[0]["visit"], 1) + self.assertEqual(rows[0]["dt.collection"], "run") + def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]: output = []