diff --git a/doc/changes/DM-40381.bugfix.md b/doc/changes/DM-40381.bugfix.md new file mode 100644 index 0000000000..e11417f01f --- /dev/null +++ b/doc/changes/DM-40381.bugfix.md @@ -0,0 +1 @@ +Ensure Datastore record exports (as used in quantum-backed butler) are deduplicated when necessary. diff --git a/python/lsst/daf/butler/core/datastore.py b/python/lsst/daf/butler/core/datastore.py index 2123e672ec..abb05423b7 100644 --- a/python/lsst/daf/butler/core/datastore.py +++ b/python/lsst/daf/butler/core/datastore.py @@ -1166,6 +1166,9 @@ def import_records( Implementations are responsible for calling `DatastoreRegistryBridge.insert` on all datasets in ``data.locations`` where the key is in `names`, as well as loading any opaque table data. + + Implementations may assume that datasets are either fully present or + not at all (single-component exports are not permitted). """ raise NotImplementedError() @@ -1181,7 +1184,8 @@ def export_records( ---------- refs : `~collections.abc.Iterable` [ `DatasetIdRef` ] Datasets to save. This may include datasets not known to this - datastore, which should be ignored. + datastore, which should be ignored. May not include component + datasets. Returns ------- diff --git a/python/lsst/daf/butler/core/datastoreRecordData.py b/python/lsst/daf/butler/core/datastoreRecordData.py index 93ae3667b2..a9b6421dfc 100644 --- a/python/lsst/daf/butler/core/datastoreRecordData.py +++ b/python/lsst/daf/butler/core/datastoreRecordData.py @@ -114,16 +114,25 @@ def update(self, other: DatastoreRecordData) -> None: Parameters ---------- other : `DatastoreRecordData` - Records tho merge into this instance. + Records to merge into this instance. Notes ----- - Merged instances can not have identical records. + If a ``(dataset_id, table_name)`` combination has any records in + ``self``, it is assumed that all records for that combination are + already present. This allows duplicates of the same dataset to be + handled gracefully. """ for dataset_id, table_records in other.records.items(): this_table_records = self.records.setdefault(dataset_id, {}) for table_name, records in table_records.items(): - this_table_records.setdefault(table_name, []).extend(records) + # If this (dataset_id, table_name) combination already has + # records in `self`, we assume that means all of the records + # for that combination; we require other code to ensure entire + # (parent) datasets are exported to these data structures + # (never components). + if not (this_records := this_table_records.setdefault(table_name, [])): + this_records.extend(records) def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None: """Extract a subset of the records that match given dataset IDs.