Merge pull request #879 from lsst/tickets/DM-40381

DM-40381: Deduplicate when merging DatastoreRecordData and document preconditions.
lsst · Aug 18, 2023 · 87d628b · 87d628b
2 parents 41038f7 + 363042d
commit 87d628b
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 4 deletions.
diff --git a/doc/changes/DM-40381.bugfix.md b/doc/changes/DM-40381.bugfix.md
@@ -0,0 +1 @@
+Ensure Datastore record exports (as used in quantum-backed butler) are deduplicated when necessary.
diff --git a/python/lsst/daf/butler/core/datastore.py b/python/lsst/daf/butler/core/datastore.py
@@ -1166,6 +1166,9 @@ def import_records(
         Implementations are responsible for calling
         `DatastoreRegistryBridge.insert` on all datasets in ``data.locations``
         where the key is in `names`, as well as loading any opaque table data.
+
+        Implementations may assume that datasets are either fully present or
+        not at all (single-component exports are not permitted).
         """
         raise NotImplementedError()
 
@@ -1181,7 +1184,8 @@ def export_records(
         ----------
         refs : `~collections.abc.Iterable` [ `DatasetIdRef` ]
             Datasets to save.  This may include datasets not known to this
-            datastore, which should be ignored.
+            datastore, which should be ignored.  May not include component
+            datasets.
 
         Returns
         -------

diff --git a/python/lsst/daf/butler/core/datastoreRecordData.py b/python/lsst/daf/butler/core/datastoreRecordData.py
@@ -114,16 +114,25 @@ def update(self, other: DatastoreRecordData) -> None:
         Parameters
         ----------
         other : `DatastoreRecordData`
-            Records tho merge into this instance.
+            Records to merge into this instance.
 
         Notes
         -----
-        Merged instances can not have identical records.
+        If a ``(dataset_id, table_name)`` combination has any records in
+        ``self``, it is assumed that all records for that combination are
+        already present.  This allows duplicates of the same dataset to be
+        handled gracefully.
         """
         for dataset_id, table_records in other.records.items():
             this_table_records = self.records.setdefault(dataset_id, {})
             for table_name, records in table_records.items():
-                this_table_records.setdefault(table_name, []).extend(records)
+                # If this (dataset_id, table_name) combination already has
+                # records in `self`, we assume that means all of the records
+                # for that combination; we require other code to ensure entire
+                # (parent) datasets are exported to these data structures
+                # (never components).
+                if not (this_records := this_table_records.setdefault(table_name, [])):
+                    this_records.extend(records)
 
     def subset(self, dataset_ids: set[DatasetId]) -> DatastoreRecordData | None:
         """Extract a subset of the records that match given dataset IDs.