Merge pull request #279 from Deltares/278-prevent-inconsistent-chunks…

…-error-in-xumerge_partitions avoid inconsistent chunks error in `merge_partitions()`
Deltares · Aug 14, 2024 · 63a5802 · 63a5802
2 parents dc5f113 + 34454b4
commit 63a5802
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 16 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -32,15 +32,22 @@ Fixed
   all values are NaN or when all weights (overlaps) are zero, and all methods
   give the same answer irrespective of the order in which the values are
   encountered.
-
+- :meth:`xugrid.merge_partitions` will now raise a ValueError if zero
+  partitions are provided.
+- :meth:`xugrid.merge_partitions` will no longer error when chunks are
+  inconsistent across variables in a dataset, but now returns a merged dataset
+  while keeping the chunking per variable. (Note that if chunks are inconstent
+  for a variable **across partitions** that they are still and always unified
+  for the variable.)
+
 Added
 ~~~~~
 
 - Percentiles (5, 10, 25, 50, 75, 90, 95) have been added to the
   :class:`xugrid.OverlapRegridder` as standard available reduction methods
   (available as ``"p5", "p10"``, etc.). Custom percentile values (e.g. 2.5, 42) can be
   setup using :meth:`xugrid.OverlapRegridder.create_percentile_method`.
-  
+
 Changed
 ~~~~~~~
 
@@ -77,7 +84,7 @@ Fixed
 Added
 ~~~~~
 
-- Included ``edge_node_connectivity`` in :meth:`xugrid.Ugrid2d.from_meshkernel`, 
+- Included ``edge_node_connectivity`` in :meth:`xugrid.Ugrid2d.from_meshkernel`,
   so the ordering of edges is consistent with ``meshkernel``.
 - Added :meth:`xugrid.Ugrid1d.create_data_array`,
   :meth:`xugrid.Ugrid2d.create_data_array`, and
@@ -86,7 +93,7 @@ Added
 - Added :func:`xugrid.create_snap_to_grid_dataframe` to provide
   more versatile snapping, e.g. with custom reductions to assign_edge_coords
   aggregated properties to grid edges.
- 
+
 Changed
 ~~~~~~~
 
@@ -104,10 +111,10 @@ Fixed
   :class:`xugrid.CentroidLocatorRegridder`, :class:`xugrid.OverlapRegridder`,
   :class:`xugrid.RelativeOverlapRegridder`, which gave the method the tendency
   to repeat the first value in the source grid across the target grid.
-  
+
 Added
 ~~~~~
-  
+
 - :func:`xugrid.earcut_triangulate_polygons` and
   :meth:`xugrid.Ugrid2d.earcut_triangulate_polygons` have been added to break
   down polygon geodataframes into a triangular mesh for further processing.
@@ -245,7 +252,7 @@ Fixed
   original grid being returned, rather than a new grid with the faces or edges
   shuffled. This breaks the link the between topology and data when using
   ``.isel`` on a UgridDataset or UgridDataArray. This has been fixed: both data
-  and the topology are now shuffled accordingly. 
+  and the topology are now shuffled accordingly.
 
 [0.6.5] 2023-09-30
 ------------------
@@ -270,7 +277,7 @@ Changed
 - Xugrid now contains a partial copy of the xarray plot utils module, and its
   tests. The latest xarray release broke xugrid (on import), since (private)
   parts of xarray were used which no longer existed.
- 
+
 Fixed
 ~~~~~
 

diff --git a/tests/test_partitioning.py b/tests/test_partitioning.py
@@ -39,6 +39,27 @@ def test_labels_to_indices():
     assert np.array_equal(indices[2], [3, 4])
 
 
+def test_single_ugrid_chunk():
+    grid = generate_mesh_2d(3, 3)
+    ugrid_dims = set(grid.dimensions)
+    da = xr.DataArray(np.ones(grid.n_face), dims=(grid.face_dimension,))
+    assert pt.single_ugrid_chunk(da, ugrid_dims) is da
+
+    da = da.chunk({grid.face_dimension: (3, 3, 3)})
+    single = pt.single_ugrid_chunk(da, ugrid_dims)
+    assert single.chunks == ((9,),)
+
+    # Don't touch other dims
+    da_time = (
+        xr.DataArray(data=np.ones(3), dims=("time",)).chunk({"time": (1, 1, 1)}) * da
+    )
+    single = pt.single_ugrid_chunk(da_time, ugrid_dims)
+    assert single.chunks == (
+        (1, 1, 1),
+        (9,),
+    )
+
+
 class TestGridPartitioning:
     @pytest.fixture(autouse=True)
     def setup(self):
@@ -178,6 +199,31 @@ def test_merge_partitions_no_duplicates(self):
         merged = pt.merge_partitions([part1, part2])
         assert np.bincount(merged["face_z"] == 1).all()
 
+    def test_merge_inconsistent_chunks_across_partitions(self):
+        part1, part2 = self.uds.ugrid.partition(n_part=2)
+        time = xr.DataArray(data=np.ones(3), dims=("time",))
+        part1 = (part1 * time).chunk({"time": (1, 1, 1)})
+        part2 = (part2 * time).chunk({"time": (1, 2)})
+        merged = pt.merge_partitions([part1, part2])
+        assert isinstance(merged, xu.UgridDataset)
+        assert merged.chunks["time"] == (1, 1, 1)
+
+    def test_merge_inconsistent_chunks_across_variables(self):
+        uds = self.uds * xr.DataArray(data=np.ones(3), dims=("time",))
+        # Make them inconsistent across the variables
+        uds["node_z"] = uds["node_z"].chunk({"time": (3,)})
+        uds["edge_z"] = uds["edge_z"].chunk({"time": (2, 1)})
+        uds["face_z"] = uds["face_z"].chunk({"time": (1, 2)})
+        part1, part2 = uds.ugrid.partition(n_part=2)
+        merged = pt.merge_partitions([part1, part2])
+        # Test that it runs without encountering the xarray "inconsistent
+        # chunks" ValueError.
+        assert isinstance(merged, xu.UgridDataset)
+        # Make sure they remain inconsistent after merging.
+        assert uds["node_z"].chunks == ((self.grid.n_node,), (3,))
+        assert uds["edge_z"].chunks == ((self.grid.n_edge,), (2, 1))
+        assert uds["face_z"].chunks == ((self.grid.n_face,), (1, 2))
+
 
 class TestMultiTopology2DMergePartitions:
     @pytest.fixture(autouse=True)

diff --git a/xugrid/ugrid/partitioning.py b/xugrid/ugrid/partitioning.py
@@ -306,6 +306,26 @@ def merge_data_along_dim(
     return xr.concat(to_merge, dim=merge_dim)
 
 
+def single_ugrid_chunk(da: xr.DataArray, ugrid_dims: set[str]) -> xr.DataArray:
+    """
+    Ensure that along a UGRID dimension only a single chunk is defined.
+
+    Preserving chunks on the merged result may generate a very sub-optimal,
+    complex dask graph.
+    """
+    if not da.chunks:
+        return da
+
+    chunks = {}
+    for dim, sizes in zip(da.dims, da.chunks):
+        # Define a single chunk for each UGRID dimension.
+        if dim in ugrid_dims:
+            chunks[dim] = (da.sizes[dim],)
+        else:
+            chunks[dim] = sizes
+    return da.chunk(chunks)
+
+
 def merge_partitions(partitions, merge_ugrid_chunks: bool = True):
     """
     Merge topology and data, partitioned along UGRID dimensions, into a single
@@ -382,13 +402,12 @@ def merge_partitions(partitions, merge_ugrid_chunks: bool = True):
             )
             merged.update(merged_selection)
 
-    # Merge chunks along the UGRID dimensions.
-    if merged.chunks and merge_ugrid_chunks:
-        chunks = dict(merged.chunks)
-        for dim in chunks:
-            # Define a single chunk for each UGRID dimension.
-            if dim in ugrid_dims:
-                chunks[dim] = (merged.sizes[dim],)
-        merged = merged.chunk(chunks)
+    # Merge chunks along the UGRID dimensions
+    if merge_ugrid_chunks:
+        # Running `merged[varname] = da` will cause xarray to auto-align. This
+        # is quite expensive, especially when there are many variables. Setting
+        # it via `._variables` sidesteps the alignment.
+        for varname, da in merged._variables.items():
+            merged._variables[varname] = single_ugrid_chunk(da, ugrid_dims)
 
     return UgridDataset(merged, merged_grids)