holoviz · ianthomas23 · Jun 7, 2023 · Jun 6, 2023 · ianthomas23 · Jun 6, 2023
diff --git a/datashader/compiler.py b/datashader/compiler.py
@@ -264,7 +264,8 @@ def make_append(bases, cols, calls, glyph, categorical, antialias):
     # Categorical aggregate arrays need to be unpacked
     if categorical:
         col_index = '' if isinstance(cols[0], category_codes) else '[0]'
-        cat_var = 'cat = int({0}[{1}]{2})'.format(signature[-1], subscript, col_index)
+        signature_index = -2 if any_uses_cuda_mutex else -1
+        cat_var = 'cat = int({0}[{1}]{2})'.format(signature[signature_index], subscript, col_index)
         aggs = ['{0} = {0}[:, :, cat]'.format(s) for s in signature[:len(calls)]]
         body = [cat_var] + aggs + body
 

diff --git a/datashader/reductions.py b/datashader/reductions.py
@@ -655,6 +655,9 @@ def out_dshape(self, input_dshape, antialias, cuda, partitioned):
     def inputs(self):
         return (self.preprocess, )
 
+    def uses_cuda_mutex(self):
+        return self.reduction.uses_cuda_mutex()
+
     def _antialias_requires_2_stages(self):
         return self.reduction._antialias_requires_2_stages()
 
@@ -684,7 +687,7 @@ def _build_finalize(self, dshape):
         def finalize(bases, cuda=False, **kwargs):
             kwargs['dims'] += [self.cat_column]
             kwargs['coords'][self.cat_column] = cats
-            return self.reduction._finalize(bases, cuda=cuda, **kwargs)
+            return self.reduction._build_finalize(dshape)(bases, cuda=cuda, **kwargs)
 
         return finalize
 
@@ -1474,16 +1477,22 @@ def _build_combine(self, dshape, antialias, cuda, partitioned):
     @staticmethod
     def _combine(aggs):
         ret = aggs[0]
+        if ret.ndim == 3:  # ndim is either 3 (ny, nx, n) or 4 (ny, nx, ncat, n)
+            # 4d view of each agg
+            aggs = [np.expand_dims(agg, 2) for agg in aggs]
         for i in range(1, len(aggs)):
-            nanmax_n_in_place(ret, aggs[i])
+            nanmax_n_in_place(aggs[0], aggs[i])
         return ret
 
     @staticmethod
     def _combine_cuda(aggs):
         ret = aggs[0]
-        kernel_args = cuda_args(ret.shape[:2])
+        if ret.ndim == 3:  # ndim is either 3 (ny, nx, n) or 4 (ny, nx, ncat, n)
+            # 4d view of each agg
+            aggs = [cp.expand_dims(agg, 2) for agg in aggs]
+        kernel_args = cuda_args(aggs[0].shape[:3])
         for i in range(1, len(aggs)):
-            cuda_nanmax_n_in_place[kernel_args](ret, aggs[i])
+            cuda_nanmax_n_in_place[kernel_args](aggs[0], aggs[i])
         return ret
 
 
@@ -1540,16 +1549,22 @@ def _build_combine(self, dshape, antialias, cuda, partitioned):
     @staticmethod
     def _combine(aggs):
         ret = aggs[0]
+        if ret.ndim == 3:  # ndim is either 3 (ny, nx, n) or 4 (ny, nx, ncat, n)
+            # 4d view of each agg
+            aggs = [np.expand_dims(agg, 2) for agg in aggs]
         for i in range(1, len(aggs)):
-            nanmin_n_in_place(ret, aggs[i])
+            nanmin_n_in_place(aggs[0], aggs[i])
         return ret
 
     @staticmethod
     def _combine_cuda(aggs):
         ret = aggs[0]
-        kernel_args = cuda_args(ret.shape[:2])
+        if ret.ndim == 3:  # ndim is either 3 (ny, nx, n) or 4 (ny, nx, ncat, n)
+            # 4d view of each agg
+            aggs = [cp.expand_dims(agg, 2) for agg in aggs]
+        kernel_args = cuda_args(aggs[0].shape[:3])
         for i in range(1, len(aggs)):
-            cuda_nanmin_n_in_place[kernel_args](ret, aggs[i])
+            cuda_nanmin_n_in_place[kernel_args](aggs[0], aggs[i])
         return ret
 
 

diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py
@@ -42,8 +42,10 @@
                       'plusminus': np.arange(20, dtype='f8')*([1, -1]*10),
                       'empty_bin': np.array([0.] * 15 + [np.nan] * 5),
                       'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5,
+                      'cat2': ['a', 'b', 'c', 'd']*5,
                       'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)})
 df_pd.cat = df_pd.cat.astype('category')
+df_pd.cat2 = df_pd.cat2.astype('category')
 df_pd.at[2,'f32'] = nan
 df_pd.at[2,'f64'] = nan
 df_pd.at[2,'plusminus'] = nan
@@ -327,6 +329,66 @@ def test_last_n(ddf, npartitions):
             assert_eq_ndarray(agg[:, :, 0].data, c.points(ddf, 'x', 'y', ds.last('plusminus')).data)
 
 
+@pytest.mark.parametrize('ddf', ddfs)
+@pytest.mark.parametrize('npartitions', [1, 2, 3, 4])
+def test_categorical_min(ddf, npartitions):
+    ddf = ddf.repartition(npartitions)
+    assert ddf.npartitions == npartitions
+    sol_int = np.array([[[0, 1, 2, 3], [12, 13, 10, 11]], [[8, 5, 6, 7], [16, 17, 18, 15]]], dtype=np.float64)
+    sol_float = np.array([[[0, 1, nan, 3], [12, 13, 10, 11]], [[8, 5, 6, 7], [16, 17, 18, 15]]])
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.min('i32'))).data, sol_int)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.min('i64'))).data, sol_int)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.min('f32'))).data, sol_float)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.min('f64'))).data, sol_float)
+
+
+@pytest.mark.parametrize('ddf', ddfs)
+@pytest.mark.parametrize('npartitions', [1, 2, 3, 4])
+def test_categorical_max(ddf, npartitions):
+    ddf = ddf.repartition(npartitions)
+    assert ddf.npartitions == npartitions
+    sol_int = np.array([[[4, 1, 2, 3], [12, 13, 14, 11]], [[8, 9, 6, 7], [16, 17, 18, 19]]], dtype=np.float64)
+    sol_float = np.array([[[4, 1, nan, 3], [12, 13, 14, 11]], [[8, 9, 6, 7], [16, 17, 18, 19]]])
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.max('i32'))).data, sol_int)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.max('i64'))).data, sol_int)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.max('f32'))).data, sol_float)
+    assert_eq_ndarray(c.points(ddf, 'x', 'y', ds.by('cat2', ds.max('f64'))).data, sol_float)
+
+
+@pytest.mark.parametrize('ddf', ddfs)
+@pytest.mark.parametrize('npartitions', [1, 2, 3, 4])
+def test_categorical_min_n(ddf, npartitions):
+    ddf = ddf.repartition(npartitions)
+    assert ddf.npartitions == npartitions
+    solution = np.array([[[[0, 4, nan], [1, nan, nan], [nan, nan, nan], [3, nan, nan]],
+                          [[12, nan, nan], [13, nan, nan], [10, 14, nan], [11, nan, nan]]],
+                         [[[8, nan, nan], [5, 9, nan], [6, nan, nan], [7, nan, nan]],
+                          [[16, nan, nan], [17, nan, nan], [18, nan, nan], [15, 19, nan]]]])
+    for n in range(1, 3):
+        agg = c.points(ddf, 'x', 'y', ds.by('cat2', ds.min_n('f32', n=n)))
+        out = solution[:, :, :, :n]
+        assert_eq_ndarray(agg.data, out)
+        if n == 1:
+            assert_eq_ndarray(agg[..., 0].data, c.points(ddf, 'x', 'y', ds.by('cat2', ds.min('f32'))).data)
+
+
+@pytest.mark.parametrize('ddf', ddfs)
+@pytest.mark.parametrize('npartitions', [1, 2, 3, 4])
+def test_categorical_max_n(ddf, npartitions):
+    ddf = ddf.repartition(npartitions)
+    assert ddf.npartitions == npartitions
+    solution = np.array([[[[4, 0, nan], [1, nan, nan], [nan, nan, nan], [3, nan, nan]],
+                          [[12, nan, nan], [13, nan, nan], [14, 10, nan], [11, nan, nan]]],
+                         [[[8, nan, nan], [9, 5, nan], [6, nan, nan], [7, nan, nan]],
+                          [[16, nan, nan], [17, nan, nan], [18, nan, nan], [19, 15, nan]]]])
+    for n in range(1, 3):
+        agg = c.points(ddf, 'x', 'y', ds.by('cat2', ds.max_n('f32', n=n)))
+        out = solution[:, :, :, :n]
+        assert_eq_ndarray(agg.data, out)
+        if n == 1:
+            assert_eq_ndarray(agg[..., 0].data, c.points(ddf, 'x', 'y', ds.by('cat2', ds.max('f32'))).data)
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 @pytest.mark.parametrize('npartitions', [1, 2, 3, 4])
 def test_where_max(ddf, npartitions):

diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py
@@ -25,11 +25,19 @@
                       'plusminus': np.arange(20, dtype='f8')*([1, -1]*10),
                       'empty_bin': np.array([0.] * 15 + [np.nan] * 5),
                       'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5,
+                      'cat2': ['a', 'b', 'c', 'd']*5,
                       'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)})
 df_pd.cat = df_pd.cat.astype('category')
+df_pd.cat2 = df_pd.cat2.astype('category')
 df_pd.at[2,'f32'] = nan
 df_pd.at[2,'f64'] = nan
 df_pd.at[2,'plusminus'] = nan
+# x          0  0   0  0 0   0 0  0 0  0   1   1  1   1  1    1  1   1  1   1
+# y          0  0   0  0 0   1 1  1 1  1   0   0  0   0  0    1  1   1  1   1
+# i32        0  1   2  3 4   5 6  7 8  9  10  11 12  13 14   15 16  17 18  19
+# f32        0  1 nan  3 4   5 6  7 8  9  10  11 12  13 14   15 16  17 18  19
+# plusminus  0 -1 nan -3 4  -5 6 -7 8 -9  10 -11 12 -13 14  -15 16 -17 18 -19
+# cat2       a  b   c  d a   b c  d a  b   c   d  a   b  c    d  a   b  c   d
 
 test_gpu = bool(int(os.getenv("DATASHADER_TEST_GPU", 0)))
 
@@ -237,6 +245,54 @@ def test_max_n(df):
             assert_eq_ndarray(agg[:, :, 0].data, c.points(df, 'x', 'y', ds.max('plusminus')).data)
 
 
+@pytest.mark.parametrize('df', dfs)
+def test_categorical_min(df):
+    sol_int = np.array([[[0, 1, 2, 3], [12, 13, 10, 11]], [[8, 5, 6, 7], [16, 17, 18, 15]]], dtype=np.float64)
+    sol_float = np.array([[[0, 1, nan, 3], [12, 13, 10, 11]], [[8, 5, 6, 7], [16, 17, 18, 15]]])
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.min('i32'))).data, sol_int)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.min('i64'))).data, sol_int)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.min('f32'))).data, sol_float)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.min('f64'))).data, sol_float)
+
+
+@pytest.mark.parametrize('df', dfs)
+def test_categorical_max(df):
+    sol_int = np.array([[[4, 1, 2, 3], [12, 13, 14, 11]], [[8, 9, 6, 7], [16, 17, 18, 19]]], dtype=np.float64)
+    sol_float = np.array([[[4, 1, nan, 3], [12, 13, 14, 11]], [[8, 9, 6, 7], [16, 17, 18, 19]]])
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.max('i32'))).data, sol_int)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.max('i64'))).data, sol_int)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.max('f32'))).data, sol_float)
+    assert_eq_ndarray(c.points(df, 'x', 'y', ds.by('cat2', ds.max('f64'))).data, sol_float)
+
+
+@pytest.mark.parametrize('df', dfs)
+def test_categorical_min_n(df):
+    solution = np.array([[[[0, 4, nan], [1, nan, nan], [nan, nan, nan], [3, nan, nan]],
+                          [[12, nan, nan], [13, nan, nan], [10, 14, nan], [11, nan, nan]]],
+                         [[[8, nan, nan], [5, 9, nan], [6, nan, nan], [7, nan, nan]],
+                          [[16, nan, nan], [17, nan, nan], [18, nan, nan], [15, 19, nan]]]])
+    for n in range(1, 3):
+        agg = c.points(df, 'x', 'y', ds.by('cat2', ds.min_n('f32', n=n)))
+        out = solution[:, :, :, :n]
+        assert_eq_ndarray(agg.data, out)
+        if n == 1:
+            assert_eq_ndarray(agg[..., 0].data, c.points(df, 'x', 'y', ds.by('cat2', ds.min('f32'))).data)
+
+
+@pytest.mark.parametrize('df', dfs)
+def test_categorical_max_n(df):
+    solution = np.array([[[[4, 0, nan], [1, nan, nan], [nan, nan, nan], [3, nan, nan]],
+                          [[12, nan, nan], [13, nan, nan], [14, 10, nan], [11, nan, nan]]],
+                         [[[8, nan, nan], [9, 5, nan], [6, nan, nan], [7, nan, nan]],
+                          [[16, nan, nan], [17, nan, nan], [18, nan, nan], [19, 15, nan]]]])
+    for n in range(1, 3):
+        agg = c.points(df, 'x', 'y', ds.by('cat2', ds.max_n('f32', n=n)))
+        out = solution[:, :, :, :n]
+        assert_eq_ndarray(agg.data, out)
+        if n == 1:
+            assert_eq_ndarray(agg[..., 0].data, c.points(df, 'x', 'y', ds.by('cat2', ds.max('f32'))).data)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_where_min_row_index(df):
     out = xr.DataArray([[0, 10], [-5, -15]], coords=coords, dims=dims)
@@ -652,7 +708,7 @@ def test_categorical_sum_binning(df):
 
 
 @pytest.mark.parametrize('df', dfs)
-def test_categorical_max(df):
+def test_categorical_max2(df):
     sol = np.array([[[  4, nan, nan, nan],
                      [nan, nan,  14, nan]],
                     [[nan,   9, nan, nan],

diff --git a/datashader/transfer_functions/_cuda_utils.py b/datashader/transfer_functions/_cuda_utils.py
@@ -202,11 +202,11 @@ def cuda_mutex_unlock(mutex, index):
 def cuda_nanmax_n_in_place(ret, other):
     """CUDA equivalent of nanmax_n_in_place.
     """
-    ny, nx, n = ret.shape
-    x, y = cuda.grid(2)
-    if x < nx and y < ny:
-        ret_pixel = ret[y, x]      # 1D array of n values for single pixel
-        other_pixel = other[y, x]  # ditto
+    ny, nx, ncat, n = ret.shape
+    x, y, cat = cuda.grid(3)
+    if x < nx and y < ny and cat < ncat:
+        ret_pixel = ret[y, x, cat]      # 1D array of n values for single pixel
+        other_pixel = other[y, x, cat]  # ditto
         # Walk along other_pixel array a value at a time, find insertion
         # index in ret_pixel and bump values along to insert.  Next
         # other_pixel value is inserted at a higher index, so this walks
@@ -230,11 +230,11 @@ def cuda_nanmax_n_in_place(ret, other):
 def cuda_nanmin_n_in_place(ret, other):
     """CUDA equivalent of nanmin_n_in_place.
     """
-    ny, nx, n = ret.shape
-    x, y = cuda.grid(2)
-    if x < nx and y < ny:
-        ret_pixel = ret[y, x]      # 1D array of n values for single pixel
-        other_pixel = other[y, x]  # ditto
+    ny, nx, ncat, n = ret.shape
+    x, y, cat = cuda.grid(3)
+    if x < nx and y < ny and cat < ncat:
+        ret_pixel = ret[y, x, cat]      # 1D array of n values for single pixel
+        other_pixel = other[y, x, cat]  # ditto
         # Walk along other_pixel array a value at a time, find insertion
         # index in ret_pixel and bump values along to insert.  Next
         # other_pixel value is inserted at a higher index, so this walks

diff --git a/datashader/utils.py b/datashader/utils.py
@@ -650,64 +650,70 @@ def nanmin_in_place(ret, other):
 
 @ngjit_parallel
 def nanmax_n_in_place(ret, other):
-    """Combine two max-n arrays, taking nans into account. Max-n arrays are 3D
-    with the last axis containing n values in descending order. If there are
-    fewer than n values it is padded with nans.
+    """Combine two max-n arrays, taking nans into account. Max-n arrays are 4D
+    with shape (ny, nx, ncat, n) where ny and nx are the number of pixels,
+    ncat the number of categories (will be 1 if not using a categorical
+    reduction) and the last axis containing n values in descending order.
+    If there are fewer than n values it is padded with nans.
     Return the first array.
     """
-    ny, nx, n = ret.shape
+    ny, nx, ncat, n = ret.shape
     for y in nb.prange(ny):
         for x in range(nx):
-            ret_pixel = ret[y, x]      # 1D array of n values for single pixel
-            other_pixel = other[y, x]  # ditto
-            # Walk along other_pixel array a value at a time, find insertion
-            # index in ret_pixel and bump values along to insert.  Next
-            # other_pixel value is inserted at a higher index, so this walks
-            # the two pixel arrays just once each.
-            istart = 0
-            for other_value in other_pixel:
-                if isnull(other_value):
-                    break
-                else:
-                    for i in range(istart, n):
-                        if isnull(ret_pixel[i]) or other_value > ret_pixel[i]:
-                            # Bump values along then insert.
-                            for j in range(n-1, i, -1):
-                                ret_pixel[j] = ret_pixel[j-1]
-                            ret_pixel[i] = other_value
-                            istart = i+1
-                            break
+            for cat in range(ncat):
+                ret_pixel = ret[y, x, cat]      # 1D array of n values for single pixel
+                other_pixel = other[y, x, cat]  # ditto
+                # Walk along other_pixel array a value at a time, find insertion
+                # index in ret_pixel and bump values along to insert.  Next
+                # other_pixel value is inserted at a higher index, so this walks
+                # the two pixel arrays just once each.
+                istart = 0
+                for other_value in other_pixel:
+                    if isnull(other_value):
+                        break
+                    else:
+                        for i in range(istart, n):
+                            if isnull(ret_pixel[i]) or other_value > ret_pixel[i]:
+                                # Bump values along then insert.
+                                for j in range(n-1, i, -1):
+                                    ret_pixel[j] = ret_pixel[j-1]
+                                ret_pixel[i] = other_value
+                                istart = i+1
+                                break
 
 
 @ngjit_parallel
 def nanmin_n_in_place(ret, other):
-    """Combine two min-n arrays, taking nans into account. Min-n arrays are 3D
-    with the last axis containing n values in descending order. If there are
-    fewer than n values it is padded with nans.
+    """Combine two min-n arrays, taking nans into account. Min-n arrays are 4D
+    with shape (ny, nx, ncat, n) where ny and nx are the number of pixels,
+    ncat the number of categories (will be 1 if not using a categorical
+    reduction) and the last axis containing n values in ascending order.
+    If there are fewer than n values it is padded with nans.
     Return the first array.
     """
-    ny, nx, n = ret.shape
+    ny, nx, ncat, n = ret.shape
     for y in nb.prange(ny):
         for x in range(nx):
-            ret_pixel = ret[y, x]      # 1D array of n values for single pixel
-            other_pixel = other[y, x]  # ditto
-            # Walk along other_pixel array a value at a time, find insertion
-            # index in ret_pixel and bump values along to insert.  Next
-            # other_pixel value is inserted at a higher index, so this walks
-            # the two pixel arrays just once each.
-            istart = 0
-            for other_value in other_pixel:
-                if isnull(other_value):
-                    break
-                else:
-                    for i in range(istart, n):
-                        if isnull(ret_pixel[i]) or other_value < ret_pixel[i]:
-                            # Bump values along then insert.
-                            for j in range(n-1, i, -1):
-                                ret_pixel[j] = ret_pixel[j-1]
-                            ret_pixel[i] = other_value
-                            istart = i+1
-                            break
+            for cat in range(ncat):
+                ret_pixel = ret[y, x, cat]      # 1D array of n values for single pixel
+                other_pixel = other[y, x, cat]  # ditto
+                # Walk along other_pixel array a value at a time, find insertion
+                # index in ret_pixel and bump values along to insert.  Next
+                # other_pixel value is inserted at a higher index, so this walks
+                # the two pixel arrays just once each.
+                istart = 0
+                for other_value in other_pixel:
+                    if isnull(other_value):
+                        break
+                    else:
+                        for i in range(istart, n):
+                            if isnull(ret_pixel[i]) or other_value < ret_pixel[i]:
+                                # Bump values along then insert.
+                                for j in range(n-1, i, -1):
+                                    ret_pixel[j] = ret_pixel[j-1]
+                                ret_pixel[i] = other_value
+                                istart = i+1
+                                break
 
 
 @ngjit_parallel