ENH: Add bincount, like np.bincount but accepts an axis parameter.

WarrenWeckesser · Sep 21, 2024 · 57052ed · 57052ed
1 parent 590561b
commit 57052ed
Show file tree

Hide file tree

Showing 7 changed files with 305 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -143,6 +143,7 @@ function to provide a capability that is not possible with a gufunc.
 
 | Function                                        | Description                                            |
 | --------                                        | -----------                                            |
+| [`bincount`](#bincount)                         | Like `np.bincount`, but gufunc-based                   |
 | [`convert_to_base`](#convert_to_base)           | Convert an integer to a given base.                    |
 | [`nextn_greater`](#nextn_greater)               | Next n values greater than the given x.                |
 | [`nextn_less`](#nextn_less)                     | Next n values greater than the given x.                |
@@ -1870,6 +1871,66 @@ Compare to
 
 ```
 
+#### `bincount`
+
+`bincount(x, m=None, out=None, axis=-1)` is a Python function that wraps a
+gufunc with shape signature `(n)->(m)`.  The function is like `np.bincount`,
+but it accepts n-dimensional arrays.  When the input has more than one
+dimension, the operation is applied along the given axis.
+
+```
+>>> import numpy as np
+>>> from ufunclab import bincount
+
+Create an array to work with.  `x` is an array with shape `(3, 12)`.
+
+>>> rng = np.random.default_rng(121263137472525314065)
+>>> x = rng.integers(0, 8, size=(3, 12))
+>>> x
+array([[7, 0, 5, 0, 2, 7, 7, 3, 0, 3, 4, 5],
+       [2, 6, 7, 1, 3, 0, 6, 1, 2, 0, 0, 6],
+       [0, 6, 1, 5, 2, 1, 4, 2, 6, 4, 2, 6]])
+
+By default, `bincount` operates along the last axis.  The default
+value of `m` is one more than maximum value in `x`, so in this case
+the output length of the counts will be 8.  That is, the output
+array will have shape `(3, 8)`.
+
+>>> bincount(x)
+array([[3, 0, 1, 2, 1, 2, 0, 3],
+       [3, 2, 2, 1, 0, 0, 3, 1],
+       [1, 2, 3, 0, 2, 1, 3, 0]], dtype=uint64)
+
+If we given a value for `m` that is larger than 8, the final values
+will be 0.
+
+>>> bincount(x, 10)
+array([[3, 0, 1, 2, 1, 2, 0, 3, 0, 0],
+       [3, 2, 2, 1, 0, 0, 3, 1, 0, 0],
+       [1, 2, 3, 0, 2, 1, 3, 0, 0, 0]], dtype=uint64)
+
+If the given value of `m` is smaller than `np.max(x) + 1`, the values
+greater than or equal to `m` are ignored.
+
+>>> bincount(x, 4)
+array([[3, 0, 1, 2],
+       [3, 2, 2, 1],
+       [1, 2, 3, 0]], dtype=uint64)
+
+The `axis` parameter selects the axis of `x` along which `bincount`
+is applied.  In the following example, since `x` has shape `(3, 12)`,
+the output has shape `(8, 12)` when `axis=0` is given.
+
+>>> bincount(x, axis=0)
+array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
+       [0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0],
+       [1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0],
+       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
+       [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
+       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
+       [0, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2],
+       [1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]], dtype=uint64)
+```
 #### `convert_to_base`
 
 `convert_to_base(k, base, ndigits, out=None, axis=-1)` is a Python function

diff --git a/src/bincount/bincount_gufunc.h b/src/bincount/bincount_gufunc.h
@@ -0,0 +1,40 @@
+
+#ifndef BINCOUNT_GUFUNC_H
+#define BINCOUNT_GUFUNC_H
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "numpy/ndarraytypes.h"
+
+#include "../src/util/strided.hpp"
+
+
+//
+// `bincount_core_calc` is the C++ core function
+// for the gufunc `bincount` with signature '(n)->(m)'
+//
+template<typename T, typename U>
+static void
+bincount_core_calc(
+    npy_intp n,         // core dimension n
+    npy_intp m,         // core dimension m
+    T *p_x,             // pointer to x
+    npy_intp x_stride,
+    U *p_out,           // pointer to out, a strided 1-d array
+    npy_intp out_stride
+)
+{
+    for (npy_intp i = 0; i < m; ++i) {
+        set(p_out, out_stride, i, static_cast<U>(0));
+    }
+    for (npy_intp i = 0; i < n; ++i) {
+        T k = get(p_x, x_stride, i);
+        if (k >= 0 && k < m) {        
+            set(p_out, out_stride, k, get(p_out, out_stride, k) + 1);
+        }
+    }
+}
+
+#endif
diff --git a/src/bincount/define_cxx_gufunc_extmod.py b/src/bincount/define_cxx_gufunc_extmod.py
@@ -0,0 +1,42 @@
+
+from ufunc_config_types import UFuncExtMod, UFunc, UFuncSource
+
+
+BINCOUNT_DOCSTRING = """\
+bincount(x, /, ...)
+
+Count integer in `x`, a 1-d array of length n.
+The `out` array *must* be given, and it must be a 1-d.
+"""
+
+bincount_src = UFuncSource(
+    funcname='bincount_core_calc',
+    typesignatures=[
+        'b->P',
+        'B->P',
+        'h->P',
+        'H->P',
+        'i->P',
+        'I->P',
+        'l->P',
+        'L->P',
+        'q->P',
+        'Q->P',
+        'p->P',
+        'P->P',
+    ]
+)
+
+bincount = UFunc(
+    name='bincount',
+    header='bincount_gufunc.h',
+    docstring=BINCOUNT_DOCSTRING,
+    signature='(n)->(m)',
+    sources=[bincount_src],
+)
+
+extmod = UFuncExtMod(
+    module='_bincount',
+    docstring=("This extension module defines the gufunc 'bincount'."),
+    ufuncs=[bincount],
+)
diff --git a/ufunclab/__init__.py b/ufunclab/__init__.py
@@ -17,6 +17,7 @@
 
 # The keys of this dict are in modules that are lazy-loaded.
 _name_to_module = {
+    'bincount': '._wrapped',
     'convert_to_base': '._wrapped',
     'nextn_greater': '._wrapped',
     'nextn_less' : '._wrapped',

diff --git a/ufunclab/_wrapped.py b/ufunclab/_wrapped.py
@@ -8,12 +8,115 @@
     from numpy.exceptions import AxisError
 except ImportError:
     from numpy import AxisError
+from ufunclab._bincount import bincount as _bincount
 from ufunclab._convert_to_base import convert_to_base as _convert_to_base
 from ufunclab._nextn import (nextn_less as _nextn_less,
                              nextn_greater as _nextn_greater)
 from ufunclab._one_hot import one_hot as _one_hot
 
 
+
+def bincount(x, m=None, out=None, axis=-1):
+    """
+    Count the number of occurrences of the positive integers in the 1-d
+    array `x` that are less than `m`.  If `m` is not given, the default
+    value is `max(np.max(x) + 1, 0)`.  If `x` is an n-dimensional array,
+    `axis` selects which axis of `x` the operation is applied to.
+
+    Notes
+    -----
+    This function is a Python wrapper of a gufunc with shape signature
+    `(n)->(m)`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from ufunclab import bincount
+
+    Create an array to work with.  `x` is an array with shape `(3, 12)`.
+
+    >>> rng = np.random.default_rng(121263137472525314065)
+    >>> x = rng.integers(0, 8, size=(3, 12))
+    >>> x
+    array([[7, 0, 5, 0, 2, 7, 7, 3, 0, 3, 4, 5],
+           [2, 6, 7, 1, 3, 0, 6, 1, 2, 0, 0, 6],
+           [0, 6, 1, 5, 2, 1, 4, 2, 6, 4, 2, 6]])
+
+    By default, `bincount` operates along the last axis.  The default
+    value of `m` is one more than maximum value in `x`, so in this case
+    the output length of the counts will be 8.  That is, the output
+    array will have shape `(3, 8)`.
+
+    >>> bincount(x)
+    array([[3, 0, 1, 2, 1, 2, 0, 3],
+           [3, 2, 2, 1, 0, 0, 3, 1],
+           [1, 2, 3, 0, 2, 1, 3, 0]], dtype=uint64)
+
+    If we given a value for `m` that is larger than 8, the final values
+    will be 0.
+
+    >>> bincount(x, 10)
+    array([[3, 0, 1, 2, 1, 2, 0, 3, 0, 0],
+           [3, 2, 2, 1, 0, 0, 3, 1, 0, 0],
+           [1, 2, 3, 0, 2, 1, 3, 0, 0, 0]], dtype=uint64)
+
+    If the given value of `m` is smaller than `np.max(x) + 1`, the values
+    greater than or equal to `m` are ignored.
+
+    >>> bincount(x, 4)
+    array([[3, 0, 1, 2],
+           [3, 2, 2, 1],
+           [1, 2, 3, 0]], dtype=uint64)
+
+    The `axis` parameter selects the axis of `x` along which `bincount`
+    is applied.  In the following example, since `x` has shape `(3, 12)`,
+    the output has shape `(8, 12)` when `axis=0` is given.
+
+    >>> bincount(x, axis=0)
+    array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
+           [0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0],
+           [1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0],
+           [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
+           [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
+           [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
+           [0, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2],
+           [1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]], dtype=uint64)
+
+    """
+    x = np.asarray(x)
+    if x.dtype.char not in np.typecodes['AllInteger']:
+        raise ValueError('x must be an integer array.')
+    if m is None:
+        m = max(np.max(x) + 1, 0)
+    else:
+        try:
+            m = operator.index(m)
+        except TypeError:
+            raise ValueError(f'm must be a nonnegative integer; got {m!r}')
+        if m < 0:
+            raise ValueError(f'm must be a nonnegative integer; got {m!r}')
+
+    x_shape = x.shape
+    adjusted_axis = axis
+    if adjusted_axis < 0:
+        adjusted_axis += len(x_shape)
+    if adjusted_axis < 0 or adjusted_axis > len(x_shape):
+        raise AxisError(f'invalid axis {axis}')
+    out_shape = list(x_shape)
+    out_shape[adjusted_axis] = m
+    out_shape = tuple(out_shape)
+    if out is not None:
+        if out.shape != out_shape:
+            raise ValueError(f'out.shape must be {out_shape}; '
+                             f'got {out.shape}.')
+    else:
+        out = np.empty(out_shape, dtype=np.uintp)
+    return _bincount(x, out=out, axes=[axis, axis])
+
+
+bincount.gufunc = _bincount
+
+
 # XXX Except for `out` and `axis`, this function does not expose any of the
 # gufunc keyword parameters.
 #

diff --git a/ufunclab/meson.build b/ufunclab/meson.build
@@ -47,6 +47,7 @@ npymath_lib = cc.find_library('npymath', dirs: npymath_path)
 
 gufunc_cxx_src_dirs = [
     'all_same',
+    'bincount',
     'backlash',
     'convert_to_base',
     'corr',

diff --git a/ufunclab/tests/test_bincount.py b/ufunclab/tests/test_bincount.py
@@ -0,0 +1,57 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_equal
+from ufunclab import bincount
+
+
+def test_1d_default_m():
+    x = np.array([0, 0, 3, 4, 3, 4, 0, 3, 0])
+    y = bincount(x)
+    assert_equal(y, [4, 0, 0, 3, 2])
+
+
+def test_1d_given_m():
+    x = np.array([0, 0, 3, 4, 3, 4, 0, 3, 0])
+    y = bincount(x, 8)
+    assert_equal(y, [4, 0, 0, 3, 2, 0, 0, 0])
+
+
+@pytest.mark.parametrize('m', [None, 2, 5, 8])
+def test_nd_default_m(m):
+    x = np.array([[0, 4, 4, 3, 2, 1],
+                  [1, 1, 2, 2, 3, 3],
+                  [4, 4, 4, 4, 4, 2]])
+    ref = np.array([[1, 1, 1, 1, 2],
+                    [0, 2, 2, 2, 0],
+                    [0, 0, 1, 0, 5]])
+    refm = max(np.max(x) + 1, 0)
+    y = bincount(x, m=m)
+    if (m is None):
+        expected = ref
+    elif m <= refm:
+        expected = ref[:, :m]
+    else:
+        expected = np.pad(ref, [(0, 0), (0, m - refm)])
+    assert_equal(y, expected)
+
+
+@pytest.mark.parametrize('m', [None, 2, 5, 8])
+def test_axis(m):
+    x = np.array([[0, 4, 4],
+                  [1, 1, 2],
+                  [4, 4, 4],
+                  [0, 0, 2]])
+    ref = np.array([[2, 1, 0],
+                    [1, 1, 0],
+                    [0, 0, 2],
+                    [0, 0, 0],
+                    [1, 2, 2]])
+    refm = max(np.max(x) + 1, 0)
+    y = bincount(x, m=m, axis=0)
+    if (m is None):
+        expected = ref
+    elif m <= refm:
+        expected = ref[:m, :]
+    else:
+        expected = np.pad(ref, [(0, m - refm), (0, 0)])
+    assert_equal(y, expected)