From 19b1d7e553a99d57dee5a992ecfe9f17b7b20f59 Mon Sep 17 00:00:00 2001 From: Jianfeng Mao <4297243+jmao-denver@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:25:21 -0700 Subject: [PATCH] auto convert Java values(arrays/scalar) to Numpy ones and convert DH nulls based on the annotations of the params of a Py UDF (#4502) * A bit of a milestone * made test suite pass * Refactor the new code * Add more tests * More refactoring and code cleanup * Fix a bug that fails vectorization * More code cleanup and clarification * More pathological test cases * Fix String/Instant array conversion issue * Fix test failures and refactor code * Trivial renaming * Respond to review comments * Apply suggestions from code review Co-authored-by: Chip Kent <5250374+chipkent@users.noreply.github.com> * Refactor the code and a minor fixes * Improve the test cases * Clearly distinguqish between params and return * Clarify some code with comments * More clarifying comments --------- Co-authored-by: Chip Kent <5250374+chipkent@users.noreply.github.com> --- .../engine/util/PyCallableWrapperJpyImpl.java | 15 +- py/server/deephaven/_udf.py | 420 ++++++++++++++++++ py/server/deephaven/dtypes.py | 63 ++- py/server/deephaven/jcompat.py | 123 ++++- py/server/deephaven/numpy.py | 29 +- py/server/deephaven/pandas.py | 49 +- py/server/deephaven/table.py | 193 +------- py/server/tests/test_numba_guvectorize.py | 12 +- py/server/tests/test_udf_numpy_args.py | 397 +++++++++++++++++ ...lues.py => test_udf_return_java_values.py} | 19 +- py/server/tests/test_vectorization.py | 56 +-- 11 files changed, 1061 insertions(+), 315 deletions(-) create mode 100644 py/server/deephaven/_udf.py create mode 100644 py/server/tests/test_udf_numpy_args.py rename py/server/tests/{test_pyfunc_return_java_values.py => test_udf_return_java_values.py} (96%) diff --git a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java index 18262f8e7f0..006bae5be5c 100644 --- a/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java +++ b/engine/table/src/main/java/io/deephaven/engine/util/PyCallableWrapperJpyImpl.java @@ -24,17 +24,18 @@ public class PyCallableWrapperJpyImpl implements PyCallableWrapper { private static final PyObject NUMBA_VECTORIZED_FUNC_TYPE = getNumbaVectorizedFuncType(); private static final PyObject NUMBA_GUVECTORIZED_FUNC_TYPE = getNumbaGUVectorizedFuncType(); - private static final PyModule dh_table_module = PyModule.importModule("deephaven.table"); + private static final PyModule dh_udf_module = PyModule.importModule("deephaven._udf"); private static final Map> numpyType2JavaClass = new HashMap<>(); static { + numpyType2JavaClass.put('b', byte.class); + numpyType2JavaClass.put('h', short.class); + numpyType2JavaClass.put('H', char.class); numpyType2JavaClass.put('i', int.class); numpyType2JavaClass.put('l', long.class); - numpyType2JavaClass.put('h', short.class); numpyType2JavaClass.put('f', float.class); numpyType2JavaClass.put('d', double.class); - numpyType2JavaClass.put('b', byte.class); numpyType2JavaClass.put('?', boolean.class); numpyType2JavaClass.put('U', String.class); numpyType2JavaClass.put('M', Instant.class); @@ -133,23 +134,21 @@ private void prepareSignature() { pyCallable + " has multiple signatures; this is not currently supported for numba vectorized/guvectorized functions"); } - signature = params.get(0).getStringValue(); unwrapped = pyCallable; // since vectorization doesn't support array type parameters, don't flag numba guvectorized as vectorized numbaVectorized = isNumbaVectorized; vectorized = isNumbaVectorized; } else if (pyCallable.hasAttribute("dh_vectorized")) { - signature = pyCallable.getAttribute("signature").toString(); unwrapped = pyCallable.getAttribute("callable"); numbaVectorized = false; vectorized = true; } else { - signature = dh_table_module.call("_encode_signature", pyCallable).toString(); unwrapped = pyCallable; numbaVectorized = false; vectorized = false; } - pyUdfDecoratedCallable = dh_table_module.call("_py_udf", unwrapped); + pyUdfDecoratedCallable = dh_udf_module.call("_py_udf", unwrapped); + signature = pyUdfDecoratedCallable.getAttribute("signature").toString(); } @Override @@ -199,7 +198,7 @@ public PyObject vectorizedCallable() { if (numbaVectorized || vectorized) { return pyCallable; } else { - return dh_table_module.call("dh_vectorize", unwrapped); + return dh_udf_module.call("_dh_vectorize", unwrapped); } } diff --git a/py/server/deephaven/_udf.py b/py/server/deephaven/_udf.py new file mode 100644 index 00000000000..fba76b3472a --- /dev/null +++ b/py/server/deephaven/_udf.py @@ -0,0 +1,420 @@ +# +# Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending +# + +from __future__ import annotations + +import inspect +import re +from dataclasses import dataclass, field +from functools import wraps +from typing import Callable, List, Any, Union, Tuple, _GenericAlias + +import numba +import numpy +import numpy as np + +from deephaven import DHError, dtypes +from deephaven.dtypes import _np_ndarray_component_type, _np_dtype_char, _NUMPY_INT_TYPE_CODES, \ + _NUMPY_FLOATING_TYPE_CODES, _component_np_dtype_char, _J_ARRAY_NP_TYPE_MAP, _PRIMITIVE_DTYPE_NULL_MAP, _scalar, \ + _BUILDABLE_ARRAY_DTYPE_MAP +from deephaven.jcompat import _j_array_to_numpy_array +from deephaven.time import to_np_datetime64 + +# For unittest vectorization +test_vectorization = False +vectorized_count = 0 + + +_SUPPORTED_NP_TYPE_CODES = {"b", "h", "H", "i", "l", "f", "d", "?", "U", "M", "O"} + + +@dataclass +class _ParsedParamAnnotation: + orig_types: set[type] = field(default_factory=set) + encoded_types: set[str] = field(default_factory=set) + none_allowed: bool = False + has_array: bool = False + int_char: str = None + floating_char: str = None + + +@dataclass +class _ParsedReturnAnnotation: + orig_type: type = None + encoded_type: str = None + none_allowed: bool = False + has_array: bool = False + + +@dataclass +class _ParsedSignature: + fn: Callable = None + params: List[_ParsedParamAnnotation] = field(default_factory=list) + ret_annotation: _ParsedReturnAnnotation = None + + @property + def encoded(self) -> str: + """Encode the signature of a Python function by mapping the annotations of the parameter types and the return + type to numpy dtype chars (i,l,h,f,d,b,?,U,M,O) and '[' for array, 'N' for NoneType. and pack them into a + string with parameter type chars first, in their original order, followed by the delimiter string '->', + then the return type char. If a parameter or the return of the function is not annotated, + the default 'O' - object type, will be used. + """ + param_str = ",".join(["".join(p.encoded_types) for p in self.params]) + # ret_annotation has only one parsed annotation, and it might be Optional which means it contains 'N' in the + # encoded type. We need to remove it. + return_type_code = re.sub(r"[N]", "", self.ret_annotation.encoded_type) + return param_str + "->" + return_type_code + + +def _encode_param_type(t: type) -> str: + """Returns the numpy based char codes for the given type. + If the type is a numpy ndarray, prefix the numpy dtype char with '[' using Java convention + If the type is a NoneType (as in Optional or as None in Union), return 'N' + """ + if t is type(None): + return "N" + + # find the component type if it is numpy ndarray + component_type = _np_ndarray_component_type(t) + if component_type: + t = component_type + + tc = _np_dtype_char(t) + tc = tc if tc in _SUPPORTED_NP_TYPE_CODES else "O" + + if component_type: + tc = "[" + tc + return tc + + +def _parse_param_annotation(annotation: Any) -> _ParsedParamAnnotation: + """ Parse a parameter annotation in a function's signature """ + p_annotation = _ParsedParamAnnotation() + + if annotation is inspect._empty: + p_annotation.encoded_types.add("O") + p_annotation.none_allowed = True + elif isinstance(annotation, _GenericAlias) and annotation.__origin__ == Union: + for t in annotation.__args__: + _parse_type_no_nested(annotation, p_annotation, t) + else: + _parse_type_no_nested(annotation, p_annotation, annotation) + return p_annotation + + +def _parse_type_no_nested(annotation: Any, p_annotation: _ParsedParamAnnotation, t: type) -> None: + """ Parse a specific type (top level or nested in a top-level Union annotation) without handling nested types + (e.g. a nested Union). The result is stored in the given _ParsedAnnotation object. + """ + p_annotation.orig_types.add(t) + tc = _encode_param_type(t) + if "[" in tc: + p_annotation.has_array = True + if tc in {"N", "O"}: + p_annotation.none_allowed = True + if tc in _NUMPY_INT_TYPE_CODES: + if p_annotation.int_char and p_annotation.int_char != tc: + raise DHError(message=f"multiple integer types in annotation: {annotation}, " + f"types: {p_annotation.int_char}, {tc}. this is not supported because it is not " + f"clear which Deephaven null value to use when checking for nulls in the argument") + p_annotation.int_char = tc + if tc in _NUMPY_FLOATING_TYPE_CODES: + if p_annotation.floating_char and p_annotation.floating_char != tc: + raise DHError(message=f"multiple floating types in annotation: {annotation}, " + f"types: {p_annotation.floating_char}, {tc}. this is not supported because it is not " + f"clear which Deephaven null value to use when checking for nulls in the argument") + p_annotation.floating_char = tc + p_annotation.encoded_types.add(tc) + + +def _parse_return_annotation(annotation: Any) -> _ParsedReturnAnnotation: + """ Parse a function's return annotation + + The return annotation is treated differently from the parameter annotations. We don't apply the same check and are + only interested in getting the array-like type right. Any nonsensical annotation will be treated as object type. + This definitely can be improved in the future. + """ + + pra = _ParsedReturnAnnotation() + + t = annotation + pra.orig_type = t + if isinstance(annotation, _GenericAlias) and annotation.__origin__ == Union and len(annotation.__args__) == 2: + # if the annotation is a Union of two types, we'll use the non-None type + if annotation.__args__[1] == type(None): # noqa: E721 + t = annotation.__args__[0] + elif annotation.__args__[0] == type(None): # noqa: E721 + t = annotation.__args__[1] + component_char = _component_np_dtype_char(t) + if component_char: + pra.encoded_type = "[" + component_char + pra.has_array = True + else: + pra.encoded_type = _np_dtype_char(t) + return pra + + +def _parse_numba_signature(fn: Union[numba.np.ufunc.gufunc.GUFunc, numba.np.ufunc.dufunc.DUFunc]) -> _ParsedSignature: + """ Parse a numba function's signature""" + sigs = fn.types # in the format of ll->l, ff->f,dd->d,OO->O, etc. + if sigs: + p_sig = _ParsedSignature(fn) + + # for now, we only support one signature for a numba function because the query engine is not ready to handle + # multiple signatures for vectorization https://github.com/deephaven/deephaven-core/issues/4762 + sig = sigs[0] + params, rt_char = sig.split("->") + + p_sig.params = [] + p_sig.ret_annotation = _ParsedReturnAnnotation() + p_sig.ret_annotation.encoded_type = rt_char + + if isinstance(fn, numba.np.ufunc.dufunc.DUFunc): + for p in params: + pa = _ParsedParamAnnotation() + pa.encoded_types.add(p) + if p in _NUMPY_INT_TYPE_CODES: + pa.int_char = p + if p in _NUMPY_FLOATING_TYPE_CODES: + pa.floating_char = p + p_sig.params.append(pa) + else: # GUFunc + # An example: @guvectorize([(int64[:], int64[:], int64[:])], "(m),(n)->(n)" + input_output_decl = fn.signature # "(m),(n)->(n)" in the above example + input_decl, output_decl = input_output_decl.split("->") + # remove the parentheses so that empty string indicates no array, non-empty string indicates array + input_decl = re.sub("[()]", "", input_decl).split(",") + output_decl = re.sub("[()]", "", output_decl) + + for p, d in zip(params, input_decl): + pa = _ParsedParamAnnotation() + if d: + pa.encoded_types.add("[" + p) + pa.has_array = True + else: + pa.encoded_types.add(p) + if p in _NUMPY_INT_TYPE_CODES: + pa.int_char = p + if p in _NUMPY_FLOATING_TYPE_CODES: + pa.floating_char = p + p_sig.params.append(pa) + + if output_decl: + p_sig.ret_annotation.has_array = True + return p_sig + else: + raise DHError(message=f"numba decorated functions must have an explicitly defined signature: {fn}") + + +def _parse_np_ufunc_signature(fn: numpy.ufunc) -> _ParsedSignature: + """ Parse the signature of a numpy ufunc """ + + # numpy ufuncs actually have signature encoded in their 'types' attribute, we want to better support + # them in the future (https://github.com/deephaven/deephaven-core/issues/4762) + p_sig = _ParsedSignature(fn) + if fn.nin > 0: + pa = _ParsedParamAnnotation() + pa.encoded_types.add("O") + p_sig.params = [pa] * fn.nin + p_sig.ret_annotation = _ParsedReturnAnnotation() + p_sig.ret_annotation.encoded_type = "O" + return p_sig + + +def _parse_signature(fn: Callable) -> _ParsedSignature: + """ Parse the signature of a function """ + + if isinstance(fn, (numba.np.ufunc.gufunc.GUFunc, numba.np.ufunc.dufunc.DUFunc)): + return _parse_numba_signature(fn) + elif isinstance(fn, numpy.ufunc): + return _parse_np_ufunc_signature(fn) + else: + p_sig = _ParsedSignature(fn=fn) + sig = inspect.signature(fn) + for n, p in sig.parameters.items(): + p_sig.params.append(_parse_param_annotation(p.annotation)) + + p_sig.ret_annotation = _parse_return_annotation(sig.return_annotation) + return p_sig + + +def _convert_arg(param: _ParsedParamAnnotation, arg: Any) -> Any: + """ Convert a single argument to the type specified by the annotation """ + if arg is None: + if not param.none_allowed: + raise TypeError(f"Argument {arg} is not compatible with annotation {param.orig_types}") + else: + return None + + # if the arg is a Java array + if np_dtype := _J_ARRAY_NP_TYPE_MAP.get(type(arg)): + encoded_type = "[" + np_dtype.char + # if it matches one of the encoded types, convert it + if encoded_type in param.encoded_types: + dtype = dtypes.from_np_dtype(np_dtype) + return _j_array_to_numpy_array(dtype, arg, conv_null=True, type_promotion=False) + # if the annotation is missing, or it is a generic object type, return the arg + elif "O" in param.encoded_types: + return arg + else: + raise TypeError(f"Argument {arg} is not compatible with annotation {param.encoded_types}") + else: # if the arg is not a Java array + specific_types = param.encoded_types - {"N", "O"} # remove NoneType and object type + if specific_types: + for t in specific_types: + if t.startswith("["): + if isinstance(arg, np.ndarray) and arg.dtype.char == t[1]: + return arg + continue + + dtype = dtypes.from_np_dtype(np.dtype(t)) + dh_null = _PRIMITIVE_DTYPE_NULL_MAP.get(dtype) + + if param.int_char and isinstance(arg, int): + if arg == dh_null: + if param.none_allowed: + return None + else: + raise DHError(f"Argument {arg} is not compatible with annotation {param.orig_types}") + else: + return np.dtype(param.int_char).type(arg) + elif param.floating_char and isinstance(arg, float): + if isinstance(arg, float): + if arg == dh_null: + return np.nan if "N" not in param.encoded_types else None + else: + return np.dtype(param.floating_char).type(arg) + elif t == "?" and isinstance(arg, bool): + return arg + elif t == "M": + try: + return to_np_datetime64(arg) + except Exception as e: + # don't raise an error, if this is the only annotation, the else block of the for loop will + # catch it and raise a TypeError + pass + elif t == "U" and isinstance(arg, str): + return arg + else: # didn't return from inside the for loop + if "O" in param.encoded_types: + return arg + else: + raise TypeError(f"Argument {arg} is not compatible with annotation {param.orig_types}") + else: # if no annotation or generic object, return arg + return arg + + +def _convert_args(p_sig: _ParsedSignature, args: Tuple[Any, ...]) -> List[Any]: + """ Convert all arguments to the types specified by the annotations. + Given that the number of arguments and the number of parameters may not match (in the presence of keyword, + var-positional, or var-keyword parameters), we have the following rules: + If the number of arguments is less than the number of parameters, the remaining parameters are left as is. + If the number of arguments is greater than the number of parameters, the extra arguments are left as is. + + Python's function call mechanism will raise an exception if it can't resolve the parameters with the arguments. + """ + converted_args = [_convert_arg(param, arg) for param, arg in zip(p_sig.params, args)] + converted_args.extend(args[len(converted_args):]) + return converted_args + + +def _py_udf(fn: Callable): + """A decorator that acts as a transparent translator for Python UDFs used in Deephaven query formulas between + Python and Java. This decorator is intended for use by the Deephaven query engine and should not be used by + users. + + It carries out two conversions: + 1. convert Python function return values to Java values. + For properly annotated functions, including numba vectorized and guvectorized ones, this decorator inspects the + signature of the function and determines its return type, including supported primitive types and arrays of + the supported primitive types. It then converts the return value of the function to the corresponding Java value + of the same type. For unsupported types, the decorator returns the original Python value which appears as + org.jpy.PyObject in Java. + 2. convert Java function arguments to Python values based on the signature of the function. + """ + if hasattr(fn, "return_type"): + return fn + p_sig = _parse_signature(fn) + # build a signature string for vectorization by removing NoneType, array char '[', and comma from the encoded types + # since vectorization only supports UDFs with a single signature and enforces an exact match, any non-compliant + # signature (e.g. Union with more than 1 non-NoneType) will be rejected by the vectorizer. + sig_str_vectorization = re.sub(r"[\[N,]", "", p_sig.encoded) + return_array = p_sig.ret_annotation.has_array + ret_dtype = dtypes.from_np_dtype(np.dtype(p_sig.ret_annotation.encoded_type[-1])) + + @wraps(fn) + def wrapper(*args, **kwargs): + converted_args = _convert_args(p_sig, args) + # kwargs are not converted because they are not used in the UDFs + ret = fn(*converted_args, **kwargs) + if return_array: + return dtypes.array(ret_dtype, ret) + elif ret_dtype == dtypes.PyObject: + return ret + else: + return _scalar(ret, ret_dtype) + + wrapper.j_name = ret_dtype.j_name + real_ret_dtype = _BUILDABLE_ARRAY_DTYPE_MAP.get(ret_dtype, dtypes.PyObject) if return_array else ret_dtype + + if hasattr(ret_dtype.j_type, 'jclass'): + j_class = real_ret_dtype.j_type.jclass + else: + j_class = real_ret_dtype.qst_type.clazz() + + wrapper.return_type = j_class + wrapper.signature = sig_str_vectorization + + return wrapper + + +def _dh_vectorize(fn): + """A decorator to vectorize a Python function used in Deephaven query formulas and invoked on a row basis. + + If this annotation is not used on a query function, the Deephaven query engine will make an effort to vectorize + the function. If vectorization is not possible, the query engine will use the original, non-vectorized function. + If this annotation is used on a function, the Deephaven query engine will use the vectorized function in a query, + or an error will result if the function can not be vectorized. + + When this decorator is used on a function, the number and type of input and output arguments are changed. + These changes are only intended for use by the Deephaven query engine. Users are discouraged from using + vectorized functions in non-query code, since the function signature may change in future versions. + + The current vectorized function signature includes (1) the size of the input arrays, (2) the output array, + and (3) the input arrays. + """ + p_sig = _parse_signature(fn) + ret_dtype = dtypes.from_np_dtype(np.dtype(p_sig.ret_annotation.encoded_type[-1])) + + @wraps(fn) + def wrapper(*args): + if len(args) != len(p_sig.params) + 2: + raise ValueError( + f"The number of arguments doesn't match the function signature. {len(args) - 2}, {p_sig.encoded}") + if args[0] <= 0: + raise ValueError(f"The chunk size argument must be a positive integer. {args[0]}") + + chunk_size = args[0] + chunk_result = args[1] + if args[2:]: + vectorized_args = zip(*args[2:]) + for i in range(chunk_size): + scalar_args = next(vectorized_args) + converted_args = _convert_args(p_sig, scalar_args) + chunk_result[i] = _scalar(fn(*converted_args), ret_dtype) + else: + for i in range(chunk_size): + chunk_result[i] = _scalar(fn(), ret_dtype) + + return chunk_result + + wrapper.callable = fn + wrapper.dh_vectorized = True + + if test_vectorization: + global vectorized_count + vectorized_count += 1 + + return wrapper \ No newline at end of file diff --git a/py/server/deephaven/dtypes.py b/py/server/deephaven/dtypes.py index 5f5857ffdbe..56d2f25ca0d 100644 --- a/py/server/deephaven/dtypes.py +++ b/py/server/deephaven/dtypes.py @@ -102,6 +102,8 @@ def __call__(self, *args, **kwargs): """Double-precision floating-point number type""" string = DType(j_name="java.lang.String", qst_type=_JQstType.stringType(), np_type=np.str_) """String type""" +Character = DType(j_name="java.lang.Character") +"""Character type""" BigDecimal = DType(j_name="java.math.BigDecimal") """Java BigDecimal type""" StringSet = DType(j_name="io.deephaven.stringset.StringSet") @@ -188,6 +190,20 @@ def __call__(self, *args, **kwargs): } +_J_ARRAY_NP_TYPE_MAP = { + boolean_array.j_type: np.dtype("?"), + byte_array.j_type: np.dtype("b"), + char_array.j_type: np.dtype("uint16"), + short_array.j_type: np.dtype("h"), + int32_array.j_type: np.dtype("i"), + long_array.j_type: np.dtype("l"), + float32_array.j_type: np.dtype("f"), + double_array.j_type: np.dtype("d"), + string_array.j_type: np.dtype("U"), + instant_array.j_type: np.dtype("datetime64[ns]"), +} + + def null_remap(dtype: DType) -> Callable[[Any], Any]: """ Creates a null value remap function for the provided DType. @@ -325,8 +341,19 @@ def from_np_dtype(np_dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]) - return PyObject -_NUMPY_INT_TYPE_CODES = ["i", "l", "h", "b"] -_NUMPY_FLOATING_TYPE_CODES = ["f", "d"] +_NUMPY_INT_TYPE_CODES = {"b", "h", "H", "i", "l"} +_NUMPY_FLOATING_TYPE_CODES = {"f", "d"} + + +def _is_py_null(x: Any) -> bool: + """Checks if the value is a Python null value, i.e. None or NaN, or Pandas.NA.""" + if x is None: + return True + + try: + return bool(pd.isna(x)) + except (TypeError, ValueError): + return False def _scalar(x: Any, dtype: DType) -> Any: @@ -336,12 +363,14 @@ def _scalar(x: Any, dtype: DType) -> Any: # NULL_BOOL will appear in Java as a byte value which causes a cast error. We just let JPY converts it to Java null # and the engine has casting logic to handle it. - if x is None and dtype != bool_ and _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): - return _PRIMITIVE_DTYPE_NULL_MAP[dtype] + if (dt := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype)) and _is_py_null(x) and dtype not in (bool_, char): + return dt try: if hasattr(x, "dtype"): - if x.dtype.char in _NUMPY_INT_TYPE_CODES: + if x.dtype.char == 'H': # np.uint16 maps to Java char + return Character(int(x)) + elif x.dtype.char in _NUMPY_INT_TYPE_CODES: return int(x) elif x.dtype.char in _NUMPY_FLOATING_TYPE_CODES: return float(x) @@ -382,20 +411,32 @@ def _component_np_dtype_char(t: type) -> Optional[str]: if isinstance(t, _GenericAlias) and issubclass(t.__origin__, Sequence): component_type = t.__args__[0] + if not component_type: + component_type = _np_ndarray_component_type(t) + + if component_type: + return _np_dtype_char(component_type) + else: + return None + + +def _np_ndarray_component_type(t: type) -> Optional[type]: + """Returns the numpy ndarray component type if the type is a numpy ndarray, otherwise return None.""" + # Py3.8: npt.NDArray can be used in Py 3.8 as a generic alias, but a specific alias (e.g. npt.NDArray[np.int64]) # is an instance of a private class of np, yet we don't have a choice but to use it. And when npt.NDArray is used, # the 1st argument is typing.Any, the 2nd argument is another generic alias of which the 1st argument is the # component type - if not component_type and sys.version_info.minor == 8: + component_type = None + if sys.version_info.major == 3 and sys.version_info.minor == 8: if isinstance(t, np._typing._generic_alias._GenericAlias) and t.__origin__ == np.ndarray: component_type = t.__args__[1].__args__[0] - # Py3.9+, np.ndarray as a generic alias is only supported in Python 3.9+, also npt.NDArray is still available but a # specific alias (e.g. npt.NDArray[np.int64]) now is an instance of typing.GenericAlias. # when npt.NDArray is used, the 1st argument is typing.Any, the 2nd argument is another generic alias of which # the 1st argument is the component type # when np.ndarray is used, the 1st argument is the component type - if not component_type and sys.version_info.minor > 8: + if not component_type and sys.version_info.major == 3 and sys.version_info.minor > 8: import types if isinstance(t, types.GenericAlias) and (issubclass(t.__origin__, Sequence) or t.__origin__ == np.ndarray): nargs = len(t.__args__) @@ -406,8 +447,4 @@ def _component_np_dtype_char(t: type) -> Optional[str]: a1 = t.__args__[1] if a0 == typing.Any and isinstance(a1, types.GenericAlias): component_type = a1.__args__[0] - - if component_type: - return _np_dtype_char(component_type) - else: - return None + return component_type diff --git a/py/server/deephaven/jcompat.py b/py/server/deephaven/jcompat.py index d12f0d01f64..c1d54a2f443 100644 --- a/py/server/deephaven/jcompat.py +++ b/py/server/deephaven/jcompat.py @@ -5,12 +5,29 @@ """ This module provides Java compatibility support including convenience functions to create some widely used Java data structures from corresponding Python ones in order to be able to call Java methods. """ -from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union +from typing import Any, Callable, Dict, Iterable, List, Sequence, Set, TypeVar, Union, Tuple, Literal import jpy +import numpy as np +import pandas as pd +from deephaven import dtypes, DHError from deephaven._wrapper import unwrap, wrap_j_object -from deephaven.dtypes import DType +from deephaven.dtypes import DType, _PRIMITIVE_DTYPE_NULL_MAP, _J_ARRAY_NP_TYPE_MAP + +_NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE +_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") + +_DH_PANDAS_NULLABLE_TYPE_MAP: Dict[DType, pd.api.extensions.ExtensionDtype] = { + dtypes.bool_: pd.BooleanDtype, + dtypes.byte: pd.Int8Dtype, + dtypes.short: pd.Int16Dtype, + dtypes.char: pd.UInt16Dtype, + dtypes.int32: pd.Int32Dtype, + dtypes.int64: pd.Int64Dtype, + dtypes.float32: pd.Float32Dtype, + dtypes.float64: pd.Float64Dtype, +} def is_java_type(obj: Any) -> bool: @@ -181,11 +198,109 @@ def to_sequence(v: Union[T, Sequence[T]] = None, wrapped: bool = False) -> Seque return () if wrapped: if not isinstance(v, Sequence) or isinstance(v, str): - return (v, ) + return (v,) else: return tuple(v) if not isinstance(v, Sequence) or isinstance(v, str): - return (unwrap(v), ) + return (unwrap(v),) else: return tuple((unwrap(o) for o in v)) + + +def _j_array_to_numpy_array(dtype: DType, j_array: jpy.JType, conv_null: bool, type_promotion: bool = False) -> \ + np.ndarray: + """ Produces a numpy array from the DType and given Java array. + + Args: + dtype (DType): The dtype of the Java array + j_array (jpy.JType): The Java array to convert + conv_null (bool): If True, convert nulls to the null value for the dtype + type_promotion (bool): Ignored when conv_null is False. When type_promotion is False, (1) input Java integer, + boolean, or character arrays containing Deephaven nulls yield an exception, (2) input Java float or double + arrays containing Deephaven nulls have null values converted to np.nan, and (3) input Java arrays without + Deephaven nulls are converted to the target type. When type_promotion is True, (1) input Java integer, + boolean, or character arrays containing Deephaven nulls are converted to np.float64 arrays and Deephaven + null values are converted to np.nan, (2) input Java float or double arrays containing Deephaven nulls have + null values converted to np.nan, and (3) input Java arrays without Deephaven nulls are converted to the + target type. Defaults to False. + + Returns: + np.ndarray: The numpy array + + Raises: + DHError + """ + if dtype.is_primitive: + np_array = np.frombuffer(j_array, dtype.np_type) + elif dtype == dtypes.Instant: + longs = _JPrimitiveArrayConversionUtility.translateArrayInstantToLong(j_array) + np_long_array = np.frombuffer(longs, np.int64) + np_array = np_long_array.view(dtype.np_type) + elif dtype == dtypes.bool_: + # dh nulls will be preserved and show up as True b/c the underlying byte array isn't modified + bytes_ = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array) + np_array = np.frombuffer(bytes_, dtype.np_type) + elif dtype == dtypes.string: + np_array = np.array(j_array, dtypes.string.np_type) + elif dtype.np_type is not np.object_: + try: + np_array = np.frombuffer(j_array, dtype.np_type) + except: + np_array = np.array(j_array, np.object_) + else: + np_array = np.array(j_array, np.object_) + + if conv_null: + if dh_null := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype): + if dtype in (dtypes.float32, dtypes.float64): + np_array = np.copy(np_array) + np_array[np_array == dh_null] = np.nan + else: + if dtype is dtypes.bool_: # needs to change its type to byte for dh null detection + np_array = np.frombuffer(np_array, np.byte) + + if any(np_array[np_array == dh_null]): + if not type_promotion: + raise DHError(f"Problem creating numpy array. Java {dtype} array contains Deephaven null values, but numpy {np_array.dtype} array does not support null values") + np_array = np_array.astype(np.float64) + np_array[np_array == dh_null] = np.nan + else: + if dtype is dtypes.bool_: # needs to change its type back to bool + np_array = np.frombuffer(np_array, np.bool_) + return np_array + + return np_array + + +def _j_array_to_series(dtype: DType, j_array: jpy.JType, conv_null: bool) -> pd.Series: + """Produce a copy of the specified Java array as a pandas.Series object. + + Args: + dtype (DType): the dtype of the Java array + j_array (jpy.JType): the Java array + conv_null (bool): whether to check for Deephaven nulls in the data and automatically replace them with + pd.NA. + + Returns: + a pandas Series + + Raises: + DHError + """ + if conv_null and dtype == dtypes.bool_: + j_array = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array) + np_array = np.frombuffer(j_array, dtype=np.byte) + s = pd.Series(data=np_array, dtype=pd.Int8Dtype(), copy=False) + s.mask(s == _NULL_BOOLEAN_AS_BYTE, inplace=True) + return s.astype(pd.BooleanDtype(), copy=False) + + np_array = _j_array_to_numpy_array(dtype, j_array, conv_null=False) + if conv_null and (nv := _PRIMITIVE_DTYPE_NULL_MAP.get(dtype)) is not None: + pd_ex_dtype = _DH_PANDAS_NULLABLE_TYPE_MAP.get(dtype) + s = pd.Series(data=np_array, dtype=pd_ex_dtype(), copy=False) + s.mask(s == nv, inplace=True) + else: + s = pd.Series(data=np_array, copy=False) + + return s diff --git a/py/server/deephaven/numpy.py b/py/server/deephaven/numpy.py index 412b6e8b5ac..3cc898271b3 100644 --- a/py/server/deephaven/numpy.py +++ b/py/server/deephaven/numpy.py @@ -8,13 +8,13 @@ import jpy import numpy as np -from deephaven.dtypes import DType -from deephaven import DHError, dtypes, empty_table, new_table +from deephaven import DHError, dtypes, new_table from deephaven.column import Column, InputColumn +from deephaven.dtypes import DType +from deephaven.jcompat import _j_array_to_numpy_array from deephaven.table import Table -_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") @@ -25,28 +25,9 @@ def _to_column_name(name: str) -> str: def column_to_numpy_array(col_def: Column, j_array: jpy.JType) -> np.ndarray: - """ Produces a numpy array from the given Java array and the Table column definition. """ + """ Produces a numpy array from the given Java array and the Table column definition.""" try: - if col_def.data_type.is_primitive: - np_array = np.frombuffer(j_array, col_def.data_type.np_type) - elif col_def.data_type == dtypes.Instant: - longs = _JPrimitiveArrayConversionUtility.translateArrayInstantToLong(j_array) - np_long_array = np.frombuffer(longs, np.int64) - np_array = np_long_array.view(col_def.data_type.np_type) - elif col_def.data_type == dtypes.bool_: - bytes_ = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(j_array) - np_array = np.frombuffer(bytes_, col_def.data_type.np_type) - elif col_def.data_type == dtypes.string: - np_array = np.array([s for s in j_array], dtypes.string.np_type) - elif col_def.data_type.np_type is not np.object_: - try: - np_array = np.frombuffer(j_array, col_def.data_type.np_type) - except: - np_array = np.array(j_array, np.object_) - else: - np_array = np.array(j_array, np.object_) - - return np_array + return _j_array_to_numpy_array(col_def.data_type, j_array, conv_null=False, type_promotion=False) except DHError: raise except Exception as e: diff --git a/py/server/deephaven/pandas.py b/py/server/deephaven/pandas.py index 883622ce27b..8626b999e11 100644 --- a/py/server/deephaven/pandas.py +++ b/py/server/deephaven/pandas.py @@ -3,7 +3,7 @@ # """ This module supports the conversion between Deephaven tables and pandas DataFrames. """ -from typing import List, Dict, Tuple, Literal +from typing import List, Literal import jpy import numpy as np @@ -13,26 +13,14 @@ from deephaven import DHError, new_table, dtypes, arrow from deephaven.column import Column from deephaven.constants import NULL_BYTE, NULL_SHORT, NULL_INT, NULL_LONG, NULL_FLOAT, NULL_DOUBLE, NULL_CHAR -from deephaven.dtypes import DType -from deephaven.numpy import column_to_numpy_array, _make_input_column +from deephaven.jcompat import _j_array_to_series +from deephaven.numpy import _make_input_column from deephaven.table import Table _NULL_BOOLEAN_AS_BYTE = jpy.get_type("io.deephaven.util.BooleanUtils").NULL_BOOLEAN_AS_BYTE -_JPrimitiveArrayConversionUtility = jpy.get_type("io.deephaven.integrations.common.PrimitiveArrayConversionUtility") _JDataAccessHelpers = jpy.get_type("io.deephaven.engine.table.impl.DataAccessHelpers") _is_dtype_backend_supported = pd.__version__ >= "2.0.0" -_DTYPE_NULL_MAPPING: Dict[DType, Tuple] = { - dtypes.bool_: (_NULL_BOOLEAN_AS_BYTE, pd.BooleanDtype), - dtypes.byte: (NULL_BYTE, pd.Int8Dtype), - dtypes.short: (NULL_SHORT, pd.Int16Dtype), - dtypes.char: (NULL_CHAR, pd.UInt16Dtype), - dtypes.int32: (NULL_INT, pd.Int32Dtype), - dtypes.int64: (NULL_LONG, pd.Int64Dtype), - dtypes.float32: (NULL_FLOAT, pd.Float32Dtype), - dtypes.float64: (NULL_DOUBLE, pd.Float64Dtype), -} - def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Series: """Produce a copy of the specified column as a pandas.Series object. @@ -51,29 +39,15 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri """ try: data_col = _JDataAccessHelpers.getColumn(table.j_table, col_def.name) - if conv_null and col_def.data_type == dtypes.bool_: - j_array = _JPrimitiveArrayConversionUtility.translateArrayBooleanToByte(data_col.getDirect()) - np_array = np.frombuffer(j_array, dtype=np.byte) - s = pd.Series(data=np_array, dtype=pd.Int8Dtype(), copy=False) - s.mask(s == _NULL_BOOLEAN_AS_BYTE, inplace=True) - return s.astype(pd.BooleanDtype(), copy=False) - - np_array = column_to_numpy_array(col_def, data_col.getDirect()) - if conv_null and (null_pair := _DTYPE_NULL_MAPPING.get(col_def.data_type)) is not None: - nv = null_pair[0] - pd_ex_dtype = null_pair[1] - s = pd.Series(data=np_array, dtype=pd_ex_dtype(), copy=False) - s.mask(s == nv, inplace=True) - else: - s = pd.Series(data=np_array, copy=False) - return s + j_array = data_col.getDirect() + return _j_array_to_series(col_def.data_type, j_array, conv_null) except DHError: raise except Exception as e: raise DHError(e, message="failed to create a pandas Series for {col}") from e -_DTYPE_MAPPING_PYARROW = { +_PANDAS_ARROW_TYPE_MAP = { pa.int8(): pd.ArrowDtype(pa.int8()), pa.int16(): pd.ArrowDtype(pa.int16()), pa.int32(): pd.ArrowDtype(pa.int32()), @@ -90,7 +64,7 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri pa.timestamp('ns', tz='UTC'): pd.ArrowDtype(pa.timestamp('ns', tz='UTC')), } -_DTYPE_MAPPING_NUMPY_NULLABLE = { +_PANDAS_NULLABLE_TYPE_MAP = { pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.uint16(): pd.UInt16Dtype(), @@ -107,8 +81,8 @@ def _column_to_series(table: Table, col_def: Column, conv_null: bool) -> pd.Seri } _PYARROW_TO_PANDAS_TYPE_MAPPERS = { - "pyarrow": _DTYPE_MAPPING_PYARROW.get, - "numpy_nullable": _DTYPE_MAPPING_NUMPY_NULLABLE.get, + "pyarrow": _PANDAS_ARROW_TYPE_MAP.get, + "numpy_nullable": _PANDAS_NULLABLE_TYPE_MAP.get, } @@ -180,7 +154,7 @@ def to_pandas(table: Table, cols: List[str] = None, raise DHError(e, "failed to create a pandas DataFrame from table.") from e -_EX_DTYPE_NULL_MAP = { +_PANDAS_EXTYPE_DH_NULL_MAP = { # This reflects the fact that in the server we use NULL_BOOLEAN_AS_BYTE - the byte encoding of null boolean to # translate boxed Boolean to/from primitive bytes pd.BooleanDtype: _NULL_BOOLEAN_AS_BYTE, @@ -209,7 +183,7 @@ def _map_na(array: [np.ndarray, pd.api.extensions.ExtensionArray]): if not isinstance(pd_dtype, pd.api.extensions.ExtensionDtype): return array - dh_null = _EX_DTYPE_NULL_MAP.get(type(pd_dtype)) or _EX_DTYPE_NULL_MAP.get(pd_dtype) + dh_null = _PANDAS_EXTYPE_DH_NULL_MAP.get(type(pd_dtype)) or _PANDAS_EXTYPE_DH_NULL_MAP.get(pd_dtype) # To preserve NaNs in floating point arrays, Pandas doesn't distinguish NaN/Null as far as NA testing is # concerned, thus its fillna() method will replace both NaN/Null in the data. if isinstance(pd_dtype, (pd.Float32Dtype, pd.Float64Dtype)) and isinstance(getattr(array, "_data"), np.ndarray): @@ -276,3 +250,4 @@ def to_table(df: pd.DataFrame, cols: List[str] = None) -> Table: raise except Exception as e: raise DHError(e, "failed to create a Deephaven Table from a pandas DataFrame.") from e + diff --git a/py/server/deephaven/table.py b/py/server/deephaven/table.py index 89fa8df9c19..922e6b3dcd1 100644 --- a/py/server/deephaven/table.py +++ b/py/server/deephaven/table.py @@ -11,13 +11,10 @@ import inspect from enum import Enum from enum import auto -from functools import wraps -from typing import Any, Optional, Callable, Dict, _GenericAlias +from typing import Any, Optional, Callable, Dict from typing import Sequence, List, Union, Protocol import jpy -import numba -import numpy as np from deephaven import DHError from deephaven import dtypes @@ -31,8 +28,6 @@ from deephaven.jcompat import to_sequence, j_array_list from deephaven.update_graph import auto_locking_ctx, UpdateGraph from deephaven.updateby import UpdateByOperation -from deephaven.dtypes import _BUILDABLE_ARRAY_DTYPE_MAP, _scalar, _np_dtype_char, \ - _component_np_dtype_char # Table _J_Table = jpy.get_type("io.deephaven.engine.table.Table") @@ -80,10 +75,6 @@ _JMultiJoinTable = jpy.get_type("io.deephaven.engine.table.MultiJoinTable") _JMultiJoinFactory = jpy.get_type("io.deephaven.engine.table.MultiJoinFactory") -# For unittest vectorization -_test_vectorization = False -_vectorized_count = 0 - class NodeType(Enum): """An enum of node types for RollupTable""" @@ -363,178 +354,6 @@ def _j_py_script_session() -> _JPythonScriptSession: return None -_SUPPORTED_NP_TYPE_CODES = ["i", "l", "h", "f", "d", "b", "?", "U", "M", "O"] - - -def _parse_annotation(annotation: Any) -> Any: - """Parse a Python annotation, for now mostly to extract the non-None type from an Optional(Union) annotation, - otherwise return the original annotation. """ - if isinstance(annotation, _GenericAlias) and annotation.__origin__ == Union and len(annotation.__args__) == 2: - if annotation.__args__[1] == type(None): # noqa: E721 - return annotation.__args__[0] - elif annotation.__args__[0] == type(None): # noqa: E721 - return annotation.__args__[1] - else: - return annotation - else: - return annotation - - -def _encode_signature(fn: Callable) -> str: - """Encode the signature of a Python function by mapping the annotations of the parameter types and the return - type to numpy dtype chars (i,l,h,f,d,b,?,U,M,O), and pack them into a string with parameter type chars first, - in their original order, followed by the delimiter string '->', then the return type_char. - - If a parameter or the return of the function is not annotated, the default 'O' - object type, will be used. - """ - try: - sig = inspect.signature(fn) - except: - # in case inspect.signature() fails, we'll just use the default 'O' - object type. - # numpy ufuncs actually have signature encoded in their 'types' attribute, we want to better support - # them in the future (https://github.com/deephaven/deephaven-core/issues/4762) - if type(fn) == np.ufunc: - return "O"*fn.nin + "->" + "O" - return "->O" - - np_type_codes = [] - for n, p in sig.parameters.items(): - p_annotation = _parse_annotation(p.annotation) - np_type_codes.append(_np_dtype_char(p_annotation)) - - return_annotation = _parse_annotation(sig.return_annotation) - return_type_code = _np_dtype_char(return_annotation) - np_type_codes = [c if c in _SUPPORTED_NP_TYPE_CODES else "O" for c in np_type_codes] - return_type_code = return_type_code if return_type_code in _SUPPORTED_NP_TYPE_CODES else "O" - - np_type_codes.extend(["-", ">", return_type_code]) - return "".join(np_type_codes) - - -def _udf_return_dtype(fn): - if isinstance(fn, (numba.np.ufunc.dufunc.DUFunc, numba.np.ufunc.gufunc.GUFunc)) and hasattr(fn, "types"): - return dtypes.from_np_dtype(np.dtype(fn.types[0][-1])) - else: - return dtypes.from_np_dtype(np.dtype(_encode_signature(fn)[-1])) - - -def _py_udf(fn: Callable): - """A decorator that acts as a transparent translator for Python UDFs used in Deephaven query formulas between - Python and Java. This decorator is intended for use by the Deephaven query engine and should not be used by - users. - - For now, this decorator is only capable of converting Python function return values to Java values. It - does not yet convert Java values in arguments to usable Python object (e.g. numpy arrays) or properly translate - Deephaven primitive null values. - - For properly annotated functions, including numba vectorized and guvectorized ones, this decorator inspects the - signature of the function and determines its return type, including supported primitive types and arrays of - the supported primitive types. It then converts the return value of the function to the corresponding Java value - of the same type. For unsupported types, the decorator returns the original Python value which appears as - org.jpy.PyObject in Java. - """ - - if hasattr(fn, "return_type"): - return fn - ret_dtype = _udf_return_dtype(fn) - - return_array = False - # If the function is a numba guvectorized function, examine the signature of the function to determine if it - # returns an array. - if isinstance(fn, numba.np.ufunc.gufunc.GUFunc): - sig = fn.signature - rtype = sig.split("->")[-1].strip("()") - if rtype: - return_array = True - else: - try: - return_annotation = _parse_annotation(inspect.signature(fn).return_annotation) - except ValueError: - # the function has no return annotation, and since we can't know what the exact type is, the return type - # defaults to the generic object type therefore it is not an array of a specific type, - # but see (https://github.com/deephaven/deephaven-core/issues/4762) for future imporvement to better support - # numpy ufuncs. - pass - else: - component_type = _component_np_dtype_char(return_annotation) - if component_type: - ret_dtype = dtypes.from_np_dtype(np.dtype(component_type)) - if ret_dtype in _BUILDABLE_ARRAY_DTYPE_MAP: - return_array = True - - @wraps(fn) - def wrapper(*args, **kwargs): - ret = fn(*args, **kwargs) - if return_array: - return dtypes.array(ret_dtype, ret) - elif ret_dtype == dtypes.PyObject: - return ret - else: - return _scalar(ret, ret_dtype) - - wrapper.j_name = ret_dtype.j_name - real_ret_dtype = _BUILDABLE_ARRAY_DTYPE_MAP.get(ret_dtype) if return_array else ret_dtype - - if hasattr(ret_dtype.j_type, 'jclass'): - j_class = real_ret_dtype.j_type.jclass - else: - j_class = real_ret_dtype.qst_type.clazz() - - wrapper.return_type = j_class - - return wrapper - - -def dh_vectorize(fn): - """A decorator to vectorize a Python function used in Deephaven query formulas and invoked on a row basis. - - If this annotation is not used on a query function, the Deephaven query engine will make an effort to vectorize - the function. If vectorization is not possible, the query engine will use the original, non-vectorized function. - If this annotation is used on a function, the Deephaven query engine will use the vectorized function in a query, - or an error will result if the function can not be vectorized. - - When this decorator is used on a function, the number and type of input and output arguments are changed. - These changes are only intended for use by the Deephaven query engine. Users are discouraged from using - vectorized functions in non-query code, since the function signature may change in future versions. - - The current vectorized function signature includes (1) the size of the input arrays, (2) the output array, - and (3) the input arrays. - """ - signature = _encode_signature(fn) - ret_dtype = _udf_return_dtype(fn) - - @wraps(fn) - def wrapper(*args): - if len(args) != len(signature) - len("->?") + 2: - raise ValueError( - f"The number of arguments doesn't match the function signature. {len(args) - 2}, {signature}") - if args[0] <= 0: - raise ValueError(f"The chunk size argument must be a positive integer. {args[0]}") - - chunk_size = args[0] - chunk_result = args[1] - if args[2:]: - vectorized_args = zip(*args[2:]) - for i in range(chunk_size): - scalar_args = next(vectorized_args) - chunk_result[i] = _scalar(fn(*scalar_args), ret_dtype) - else: - for i in range(chunk_size): - chunk_result[i] = _scalar(fn(), ret_dtype) - - return chunk_result - - wrapper.callable = fn - wrapper.signature = signature - wrapper.dh_vectorized = True - - if _test_vectorization: - global _vectorized_count - _vectorized_count += 1 - - return wrapper - - @contextlib.contextmanager def _query_scope_ctx(): """A context manager to set/unset query scope based on the scope of the most immediate caller code that invokes @@ -3712,6 +3531,7 @@ def update_by(self, ops: Union[UpdateByOperation, List[UpdateByOperation]], except Exception as e: raise DHError(e, "update-by operation on the PartitionedTableProxy failed.") from e + class MultiJoinInput(JObjectWrapper): """A MultiJoinInput represents the input tables, key columns and additional columns to be used in the multi-table natural join. """ @@ -3779,7 +3599,8 @@ def __init__(self, input: Union[Table, Sequence[Table], MultiJoinInput, Sequence with auto_locking_ctx(*tables): j_tables = to_sequence(input) self.j_multijointable = _JMultiJoinFactory.of(on, *j_tables) - elif isinstance(input, MultiJoinInput) or (isinstance(input, Sequence) and all(isinstance(ji, MultiJoinInput) for ji in input)): + elif isinstance(input, MultiJoinInput) or ( + isinstance(input, Sequence) and all(isinstance(ji, MultiJoinInput) for ji in input)): if on is not None: raise DHError(message="on parameter is not permitted when MultiJoinInput objects are provided.") wrapped_input = to_sequence(input, wrapped=True) @@ -3788,13 +3609,13 @@ def __init__(self, input: Union[Table, Sequence[Table], MultiJoinInput, Sequence input = to_sequence(input) self.j_multijointable = _JMultiJoinFactory.of(*input) else: - raise DHError(message="input must be a Table, a sequence of Tables, a MultiJoinInput, or a sequence of MultiJoinInputs.") + raise DHError( + message="input must be a Table, a sequence of Tables, a MultiJoinInput, or a sequence of MultiJoinInputs.") except Exception as e: raise DHError(e, "failed to build a MultiJoinTable object.") from e - def multi_join(input: Union[Table, Sequence[Table], MultiJoinInput, Sequence[MultiJoinInput]], on: Union[str, Sequence[str]] = None) -> MultiJoinTable: """ The multi_join method creates a new table by performing a multi-table natural join on the input tables. The result @@ -3812,4 +3633,4 @@ def multi_join(input: Union[Table, Sequence[Table], MultiJoinInput, Sequence[Mul MultiJoinTable: the result of the multi-table natural join operation. To access the underlying Table, use the table() method. """ - return MultiJoinTable(input, on) \ No newline at end of file + return MultiJoinTable(input, on) diff --git a/py/server/tests/test_numba_guvectorize.py b/py/server/tests/test_numba_guvectorize.py index c82b92296e3..79d9f87241f 100644 --- a/py/server/tests/test_numba_guvectorize.py +++ b/py/server/tests/test_numba_guvectorize.py @@ -5,7 +5,7 @@ import unittest import numpy as np -from numba import guvectorize, int64 +from numba import guvectorize, int64, int32 from deephaven import empty_table, dtypes from tests.testbase import BaseTestCase @@ -22,13 +22,13 @@ def g(x, res): for xi in x: res[0] += xi - t = empty_table(10).update(["X=i%3", "Y=i"]).group_by("X").update("Z=g(Y)") + t = empty_table(10).update(["X=i%3", "Y=ii"]).group_by("X").update("Z=g(Y)") m = t.meta_table self.assertEqual(t.columns[2].data_type, dtypes.int64) def test_vector_return(self): # vector and scalar input to vector ouput function - @guvectorize([(int64[:], int64, int64[:])], "(m),()->(m)", nopython=True) + @guvectorize([(int32[:], int32, int64[:])], "(m),()->(m)", nopython=True) def g(x, y, res): for i in range(len(x)): res[i] = x[i] + y @@ -61,7 +61,7 @@ def test_fixed_length_vector_return(self): dummy = np.array([0, 0], dtype=np.int64) # vector input to fixed-length vector ouput function -- second arg is a dummy just to get a fixed size output - @guvectorize([(int64[:], int64[:], int64[:])], "(m),(n)->(n)", nopython=True) + @guvectorize([(int32[:], int64[:], int64[:])], "(m),(n)->(n)", nopython=True) def g(x, dummy, res): res[0] = min(x) res[1] = max(x) @@ -78,7 +78,7 @@ def g(x, dummy, res): res[0] = np.min(x) res[1] = np.max(x) - t = empty_table(10).update(["X=i%3", "Y=i"]).group_by("X").update("Z=g(Y,dummy)") + t = empty_table(10).update(["X=i%3", "Y=ii"]).group_by("X").update("Z=g(Y,dummy)") self.assertEqual(t.columns[2].data_type, dtypes.long_array) def test_np_on_java_array2(self): @@ -86,7 +86,7 @@ def test_np_on_java_array2(self): def g(x, res): res[:] = x + 5 - t = empty_table(10).update(["X=i%3", "Y=i"]).group_by("X").update("Z=g(Y)") + t = empty_table(10).update(["X=i%3", "Y=ii"]).group_by("X").update("Z=g(Y)") self.assertEqual(t.columns[2].data_type, dtypes.long_array) diff --git a/py/server/tests/test_udf_numpy_args.py b/py/server/tests/test_udf_numpy_args.py new file mode 100644 index 00000000000..ba698a4b21c --- /dev/null +++ b/py/server/tests/test_udf_numpy_args.py @@ -0,0 +1,397 @@ +# +# Copyright (c) 2016-2023 Deephaven Data Labs and Patent Pending +# +import typing +from typing import Optional, Union, Any +import unittest + +import numpy as np +import numpy.typing as npt + +from deephaven import empty_table, DHError, dtypes +from deephaven.dtypes import double_array, int32_array, long_array, int16_array, char_array, int8_array, \ + float32_array +from tests.testbase import BaseTestCase + +_J_TYPE_NULL_MAP = { + "byte": "NULL_BYTE", + "short": "NULL_SHORT", + "char": "NULL_CHAR", + "int": "NULL_INT", + "long": "NULL_LONG", + "float": "NULL_FLOAT", + "double": "NULL_DOUBLE", +} + +_J_TYPE_NP_DTYPE_MAP = { + "byte": "np.int8", + "short": "np.int16", + "char": "np.uint16", + "int": "np.int32", + "long": "np.int64", + "float": "np.float32", + "double": "np.float64", +} + +_J_TYPE_J_ARRAY_TYPE_MAP = { + "byte": int8_array, + "short": int16_array, + "char": char_array, + "int": int32_array, + "long": long_array, + "float": float32_array, + "double": double_array, +} + + +class UDFNumpyTest(BaseTestCase): + def test_j_to_py_no_annotation_no_null(self): + col1_formula = "Col1 = i % 10" + for j_dtype, np_dtype in _J_TYPE_NP_DTYPE_MAP.items(): + col2_formula = f"Col2 = ({j_dtype})i" + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]).group_by("Col1") + + func_str = f""" +def test_udf(col1, col2) -> bool: + j_array_type = _J_TYPE_J_ARRAY_TYPE_MAP[{j_dtype!r}].j_type + return isinstance(col1, int) and isinstance(col2, j_array_type) + """ + exec(func_str, globals()) + res = tbl.update("Col3 = test_udf(Col1, Col2)") + self.assertEqual(10, res.to_string().count("true")) + + def test_j_to_py_no_annotation_null(self): + col1_formula = "Col1 = i % 10" + for j_dtype, null_name in _J_TYPE_NULL_MAP.items(): + col2_formula = f"Col2 = i % 3 == 0? {null_name} : ({j_dtype})i" + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]).group_by("Col1") + + func_str = f""" +def test_udf(col1, col2) -> bool: + j_array_type = _J_TYPE_J_ARRAY_TYPE_MAP[{j_dtype!r}].j_type + return (isinstance(col1, int) and isinstance(col2, j_array_type) and np.any(np.array(col2) == {null_name})) + """ + exec(f"from deephaven.constants import {null_name}", globals()) + exec(func_str, globals()) + res = tbl.update("Col3 = test_udf(Col1, Col2)") + self.assertEqual(10, res.to_string().count("true")) + exec(f"del {null_name}", globals()) + + def test_jarray_to_np_array_no_null(self): + col1_formula = "Col1 = i % 10" + for j_dtype, np_dtype in _J_TYPE_NP_DTYPE_MAP.items(): + col2_formula = f"Col2 = ({j_dtype})i" + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]).group_by("Col1") + + func_str = f""" +def test_udf(col1, col2: np.ndarray[{np_dtype}]) -> bool: + return (isinstance(col1, int) and isinstance(col2, np.ndarray) and col2.dtype.type == {np_dtype} and np.nanmean( + col2) == np.mean( col2)) + """ + exec(func_str, globals()) + res = tbl.update("Col3 = test_udf(Col1, Col2)") + self.assertEqual(10, res.to_string().count("true")) + + def test_jarray_to_np_array_null(self): + col1_formula = "Col1 = i % 10" + for j_dtype, null_name in _J_TYPE_NULL_MAP.items(): + col2_formula = f"Col2 = i % 3 == 0? {null_name} : ({j_dtype})i" + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]).group_by("Col1") + + func_str = f""" +def test_udf(col1, col2: np.ndarray[{_J_TYPE_NP_DTYPE_MAP[j_dtype]}]) -> bool: + return (isinstance(col1, int) and isinstance(col2, np.ndarray) and col2.dtype.type == + {_J_TYPE_NP_DTYPE_MAP[j_dtype]} and np.nanmean(col2) != np.mean( col2)) + """ + exec(func_str, globals()) + + # for floating point types, DH nulls are auto converted to np.nan + # for integer types, DH nulls in the array raise exceptions + if j_dtype in ("float", "double"): + res = tbl.update("Col3 = test_udf(Col1, Col2)") + self.assertEqual(10, res.to_string().count("true")) + else: + with self.assertRaises(DHError) as cm: + tbl.update("Col3 = test_udf(Col1, Col2)") + self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* " + "array does not support ") + + def test_j_scalar_to_py_no_null(self): + col1_formula = "Col1 = i % 10" + for j_dtype, null_name in _J_TYPE_NULL_MAP.items(): + col2_formula = f"Col2 = ({j_dtype})i" + with self.subTest(j_dtype): + np_type = _J_TYPE_NP_DTYPE_MAP[j_dtype] + func = f""" +def test_udf(col: {np_type}) -> bool: + if not isinstance(col, {np_type}): + return False + if np.isnan(col): + return False + else: + return True + """ + exec(func, globals()) + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]) + res = tbl.update("Col3 = test_udf(Col2)") + self.assertEqual(10, res.to_string().count("true")) + + func = f""" +def test_udf(col: Optional[{np_type}]) -> bool: + if not isinstance(col, {np_type}): + return False + if col is None: + return False + else: + return True + """ + exec(func, globals()) + with self.subTest(j_dtype): + tbl = empty_table(100).update([col1_formula, col2_formula]) + res = tbl.update("Col3 = test_udf(Col2)") + self.assertEqual(10, res.to_string().count("true")) + + def test_j_scalar_to_py_null(self): + col1_formula = "Col1 = i % 10" + for data_type, null_name in _J_TYPE_NULL_MAP.items(): + col2_formula = f"Col2 = i % 3 == 0? {null_name} : ({data_type})i" + with self.subTest(data_type): + np_type = _J_TYPE_NP_DTYPE_MAP[data_type] + func = f""" +def test_udf(col: {np_type}) -> bool: + if np.isnan(col): + return True + else: + if not isinstance(col, {np_type}): + return True + return False +""" + exec(func, globals()) + with self.subTest(data_type): + tbl = empty_table(100).update([col1_formula, col2_formula]) + # for floating point types, DH nulls are auto converted to np.nan + # for integer types, DH nulls in the array raise exceptions + if data_type in ("float", "double"): + res = tbl.update("Col3 = test_udf(Col2)") + self.assertEqual(4, res.to_string().count("true")) + else: + with self.assertRaises(DHError) as cm: + res = tbl.update("Col3 = test_udf(Col2)") + self.assertRegex(str(cm.exception), "Argument .* is not compatible with annotation*") + + func = f""" +def test_udf(col: Optional[{np_type}]) -> bool: + if col is None: + return True + else: + if not isinstance(col, {np_type}): + return True + return False +""" + exec(func, globals()) + with self.subTest(data_type): + tbl = empty_table(100).update([col1_formula, col2_formula]) + res = tbl.update("Col3 = test_udf(Col2)") + self.assertEqual(4, res.to_string().count("true")) + + def test_weird_cases(self): + def f(p1: Union[np.ndarray[typing.Any], None]) -> bool: + return bool(p1) + + with self.assertRaises(DHError) as cm: + t = empty_table(10).update(["X1 = f(i)"]) + + def f1(p1: Union[np.int16, np.int32]) -> bool: + return bool(p1) + + with self.assertRaises(DHError) as cm: + t = empty_table(10).update(["X1 = f1(i)"]) + + def f11(p1: Union[float, np.float32]) -> bool: + return bool(p1) + + with self.assertRaises(DHError) as cm: + t = empty_table(10).update(["X1 = f11(i)"]) + + def f2(p1: Union[np.int16, np.float64]) -> Union[Optional[bool]]: + return bool(p1) + + t = empty_table(10).update(["X1 = f2(i)"]) + self.assertEqual(t.columns[0].data_type, dtypes.bool_) + self.assertEqual(9, t.to_string().count("true")) + + def f21(p1: Union[np.int16, np.float64]) -> Union[Optional[bool], int]: + return bool(p1) + + with self.assertRaises(DHError) as cm: + t = empty_table(10).update(["X1 = f21(i)"]) + + def f3(p1: Union[np.int16, np.float64], p2=None) -> bool: + return bool(p1) + + t = empty_table(10).update(["X1 = f3(i)"]) + self.assertEqual(t.columns[0].data_type, dtypes.bool_) + + def f4(p1: Union[np.int16, np.float64], p2=None) -> bool: + return bool(p1) + + t = empty_table(10).update(["X1 = f4((double)i)"]) + self.assertEqual(t.columns[0].data_type, dtypes.bool_) + with self.assertRaises(DHError) as cm: + t = empty_table(10).update(["X1 = f4(now())"]) + self.assertRegex(str(cm.exception), "Argument .* is not compatible with annotation*") + + def f41(p1: Union[np.int16, np.float64, Union[Any]], p2=None) -> bool: + return bool(p1) + + t = empty_table(10).update(["X1 = f41(now())"]) + self.assertEqual(t.columns[0].data_type, dtypes.bool_) + + def f42(p1: Union[np.int16, np.float64, np.datetime64], p2=None) -> bool: + return p1.dtype.char == "M" + + t = empty_table(10).update(["X1 = f42(now())"]) + self.assertEqual(t.columns[0].data_type, dtypes.bool_) + self.assertEqual(10, t.to_string().count("true")) + + def f5(col1, col2: np.ndarray[np.int32]) -> bool: + return np.nanmean(col2) == np.mean(col2) + + t = empty_table(10).update(["X = i % 3", "Y = i"]).group_by("X") + t = t.update(["X1 = f5(X, Y)"]) + with self.assertRaises(DHError) as cm: + t = t.update(["X1 = f5(X, null)"]) + self.assertRegex(str(cm.exception), "Argument .* is not compatible with annotation*") + + def f51(col1, col2: Optional[np.ndarray[np.int32]]) -> bool: + return np.nanmean(col2) == np.mean(col2) + + t = empty_table(10).update(["X = i % 3", "Y = i"]).group_by("X") + t = t.update(["X1 = f51(X, Y)"]) + with self.assertRaises(DHError) as cm: + t = t.update(["X1 = f51(X, null)"]) + self.assertRegex(str(cm.exception), "unsupported operand type.*NoneType") + + t = empty_table(10).update(["X = i % 3", "Y = i"]).group_by("X") + + def f6(*args: np.int32, col2: np.ndarray[np.int32]) -> bool: + return np.nanmean(col2) == np.mean(col2) + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f6(X, Y)"]) + self.assertIn("missing 1 required keyword-only argument", str(cm.exception)) + + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f6(X, Y=null)"]) + self.assertIn("not compatible with annotation", str(cm.exception)) + + def test_str_bool_datetime_array(self): + with self.subTest("str"): + def f1(p1: np.ndarray[str], p2=None) -> bool: + return bool(len(p1)) + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? `deephaven`: null"]).group_by("X") + t1 = t.update(["X1 = f1(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + with self.assertRaises(DHError) as cm: + t2 = t.update(["X1 = f1(null, Y )"]) + self.assertRegex(str(cm.exception), "Argument .* is not compatible with annotation*") + + def f11(p1: Union[np.ndarray[str], None], p2=None) -> bool: + return bool(len(p1)) if p1 is not None else False + t2 = t.update(["X1 = f11(null, Y)"]) + self.assertEqual(3, t2.to_string().count("false")) + + with self.subTest("datetime"): + def f2(p1: np.ndarray[np.datetime64], p2=None) -> bool: + return bool(len(p1)) + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? now() : null"]).group_by("X") + t1 = t.update(["X1 = f2(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + with self.assertRaises(DHError) as cm: + t2 = t.update(["X1 = f2(null, Y )"]) + self.assertRegex(str(cm.exception), "Argument .* is not compatible with annotation*") + + def f21(p1: Union[np.ndarray[np.datetime64], None], p2=None) -> bool: + return bool(len(p1)) if p1 is not None else False + t2 = t.update(["X1 = f21(null, Y)"]) + self.assertEqual(3, t2.to_string().count("false")) + + with self.subTest("boolean"): + def f3(p1: np.ndarray[np.bool_], p2=None) -> bool: + return bool(len(p1)) + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]).group_by("X") + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f3(Y)"]) + self.assertRegex(str(cm.exception), "Java .* array contains Deephaven null values, but numpy .* " + "array does not support ") + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]).group_by("X") + t1 = t.update(["X1 = f3(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + with self.assertRaises(DHError) as cm: + t2 = t.update(["X1 = f3(null, Y )"]) + self.assertRegex(str(cm.exception), "Argument None is not compatible with annotation") + + def f31(p1: Optional[np.ndarray[bool]], p2=None) -> bool: + return bool(len(p1)) if p1 is not None else False + t2 = t.update(["X1 = f31(null, Y)"]) + self.assertEqual(3, t2.to_string("X1").count("false")) + + def test_str_bool_datetime_scalar(self): + with self.subTest("str"): + def f1(p1: str, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? `deephaven`: null"]) + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f1(Y)"]) + self.assertRegex(str(cm.exception), "Argument None is not compatible with annotation") + + def f11(p1: Union[str, None], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f11(Y)"]) + self.assertEqual(5, t2.to_string().count("false")) + + with self.subTest("datetime"): + def f2(p1: np.datetime64, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? now() : null"]) + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f2(Y)"]) + self.assertRegex(str(cm.exception), "Argument None is not compatible with annotation") + + def f21(p1: Union[np.datetime64, None], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f21(Y)"]) + self.assertEqual(5, t2.to_string().count("false")) + + with self.subTest("boolean"): + def f3(p1: np.bool_, p2=None) -> bool: + return p1 is None + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : null"]) + with self.assertRaises(DHError) as cm: + t1 = t.update(["X1 = f3(Y)"]) + self.assertRegex(str(cm.exception), "Argument None is not compatible with annotation") + + t = empty_table(10).update(["X = i % 3", "Y = i % 2 == 0? true : false"]) + t1 = t.update(["X1 = f3(Y)"]) + self.assertEqual(t1.columns[2].data_type, dtypes.bool_) + self.assertEqual(0, t1.to_string("X1").count("true")) + + def f31(p1: Optional[np.bool_], p2=None) -> bool: + return p1 is None + t2 = t.update(["X1 = f31(null, Y)"]) + self.assertEqual(10, t2.to_string("X1").count("true")) + + +if __name__ == "__main__": + unittest.main() diff --git a/py/server/tests/test_pyfunc_return_java_values.py b/py/server/tests/test_udf_return_java_values.py similarity index 96% rename from py/server/tests/test_pyfunc_return_java_values.py rename to py/server/tests/test_udf_return_java_values.py index aef0d44cb93..1a7c78e3aa9 100644 --- a/py/server/tests/test_pyfunc_return_java_values.py +++ b/py/server/tests/test_udf_return_java_values.py @@ -23,7 +23,7 @@ dtypes.byte: "np.int8", dtypes.bool_: "np.bool_", dtypes.string: "np.str_", - # dtypes.char: "np.uint16", + dtypes.char: "np.uint16", } @@ -52,7 +52,7 @@ def test_array_return(self): "np.float64": dtypes.double_array, "bool": dtypes.boolean_array, "np.str_": dtypes.string_array, - # "np.uint16": dtypes.char_array, + "np.uint16": dtypes.char_array, } container_types = ["List", "Tuple", "list", "tuple", "Sequence", "np.ndarray"] for component_type, dh_dtype in component_types.items(): @@ -189,7 +189,7 @@ def f4557_1(x, y) -> np.ndarray[np.int64]: return np.array(x) + y # Testing https://github.com/deephaven/deephaven-core/issues/4562 - @nb.guvectorize([(nb.int64[:], nb.int64, nb.int64[:])], "(m),()->(m)", nopython=True) + @nb.guvectorize([(nb.int32[:], nb.int32, nb.int32[:])], "(m),()->(m)", nopython=True) def f4562_1(x, y, res): res[:] = x + y @@ -198,11 +198,11 @@ def f4562_1(x, y, res): "Y = f4562_1(B,3)" ]) self.assertEqual(t2.columns[2].data_type, dtypes.long_array) - self.assertEqual(t2.columns[3].data_type, dtypes.long_array) + self.assertEqual(t2.columns[3].data_type, dtypes.int32_array) t3 = t2.ungroup() self.assertEqual(t3.columns[2].data_type, dtypes.int64) - self.assertEqual(t3.columns[3].data_type, dtypes.int64) + self.assertEqual(t3.columns[3].data_type, dtypes.int32) def test_ndim_nparray_return_type(self): def f() -> np.ndarray[np.int64]: @@ -222,28 +222,29 @@ def f() -> npt.NDArray[np.int64]: def test_ndarray_weird_cases(self): def f() -> np.ndarray[typing.Any]: return np.array([1, 2], dtype=np.int64) - t = empty_table(10).update(["X1 = f()"]) self.assertEqual(t.columns[0].data_type, dtypes.PyObject) def f1() -> npt.NDArray[typing.Any]: return np.array([1, 2], dtype=np.int64) - t = empty_table(10).update(["X1 = f1()"]) self.assertEqual(t.columns[0].data_type, dtypes.PyObject) def f2() -> np.ndarray[typing.Any, np.int64]: return np.array([1, 2], dtype=np.int64) - t = empty_table(10).update(["X1 = f2()"]) self.assertEqual(t.columns[0].data_type, dtypes.PyObject) def f3() -> Union[None, None]: return np.array([1, 2], dtype=np.int64) - t = empty_table(10).update(["X1 = f3()"]) self.assertEqual(t.columns[0].data_type, dtypes.PyObject) + def f4() -> None: + return np.array([1, 2], dtype=np.int64) + t = empty_table(10).update(["X1 = f4()"]) + self.assertEqual(t.columns[0].data_type, dtypes.PyObject) + def test_optional_scalar_return(self): for dh_dtype, np_dtype in _J_TYPE_NP_DTYPE_MAP.items(): with self.subTest(dh_dtype=dh_dtype, np_dtype=np_dtype): diff --git a/py/server/tests/test_vectorization.py b/py/server/tests/test_vectorization.py index 82b9dccbe2c..8eb28e65cda 100644 --- a/py/server/tests/test_vectorization.py +++ b/py/server/tests/test_vectorization.py @@ -7,24 +7,24 @@ from typing import Optional import numpy as np -import deephaven from deephaven import DHError, empty_table, dtypes from deephaven import new_table from deephaven.column import int_col from deephaven.filters import Filter, and_ -from deephaven.table import dh_vectorize +import deephaven._udf as _udf +from deephaven._udf import _dh_vectorize as dh_vectorize from tests.testbase import BaseTestCase class VectorizationTestCase(BaseTestCase): def setUp(self): super().setUp() - deephaven.table._test_vectorization = True - deephaven.table._vectorized_count = 0 + _udf.test_vectorization = True + _udf.vectorized_count = 0 def tearDown(self) -> None: - deephaven.table._test_vectorization = False - deephaven.table._vectorized_count = 0 + _udf.test_vectorization = False + _udf.vectorized_count = 0 super().tearDown() def test_vectorization_exceptions(self): @@ -66,7 +66,7 @@ def py_plus(p1, p2) -> int: t = empty_table(1).update("X = py_plus(ii, ii)") - self.assertEqual(deephaven.table._vectorized_count, 1) + self.assertEqual(_udf.vectorized_count, 1) def test_vectorized_no_arg(self): def py_random() -> int: @@ -74,7 +74,7 @@ def py_random() -> int: t = empty_table(1).update("X = py_random()") - self.assertEqual(deephaven.table._vectorized_count, 1) + self.assertEqual(_udf.vectorized_count, 1) def test_vectorized_const_arg(self): def py_const(seed) -> int: @@ -84,27 +84,27 @@ def py_const(seed) -> int: expected_count = 0 t = empty_table(10).update("X = py_const(3)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) seed = 10 t = empty_table(10).update("X = py_const(seed)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const(30*1024*1024*1024)") - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const(30000000000L)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const(100.01)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const(100.01f)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) with self.assertRaises(DHError) as cm: t = empty_table(1).update("X = py_const(NULL_INT)") @@ -115,26 +115,26 @@ def py_const_str(s) -> str: t = empty_table(10).update("X = py_const_str(`Deephaven`)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const_str(null)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = empty_table(10).update("X = py_const_str(true)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) t = t.update("Y = py_const_str(X)") expected_count += 1 - self.assertEqual(deephaven.table._vectorized_count, expected_count) + self.assertEqual(_udf.vectorized_count, expected_count) def test_multiple_formulas(self): def pyfunc(p1, p2, p3) -> int: return p1 + p2 + p3 t = empty_table(1).update("X = i").update(["Y = pyfunc(X, i, 33)", "Z = pyfunc(X, ii, 66)"]) - self.assertEqual(deephaven.table._vectorized_count, 2) + self.assertEqual(_udf.vectorized_count, 2) self.assertIn("33", t.to_string(cols=["Y"])) self.assertIn("66", t.to_string(cols=["Z"])) @@ -144,7 +144,7 @@ def pyfunc(p1, p2, p3) -> int: return p1 + p2 + p3 t = empty_table(1).update("X = i").update(["Y = pyfunc(X, i, 33)", "Z = pyfunc(X, ii, 66)"]) - self.assertEqual(deephaven.table._vectorized_count, 1) + self.assertEqual(_udf.vectorized_count, 1) self.assertIn("33", t.to_string(cols=["Y"])) self.assertIn("66", t.to_string(cols=["Z"])) @@ -157,11 +157,11 @@ def pyfunc_bool(p1, p2, p3) -> bool: with self.assertRaises(DHError) as cm: t = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where("pyfunc_int(I, 3, J)") - self.assertEqual(deephaven.table._vectorized_count, 0) + self.assertEqual(_udf.vectorized_count, 0) self.assertIn("boolean required", str(cm.exception)) t = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where("pyfunc_bool(I, 3, J)") - self.assertEqual(deephaven.table._vectorized_count, 1) + self.assertEqual(_udf.vectorized_count, 1) self.assertGreater(t.size, 1) def test_multiple_filters(self): @@ -171,11 +171,11 @@ def pyfunc_bool(p1, p2, p3) -> bool: conditions = ["pyfunc_bool(I, 3, J)", "pyfunc_bool(i, 10, ii)"] filters = Filter.from_(conditions) t = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where(filters) - self.assertEqual(2, deephaven.table._vectorized_count) + self.assertEqual(2, _udf.vectorized_count) filter_and = and_(filters) t1 = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where(filter_and) - self.assertEqual(4, deephaven.table._vectorized_count) + self.assertEqual(4, _udf.vectorized_count) self.assertEqual(t1.size, t.size) self.assertEqual(9, t.size) @@ -187,11 +187,11 @@ def pyfunc_bool(p1, p2, p3) -> bool: conditions = ["pyfunc_bool(I, 3, J)", "pyfunc_bool(i, 10, ii)"] filters = Filter.from_(conditions) t = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where(filters) - self.assertEqual(1, deephaven.table._vectorized_count) + self.assertEqual(1, _udf.vectorized_count) filter_and = and_(filters) t1 = empty_table(10).view(formulas=["I=ii", "J=(ii * 2)"]).where(filter_and) - self.assertEqual(1, deephaven.table._vectorized_count) + self.assertEqual(1, _udf.vectorized_count) self.assertEqual(t1.size, t.size) self.assertEqual(9, t.size) @@ -258,7 +258,7 @@ def sinc(x) -> np.double: t = empty_table(100).update(["X = 0.1 * i", "SincXS=((sinc(X)))"]) self.assertEqual(t.columns[1].data_type, dtypes.double) - self.assertEqual(deephaven.table._vectorized_count, 1) + self.assertEqual(_udf.vectorized_count, 1) def sinc2(x): return np.sinc(x) @@ -272,7 +272,7 @@ def pyfunc(p1: np.int32, p2: np.int32, p3: Optional[np.int32]) -> Optional[int]: return None if total % 3 == 0 else total t = empty_table(10).update("X = i").update(["Y = pyfunc(X, i, 13)", "Z = pyfunc(X, ii, 66)"]) - self.assertEqual(deephaven.table._vectorized_count, 2) + self.assertEqual(_udf.vectorized_count, 2) self.assertIn("13", t.to_string(cols=["Y"])) self.assertIn("null", t.to_string()) self.assertEqual(t.columns[1].data_type, dtypes.long)