Skip to content

Commit

Permalink
make NA behavior for cast to string and cast to unicode match
Browse files Browse the repository at this point in the history
  • Loading branch information
ngoldbaum committed Sep 25, 2023
1 parent 3240d93 commit 591e1e1
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
33 changes: 29 additions & 4 deletions stringdtype/stringdtype/src/casts.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,32 @@ string_to_string_resolve_descriptors(PyObject *NPY_UNUSED(self),
Py_INCREF(given_descrs[0]);
loop_descrs[0] = given_descrs[0];

StringDTypeObject *descr0 = (StringDTypeObject *)loop_descrs[0];
StringDTypeObject *descr1 = (StringDTypeObject *)loop_descrs[1];

if ((descr0->na_object != NULL) && (descr1->na_object == NULL)) {
// cast from a dtype with an NA to one without, so it's a lossy
// unsafe cast
return NPY_UNSAFE_CASTING;
}

*view_offset = 0;

return NPY_NO_CASTING;
}

static int
string_to_string(PyArrayMethod_Context *NPY_UNUSED(context),
char *const data[], npy_intp const dimensions[],
npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
string_to_string(PyArrayMethod_Context *context, char *const data[],
npy_intp const dimensions[], npy_intp const strides[],
NpyAuxData *NPY_UNUSED(auxdata))
{
StringDTypeObject *in_descr =
((StringDTypeObject *)context->descriptors[0]);
StringDTypeObject *out_descr =
((StringDTypeObject *)context->descriptors[1]);
int in_hasnull = in_descr->na_object != NULL;
int out_hasnull = out_descr->na_object != NULL;
const npy_static_string *in_na_name = &in_descr->na_name;
npy_intp N = dimensions[0];
char *in = data[0];
char *out = data[1];
Expand All @@ -74,7 +90,16 @@ string_to_string(PyArrayMethod_Context *NPY_UNUSED(context),
npy_packed_static_string *os = (npy_packed_static_string *)out;
if (in != out) {
npy_string_free(os);
if (npy_string_dup(s, os) < 0) {
if (in_hasnull && !out_hasnull && npy_string_isnull(s)) {
// lossy but this is an unsafe cast so this is OK
if (npy_string_newsize(in_na_name->buf, in_na_name->size, os) <
0) {
gil_error(PyExc_MemoryError,
"Failed to allocate string in string to string "
"cast.");
}
}
else if (npy_string_dup(s, os) < 0) {
gil_error(PyExc_MemoryError, "npy_string_dup failed");
return -1;
}
Expand Down
38 changes: 38 additions & 0 deletions stringdtype/tests/test_stringdtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@ def dtype(na_object, coerce):
return StringDType(coerce=coerce)


# second copy for cast tests to do a cartesian product over dtypes
@pytest.fixture()
def dtype2(na_object, coerce):
# explicit is check for pd_NA because != with pd_NA returns pd_NA
if na_object is pd_NA or na_object != "unset":
return StringDType(na_object=na_object, coerce=coerce)
else:
return StringDType(coerce=coerce)


def test_dtype_creation():
hashes = set()
dt = StringDType()
Expand Down Expand Up @@ -136,6 +146,34 @@ def test_scalars_string_conversion(data, dtype):
np.array(data, dtype=dtype)


@pytest.mark.parametrize(
("strings"),
[
["this", "is", "an", "array"],
["€", "", "😊"],
["A¢☃€ 😊", " A☃€¢😊", "☃€😊 A¢", "😊☃A¢ €"],
],
)
def test_self_casts(dtype, dtype2, strings):
if hasattr(dtype, "na_object"):
strings = strings + [dtype.na_object]
arr = np.array(strings, dtype=dtype)
newarr = arr.astype(dtype2)

if hasattr(dtype, "na_object") and not hasattr(dtype2, "na_object"):
assert newarr[-1] == str(dtype.na_object)
with pytest.raises(TypeError):
arr.astype(dtype2, casting="safe")
arr.astype(dtype2, casting="unsafe")
elif hasattr(dtype, "na_object") and hasattr(dtype2, "na_object"):
assert newarr[-1] is dtype2.na_object
arr.astype(dtype2, casting="safe")
else:
arr.astype(dtype2, casting="safe")

np.testing.assert_array_equal(arr[:-1], newarr[:-1])


@pytest.mark.parametrize(
("strings"),
[
Expand Down

0 comments on commit 591e1e1

Please sign in to comment.