Skip to content

Commit

Permalink
Merge pull request #427 from tvdboom/handle_pd_na
Browse files Browse the repository at this point in the history
convert pd.NA to np.nan
  • Loading branch information
PaulWestenthanner authored Oct 10, 2023
2 parents eaf35b6 + fd8a6e1 commit b8a1901
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
6 changes: 3 additions & 3 deletions category_encoders/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand
column = switch.get('col')
col_mapping = switch['mapping']

# Treat None as np.nan
X[column] = pd.Series([el if el is not None else np.NaN for el in X[column]], index=X[column].index)
X[column] = X[column].map(col_mapping)
# Convert to object to accept np.nan (dtype string doesn't)
# fillna changes None and pd.NA to np.nan
X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
if util.is_category(X[column].dtype):
nan_identity = col_mapping.loc[col_mapping.index.isna()].array[0]
X[column] = X[column].cat.add_categories(nan_identity)
Expand Down
24 changes: 16 additions & 8 deletions tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_handle_unknown_error(self):
def test_handle_missing_error(self):
non_null = pd.DataFrame({'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]}) # only 'city' column is going to be transformed
has_null = pd.DataFrame({'city': ['chicago', np.nan], 'color': ['red', np.nan]})
has_null_pd = pd.DataFrame({'city': ['chicago', pd.NA], 'color': ['red', pd.NA]}, dtype="string")
y = pd.Series([1, 0])

for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
Expand All @@ -158,6 +159,9 @@ def test_handle_missing_error(self):
with self.assertRaises(ValueError):
enc.fit(has_null, y)

with self.assertRaises(ValueError):
enc.fit(has_null_pd, y)

enc.fit(non_null, y) # we raise an error only if a missing value is in one of the transformed columns

with self.assertRaises(ValueError):
Expand Down Expand Up @@ -199,13 +203,15 @@ def test_handle_unknown_return_nan(self):
self.assertTrue(result[1:].isna().all())

def test_handle_missing_return_nan_train(self):
X = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
y = pd.Series([1, 0, 1])

for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
with self.subTest(encoder_name=encoder_name):
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
result = enc.fit_transform(X, y).iloc[2, :]
for X in (X_np, X_pd):
with self.subTest(encoder_name=encoder_name):
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
result = enc.fit_transform(X, y).iloc[2, :]

if len(result) == 1:
self.assertTrue(result.isna().all())
Expand All @@ -214,13 +220,15 @@ def test_handle_missing_return_nan_train(self):

def test_handle_missing_return_nan_test(self):
X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']})
X_t = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
X_np = pd.DataFrame({'city': ['chicago', 'los angeles', np.NaN]})
X_pd = pd.DataFrame({'city': ['chicago', 'los angeles', pd.NA]}, dtype="string")
y = pd.Series([1, 0, 1])

for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded
with self.subTest(encoder_name=encoder_name):
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
result = enc.fit(X, y).transform(X_t).iloc[2, :]
for X_na in (X_np, X_pd):
with self.subTest(encoder_name=encoder_name):
enc = getattr(encoders, encoder_name)(handle_missing='return_nan')
result = enc.fit(X, y).transform(X_na).iloc[2, :]

if len(result) == 1:
self.assertTrue(result.isna().all())
Expand Down

0 comments on commit b8a1901

Please sign in to comment.