Skip to content

Commit

Permalink
prepare release 2.6
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulWestenthanner committed Jan 14, 2023
1 parent c189587 commit b045143
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 26 deletions.
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
unreleased
==========
* added gray encoder

v2.6.0
======
* added gray encoder
* added thermometer / rank-hot encoder
* introduce compatibility with sklearn 1.2
* compatibility with `feature_names_out_`
* remove boston housing dataset
* drop support for dataframes with non-homogenous data types in column names (i.e. having both string and integer column names)
* improve performance of hashing encoder
* improve catboost documentation
* fix inverse transform in baseN with special character column names (issue 392)
* fix inverse transform of ordinal encoder with custom mapping (issue 202)
* fix re-fittable polynomial wrapper (issue 313)
* fix numerical stability for target encoding (issue 377)
* change default parameters of target encoding (issue 327)
* drop support for sklearn 0.x

v2.5.1.post0
============
* fix pypi sdist
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ __Unsupervised:__
* Helmert Contrast [2][3]
* Ordinal [2][3]
* One-Hot [2][3]
* Rank Hot [15]
* Polynomial Contrast [2][3]
* Sum Contrast [2][3]

Expand Down Expand Up @@ -149,3 +150,4 @@ References
12. Andrew Gelman and Jennifer Hill (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. From https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf
13. Carlos Mougan, David Masip, Jordi Nin and Oriol Pujol (2021). Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems. https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14
14. Gray Encoding. From https://en.wikipedia.org/wiki/Gray_code
15. Jacob Buckman, Aurko Roy, Colin Raffel, Ian Goodfellow: Thermometer Encoding: One Hot Way To Resist Adversarial Examples. From https://openreview.net/forum?id=S18Su--CW
2 changes: 1 addition & 1 deletion category_encoders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from category_encoders.glmm import GLMMEncoder
from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder

__version__ = '2.5.1.post0'
__version__ = '2.6.0'

__author__ = "willmcginnis", "cmougan", "paulwestenthanner"

Expand Down
18 changes: 4 additions & 14 deletions category_encoders/target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from scipy.special import expit
from category_encoders.ordinal import OrdinalEncoder
import category_encoders.utils as util
import warnings

__author__ = 'chappers'

Expand Down Expand Up @@ -44,10 +43,10 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf).
hierarchy: dict or dataframe
A dictionary or a dataframe to define the hierarchy for mapping.
If a dictionary, this contains a dict of columns to map into hierarchies. Dictionary key(s) should be the column name from X
which requires mapping. For multiple hierarchical maps, this should be a dictionary of dictionaries.
If dataframe: a dataframe defining columns to be used for the hierarchies. Column names must take the form:
HIER_colA_1, ... HIER_colA_N, HIER_colB_1, ... HIER_colB_M, ...
where [colA, colB, ...] are given columns in cols list.
Expand Down Expand Up @@ -111,20 +110,12 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin):
encoding_relation = util.EncodingRelation.ONE_TO_ONE

def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value',
handle_unknown='value', min_samples_leaf=1, smoothing=1.0, hierarchy=None):
handle_unknown='value', min_samples_leaf=20, smoothing=10, hierarchy=None):
super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df,
handle_unknown=handle_unknown, handle_missing=handle_missing)
self.ordinal_encoder = None
self.min_samples_leaf = min_samples_leaf
if min_samples_leaf == 1:
warnings.warn("Default parameter min_samples_leaf will change in version 2.6."
"See https://github.com/scikit-learn-contrib/category_encoders/issues/327",
category=FutureWarning)
self.smoothing = smoothing
if smoothing == 1.0:
warnings.warn("Default parameter smoothing will change in version 2.6."
"See https://github.com/scikit-learn-contrib/category_encoders/issues/327",
category=FutureWarning)
self.mapping = None
self._mean = None
if isinstance(hierarchy, (dict, pd.DataFrame)) and cols is None:
Expand Down Expand Up @@ -203,7 +194,7 @@ def fit_target_encoding(self, X, y):
col = switch.get('col')
if 'HIER_' not in str(col):
values = switch.get('mapping')

scalar = prior
if (isinstance(self.hierarchy, dict) and col in self.hierarchy) or \
(isinstance(self.hierarchy, pd.DataFrame)):
Expand All @@ -222,7 +213,6 @@ def fit_target_encoding(self, X, y):
smoove = self._weighting(stats['count'])

smoothing = scalar * (1 - smoove) + stats['mean'] * smoove
smoothing[stats['count'] == 1] = scalar

if self.handle_unknown == 'return_nan':
smoothing.loc[-1] = np.nan
Expand Down
2 changes: 1 addition & 1 deletion tests/test_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def test_preserve_names(self):
def test_unique_column_is_not_predictive(self):
# @ToDo not sure how useful this test is. TargetEncoders set the value to the default if there is only
# one category but they probably should not. See discussion in issue 327
test_encoders = ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder', 'MEstimateEncoder',
test_encoders = ['LeaveOneOutEncoder', 'WOEEncoder', 'MEstimateEncoder',
'JamesSteinEncoder', 'CatBoostEncoder', 'GLMMEncoder']
for encoder_name in test_encoders:
enc = getattr(encoders, encoder_name)()
Expand Down
13 changes: 4 additions & 9 deletions tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,6 @@ def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(se
self.assertAlmostEqual(0.4125, values[2], delta=1e-4)
self.assertEqual(0.5, values[3])

def test_target_encoder_noncontiguous_index(self):
data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': range(5)}).dropna()
result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y'])
self.assertTrue(np.allclose(result, 2.0))

def test_HandleMissingIsValueAndNanInTest_ExpectMean(self):
df = pd.DataFrame({
'color': ["a", "a", "a", "b", "b", "b"],
Expand Down Expand Up @@ -175,7 +170,7 @@ def test_hierarchical_smoothing_multi(self):
self.assertAlmostEqual(0.3248, values[5], delta=1e-4)
self.assertAlmostEqual(0.6190, values[11], delta=1e-4)
self.assertAlmostEqual(0.1309, values[13], delta=1e-4)
self.assertAlmostEqual(0.7381, values[15], delta=1e-4)
self.assertAlmostEqual(0.8370, values[15], delta=1e-4)

def test_hierarchical_part_named_cols(self):

Expand Down Expand Up @@ -299,10 +294,10 @@ def test_hierarchy_multi_level(self):
values = result['Animal'].values
self.assertAlmostEqual(0.6261, values[0], delta=1e-4)
self.assertAlmostEqual(0.9065, values[2], delta=1e-4)
self.assertAlmostEqual(0.4107, values[5], delta=1e-4)
self.assertAlmostEqual(0.2556, values[5], delta=1e-4)
self.assertAlmostEqual(0.3680, values[8], delta=1e-4)
self.assertAlmostEqual(0.4626, values[11], delta=1e-4)
self.assertAlmostEqual(0.2466, values[13], delta=1e-4)
self.assertAlmostEqual(0.1535, values[13], delta=1e-4)
self.assertAlmostEqual(0.4741, values[14], delta=1e-4)

def test_hierarchy_columnwise_compass(self):
Expand Down Expand Up @@ -330,7 +325,7 @@ def test_hierarchy_columnwise_postcodes(self):
result = enc.fit_transform(X[cols], y)

values = result['postcode'].values
self.assertAlmostEqual(0.7506, values[0], delta=1e-4)
self.assertAlmostEqual(0.8448, values[0], delta=1e-4)


def test_hierarchy_columnwise_missing_level(self):
Expand Down

0 comments on commit b045143

Please sign in to comment.