From b04514346ec633aa9c62ba408b715a5fffd1eaae Mon Sep 17 00:00:00 2001 From: paul Date: Sat, 14 Jan 2023 22:15:04 +0100 Subject: [PATCH] prepare release 2.6 --- CHANGELOG.md | 18 +++++++++++++++++- README.md | 2 ++ category_encoders/__init__.py | 2 +- category_encoders/target_encoder.py | 18 ++++-------------- tests/test_encoders.py | 2 +- tests/test_target_encoder.py | 13 ++++--------- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e5a96b4..4541220a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,23 @@ unreleased ========== -* added gray encoder +v2.6.0 +====== +* added gray encoder +* added thermometer / rank-hot encoder +* introduce compatibility with sklearn 1.2 + * compatibility with `feature_names_out_` + * remove boston housing dataset + * drop support for dataframes with non-homogenous data types in column names (i.e. having both string and integer column names) +* improve performance of hashing encoder +* improve catboost documentation +* fix inverse transform in baseN with special character column names (issue 392) +* fix inverse transform of ordinal encoder with custom mapping (issue 202) +* fix re-fittable polynomial wrapper (issue 313) +* fix numerical stability for target encoding (issue 377) +* change default parameters of target encoding (issue 327) +* drop support for sklearn 0.x + v2.5.1.post0 ============ * fix pypi sdist diff --git a/README.md b/README.md index 7733fb97..f653fcc6 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ __Unsupervised:__ * Helmert Contrast [2][3] * Ordinal [2][3] * One-Hot [2][3] + * Rank Hot [15] * Polynomial Contrast [2][3] * Sum Contrast [2][3] @@ -149,3 +150,4 @@ References 12. Andrew Gelman and Jennifer Hill (2006). Data Analysis Using Regression and Multilevel/Hierarchical Models. From https://faculty.psau.edu.sa/filedownload/doc-12-pdf-a1997d0d31f84d13c1cdc44ac39a8f2c-original.pdf 13. Carlos Mougan, David Masip, Jordi Nin and Oriol Pujol (2021). Quantile Encoder: Tackling High Cardinality Categorical Features in Regression Problems. https://link.springer.com/chapter/10.1007%2F978-3-030-85529-1_14 14. Gray Encoding. From https://en.wikipedia.org/wiki/Gray_code + 15. Jacob Buckman, Aurko Roy, Colin Raffel, Ian Goodfellow: Thermometer Encoding: One Hot Way To Resist Adversarial Examples. From https://openreview.net/forum?id=S18Su--CW diff --git a/category_encoders/__init__.py b/category_encoders/__init__.py index ec3c78b4..d1c4ec85 100644 --- a/category_encoders/__init__.py +++ b/category_encoders/__init__.py @@ -27,7 +27,7 @@ from category_encoders.glmm import GLMMEncoder from category_encoders.quantile_encoder import QuantileEncoder, SummaryEncoder -__version__ = '2.5.1.post0' +__version__ = '2.6.0' __author__ = "willmcginnis", "cmougan", "paulwestenthanner" diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 3b37afe4..64de8350 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -4,7 +4,6 @@ from scipy.special import expit from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util -import warnings __author__ = 'chappers' @@ -44,10 +43,10 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): The value must be strictly bigger than 0. Higher values mean a flatter S-curve (see min_samples_leaf). hierarchy: dict or dataframe A dictionary or a dataframe to define the hierarchy for mapping. - + If a dictionary, this contains a dict of columns to map into hierarchies. Dictionary key(s) should be the column name from X which requires mapping. For multiple hierarchical maps, this should be a dictionary of dictionaries. - + If dataframe: a dataframe defining columns to be used for the hierarchies. Column names must take the form: HIER_colA_1, ... HIER_colA_N, HIER_colB_1, ... HIER_colB_M, ... where [colA, colB, ...] are given columns in cols list. @@ -111,20 +110,12 @@ class TargetEncoder(util.BaseEncoder, util.SupervisedTransformerMixin): encoding_relation = util.EncodingRelation.ONE_TO_ONE def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', - handle_unknown='value', min_samples_leaf=1, smoothing=1.0, hierarchy=None): + handle_unknown='value', min_samples_leaf=20, smoothing=10, hierarchy=None): super().__init__(verbose=verbose, cols=cols, drop_invariant=drop_invariant, return_df=return_df, handle_unknown=handle_unknown, handle_missing=handle_missing) self.ordinal_encoder = None self.min_samples_leaf = min_samples_leaf - if min_samples_leaf == 1: - warnings.warn("Default parameter min_samples_leaf will change in version 2.6." - "See https://github.com/scikit-learn-contrib/category_encoders/issues/327", - category=FutureWarning) self.smoothing = smoothing - if smoothing == 1.0: - warnings.warn("Default parameter smoothing will change in version 2.6." - "See https://github.com/scikit-learn-contrib/category_encoders/issues/327", - category=FutureWarning) self.mapping = None self._mean = None if isinstance(hierarchy, (dict, pd.DataFrame)) and cols is None: @@ -203,7 +194,7 @@ def fit_target_encoding(self, X, y): col = switch.get('col') if 'HIER_' not in str(col): values = switch.get('mapping') - + scalar = prior if (isinstance(self.hierarchy, dict) and col in self.hierarchy) or \ (isinstance(self.hierarchy, pd.DataFrame)): @@ -222,7 +213,6 @@ def fit_target_encoding(self, X, y): smoove = self._weighting(stats['count']) smoothing = scalar * (1 - smoove) + stats['mean'] * smoove - smoothing[stats['count'] == 1] = scalar if self.handle_unknown == 'return_nan': smoothing.loc[-1] = np.nan diff --git a/tests/test_encoders.py b/tests/test_encoders.py index 442f3088..b93d3f10 100644 --- a/tests/test_encoders.py +++ b/tests/test_encoders.py @@ -381,7 +381,7 @@ def test_preserve_names(self): def test_unique_column_is_not_predictive(self): # @ToDo not sure how useful this test is. TargetEncoders set the value to the default if there is only # one category but they probably should not. See discussion in issue 327 - test_encoders = ['LeaveOneOutEncoder', 'TargetEncoder', 'WOEEncoder', 'MEstimateEncoder', + test_encoders = ['LeaveOneOutEncoder', 'WOEEncoder', 'MEstimateEncoder', 'JamesSteinEncoder', 'CatBoostEncoder', 'GLMMEncoder'] for encoder_name in test_encoders: enc = getattr(encoders, encoder_name)() diff --git a/tests/test_target_encoder.py b/tests/test_target_encoder.py index d3a75c6c..924c79d4 100644 --- a/tests/test_target_encoder.py +++ b/tests/test_target_encoder.py @@ -109,11 +109,6 @@ def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(se self.assertAlmostEqual(0.4125, values[2], delta=1e-4) self.assertEqual(0.5, values[3]) - def test_target_encoder_noncontiguous_index(self): - data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': range(5)}).dropna() - result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y']) - self.assertTrue(np.allclose(result, 2.0)) - def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): df = pd.DataFrame({ 'color': ["a", "a", "a", "b", "b", "b"], @@ -175,7 +170,7 @@ def test_hierarchical_smoothing_multi(self): self.assertAlmostEqual(0.3248, values[5], delta=1e-4) self.assertAlmostEqual(0.6190, values[11], delta=1e-4) self.assertAlmostEqual(0.1309, values[13], delta=1e-4) - self.assertAlmostEqual(0.7381, values[15], delta=1e-4) + self.assertAlmostEqual(0.8370, values[15], delta=1e-4) def test_hierarchical_part_named_cols(self): @@ -299,10 +294,10 @@ def test_hierarchy_multi_level(self): values = result['Animal'].values self.assertAlmostEqual(0.6261, values[0], delta=1e-4) self.assertAlmostEqual(0.9065, values[2], delta=1e-4) - self.assertAlmostEqual(0.4107, values[5], delta=1e-4) + self.assertAlmostEqual(0.2556, values[5], delta=1e-4) self.assertAlmostEqual(0.3680, values[8], delta=1e-4) self.assertAlmostEqual(0.4626, values[11], delta=1e-4) - self.assertAlmostEqual(0.2466, values[13], delta=1e-4) + self.assertAlmostEqual(0.1535, values[13], delta=1e-4) self.assertAlmostEqual(0.4741, values[14], delta=1e-4) def test_hierarchy_columnwise_compass(self): @@ -330,7 +325,7 @@ def test_hierarchy_columnwise_postcodes(self): result = enc.fit_transform(X[cols], y) values = result['postcode'].values - self.assertAlmostEqual(0.7506, values[0], delta=1e-4) + self.assertAlmostEqual(0.8448, values[0], delta=1e-4) def test_hierarchy_columnwise_missing_level(self):