Skip to content

Commit

Permalink
Reformatted the code and updated the examples
Browse files Browse the repository at this point in the history
  • Loading branch information
janmotl committed Apr 27, 2019
1 parent 374875b commit 5e9e803
Show file tree
Hide file tree
Showing 36 changed files with 343 additions and 270 deletions.
12 changes: 6 additions & 6 deletions category_encoders/backward_difference.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand Down Expand Up @@ -75,13 +75,12 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
.. [1] Contrast Coding Systems for Categorical Variables, from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""

def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
Expand Down Expand Up @@ -282,11 +281,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

if not isinstance(self.feature_names, list):
Expand Down
16 changes: 10 additions & 6 deletions category_encoders/basen.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand Down Expand Up @@ -310,9 +310,11 @@ def basen_encode(self, X_in, cols=None):
X_in: DataFrame
cols: list-like, default None
Column names in the DataFrame to be encoded
Returns
-------
dummies : DataFrame
"""

X = X_in.copy(deep=True)
Expand Down Expand Up @@ -348,6 +350,7 @@ def basen_to_integer(self, X, cols, base):
Returns
-------
numerical: DataFrame
"""
out_cols = X.columns.values.tolist()

Expand All @@ -360,7 +363,7 @@ def basen_to_integer(self, X, cols, base):
else:
len0 = len(col_list)
value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)])
X.insert(insert_at,col,np.dot(X[col_list].values, value_array.T))
X.insert(insert_at, col, np.dot(X[col_list].values, value_array.T))
X.drop(col_list, axis=1, inplace=True)
out_cols = X.columns.values.tolist()

Expand All @@ -374,14 +377,14 @@ def col_transform(self, col, digits):
if col is None or float(col) < 0.0:
return None
else:
col = self.numberToBase(int(col), self.base, digits)
col = self.number_to_base(int(col), self.base, digits)
if len(col) == digits:
return col
else:
return [0 for _ in range(digits - len(col))] + col

@staticmethod
def numberToBase(n, b, limit):
def number_to_base(n, b, limit):
if b == 1:
return [0 if n != _ else 1 for _ in range(limit)]

Expand All @@ -399,11 +402,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

if not isinstance(self.feature_names, list):
Expand Down
7 changes: 4 additions & 3 deletions category_encoders/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand Down Expand Up @@ -144,11 +144,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

return self.base_n_encoder.get_feature_names()
23 changes: 14 additions & 9 deletions category_encoders/cat_boost.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""CatBoost coding"""

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
Expand All @@ -15,19 +16,21 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
values "on-the-fly". Consequently, the values naturally vary
during the training phase and it is not necessary to add random noise.
Beware, the training data have to be randomly permutated. E.g.:
Beware, the training data have to be randomly permutated. E.g.::
# Random permutation
perm = np.random.permutation(len(X))
X = X.iloc[perm].reset_index(drop=True)
y = y.iloc[perm].reset_index(drop=True)
This is necessary because some datasets are sorted based on the target
value and this coder encodes the features on-the-fly in a single pass.
Parameters
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand All @@ -48,7 +51,7 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
>>> bunch = load_boston()
>>> y = bunch.target
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> enc = CatBoostEncoder(cols=['CHAS', 'RAD']).fit(X, y)
>>> numeric_dataset = enc.transform(X)
>>> print(numeric_dataset.info())
<class 'pandas.core.frame.DataFrame'>
Expand All @@ -74,8 +77,9 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
References
----------
.. [1] Transforming categorical features to numerical features. from
https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/.
.. [1] Transforming categorical features to numerical features, from
https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
"""

def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
Expand All @@ -84,7 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
self.drop_invariant = drop_invariant
self.drop_cols = []
self.verbose = verbose
self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
self.cols = cols
self._dim = None
self.mapping = None
Expand Down Expand Up @@ -280,7 +284,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None):
level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean)
X[col] = X[col].map(level_means)
else:
## Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
# Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
# The nice thing about this is that it helps to prevent overfitting. The bad thing
# is that CatBoost uses many iterations over the data. But we run just one iteration.
# Still, it works better than leave-one-out without any noise.
Expand Down Expand Up @@ -308,11 +312,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

if not isinstance(self.feature_names, list):
Expand Down
11 changes: 6 additions & 5 deletions category_encoders/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand Down Expand Up @@ -69,8 +69,8 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
References
----------
.. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for
Large Scale Multitask Learning. Proc. ICML.
.. [1] Feature Hashing for Large Scale Multitask Learning, from
https://alex.smola.org/papers/2009/Weinbergeretal09.pdf
"""

Expand Down Expand Up @@ -258,11 +258,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

if not isinstance(self.feature_names, list):
Expand Down
13 changes: 7 additions & 6 deletions category_encoders/helmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
----------
verbose: int
integer indicating verbosity of output. 0 for none.
integer indicating verbosity of the output. 0 for none.
cols: list
a list of columns to encode, if None, all string columns will be encoded.
drop_invariant: bool
Expand Down Expand Up @@ -76,14 +76,14 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
References
----------
.. [1] Contrast Coding Systems for categorical variables. UCLA: Statistical Consulting Group. from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
.. [1] Contrast Coding Systems for Categorical Variables, from
https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
.. [2] Gregory Carey (2003). Coding Categorical Variables, from
http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
"""

def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
handle_unknown='indicator', handle_missing='indicator'):
self.return_df = return_df
Expand Down Expand Up @@ -279,11 +279,12 @@ def get_feature_names(self):
"""
Returns the names of all transformed / added columns.
Returns:
--------
Returns
-------
feature_names: list
A list with all feature names transformed or added.
Note: potentially dropped features are not included!
"""

if not isinstance(self.feature_names, list):
Expand Down
Loading

0 comments on commit 5e9e803

Please sign in to comment.