Reformatted the code and updated the examples

scikit-learn-contrib · Apr 27, 2019 · 5e9e803 · 5e9e803
1 parent 374875b
commit 5e9e803
Show file tree

Hide file tree

Showing 36 changed files with 343 additions and 270 deletions.
diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py
@@ -17,7 +17,7 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -75,13 +75,12 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
     References
     ----------
 
-    .. [1] Contrast Coding Systems for categorical variables.  UCLA: Statistical Consulting Group. from
-    https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
+    .. [1] Contrast Coding Systems for Categorical Variables, from
+    https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
 
     .. [2] Gregory Carey (2003). Coding Categorical Variables, from
     http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
 
-
     """
 
     def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
@@ -282,11 +281,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         if not isinstance(self.feature_names, list):

diff --git a/category_encoders/basen.py b/category_encoders/basen.py
@@ -20,7 +20,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -310,9 +310,11 @@ def basen_encode(self, X_in, cols=None):
         X_in: DataFrame
         cols: list-like, default None
             Column names in the DataFrame to be encoded
+
         Returns
         -------
         dummies : DataFrame
+
         """
 
         X = X_in.copy(deep=True)
@@ -348,6 +350,7 @@ def basen_to_integer(self, X, cols, base):
         Returns
         -------
         numerical: DataFrame
+
         """
         out_cols = X.columns.values.tolist()
 
@@ -360,7 +363,7 @@ def basen_to_integer(self, X, cols, base):
             else:
                 len0 = len(col_list)
                 value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)])
-            X.insert(insert_at,col,np.dot(X[col_list].values, value_array.T))
+            X.insert(insert_at, col, np.dot(X[col_list].values, value_array.T))
             X.drop(col_list, axis=1, inplace=True)
             out_cols = X.columns.values.tolist()
 
@@ -374,14 +377,14 @@ def col_transform(self, col, digits):
         if col is None or float(col) < 0.0:
             return None
         else:
-            col = self.numberToBase(int(col), self.base, digits)
+            col = self.number_to_base(int(col), self.base, digits)
             if len(col) == digits:
                 return col
             else:
                 return [0 for _ in range(digits - len(col))] + col
 
     @staticmethod
-    def numberToBase(n, b, limit):
+    def number_to_base(n, b, limit):
         if b == 1:
             return [0 if n != _ else 1 for _ in range(limit)]
 
@@ -399,11 +402,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         if not isinstance(self.feature_names, list):

diff --git a/category_encoders/binary.py b/category_encoders/binary.py
@@ -15,7 +15,7 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -144,11 +144,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         return self.base_n_encoder.get_feature_names()
diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py
@@ -1,4 +1,5 @@
 """CatBoost coding"""
+
 import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
@@ -15,19 +16,21 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
     values "on-the-fly". Consequently, the values naturally vary
     during the training phase and it is not necessary to add random noise.
 
-    Beware, the training data have to be randomly permutated. E.g.:
+    Beware, the training data have to be randomly permutated. E.g.::
+
         # Random permutation
         perm = np.random.permutation(len(X))
         X = X.iloc[perm].reset_index(drop=True)
         y = y.iloc[perm].reset_index(drop=True)
+
     This is necessary because some datasets are sorted based on the target
     value and this coder encodes the features on-the-fly in a single pass.
 
     Parameters
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -48,7 +51,7 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
     >>> bunch = load_boston()
     >>> y = bunch.target
     >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
-    >>> enc = LeaveOneOutEncoder(cols=['CHAS', 'RAD']).fit(X, y)
+    >>> enc = CatBoostEncoder(cols=['CHAS', 'RAD']).fit(X, y)
     >>> numeric_dataset = enc.transform(X)
     >>> print(numeric_dataset.info())
     <class 'pandas.core.frame.DataFrame'>
@@ -74,8 +77,9 @@ class CatBoostEncoder(BaseEstimator, TransformerMixin):
     References
     ----------
 
-    .. [1] Transforming categorical features to numerical features. from
-    https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/.
+    .. [1] Transforming categorical features to numerical features, from
+    https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/
+
     """
 
     def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
@@ -84,7 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True,
         self.drop_invariant = drop_invariant
         self.drop_cols = []
         self.verbose = verbose
-        self.use_default_cols = cols is None # if True, even a repeated call of fit() will select string columns from X
+        self.use_default_cols = cols is None  # if True, even a repeated call of fit() will select string columns from X
         self.cols = cols
         self._dim = None
         self.mapping = None
@@ -280,7 +284,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None):
                 level_means = ((colmap['sum'] + self._mean) / (colmap['count'] + 1)).where(level_notunique, self._mean)
                 X[col] = X[col].map(level_means)
             else:
-                ## Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
+                # Simulation of CatBoost implementation, which calculates leave-one-out on the fly.
                 # The nice thing about this is that it helps to prevent overfitting. The bad thing
                 # is that CatBoost uses many iterations over the data. But we run just one iteration.
                 # Still, it works better than leave-one-out without any noise.
@@ -308,11 +312,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         if not isinstance(self.feature_names, list):

diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py
@@ -20,7 +20,7 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -69,8 +69,8 @@ class HashingEncoder(BaseEstimator, TransformerMixin):
 
     References
     ----------
-    .. [1] Kilian Weinberger; Anirban Dasgupta; John Langford; Alex Smola; Josh Attenberg (2009). Feature Hashing for
-    Large Scale Multitask Learning. Proc. ICML.
+    .. [1] Feature Hashing for Large Scale Multitask Learning, from
+    https://alex.smola.org/papers/2009/Weinbergeretal09.pdf
 
     """
 
@@ -258,11 +258,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         if not isinstance(self.feature_names, list):

diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py
@@ -18,7 +18,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
     ----------
 
     verbose: int
-        integer indicating verbosity of output. 0 for none.
+        integer indicating verbosity of the output. 0 for none.
     cols: list
         a list of columns to encode, if None, all string columns will be encoded.
     drop_invariant: bool
@@ -76,14 +76,14 @@ class HelmertEncoder(BaseEstimator, TransformerMixin):
     References
     ----------
 
-    .. [1] Contrast Coding Systems for categorical variables.  UCLA: Statistical Consulting Group. from
-    https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/.
+    .. [1] Contrast Coding Systems for Categorical Variables, from
+    https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
 
     .. [2] Gregory Carey (2003). Coding Categorical Variables, from
     http://psych.colorado.edu/~carey/Courses/PSYC5741/handouts/Coding%20Categorical%20Variables%202006-03-03.pdf
 
-
     """
+
     def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
                  handle_unknown='indicator', handle_missing='indicator'):
         self.return_df = return_df
@@ -279,11 +279,12 @@ def get_feature_names(self):
         """
         Returns the names of all transformed / added columns.
 
-        Returns:
-        --------
+        Returns
+        -------
         feature_names: list
             A list with all feature names transformed or added.
             Note: potentially dropped features are not included!
+
         """
 
         if not isinstance(self.feature_names, list):