Skip to content

Commit

Permalink
Merge pull request #371 from ZJUEarthData/dev/jmNormal722
Browse files Browse the repository at this point in the history
feat: add Custom scaling MeanNormalization
  • Loading branch information
SanyHe authored Jul 23, 2024
2 parents 1826178 + f282355 commit f5bf809
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 1 deletion.
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@

IMPUTING_STRATEGY = ["Mean Value", "Median Value", "Most Frequent Value", "Constant(Specified Value)"]

FEATURE_SCALING_STRATEGY = ["Min-max Scaling", "Standardization"]
FEATURE_SCALING_STRATEGY = ["Min-max Scaling", "Standardization", "MeanNormalization"]

SAMPLE_BALANCE_STRATEGY = ["Over Sampling", "Under Sampling", "Oversampling and Undersampling"]

Expand Down
2 changes: 2 additions & 0 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..constants import MLFLOW_ARTIFACT_DATA_PATH
from ..utils.base import save_data, save_model, save_text
from .data_readiness import np2pd
from .preprocessing import MeanNormalScaler


class PipelineConstrutor:
Expand All @@ -27,6 +28,7 @@ def transformer_dict(self) -> Dict:
"SimpleImputer": SimpleImputer,
"MinMaxScaler": MinMaxScaler,
"StandardScaler": StandardScaler,
"MeanNormalScaler": MeanNormalScaler,
"PolynomialFeatures": PolynomialFeatures,
"RandomOverSampler": RandomOverSampler,
"RandomUnderSampler": RandomUnderSampler,
Expand Down
58 changes: 58 additions & 0 deletions geochemistrypi/data_mining/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,68 @@
import numpy as np
import pandas as pd
from rich import print
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest, f_classif, f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from .data_readiness import show_data_columns


class MeanNormalScaler(BaseEstimator, TransformerMixin):
"""Custom Scikit-learn transformer for mean normalization.
MeanNormalization involves subtracting the mean of each feature from the feature values
and then dividing by the range (maximum value minus minimum value) of that feature.
The transformation is given by:
X_scaled = (X - X.mean()) / (X.max() - X.min())
"""

def __init__(self, copy=True):
self.copy = copy
self.mean_ = None
self.scale_ = None

def fit(self, X: pd.DataFrame, y=None):
"""
Compute the mean and range (max - min) for each feature.
Parameters:
----------
X (pd.DataFrame): The input dataframe where each column represents a feature.
y : (ignored).
Returns:
self: Fitted transformer.
"""
self.mean_ = np.mean(X, axis=0)
self.scale_ = np.std(X, axis=0)
return self

def transform(self, X, y=None, copy=None):
"""
Apply mean normalization to the data.
Parameters:
----------
X (pd.DataFrame): The input dataframe where each column represents a feature.
copy : bool, optional (default: None)
Copy the input X or not.
Returns:
np.ndarray: The normalized data.
"""
copy = copy if copy is not None else self.copy
X = X if not self.copy else X.copy()
return (X - self.mean_) / self.scale_

def inverse_transform(self, X):
X = X if not self.copy else X.copy()
return X * self.scale_ + self.mean_


def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple[dict, np.ndarray]:
"""Apply feature scaling methods.
Expand All @@ -36,6 +92,8 @@ def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple
scaler = MinMaxScaler()
elif method[method_idx] == "Standardization":
scaler = StandardScaler()
elif method[method_idx] == "MeanNormalization":
scaler = MeanNormalScaler()
try:
X_scaled = scaler.fit_transform(X)
except ValueError:
Expand Down

0 comments on commit f5bf809

Please sign in to comment.