Skip to content

Commit

Permalink
Merge pull request ZJUEarthData#254 from ZJUEarthData/dev/Mengqi
Browse files Browse the repository at this point in the history
feat: add feature selection function.
  • Loading branch information
SanyHe authored Sep 23, 2023
2 parents c272ea4 + c609147 commit a0141c9
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 6 deletions.
24 changes: 21 additions & 3 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
CLUSTERING_MODELS,
DECOMPOSITION_MODELS,
FEATURE_SCALING_STRATEGY,
FEATURE_SELECTION_STRATEGY,
IMPUTING_STRATEGY,
MLFLOW_ARTIFACT_DATA_PATH,
MODE_OPTION,
Expand All @@ -28,7 +29,7 @@
from .data.feature_engineering import FeatureConstructor
from .data.imputation import imputer
from .data.inference import build_transform_pipeline, model_inference
from .data.preprocessing import feature_scaler
from .data.preprocessing import feature_scaler, feature_selector
from .data.statistic import monte_carlo_simulator
from .plot.map_plot import process_world_map
from .plot.statistic_plot import basic_statistic, correlation_plot, distribution_plot, is_imputed, is_null_value, log_distribution_plot, probability_plot, ratio_null_vs_filled
Expand Down Expand Up @@ -322,6 +323,22 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# <--- Feature Selection --->
print("-*-*- Feature Selection -*-*-")
num2option(OPTION)
is_feature_selection = limit_num_input(OPTION, SECTION[1], num_input)
if is_feature_selection == 1:
print("Which strategy do you want to apply?")
num2option(FEATURE_SELECTION_STRATEGY)
feature_selection_num = limit_num_input(FEATURE_SELECTION_STRATEGY, SECTION[1], num_input)
feature_selection_config, X = feature_selector(X, y, mode_num, FEATURE_SELECTION_STRATEGY, feature_selection_num - 1)
print("--Selected Features-")
show_data_columns(X.columns)
save_data(X, "X After feature selection", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
else:
feature_selection_config = {}
clear_output()

# create training data and testing data
print("-*-*- Data Split - Train Set and Test Set -*-*-")
print("Notice: Normally, set 20% of the dataset aside as test set, such as 0.2")
Expand All @@ -341,6 +358,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N
else:
# unsupervised learning
feature_scaling_config = {}
feature_selection_config = {}
X = data_selected_imputed_fe
X_train = data_selected_imputed_fe
y, X_test, y_train, y_test = None, None, None, None
Expand Down Expand Up @@ -416,7 +434,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Transform Pipeline --->
logger.debug("Transform Pipeline")
transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, run, X_train)
transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, feature_selection_config, run, X_train, y_train)
clear_output()

# <--- Model Inference --->
Expand All @@ -443,7 +461,7 @@ def cli_pipeline(training_data_path: str, inference_data_path: Optional[str] = N

# <--- Transform Pipeline --->
logger.debug("Transform Pipeline")
transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, run, X_train)
transformer_config, transform_pipeline = build_transform_pipeline(imputation_config, feature_scaling_config, feature_selection_config, run, X_train, y_train)

# <--- Model Inference --->
logger.debug("Model Inference")
Expand Down
2 changes: 2 additions & 0 deletions geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,5 @@
SAMPLE_BALANCE_STRATEGY = ["Over Sampling", "Under Sampling", "Oversampling and Undersampling"]

CUSTOMIZE_LABEL_STRATEGY = ["Automatic Coding", "Custom Numeric Labels", "Custom Non-numeric Labels"]

FEATURE_SELECTION_STRATEGY = ["GenericUnivariateSelect", "SelectKBest"]
19 changes: 16 additions & 3 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import json
import os
from typing import Dict, Optional, Tuple
Expand All @@ -6,6 +7,7 @@
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler
Expand All @@ -28,6 +30,8 @@ def transformer_dict(self) -> Dict:
"PolynomialFeatures": PolynomialFeatures,
"RandomOverSampler": RandomOverSampler,
"RandomUnderSampler": RandomUnderSampler,
"GenericUnivariateSelect": GenericUnivariateSelect,
"SelectKBest": SelectKBest,
}

def chain(self, transformer_config: Dict) -> object:
Expand All @@ -49,7 +53,7 @@ def chain(self, transformer_config: Dict) -> object:
return make_pipeline(*transformers)


def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Dict, run: object, X_train: pd.DataFrame) -> Tuple[Dict, object]:
def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Dict, feature_selection_config: Dict, run: object, X_train: pd.DataFrame, y_train: pd.DataFrame) -> Tuple[Dict, object]:
"""Build the transform pipeline.
Parameters
Expand All @@ -60,6 +64,9 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
feature_scaling_config : Dict
The feature scaling configuration.
feature_selection_config : Dict
The feature selection configuration.
run : object
The model selection object.
Expand All @@ -77,16 +84,22 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
transformer_config = {}
transformer_config.update(imputation_config)
transformer_config.update(feature_scaling_config)
transformer_config.update(feature_selection_config)
transformer_config.update(run.transformer_config)
transformer_config_str = json.dumps(transformer_config, indent=4)
transformer_config_str = copy.deepcopy(transformer_config)
for key, value in transformer_config_str.items():
for k, v in value.items():
if callable(v):
transformer_config_str[key][k] = v.__name__
transformer_config_str = json.dumps(transformer_config_str, indent=4)
GEOPI_OUTPUT_ARTIFACTS_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_PATH")
save_text(transformer_config_str, "Transform Pipeline Configuration", GEOPI_OUTPUT_ARTIFACTS_PATH, "root")
# If transformer_config is not {}, then create the transform pipeline.
if transformer_config:
# Create the transform pipeline.
transform_pipeline = PipelineConstrutor().chain(transformer_config)
# Fit the transform pipeline with the training data.
transform_pipeline.fit(X_train)
transform_pipeline.fit(X_train, y_train)
# Save the transform pipeline.
GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH")
save_model(transform_pipeline, "Transform Pipeline", X_train.iloc[[0]], GEOPI_OUTPUT_ARTIFACTS_MODEL_PATH)
Expand Down
61 changes: 61 additions & 0 deletions geochemistrypi/data_mining/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@

import numpy as np
import pandas as pd
from rich import print
from sklearn.feature_selection import GenericUnivariateSelect, SelectKBest, f_classif, f_regression
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from .data_readiness import show_data_columns


def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple[dict, np.ndarray]:
"""Apply feature scaling methods.
Expand Down Expand Up @@ -39,3 +43,60 @@ def feature_scaler(X: pd.DataFrame, method: List[str], method_idx: int) -> tuple
print("Please check the dataset to find the reason.")
feature_scaling_config = {type(scaler).__name__: scaler.get_params()}
return feature_scaling_config, X_scaled


def feature_selector(X: pd.DataFrame, y: pd.DataFrame, feature_selection_task: int, method: List[str], method_idx: int) -> tuple[dict, pd.DataFrame]:
"""Apply feature selection methods.
Parameters
----------
X : pd.DataFrame
The feature dataset.
y : pd.DataFrame
The label dataset.
feature_selection_task : int
Feature selection for regression or classification tasks.
method : str
The feature selection methods.
method_idx : int
The index of methods.
Returns
-------
feature_selection_config : dict
The feature selection configuration.
X_selected : pd.DataFrame
The feature dataset after selecting.
"""
print("--Original Features-")
show_data_columns(X.columns)

features_num = len(X.columns)
print(f"The original number of features is {features_num}, and your input must be less than {features_num}.")
features_retain_num = int(input("Please enter the number of features to retain.\n" "@input: "))

if feature_selection_task == 1:
score_func = f_regression
elif feature_selection_task == 2:
score_func = f_classif

if method[method_idx] == "GenericUnivariateSelect":
selector = GenericUnivariateSelect(score_func=score_func, mode="k_best", param=features_retain_num)
elif method[method_idx] == "SelectKBest":
selector = SelectKBest(score_func=score_func, k=features_retain_num)

try:
selector.fit(X, y)
features_selected = selector.get_feature_names_out()
X = X[features_selected]
except ValueError:
print("The selected feature selection method is not applicable to the dataset!")
print("Please check the dataset to find the reason.")

feature_selection_config = {type(selector).__name__: selector.get_params()}
return feature_selection_config, X

0 comments on commit a0141c9

Please sign in to comment.