Skip to content

Commit

Permalink
Merge pull request #385 from ZJUEarthData/dev/Yongkang
Browse files Browse the repository at this point in the history
perf:Add a name to the data
  • Loading branch information
SanyHe committed Sep 15, 2024
2 parents a70829e + b040a17 commit f81fc36
Show file tree
Hide file tree
Showing 18 changed files with 552 additions and 180 deletions.
162 changes: 117 additions & 45 deletions geochemistrypi/data_mining/cli_pipeline.py

Large diffs are not rendered by default.

70 changes: 67 additions & 3 deletions geochemistrypi/data_mining/data/data_readiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from rich import print
from sklearn.model_selection import train_test_split

from ..constants import BUILT_IN_DATASET_PATH
from ..constants import BUILT_IN_DATASET_PATH, SECTION

# from utils.exceptions import InvalidFileError

Expand Down Expand Up @@ -153,6 +153,33 @@ def select_columns(columns_range: Optional[str] = None) -> List[int]:
return columns_selected


def select_column_name(data: pd.DataFrame) -> str:
"""Select a single column from the dataframe and return its name.
Parameters
----------
data : pd.DataFrame
The data set to be selected name.
"""
print(
"You need to choose the number of the column above as the output data identifier column.\n"
"The data identifier column helps identify uniquely each row of data point in the output data.\n"
"** For example, when using built-in dataset, you can choose the column ‘SAMPLE NAME’.**\n"
"Once finishing the whole run, in the output data file, all data point will have the value in the column ‘SAMPLE NAME’ as its unique identifier.\n"
"Enter the number of the output data identifier column."
)
while True:
try:
column_index = int_input(column=2, prefix=SECTION[1], slogan="@Number: ")
if column_index < 1 or column_index > data.shape[1]:
print(f"The entered number is out of range! Please enter a number between 1 and {data.shape[1]}.")
continue
column_name = data.columns[column_index - 1]
return column_name
except ValueError:
print("Invalid input, please enter an integer.")


def create_sub_data_set(data: pd.DataFrame, allow_empty_columns: bool = False) -> pd.DataFrame:
"""Create a sub data set.
Expand Down Expand Up @@ -287,7 +314,7 @@ def create_sub_data_set(data: pd.DataFrame, allow_empty_columns: bool = False) -
return sub_data_set


def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> Dict:
def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], names: pd.DataFrame, test_size: float = 0.2) -> Dict:
"""Split arrays or matrices into random train and test subsets.
Parameters
Expand All @@ -298,6 +325,9 @@ def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: fl
y : pd.DataFrame or pd.Series
The target variable to be split.
name : pd.DataFrame
The name of data.
test_size : float, default=0.2
Represents the proportion of the dataset to include in the test split.
Expand All @@ -307,7 +337,8 @@ def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: fl
A dictionary containing the split data.
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test}
name_train, name_test = train_test_split(names, test_size=test_size, random_state=42)
return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test, "Name Train": name_train, "Name Test": name_test}


def num2option(items: List[str]) -> None:
Expand Down Expand Up @@ -430,6 +461,39 @@ def float_input(default: float, prefix: Optional[str] = None, slogan: Optional[s
return option


def int_input(column: int, prefix: Optional[str] = None, slogan: Optional[str] = "@Number: ") -> int:
"""Get the number of the desired option.
Parameters
----------
default: int
If the user does not enter anything, it is assigned to option.
prefix : str, default=None
It indicates which section the user currently is in on the UML, which is shown on the command-line console.
slogan : str, default="@Number: "
It acts like the first parameter of input function in Python, which output the hint.
Returns
-------
option: int
An option number.
"""
while True:
option = input(f"({prefix}) ➜ {slogan}").strip()
if option.isdigit():
option = int(option)
break
elif len(option) == 0:
option = column

break
else:
print("Caution: The input is not a positive integer number. Please input the right number again!")
return option


def str_input(option_list: List[str], prefix: Optional[str] = None) -> str:
"""Get the string of the desired option.
Expand Down
7 changes: 4 additions & 3 deletions geochemistrypi/data_mining/data/feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class FeatureConstructor(object):
# parenthesis = ['(', ')']
cal_words = ["pow", "sin", "cos", "tan", "pi", "mean", "std", "var", "log"]

def __init__(self, data: pd.DataFrame) -> None:
def __init__(self, data: pd.DataFrame, name_all: str) -> None:
self.feature_name = None
self.data = data
self.alphabet = string.ascii_lowercase
Expand All @@ -29,6 +29,7 @@ def __init__(self, data: pd.DataFrame) -> None:
self.map_dict = {}
self._result = None
self.config = {}
self.name_all = name_all

def index2name(self) -> None:
"""Show the index of columns in the data set. The display pattern is [letter : column name], e.g. a : 1st column name; b : 2nd column name."""
Expand Down Expand Up @@ -171,12 +172,12 @@ def build(self) -> None:
clear_output()
continue
else:
save_data(self.data, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, self.name_all, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
print("Exit Feature Engineering Mode.")
clear_output()
break
else:
save_data(self.data, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, self.name_all, "Data Selected Dropped-Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
break
return self.data
Expand Down
7 changes: 5 additions & 2 deletions geochemistrypi/data_mining/data/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,17 @@ def build_transform_pipeline(imputation_config: Dict, feature_scaling_config: Di
return transformer_config, transform_pipeline


def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
def model_inference(inference_data: pd.DataFrame, inference_name_column: str, is_inference: bool, run: object, transformer_config: Dict, transform_pipeline: Optional[object] = None):
"""Run the model inference.
Parameters
----------
inference_data : pd.DataFrame
The inference data.
inference_name_column: str
The name of inference_data
is_inference : bool
Whether to run the model inference.
Expand All @@ -141,4 +144,4 @@ def model_inference(inference_data: pd.DataFrame, is_inference: bool, run: objec
inference_data_predicted_np = loaded_model.predict(inference_data_transformed)
inference_data_predicted = np2pd(inference_data_predicted_np, ["Predicted Value"])
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(inference_data_predicted, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(inference_data_predicted, inference_name_column, "Application Data Predicted", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
40 changes: 26 additions & 14 deletions geochemistrypi/data_mining/model/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ def data_upload(
X_test: Optional[pd.DataFrame] = None,
y_train: Optional[pd.DataFrame] = None,
y_test: Optional[pd.DataFrame] = None,
name_train: Optional[pd.Series] = None,
name_test: Optional[pd.Series] = None,
name_all: Optional[pd.Series] = None,
y_train_predict: Optional[pd.DataFrame] = None,
y_test_predict: Optional[pd.DataFrame] = None,
) -> None:
Expand All @@ -218,20 +221,28 @@ def data_upload(
WorkflowBase.y_train = y_train
if y_test is not None:
WorkflowBase.y_test = y_test
if name_train is not None:
WorkflowBase.name_train = name_train
if name_test is not None:
WorkflowBase.name_test = name_test
if name_all is not None:
WorkflowBase.name_all = name_all
if y_test_predict is not None:
WorkflowBase.y_test_predict = y_test_predict
if y_train_predict is not None:
WorkflowBase.y_train_predict = y_train_predict

@staticmethod
def data_save(df: pd.DataFrame, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None:
def data_save(df: pd.DataFrame, name: str, df_name: str, local_path: str, mlflow_path: str, slogan: str) -> None:
"""This method saves the data into the local path and the mlflow path.
Parameters
----------
df : pd.DataFrame
The data to be saved.
name: str
The name.
df_name : str
The name of the data.
Expand All @@ -246,7 +257,7 @@ def data_save(df: pd.DataFrame, df_name: str, local_path: str, mlflow_path: str,
"""
print(f"-----* {slogan} *-----")
print(df)
save_data(df, df_name, local_path, mlflow_path)
save_data(df, name, df_name, local_path, mlflow_path)

@staticmethod
def save_hyper_parameters(hyper_parameters_dict: Dict, model_name: str, local_path: str) -> None:
Expand Down Expand Up @@ -285,6 +296,7 @@ def model_save(self, is_automl: bool) -> None:
def _plot_permutation_importance(
X_test: pd.DataFrame,
y_test: pd.DataFrame,
name_column: str,
trained_model: object,
image_config: dict,
algorithm_name: str,
Expand All @@ -295,8 +307,8 @@ def _plot_permutation_importance(
print("-----* Permutation Importance Diagram *-----")
importances_mean, importances_std, importances = plot_permutation_importance(X_test, y_test, trained_model, image_config)
save_fig(f"Permutation Importance - {algorithm_name}", local_path, mlflow_path)
save_data(X_test, "Permutation Importance - X Test", local_path, mlflow_path)
save_data(y_test, "Permutation Importance - Y Test", local_path, mlflow_path)
save_data(X_test, name_column, "Permutation Importance - X Test", local_path, mlflow_path)
save_data(y_test, name_column, "Permutation Importance - Y Test", local_path, mlflow_path)
data_dict = {"importances_mean": importances_mean.tolist(), "importances_std": importances_std.tolist(), "importances": importances.tolist()}
data_str = json.dumps(data_dict, indent=4)
save_text(data_str, f"Permutation Importance - {algorithm_name}", local_path, mlflow_path)
Expand All @@ -306,14 +318,14 @@ class TreeWorkflowMixin:
"""Mixin class for tree models."""

@staticmethod
def _plot_feature_importance(X_train: pd.DataFrame, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_feature_importance(X_train: pd.DataFrame, name_column: str, trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Draw the feature importance bar diagram."""
print("-----* Feature Importance Diagram *-----")
columns_name = X_train.columns
feature_importances = trained_model.feature_importances_
data = plot_feature_importance(columns_name, feature_importances, image_config)
save_fig(f"Feature Importance - {algorithm_name}", local_path, mlflow_path)
save_data(data, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True)
save_data(data, name_column, f"Feature Importance - {algorithm_name}", local_path, mlflow_path, True)

@staticmethod
def _plot_tree(trained_model: object, image_config: dict, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
Expand All @@ -337,40 +349,40 @@ def _show_formula(
save_text(formula_str, f"{algorithm_name} Formula", local_path, mlflow_path)

@staticmethod
def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_2d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the 2D graph of the linear regression model."""
print("-----* 2D Scatter Diagram *-----")
plot_2d_scatter_diagram(feature_data, target_data)
save_fig(f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)
data = pd.concat([feature_data, target_data], axis=1)
save_data(data, f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)
save_data(data, data_name, f"2D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)

@staticmethod
def _plot_2d_line_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_2d_line_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the 2D graph of the linear regression model."""
print("-----* 2D Line Diagram *-----")
plot_2d_line_diagram(feature_data, target_data, y_test_predict)
save_fig(f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path)
data = pd.concat([feature_data, target_data, y_test_predict], axis=1)
save_data(data, f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path)
save_data(data, data_name, f"2D Line Diagram - {algorithm_name}", local_path, mlflow_path)

@staticmethod
def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_3d_scatter_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the 3D graph of the linear regression model."""
print("-----* 3D Scatter Diagram *-----")
plot_3d_scatter_diagram(feature_data, target_data)
save_fig(f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)
data = pd.concat([feature_data, target_data], axis=1)
save_data(data, f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)
save_data(data, data_name, f"3D Scatter Diagram - {algorithm_name}", local_path, mlflow_path)

@staticmethod
def _plot_3d_surface_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
def _plot_3d_surface_diagram(feature_data: pd.DataFrame, target_data: pd.DataFrame, y_test_predict: pd.DataFrame, data_name: str, algorithm_name: str, local_path: str, mlflow_path: str) -> None:
"""Plot the 3D graph of the linear regression model."""
print("-----* 3D Surface Diagram *-----")
plot_3d_surface_diagram(feature_data, target_data, y_test_predict)
save_fig(f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path)
data = pd.concat([feature_data, target_data, y_test_predict], axis=1)
save_data(data, f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path)
save_data(data, data_name, f"3D Surface Diagram - {algorithm_name}", local_path, mlflow_path)


class ClusteringMetricsMixin:
Expand Down
Loading

0 comments on commit f81fc36

Please sign in to comment.