Skip to content

Commit

Permalink
Merge pull request #188 from ZJUEarthData/web
Browse files Browse the repository at this point in the history
feat: use environment variable to formalize the output path.
  • Loading branch information
SanyHe authored Jul 24, 2023
2 parents 0bf714f + db12aa0 commit eaeb76e
Show file tree
Hide file tree
Showing 14 changed files with 190 additions and 106 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# all produced data set under the directory output
**/output
**/geopi_output

# all saved trained model under the directory trained model
**/trained_models
Expand Down
85 changes: 43 additions & 42 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from .constants import (
CLASSIFICATION_MODELS,
CLUSTERING_MODELS,
DATASET_OUTPUT_PATH,
DECOMPOSITION_MODELS,
FEATURE_SCALING_STRATEGY,
GEO_IMAGE_PATH,
Expand All @@ -19,7 +18,6 @@
MLFLOW_ARTIFACT_DATA_PATH,
MODE_OPTION,
MODEL_OUTPUT_IMAGE_PATH,
MODEL_PATH,
NON_AUTOML_MODELS,
OPTION,
OUTPUT_PATH,
Expand All @@ -40,16 +38,14 @@
from .process.cluster import ClusteringModelSelection
from .process.decompose import DecompositionModelSelection
from .process.regress import RegressionModelSelection
from .utils.base import clear_output, log, save_data, show_warning
from .utils.base import clear_output, create_geopi_output_dir, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id

# create the directories if they didn't exist yet
os.makedirs(MODEL_OUTPUT_IMAGE_PATH, exist_ok=True)
os.makedirs(STATISTIC_IMAGE_PATH, exist_ok=True)
os.makedirs(DATASET_OUTPUT_PATH, exist_ok=True)
os.makedirs(MAP_IMAGE_PATH, exist_ok=True)
os.makedirs(GEO_IMAGE_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)


def cli_pipeline(file_name: str) -> None:
Expand Down Expand Up @@ -110,6 +106,7 @@ def cli_pipeline(file_name: str) -> None:
run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
create_geopi_output_dir(experiment.name, run_name)
clear_output()

# <--- Built-in Data Loading --->
Expand Down Expand Up @@ -140,44 +137,46 @@ def cli_pipeline(file_name: str) -> None:
logger.debug("Data Selection")
print("-*-*- Data Selection -*-*-")
show_data_columns(data.columns)
data_processed = create_sub_data_set(data)
data_selected = create_sub_data_set(data)
clear_output()
print("The Selected Data Set:")
print(data_processed)
print(data_selected)
clear_output()
print("Basic Statistical Information: ")
basic_info(data_processed)
basic_statistic(data_processed)
correlation_plot(data_processed.columns, data_processed)
distribution_plot(data_processed.columns, data_processed)
logged_distribution_plot(data_processed.columns, data_processed)
save_data(data_processed, "Data Selected", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
basic_info(data_selected)
basic_statistic(data_selected)
correlation_plot(data_selected.columns, data_selected)
distribution_plot(data_selected.columns, data_selected)
logged_distribution_plot(data_selected.columns, data_selected)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(data, "Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(data_selected, "Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# <--- Imputation --->
logger.debug("Imputation")
print("-*-*- Imputation -*-*-")
is_null_value(data_processed)
ratio_null_vs_filled(data_processed)
imputed_flag = is_imputed(data_processed)
is_null_value(data_selected)
ratio_null_vs_filled(data_selected)
imputed_flag = is_imputed(data_selected)
clear_output()
if imputed_flag:
print("-*-*- Strategy for Missing Values -*-*-")
num2option(IMPUTING_STRATEGY)
print("Which strategy do you want to apply?")
strategy_num = limit_num_input(IMPUTING_STRATEGY, SECTION[1], num_input)
data_processed_imputed_np = imputer(data_processed, IMPUTING_STRATEGY[strategy_num - 1])
data_processed_imputed = np2pd(data_processed_imputed_np, data_processed.columns)
del data_processed_imputed_np
data_selected_imputed_np = imputer(data_selected, IMPUTING_STRATEGY[strategy_num - 1])
data_selected_imputed = np2pd(data_selected_imputed_np, data_selected.columns)
del data_selected_imputed_np
clear_output()
print("-*-*- Hypothesis Testing on Imputation Method -*-*-")
print("Null Hypothesis: The distributions of the data set before and after imputing remain the same.")
print("Thoughts: Check which column rejects null hypothesis.")
print("Statistics Test Method: Wilcoxon Test")
monte_carlo_simulator(
data_processed,
data_processed_imputed,
sample_size=data_processed_imputed.shape[0] // 2,
data_selected,
data_selected_imputed,
sample_size=data_selected_imputed.shape[0] // 2,
iteration=100,
test="wilcoxon",
confidence=0.05,
Expand All @@ -186,20 +185,22 @@ def cli_pipeline(file_name: str) -> None:
# print("The statistics test method: Kruskal Wallis Test")
# monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
# iteration=100, test='kruskal', confidence=0.05)
probability_plot(data_processed.columns, data_processed, data_processed_imputed)
basic_info(data_processed_imputed)
basic_statistic(data_processed_imputed)
del data_processed
probability_plot(data_selected.columns, data_selected, data_selected_imputed)
basic_info(data_selected_imputed)
basic_statistic(data_selected_imputed)
save_data(data_selected_imputed, "Data Selected Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
del data_selected
clear_output()
else:
# if the selected data set doesn't need imputation, which means there are no missing values.
data_processed_imputed = data_processed
data_selected_imputed = data_selected

# <--- Feature Engineering --->
logger.debug("Feature Engineering")
feature_built = FeatureConstructor(data_processed_imputed)
feature_built = FeatureConstructor(data_selected_imputed)
feature_built.process_feature_engineering()
data_processed_imputed = feature_built.data
data_selected_imputed_fe = feature_built.data
del data_selected_imputed

# <--- Mode Selection --->
logger.debug("Mode Selection")
Expand All @@ -216,15 +217,15 @@ def cli_pipeline(file_name: str) -> None:
print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
# create X data set
print("Selected sub data set to create X data set:")
show_data_columns(data_processed_imputed.columns)
show_data_columns(data_selected_imputed_fe.columns)
print("The selected X data set:")
X = create_sub_data_set(data_processed_imputed)
X = create_sub_data_set(data_selected_imputed_fe)
print("Successfully create X data set.")
print("The Selected Data Set:")
print(X)
print("Basic Statistical Information: ")
basic_statistic(X)
save_data(X, "X Without Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# <--- Feature Scaling --->
Expand All @@ -242,23 +243,23 @@ def cli_pipeline(file_name: str) -> None:
print(X)
print("Basic Statistical Information: ")
basic_statistic(X)
save_data(X, "X With Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# create Y data set
print("-*-*- Data Split - X Set and Y Set-*-*-")
print("Selected sub data set to create Y data set:")
show_data_columns(data_processed_imputed.columns)
show_data_columns(data_selected_imputed_fe.columns)
print("The selected Y data set:")
print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
print("Notice: For classification model training, please choose the label column which has distinctive integers.")
y = create_sub_data_set(data_processed_imputed)
y = create_sub_data_set(data_selected_imputed_fe)
print("Successfully create Y data set.")
print("The Selected Data Set:")
print(y)
print("Basic Statistical Information: ")
basic_statistic(y)
save_data(y, "y", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# create training data and testing data
Expand All @@ -272,15 +273,15 @@ def cli_pipeline(file_name: str) -> None:
print(value)
print(f"Basic Statistical Information: {key}")
basic_statistic(value)
save_data(value, key, DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
X_train, X_test = train_test_data["X train"], train_test_data["X test"]
y_train, y_test = train_test_data["y train"], train_test_data["y test"]
del data_processed_imputed
save_data(value, key, GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
X_train, X_test = train_test_data["X Train"], train_test_data["X Test"]
y_train, y_test = train_test_data["Y Train"], train_test_data["Y Test"]
del data_selected_imputed_fe
clear_output()
else:
# unsupervised learning
X = data_processed_imputed
X_train = data_processed_imputed
X = data_selected_imputed_fe
X_train = data_selected_imputed_fe
y, X_test, y_train, y_test = None, None, None, None

# <--- Model Selection --->
Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")

# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "output")
OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

# the directory where the data set produced stays
DATASET_OUTPUT_PATH = os.path.join(OUTPUT_PATH, "data")
Expand Down
25 changes: 21 additions & 4 deletions geochemistrypi/data_mining/data/data_readiness.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import sys
from typing import List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import openpyxl.utils.exceptions
import pandas as pd
Expand Down Expand Up @@ -154,10 +154,27 @@ def create_sub_data_set(data: pd.DataFrame) -> pd.DataFrame:
return sub_data_set


def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> dict:
"""Split arrays or matrices into random train and test subsets."""
def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> Dict:
"""Split arrays or matrices into random train and test subsets.
Parameters
----------
X : pd.DataFrame
The data to be split.
y : pd.DataFrame or pd.Series
The target variable to be split.
test_size : float, default=0.2
Represents the proportion of the dataset to include in the test split.
Returns
-------
dict
A dictionary containing the split data.
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
return {"X train": X_train, "X test": X_test, "y train": y_train, "y test": y_test}
return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test}


def num2option(items: List[str]) -> None:
Expand Down
39 changes: 19 additions & 20 deletions geochemistrypi/data_mining/data/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
import os
import string

import numpy as np
import pandas as pd
from rich import print

from ..constants import DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
from ..constants import MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
from ..plot.statistic_plot import basic_statistic
from ..utils.base import clear_output, save_data
from .data_readiness import basic_info, limit_num_input, num2option, num_input, show_data_columns
Expand All @@ -19,37 +20,34 @@ class FeatureConstructor(object):
alphabet = string.ascii_letters
cal_words = ["pow", "sin", "cos", "tan", "pi", "mean", "std", "var", "log"]

def __init__(self, data):
def __init__(self, data: pd.DataFrame) -> None:
self.feature_name = None
self.data = data
self._infix_expr = []
self._postfix_expr = []
self.map_dict = {}
self._result = None

def index2name(self):
"""Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name
:return: index : column name, dict
"""
def index2name(self) -> None:
"""Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name."""
columns_name = self.data.columns
print("Selected data set:")
for i in range(len(columns_name)):
print(FeatureConstructor.alphabet[i] + " - " + columns_name[i])
self.map_dict[FeatureConstructor.alphabet[i]] = columns_name[i]

def _get_column(self, index):
def _get_column(self, index: str) -> str:
return self.map_dict[index]

def name_feature(self):
def name_feature(self) -> None:
while True:
self.feature_name = input("Name the constructed feature (column name), like 'NEW-COMPOUND': \n" "@input: ")
if len(self.feature_name) == 0:
print("Sorry!You haven't named it yet!")
else:
break

def input_expression(self):
def input_expression(self) -> None:
expression = input(
"Build up new feature with the combination of 4 basic arithmatic operator,"
" including '+', '-', '*', '/', '()'.\n"
Expand Down Expand Up @@ -103,7 +101,8 @@ def input_expression(self):
else:
break

def evaluate(self):
def evaluate(self) -> None:
"""Evaluate the expression."""
self.letter_map()
np.array(["dummy"]) # dummy array to skip the flake8 warning - F401 'numpy as np' imported but unused'
self._infix_expr = self._infix_expr.replace("sin", "np.sin")
Expand All @@ -127,7 +126,8 @@ def evaluate(self):
except ZeroDivisionError:
print("The expression contains a division by zero.")

def letter_map(self):
def letter_map(self) -> None:
"""Map the letter to the column name."""
new_text = ""
test_text = "".join(ch for ch in self._infix_expr if ch not in set(" "))
for words in FeatureConstructor.cal_words:
Expand All @@ -147,12 +147,14 @@ def letter_map(self):
else:
self._infix_expr += ww

def process_feature_engineering(self):
def process_feature_engineering(self) -> None:
"""Process the feature engineering."""
print("-*-*- Feature Engineering -*-*-")
print("The Selected Data Set:")
show_data_columns(self.data.columns)
fe_flag = 0
is_feature_engineering = 0
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
while True:
if fe_flag != 1:
print("Feature Engineering Option:")
Expand All @@ -179,20 +181,17 @@ def process_feature_engineering(self):
clear_output()
continue
else:
save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
print("Exit Feature Engineering Mode.")
clear_output()
break
else:
save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
break

def create_data_set(self):
def create_data_set(self) -> pd.DataFrame:
"""Create a new data set with the new feature."""
print(f'Successfully construct a new feature "{self.feature_name}".')
print(self._result)
return pd.concat([self.data, self._result], axis=1)

# TODO: Is the scope of input right?
def check_data_scope(self):
pass
Loading

0 comments on commit eaeb76e

Please sign in to comment.