Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use environment variable to formalize the output path. #188

Merged
merged 1 commit into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

# all produced data set under the directory output
**/output
**/geopi_output

# all saved trained model under the directory trained model
**/trained_models
Expand Down
85 changes: 43 additions & 42 deletions geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from .constants import (
CLASSIFICATION_MODELS,
CLUSTERING_MODELS,
DATASET_OUTPUT_PATH,
DECOMPOSITION_MODELS,
FEATURE_SCALING_STRATEGY,
GEO_IMAGE_PATH,
Expand All @@ -19,7 +18,6 @@
MLFLOW_ARTIFACT_DATA_PATH,
MODE_OPTION,
MODEL_OUTPUT_IMAGE_PATH,
MODEL_PATH,
NON_AUTOML_MODELS,
OPTION,
OUTPUT_PATH,
Expand All @@ -40,16 +38,14 @@
from .process.cluster import ClusteringModelSelection
from .process.decompose import DecompositionModelSelection
from .process.regress import RegressionModelSelection
from .utils.base import clear_output, log, save_data, show_warning
from .utils.base import clear_output, create_geopi_output_dir, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id

# create the directories if they didn't exist yet
os.makedirs(MODEL_OUTPUT_IMAGE_PATH, exist_ok=True)
os.makedirs(STATISTIC_IMAGE_PATH, exist_ok=True)
os.makedirs(DATASET_OUTPUT_PATH, exist_ok=True)
os.makedirs(MAP_IMAGE_PATH, exist_ok=True)
os.makedirs(GEO_IMAGE_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)


def cli_pipeline(file_name: str) -> None:
Expand Down Expand Up @@ -110,6 +106,7 @@ def cli_pipeline(file_name: str) -> None:
run_tag = Prompt.ask("✨ Run Tag Version", default="R - v1.0.0")
run_description = Prompt.ask("✨ Run Description", default="Use xgboost for GeoPi classification.")
mlflow.start_run(run_name=run_name, experiment_id=experiment.experiment_id, tags={"version": run_tag, "description": run_description})
create_geopi_output_dir(experiment.name, run_name)
clear_output()

# <--- Built-in Data Loading --->
Expand Down Expand Up @@ -140,44 +137,46 @@ def cli_pipeline(file_name: str) -> None:
logger.debug("Data Selection")
print("-*-*- Data Selection -*-*-")
show_data_columns(data.columns)
data_processed = create_sub_data_set(data)
data_selected = create_sub_data_set(data)
clear_output()
print("The Selected Data Set:")
print(data_processed)
print(data_selected)
clear_output()
print("Basic Statistical Information: ")
basic_info(data_processed)
basic_statistic(data_processed)
correlation_plot(data_processed.columns, data_processed)
distribution_plot(data_processed.columns, data_processed)
logged_distribution_plot(data_processed.columns, data_processed)
save_data(data_processed, "Data Selected", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
basic_info(data_selected)
basic_statistic(data_selected)
correlation_plot(data_selected.columns, data_selected)
distribution_plot(data_selected.columns, data_selected)
logged_distribution_plot(data_selected.columns, data_selected)
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
save_data(data, "Data Original", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(data_selected, "Data Selected", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# <--- Imputation --->
logger.debug("Imputation")
print("-*-*- Imputation -*-*-")
is_null_value(data_processed)
ratio_null_vs_filled(data_processed)
imputed_flag = is_imputed(data_processed)
is_null_value(data_selected)
ratio_null_vs_filled(data_selected)
imputed_flag = is_imputed(data_selected)
clear_output()
if imputed_flag:
print("-*-*- Strategy for Missing Values -*-*-")
num2option(IMPUTING_STRATEGY)
print("Which strategy do you want to apply?")
strategy_num = limit_num_input(IMPUTING_STRATEGY, SECTION[1], num_input)
data_processed_imputed_np = imputer(data_processed, IMPUTING_STRATEGY[strategy_num - 1])
data_processed_imputed = np2pd(data_processed_imputed_np, data_processed.columns)
del data_processed_imputed_np
data_selected_imputed_np = imputer(data_selected, IMPUTING_STRATEGY[strategy_num - 1])
data_selected_imputed = np2pd(data_selected_imputed_np, data_selected.columns)
del data_selected_imputed_np
clear_output()
print("-*-*- Hypothesis Testing on Imputation Method -*-*-")
print("Null Hypothesis: The distributions of the data set before and after imputing remain the same.")
print("Thoughts: Check which column rejects null hypothesis.")
print("Statistics Test Method: Wilcoxon Test")
monte_carlo_simulator(
data_processed,
data_processed_imputed,
sample_size=data_processed_imputed.shape[0] // 2,
data_selected,
data_selected_imputed,
sample_size=data_selected_imputed.shape[0] // 2,
iteration=100,
test="wilcoxon",
confidence=0.05,
Expand All @@ -186,20 +185,22 @@ def cli_pipeline(file_name: str) -> None:
# print("The statistics test method: Kruskal Wallis Test")
# monte_carlo_simulator(data_processed, data_processed_imputed, sample_size=50,
# iteration=100, test='kruskal', confidence=0.05)
probability_plot(data_processed.columns, data_processed, data_processed_imputed)
basic_info(data_processed_imputed)
basic_statistic(data_processed_imputed)
del data_processed
probability_plot(data_selected.columns, data_selected, data_selected_imputed)
basic_info(data_selected_imputed)
basic_statistic(data_selected_imputed)
save_data(data_selected_imputed, "Data Selected Imputed", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
del data_selected
clear_output()
else:
# if the selected data set doesn't need imputation, which means there are no missing values.
data_processed_imputed = data_processed
data_selected_imputed = data_selected

# <--- Feature Engineering --->
logger.debug("Feature Engineering")
feature_built = FeatureConstructor(data_processed_imputed)
feature_built = FeatureConstructor(data_selected_imputed)
feature_built.process_feature_engineering()
data_processed_imputed = feature_built.data
data_selected_imputed_fe = feature_built.data
del data_selected_imputed

# <--- Mode Selection --->
logger.debug("Mode Selection")
Expand All @@ -216,15 +217,15 @@ def cli_pipeline(file_name: str) -> None:
print("Divide the processing data set into X (feature value) and Y (target value) respectively.")
# create X data set
print("Selected sub data set to create X data set:")
show_data_columns(data_processed_imputed.columns)
show_data_columns(data_selected_imputed_fe.columns)
print("The selected X data set:")
X = create_sub_data_set(data_processed_imputed)
X = create_sub_data_set(data_selected_imputed_fe)
print("Successfully create X data set.")
print("The Selected Data Set:")
print(X)
print("Basic Statistical Information: ")
basic_statistic(X)
save_data(X, "X Without Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(X, "X Without Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# <--- Feature Scaling --->
Expand All @@ -242,23 +243,23 @@ def cli_pipeline(file_name: str) -> None:
print(X)
print("Basic Statistical Information: ")
basic_statistic(X)
save_data(X, "X With Scaling", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(X, "X With Scaling", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# create Y data set
print("-*-*- Data Split - X Set and Y Set-*-*-")
print("Selected sub data set to create Y data set:")
show_data_columns(data_processed_imputed.columns)
show_data_columns(data_selected_imputed_fe.columns)
print("The selected Y data set:")
print("Notice: Normally, please choose only one column to be tag column Y, not multiple columns.")
print("Notice: For classification model training, please choose the label column which has distinctive integers.")
y = create_sub_data_set(data_processed_imputed)
y = create_sub_data_set(data_selected_imputed_fe)
print("Successfully create Y data set.")
print("The Selected Data Set:")
print(y)
print("Basic Statistical Information: ")
basic_statistic(y)
save_data(y, "y", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(y, "Y", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()

# create training data and testing data
Expand All @@ -272,15 +273,15 @@ def cli_pipeline(file_name: str) -> None:
print(value)
print(f"Basic Statistical Information: {key}")
basic_statistic(value)
save_data(value, key, DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
X_train, X_test = train_test_data["X train"], train_test_data["X test"]
y_train, y_test = train_test_data["y train"], train_test_data["y test"]
del data_processed_imputed
save_data(value, key, GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
X_train, X_test = train_test_data["X Train"], train_test_data["X Test"]
y_train, y_test = train_test_data["Y Train"], train_test_data["Y Test"]
del data_selected_imputed_fe
clear_output()
else:
# unsupervised learning
X = data_processed_imputed
X_train = data_processed_imputed
X = data_selected_imputed_fe
X_train = data_selected_imputed_fe
y, X_test, y_train, y_test = None, None, None, None

# <--- Model Selection --->
Expand Down
2 changes: 1 addition & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")

# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "output")
OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")

# the directory where the data set produced stays
DATASET_OUTPUT_PATH = os.path.join(OUTPUT_PATH, "data")
Expand Down
25 changes: 21 additions & 4 deletions geochemistrypi/data_mining/data/data_readiness.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import sys
from typing import List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

import openpyxl.utils.exceptions
import pandas as pd
Expand Down Expand Up @@ -154,10 +154,27 @@ def create_sub_data_set(data: pd.DataFrame) -> pd.DataFrame:
return sub_data_set


def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> dict:
"""Split arrays or matrices into random train and test subsets."""
def data_split(X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], test_size: float = 0.2) -> Dict:
"""Split arrays or matrices into random train and test subsets.

Parameters
----------
X : pd.DataFrame
The data to be split.

y : pd.DataFrame or pd.Series
The target variable to be split.

test_size : float, default=0.2
Represents the proportion of the dataset to include in the test split.

Returns
-------
dict
A dictionary containing the split data.
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
return {"X train": X_train, "X test": X_test, "y train": y_train, "y test": y_test}
return {"X Train": X_train, "X Test": X_test, "Y Train": y_train, "Y Test": y_test}


def num2option(items: List[str]) -> None:
Expand Down
39 changes: 19 additions & 20 deletions geochemistrypi/data_mining/data/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
import os
import string

import numpy as np
import pandas as pd
from rich import print

from ..constants import DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
from ..constants import MLFLOW_ARTIFACT_DATA_PATH, OPTION, SECTION
from ..plot.statistic_plot import basic_statistic
from ..utils.base import clear_output, save_data
from .data_readiness import basic_info, limit_num_input, num2option, num_input, show_data_columns
Expand All @@ -19,37 +20,34 @@ class FeatureConstructor(object):
alphabet = string.ascii_letters
cal_words = ["pow", "sin", "cos", "tan", "pi", "mean", "std", "var", "log"]

def __init__(self, data):
def __init__(self, data: pd.DataFrame) -> None:
self.feature_name = None
self.data = data
self._infix_expr = []
self._postfix_expr = []
self.map_dict = {}
self._result = None

def index2name(self):
"""Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name

:return: index : column name, dict
"""
def index2name(self) -> None:
"""Pattern: [letter : column name], e.g. a : 1st column name; b : 2nd column name."""
columns_name = self.data.columns
print("Selected data set:")
for i in range(len(columns_name)):
print(FeatureConstructor.alphabet[i] + " - " + columns_name[i])
self.map_dict[FeatureConstructor.alphabet[i]] = columns_name[i]

def _get_column(self, index):
def _get_column(self, index: str) -> str:
return self.map_dict[index]

def name_feature(self):
def name_feature(self) -> None:
while True:
self.feature_name = input("Name the constructed feature (column name), like 'NEW-COMPOUND': \n" "@input: ")
if len(self.feature_name) == 0:
print("Sorry!You haven't named it yet!")
else:
break

def input_expression(self):
def input_expression(self) -> None:
expression = input(
"Build up new feature with the combination of 4 basic arithmatic operator,"
" including '+', '-', '*', '/', '()'.\n"
Expand Down Expand Up @@ -103,7 +101,8 @@ def input_expression(self):
else:
break

def evaluate(self):
def evaluate(self) -> None:
"""Evaluate the expression."""
self.letter_map()
np.array(["dummy"]) # dummy array to skip the flake8 warning - F401 'numpy as np' imported but unused'
self._infix_expr = self._infix_expr.replace("sin", "np.sin")
Expand All @@ -127,7 +126,8 @@ def evaluate(self):
except ZeroDivisionError:
print("The expression contains a division by zero.")

def letter_map(self):
def letter_map(self) -> None:
"""Map the letter to the column name."""
new_text = ""
test_text = "".join(ch for ch in self._infix_expr if ch not in set(" "))
for words in FeatureConstructor.cal_words:
Expand All @@ -147,12 +147,14 @@ def letter_map(self):
else:
self._infix_expr += ww

def process_feature_engineering(self):
def process_feature_engineering(self) -> None:
"""Process the feature engineering."""
print("-*-*- Feature Engineering -*-*-")
print("The Selected Data Set:")
show_data_columns(self.data.columns)
fe_flag = 0
is_feature_engineering = 0
GEOPI_OUTPUT_ARTIFACTS_DATA_PATH = os.getenv("GEOPI_OUTPUT_ARTIFACTS_DATA_PATH")
while True:
if fe_flag != 1:
print("Feature Engineering Option:")
Expand All @@ -179,20 +181,17 @@ def process_feature_engineering(self):
clear_output()
continue
else:
save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
print("Exit Feature Engineering Mode.")
clear_output()
break
else:
save_data(self.data, "Data Before Splitting", DATASET_OUTPUT_PATH, MLFLOW_ARTIFACT_DATA_PATH)
save_data(self.data, "Data Selected Imputed Feature-Engineering", GEOPI_OUTPUT_ARTIFACTS_DATA_PATH, MLFLOW_ARTIFACT_DATA_PATH)
clear_output()
break

def create_data_set(self):
def create_data_set(self) -> pd.DataFrame:
"""Create a new data set with the new feature."""
print(f'Successfully construct a new feature "{self.feature_name}".')
print(self._result)
return pd.concat([self.data, self._result], axis=1)

# TODO: Is the scope of input right?
def check_data_scope(self):
pass
Loading