Skip to content

Commit

Permalink
Merge pull request #383 from ZJUEarthData/dev/Jin
Browse files Browse the repository at this point in the history
feat:Added the function of switching between APP mode and standard us…
  • Loading branch information
SanyHe committed Sep 15, 2024
2 parents 817f843 + 8d15ef4 commit 15f18b4
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 2 deletions.
16 changes: 15 additions & 1 deletion geochemistrypi/data_mining/cli_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
REGRESSION_MODELS_WITH_MISSING_VALUES,
SECTION,
TEST_DATA_OPTION,
TOGGLE_ADDRESS_STATUS,
WORKING_PATH,
)
from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns
from .data.data_readiness import basic_info, create_sub_data_set, data_split, float_input, limit_num_input, np2pd, num2option, num_input, read_data, show_data_columns, show_excel_columns
from .data.feature_engineering import FeatureConstructor
from .data.imputation import imputer
from .data.inference import build_transform_pipeline, model_inference
Expand All @@ -47,6 +48,7 @@
from .process.regress import RegressionModelSelection
from .utils.base import check_package, clear_output, copy_files, create_geopi_output_dir, get_os, install_package, log, save_data, show_warning
from .utils.mlflow_utils import retrieve_previous_experiment_id
from .utils.toggle_address_status import toggle_address_status


def cli_pipeline(training_data_path: str, application_data_path: Optional[str] = None) -> None:
Expand Down Expand Up @@ -85,6 +87,18 @@ def cli_pipeline(training_data_path: str, application_data_path: Optional[str] =
# <-- User Training Data Loading -->
with console.status("[bold green]Training Data Loading...[/bold green]", spinner="dots"):
sleep(0.75)

# Call toggle_address_status and pass status and training_data_path as parameters to obtain the address of the training data
training_data_path = toggle_address_status(status=TOGGLE_ADDRESS_STATUS, training_data_path=training_data_path)[0]

# Check if the length of training_data_path is greater than 1
if len(training_data_path) > 1:
# Display the columns of the Excel file located at training_data_path
show_excel_columns(training_data_path)
print("Please select only one file that you want to process:")
# Limit the user input to a number within the range of available files and assign the result to training_data_path
training_data_path = training_data_path[limit_num_input(range(1, len(training_data_path) + 1), SECTION[0], num_input) - 1]

if training_data_path:
# If the user provides file name, then load the training data from the file.
data = read_data(file_path=training_data_path, is_own_data=1)
Expand Down
7 changes: 6 additions & 1 deletion geochemistrypi/data_mining/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import os

from .utils.toggle_address_status import toggle_address_status

# Adjust the path of project data flow: The number 1 indicates standard mode, and the number 2 indicates APP mode.
TOGGLE_ADDRESS_STATUS = 1

# The number of uploading dataset per user is limited to 5.
MAX_UPLOADS_PER_USER = 5

Expand All @@ -10,7 +15,7 @@
BUILT_IN_DATASET_PATH = os.path.join(PACKAGEDIR, "data", "dataset")

# current working directory in which the user activates the application
WORKING_PATH = os.getcwd()
WORKING_PATH = toggle_address_status(status=TOGGLE_ADDRESS_STATUS)[1]

# the root directory where all the output stays
OUTPUT_PATH = os.path.join(WORKING_PATH, "geopi_output")
Expand Down
22 changes: 22 additions & 0 deletions geochemistrypi/data_mining/data/data_readiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,28 @@ def basic_info(data: pd.DataFrame) -> None:
print(data.info())


def show_excel_columns(excel_list: Optional[List] = None) -> None:
"""Displays the index and name of each column in the provided Excel list.
Parameters
----------
excel_list : Optional[List], optional
A list containing the names of Excel columns. Defaults to None.
Returns
-------
None
"""

print("-" * 20)

print("Index - Excel Name")

for i in range(len(excel_list)):
print(i + 1, "-", excel_list[i])


def show_data_columns(columns_name: pd.Index, columns_index: Optional[List] = None) -> None:
"""Show the column names of the data set.
Expand Down
58 changes: 58 additions & 0 deletions geochemistrypi/data_mining/utils/toggle_address_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os


def list_excel_files(directory: str) -> list:
"""Recursively lists all Excel files (including .xlsx, .xls, and .csv) in the specified directory and its subdirectories.
Parameters
----------
directory : str
The path to the directory to search for Excel files.
Returns
-------
excel_files : list
A list of file paths for all Excel files found.
Notes
-----
(1) The function uses `os.walk` to traverse the directory and its subdirectories.
(2) Only files with extensions .xlsx, .xls, and .csv are considered as Excel files.
"""
excel_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv"):
excel_files.append(os.path.join(root, file))
return excel_files


def toggle_address_status(status: str = None, training_data_path: str = None) -> list:
"""Toggles the training data path and output path based on the provided status.
Parameters
----------
status : str, optional
The status value, which can be "1" or "2".
- "1": Use the input and output paths in command line mode.
- "2": Retrieves all Excel files from the "data" folder on the desktop as the training data path, and sets the output path to the desktop.
training_data_path : str, optional
The path to the training data. This parameter is used when `status` is "1".
Returns
-------
paths : list
A list containing the training data path and the output path.
"""

if int(status) == 1:
working_path = os.path.dirname(os.getcwd())
elif int(status) == 2:
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
training_data_path = list_excel_files(os.path.join(desktop_path, "data"))
working_path = desktop_path
else:
raise ValueError("Invalid status value. It should be '1' or '2'.")

return [training_data_path, working_path]

0 comments on commit 15f18b4

Please sign in to comment.