From 1e5b815b4abf4f483a8eec4be3da6a1cb12f4a66 Mon Sep 17 00:00:00 2001 From: sanyhe Date: Sun, 25 Jun 2023 13:58:24 +0800 Subject: [PATCH] docs: add commands to operate the software on Jupyter Notebook and Google Colab. --- .gitignore | 3 ++ README.md | 30 +++++++++++++++---- geochemistrypi/data_mining/cli_pipeline.py | 35 ++++++++++++---------- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 752a62cf..f273b883 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,6 @@ docs/build # env .env + +# mlflow +mlruns diff --git a/README.md b/README.md index 23dbd08d..d4caebc6 100644 --- a/README.md +++ b/README.md @@ -29,35 +29,53 @@ Latest Update: follow up by clicking `Starred` and `Watch` on our [GitHub repos ## Quick Installation -One instruction to download on command line, such as Terminal on macOS, CMD on Windows. +One instruction to download on **command line**, such as Terminal on macOS, CMD on Windows. ``` pip install geochemistrypi ``` +One instruction to download on **Jupyter Notebook** or **Google Colab**. +``` +!pip install geochemistrypi +``` **Note**: The beta version runs on MacOS, Windows or Linux. ## Quick Update -One instruction to update the software to the latest version on command line, such as Terminal on macOS, CMD on Windows. +One instruction to update the software to the latest version on **command line**, such as Terminal on macOS, CMD on Windows. ``` pip install --upgrade geochemistrypi ``` +One instruction to download on **Jupyter Notebook** or **Google Colab**. +``` +!pip install --upgrade geochemistrypi +``` ## Example -**How to run:** After successfully downloading, run this instruction on command line whatever directory it is. +**How to run:** After successfully downloading, run this instruction on **command line / Jupyter Notebook / Google Colab** whatever directory it is. ### Case 1: Run with built-in data set for testing +On command line: ``` geochemistrypi data-mining ``` +On Jupyter Notebook / Google Colab: +``` +!geochemistrypi data-mining +``` **Note**: There are four built-in data sets corresponding to four kinds of model pattern. ### Case 2: Run with your own data set +On command line: ``` geochemistrypi data-mining --data your_own_data_set.xlsx ``` -**Note**: Currently, only `.xlsx` file is supported. Please specify the path your data file exists. +On Jupyter Notebook / Google Colab: +``` +!geochemistrypi data-mining --data your_own_data_set.xlsx +``` +**Note**: Currently, only `.xlsx` file is supported. Please specify the path your data file exists. For Google Colab, don't forget to upload your dataset first. -For more details: Please refer to +For more details: Please refer to: + [Manual v1.1.0 for Geochemistry π - Beta (International - Google drive)](https://drive.google.com/file/d/1yryykCyWKM-Sj88fOYbOba6QkB_fu2ws/view?usp=sharing) + [Manual v1.1.0 for Geochemistry π - Beta (China - Tencent Docs)](https://docs.qq.com/pdf/DQ0l5d2xVd2VwcnVW?&u=6868f96d4a384b309036e04e637e367a) + [Geochemistry π - Download and Run the Beta Version (International - Youtube)](https://www.youtube.com/watch?v=EeVaJ3H7_AU&list=PLy8hNsI55lvh1UHjhVhqNUj3xPdV9sEiM&index=9) @@ -113,7 +131,7 @@ The whole package is under construction and the documentation is progressively e + Bailun Jiang (EPSI / Lille University, France) + Yucheng Yan (Andy, University of Sydney) + Keran Li (Kirk, Chengdu University of Technology) -+ Mengying Ye (Jilin University, Changchun, China) ++ Mengying Ye (Mary, Jilin University, Changchun, China) ## Join Us :) diff --git a/geochemistrypi/data_mining/cli_pipeline.py b/geochemistrypi/data_mining/cli_pipeline.py index 27e88459..3919845c 100644 --- a/geochemistrypi/data_mining/cli_pipeline.py +++ b/geochemistrypi/data_mining/cli_pipeline.py @@ -51,16 +51,18 @@ def cli_pipeline(file_name: str) -> None: logger = log(OUTPUT_PATH, "inner_test.log") logger.info("Geochemistry Py v.1.0.0 - beta version") - # If the argument is False, hide all Python level warnings. + # If the argument is False, hide all Python level warnings. Developers can turn it on by setting the argument to True. show_warning(False) - # Read the data - logger.debug("Data Uploaded") - print("-*-*- Data Loading -*-*-") + # Data Loading + logger.debug("User Data Uploaded") + print("-*-*- User Data Loading -*-*-") if file_name: + # If the user provides the file name, then load the data from the file. data = read_data(file_name=file_name, is_own_data=1) print(f"Successfully load the data set '{file_name}'.") else: + # If the user doesn't provide the file name, then load the built-in data set. print("Built-in Data Option:") num2option(TEST_DATA_OPTION) test_data_num = limit_num_input(TEST_DATA_OPTION, SECTION[0], num_input) @@ -77,11 +79,12 @@ def cli_pipeline(file_name: str) -> None: show_data_columns(data.columns) clear_output() - # World map projection for a specific element + # World Map Projection (Optional) logger.debug("World Map Projection") - print("-*-*- World Map -*-*-") + print("-*-*- World Map Projection -*-*-") map_flag = 0 is_map_projection = 0 + # TODO: Abstract the following code of checking the existence of the longitude and latitude columns into a function. detection_index = 0 lon = ["LONGITUDE", "Longitude (°E)", "longitude", "Longitude", "经度 (°N)", "经度"] lat = ["LATITUDE", "Latitude (°N)", "latitude", "Latitude", "纬度 (°E)", "纬度"] @@ -100,14 +103,16 @@ def cli_pipeline(file_name: str) -> None: if detection_index != 3: print("Hence, world map projection functionality will be skipped!") clear_output() + # If the data set contains both longitude and latitude data, then the user can choose to project the data on the world map. while detection_index == 3: if map_flag != 1: - # option selection + # Check if the user wants to project the data on the world map. print("World Map Projection for A Specific Element Option:") num2option(OPTION) is_map_projection = limit_num_input(OPTION, SECTION[3], num_input) clear_output() if is_map_projection == 1: + # If the user chooses to project the data on the world map, then the user can select the element to be projected. print("-*-*- Distribution in World Map -*-*-") print("Select one of the elements below to be projected in the World Map: ") show_data_columns(data.columns) @@ -131,7 +136,7 @@ def cli_pipeline(file_name: str) -> None: elif is_map_projection == 2: break - # Create the processing data set + # Data Selection for Preprocessing logger.debug("Data Selection") print("-*-*- Data Selection -*-*-") show_data_columns(data.columns) @@ -148,7 +153,7 @@ def cli_pipeline(file_name: str) -> None: logged_distribution_plot(data_processed.columns, data_processed) clear_output() - # Imputing + # Imputing Missing Values logger.debug("Imputation") print("-*-*- Imputation -*-*-") is_null_value(data_processed) @@ -189,7 +194,7 @@ def cli_pipeline(file_name: str) -> None: # if the selected data set doesn't need imputation, which means there are no missing values. data_processed_imputed = data_processed - # Feature engineering + # Feature Engineering # FIXME(hecan sanyhew1097618435@163.com): fix the logic logger.debug("Feature Engineering") print("-*-*- Feature Engineering -*-*-") @@ -233,16 +238,16 @@ def cli_pipeline(file_name: str) -> None: clear_output() break - # Mode selection + # Mode Selection logger.debug("Mode Selection") - print("-*-*- Mode Options -*-*-") + print("-*-*- Mode Selection -*-*-") num2option(MODE_OPTION) mode_num = limit_num_input(MODE_OPTION, SECTION[2], num_input) clear_output() # divide X and y data set when it is supervised learning logger.debug("Data Split") if mode_num == 1 or mode_num == 2: - print("-*-*- Data Split - X Set and Y Set-*-*-") + print("-*-*- Data Split - X Set and Y Set -*-*-") print("Divide the processing data set into X (feature value) and Y (target value) respectively.") # create X data set print("Selected sub data set to create X data set:") @@ -257,7 +262,7 @@ def cli_pipeline(file_name: str) -> None: save_data(X, "X Without Scaling", DATASET_OUTPUT_PATH) clear_output() - # feature scaling + # Feature Scaling print("-*-*- Feature Scaling on X Set -*-*-") num2option(OPTION) is_feature_scaling = limit_num_input(OPTION, SECTION[1], num_input) @@ -313,7 +318,7 @@ def cli_pipeline(file_name: str) -> None: X_train = data_processed_imputed y, X_test, y_train, y_test = None, None, None, None - # Model option for users + # Model Selection logger.debug("Model Selection") print("-*-*- Model Selection -*-*-:") Modes2Models = {1: REGRESSION_MODELS, 2: CLASSIFICATION_MODELS, 3: CLUSTERING_MODELS, 4: DECOMPOSITION_MODELS}