From b05f396ed139b70eb8687792d3683869ca5e021e Mon Sep 17 00:00:00 2001 From: Nick Strugnell Date: Thu, 13 Jun 2024 14:38:27 +0000 Subject: [PATCH] more changes --- 1-prep_and_gather_data.ipynb | 822 ++++++++++++++++++++++++ 1-prep_and_gather_data_instructor.ipynb | 771 ++++++++++++++++++++++ 2 files changed, 1593 insertions(+) create mode 100644 1-prep_and_gather_data.ipynb create mode 100644 1-prep_and_gather_data_instructor.ipynb diff --git a/1-prep_and_gather_data.ipynb b/1-prep_and_gather_data.ipynb new file mode 100644 index 0000000..8a017a6 --- /dev/null +++ b/1-prep_and_gather_data.ipynb @@ -0,0 +1,822 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "73878580-3986-4fc3-81b6-f598e0783b4f", + "metadata": {}, + "source": [ + "# Demo project - Wine quality prediction" + ] + }, + { + "cell_type": "markdown", + "id": "53536e98-387e-4457-9d03-5a1d435837a6", + "metadata": {}, + "source": [ + "## Contents:\n", + "* [Import packages](#first-bullet)\n", + "* [Load Data](#second-bullet)\n", + "* [Exploratory data analysis](#third-bullet)\n", + "* [Prepare dataset for training model](#forth-bullet)\n", + "* [Build a baseline model](#fifth-bullet)\n", + "* [Experiment with a new model](#sixth-bullet)\n", + "* [Predict](#seventh-bullet)" + ] + }, + { + "cell_type": "markdown", + "id": "ff902f5d-7bc1-49c9-8994-d4c3e1cff7e8", + "metadata": {}, + "source": [ + "## Import packages " + ] + }, + { + "cell_type": "markdown", + "id": "d73dd05b-5b58-4587-a1df-02a9c96d1996", + "metadata": {}, + "source": [ + "We will need to install and import packages as we develop our notebook. We've created a couple of starter cells for you but you will need to add more as you work through the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3744f2b-4d07-4f10-84f0-1daa05cb8573", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install s3fs\n", + "# Install more modules that you need here\n", + "!pip install seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa3a0921-36d0-4b26-8378-cd5adab57fb8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas\n", + "# Import more modules and classes that you need here - REMEMBER TO RERUN THE CELL AFTER MODIFYING!\n", + "import os\n", + "import seaborn" + ] + }, + { + "cell_type": "markdown", + "id": "c5ac7837-ae31-4b73-9fd0-e8d8dddbafb6", + "metadata": { + "tags": [] + }, + "source": [ + "## Load Data " + ] + }, + { + "cell_type": "markdown", + "id": "7e0d066b-1b2f-47ac-a9a3-f01d62f22969", + "metadata": {}, + "source": [ + "You have access to a Minio-based S3 storage where your datasets are available, and where you will eventually push models. This storage is defined using a 'Data Connection' in your Data Science Project. You can access this data connection using environment variables. Run the following shell block to determine the environment variable names:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "780f041d-f811-4927-b039-13eed4dd151e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!env | grep AWS" + ] + }, + { + "cell_type": "markdown", + "id": "edbe5e0f-9470-4228-88be-43fb588ede9d", + "metadata": {}, + "source": [ + "You will need to assign these to Python variables to be able to use them in code blocks. We've started you off with some code below, but you'll also need variables set for the endpoint and bucket. Remember to import modules as needed in the import block at the top of the Notebook and re-run the cell below again after importing any modules." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2aa0f94-5b37-4873-9467-44e6703af9c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']\n", + "AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']\n", + "# Add variable assignments for AWS_S3_ENDPOINT and AWS_S3_Bucket below.\n", + "AWS_S3_ENDPOINT = os.environ['AWS_S3_ENDPOINT']\n", + "AWS_S3_BUCKET = os.environ['AWS_S3_BUCKET']\n" + ] + }, + { + "cell_type": "markdown", + "id": "d0fdc153-bb03-4e42-8df2-01de01b4f47d", + "metadata": {}, + "source": [ + "Check that your variables have been correctly set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dfbd5d1-2f81-4432-be43-fc6101d97ea3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"AWS_ACCESS_KEY is \" + AWS_ACCESS_KEY_ID)\n", + "print(\"AWS_SECRET_ACCESS_KEY is \" + AWS_SECRET_ACCESS_KEY)\n", + "print(\"AWS_S3_ENDPOINT is \" + AWS_S3_ENDPOINT)\n", + "print(\"AWS_S3_BUCKET is \" + AWS_S3_BUCKET)\n" + ] + }, + { + "cell_type": "markdown", + "id": "6e94b018-a5c8-4814-8619-0eece4e5d246", + "metadata": {}, + "source": [ + "## Exploratory data analysis \n", + "Have a look in the Minio UI and you will see that you have two datafiles in your bucket, called winequality-red.csv and winequality-white.csv. Let's set up some code to pull these from the storage into memory so that we can start some statistical exploration and visualisation. We will use the Pandas module to do this." + ] + }, + { + "cell_type": "markdown", + "id": "0da01133-3435-48e2-b67e-184d25a3270f", + "metadata": {}, + "source": [ + "First we define a function, read_data() which uses a pandas method to read CSVs directly from S3 storage. Note how we pass our S3 credentials to the method. Because this is a function definition it won't actually do anything when you execute the code cell. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3131849f-2412-4468-93b2-b21e73f91aa7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def read_data(datasrc):\n", + " data = pandas.read_csv(\n", + " \"s3://\" + AWS_S3_BUCKET + \"/\" + datasrc, sep=';',\n", + " storage_options={\n", + " \"key\": AWS_ACCESS_KEY_ID,\n", + " \"secret\": AWS_SECRET_ACCESS_KEY,\n", + " \"endpoint_url\": AWS_S3_ENDPOINT,\n", + " }\n", + " )\n", + " return data" + ] + }, + { + "cell_type": "markdown", + "id": "d96f7e23-18f1-4749-987c-1b0a1ee1f84e", + "metadata": {}, + "source": [ + "Let's try reading our two CSV files into memory now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ab7588-f839-4325-a4cb-22486498884d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "white_wine = read_data('winequality-white.csv')\n", + "red_wine = read_data('winequality-red.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "46ce3744-1bcd-4571-9613-d1b9a0ba024a", + "metadata": {}, + "source": [ + "Let's have a look at our white wine data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00ee462f-ff94-44ba-b68a-53d2211e8939", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "white_wine.head(5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dac8273-016a-4771-9026-0b8b77f44bf7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# add a command in this cell to inspect our red wine data\n", + "red_wine.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "9197769d-d945-491a-abdc-3939a8b3a3a5", + "metadata": {}, + "source": [ + "We would like to run analysis on both our red and white wine datasets simultaneously, so it makes sense to merge these two datasets into one. But how will we then tell the difference between our red and white wines? Well, we simply add another feature - the feature is calles 'is_red' and is essentially a Boolean indicating whether the wine is red, or 'not red' i.e. white.\n", + "\n", + "(Extra credit for anyone who can point out what might be problematic about this approach!)" + ] + }, + { + "cell_type": "markdown", + "id": "bbeed573-c132-4e8b-b1bb-e261812faa71", + "metadata": {}, + "source": [ + "Let's define a function to definte our additional feature in each dataset, and then concatenate them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6af49215-b766-4ca4-a140-6e21f8d7ecb7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def transformdata(red_wine,white_wine):\n", + " red_wine['is_red'] = 1\n", + " white_wine['is_red'] = 0\n", + " data = pandas.concat([red_wine, white_wine], axis=0)\n", + " # lets get rid of those annoying spaces in our column names\n", + " data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27a5d750-0307-43ca-abf5-1562fc8191f2", + "metadata": {}, + "outputs": [], + "source": [ + "Now, invoke your method and show the first 5 lines of the merged data below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa3e4777-76c2-4687-a8da-6e10c18fe536", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Write your code here\n", + "# data = insert your method call here\n", + "data = transformdata(red_wine,white_wine)\n", + "data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "bd056fae-7cf1-4e54-99d9-0f19c014534b", + "metadata": {}, + "source": [ + "SLIDES TO DISCUSS EXPLORATORY STATS COVERING:\n", + "- visualisation basics\n", + "- mean, median, deviation, skew\n", + "- quartiles and outliers\n", + "- correlation\n", + "\n", + "\n", + "Let's visualise our data using the seaborn module. Remember you may to install and/or import the module in the block at the beginning of the notebook (and re-run). Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.You can read more about it here: https://seaborn.pydata.org/" + ] + }, + { + "cell_type": "markdown", + "id": "4effa559-432c-4130-b41d-2e3f11ec0465", + "metadata": {}, + "source": [ + "This will plot a histogram of the quality of our wine. Experiement with plotting different features of the dataset, e.g. alcohol content, pH etc." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4e1d074-f5fb-41b1-afc7-1150550db01f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "seaborn.displot(data.quality, kde=False)" + ] + }, + { + "cell_type": "markdown", + "id": "494dfe1e-15a4-4a41-adda-f4700bac5835", + "metadata": {}, + "source": [ + "Let's simplify things by converting quality from a 1-10 scale to a simple boolean. A wine is either of high quality, or it is not. This quality feature will be our output feature when we run an inference model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c06e500-1871-4c71-816a-b898cb8633d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def settarget(data):\n", + " high_quality = (data.quality >= 7).astype(int)\n", + " data.quality = high_quality\n", + " return data\n", + "\n", + "data = settarget(data)\n", + "data.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "661daf61-7b01-4dcd-9f26-30a0dfa3c51a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "sns.displot(data.quality, kde=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4fec16b-9490-46a1-9f98-fe917796f219", + "metadata": {}, + "outputs": [], + "source": [ + "## median, upper and lower quartile, IQR\n", + "## histogram for distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f886882-1fd1-4633-af8b-602dc90d369a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dims = (3, 4)\n", + "\n", + "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", + "axis_i, axis_j = 0, 0\n", + "for col in data.columns:\n", + " if col == 'is_red' or col == 'quality':\n", + " continue # Box plots cannot be used on indicator variables\n", + " sns.boxplot(x=data['quality'], y=data[col], ax=axes[axis_i, axis_j])\n", + " axis_j += 1\n", + " if axis_j == dims[1]:\n", + " axis_i += 1\n", + " axis_j = 0" + ] + }, + { + "cell_type": "markdown", + "id": "206b8310-cd76-4012-ba6f-f2f621cd3fde", + "metadata": {}, + "source": [ + "Check missing value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dee4b83a-a069-4ed8-a9c5-935e45539cd3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## scenarios for missing data - decision for the missing data\n", + "## if alcohol is not an indicator, delete that record\n", + "\n", + "## what are we going to do with the outliers? are they real outliers?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31857042-ba20-42b5-9739-6a017f6b1951", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data.isna().any()" + ] + }, + { + "cell_type": "markdown", + "id": "6b89fa04-ee75-4782-ac57-b2f5f2f45239", + "metadata": {}, + "source": [ + "## Prepare dataset for training model \n", + "Split the input data into 3 sets:\n", + "\n", + "- Train (60% of the dataset used to train the model)\n", + "- Validation (20% of the dataset used to tune the hyperparameters)\n", + "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "757d77d2-d54e-4a2b-8b8d-1eba01ba61ce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_trainingdata(data):\n", + " X = data.drop([\"quality\"], axis=1)\n", + " y = data.quality\n", + "\n", + " # Split out the training data\n", + " X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + " # Split the remaining data equally into validation and test\n", + " X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)\n", + " return (X_train,X_val,X_test,y_train,y_val,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d6a7b2-974c-4c6c-a31f-3df48760c805", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(X_train,X_val,X_test,y_train,y_val,y_test) = get_trainingdata(data)" + ] + }, + { + "cell_type": "markdown", + "id": "cfe73322-99ac-4e6b-8ed6-c475d418e108", + "metadata": { + "tags": [] + }, + "source": [ + "## Build a baseline model (random forest classifier) \n", + "Build a simple classifier using scikit-learn. Use MLflow to keep track of the model accuracy. You can read about Classification - ROC and AUC here if you want \n", + "https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc" + ] + }, + { + "cell_type": "markdown", + "id": "4ea89452-6ec8-43cc-923c-de443450ff35", + "metadata": {}, + "source": [ + "Enable MLflow autologging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b53a514-9eab-491d-9237-80448e4cea20", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_name = \"WineQuality\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6628b383-14a4-493e-9add-e6076adf6ad5", + "metadata": {}, + "outputs": [], + "source": [ + "# check if experiment name already exists\n", + "mlflow.set_tracking_uri(\"http://mlflow:5500\")\n", + "mlflow.set_experiment(experiment_name)\n", + "\n", + "# enable autologging\n", + "mlflow.sklearn.autolog(log_input_examples=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f9e8cf7-0458-49d1-9dfa-a3061bbc00d4", + "metadata": {}, + "outputs": [], + "source": [ + "def log_featureimportance(model):\n", + " tmpdir = tempfile.mkdtemp()\n", + " filepath = os.path.join(tmpdir, 'feature_importance.json')\n", + " feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + " feature_importances.sort_values('importance', ascending=False).to_json(filepath)\n", + " mlflow.log_artifact(filepath)\n", + " return" + ] + }, + { + "cell_type": "markdown", + "id": "01bdefde-fc6a-4e7d-8c68-649528667fd4", + "metadata": {}, + "source": [ + "Train random forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f2832df-baa7-4125-8ef3-681517dbe8b0", + "metadata": {}, + "outputs": [], + "source": [ + "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", + " def __init__(self, model):\n", + " self.model = model\n", + "\n", + " def predict(self, context, model_input):\n", + " return self.model.predict_proba(model_input)[:,1]\n", + "\n", + "def train_randomforest(X_train,y_train,X_test,y_test):\n", + "\n", + " with mlflow.start_run(run_name='untuned_random_forest'):\n", + " n_estimators = 10\n", + " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", + " model.fit(X_train, y_train)\n", + "\n", + " predictions_test = model.predict_proba(X_test)[:,1]\n", + " auc_score = roc_auc_score(y_test, predictions_test)\n", + " mlflow.log_param('n_estimators', n_estimators) #specify the interested parameter/metric\n", + " mlflow.log_metric('auc', auc_score)\n", + " wrappedModel = SklearnModelWrapper(model)\n", + "\n", + " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", + "\n", + " conda_env = _mlflow_conda_env(\n", + " additional_conda_deps=None,\n", + " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", + " additional_conda_channels=None,\n", + " )\n", + " mlflow.pyfunc.log_model(\"random_forest_model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)\n", + " log_featureimportance(model)\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8957943-d0dc-4803-aab5-6c6ceb8ba34d", + "metadata": {}, + "outputs": [], + "source": [ + "model = train_randomforest(X_train,y_train,X_test,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c8cdc21-a1df-429b-8be3-3e4d1ea5ff2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity-check: This should match the AUC logged by MLflow\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:,1])}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eba2f19-7755-412e-bc67-2e586817582c", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity-check: This should match the feature importance logged by MLflow\n", + "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + "feature_importances.sort_values('importance', ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "b4206c08-5021-43dd-a11d-e7e7f687ebef", + "metadata": {}, + "source": [ + "## Experiment with a new model (xgboost) \n", + "Use the xgboost library to train a more accurate model. Run hyperparameter tuning to train multiple models. As before, the code tracks the performance of each parameter configuration with MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84cf68d6-86a4-4daa-927b-b4dc53f8cc9d", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "search_space = {\n", + " 'max_depth': scope.int(hp.quniform('max_depth', 50, 100, 10)),\n", + " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", + " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", + " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", + " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", + " 'objective': 'binary:logistic',\n", + " 'seed': 123, # Set a seed for deterministic training\n", + "}\n", + "\n", + "def train_model(params):\n", + "\n", + " mlflow.xgboost.autolog()\n", + " with mlflow.start_run(nested=True):\n", + " train = xgb.DMatrix(data=X_train, label=y_train)\n", + " validation = xgb.DMatrix(data=X_val, label=y_val)\n", + "\n", + " booster = xgb.train(params=params, dtrain=train, num_boost_round=100,\\\n", + " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", + " validation_predictions = booster.predict(validation)\n", + " auc_score = roc_auc_score(y_val, validation_predictions)\n", + " mlflow.log_metric('auc', auc_score) #specify the interested parameter/metric\n", + "\n", + " signature = infer_signature(X_train, booster.predict(train))\n", + " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", + "\n", + " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", + "\n", + "with mlflow.start_run(run_name='xgboost_models'):\n", + " best_params = fmin(\n", + " fn=train_model,\n", + " space=search_space,\n", + " algo=tpe.suggest,\n", + " max_evals=10,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12fb65af-5545-4563-a70a-ad62d11e6615", + "metadata": {}, + "outputs": [], + "source": [ + "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", + "best_run_id = best_run[\"run_id\"]\n", + "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2df971e0-1748-489c-b36d-dd481c211a0d", + "metadata": {}, + "outputs": [], + "source": [ + "best_run_id" + ] + }, + { + "cell_type": "markdown", + "id": "1f57f0bc-ec52-463c-9241-0bf897465b1b", + "metadata": { + "tags": [] + }, + "source": [ + "## Predict " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db412034-daab-4942-b3ae-0c5410a3e5a5", + "metadata": {}, + "outputs": [], + "source": [ + "# model = mlflow.pyfunc.load_model(f\"models:/TestModelD/production\")\n", + "model = mlflow.pyfunc.load_model(\"runs:/\" + best_run_id + \"/model\")\n", + "\n", + "test_predictions = model.predict(X_test)\n", + "print(f'AUC: {roc_auc_score(y_test, test_predictions)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5593581-c563-4c1a-aa80-f10d77f53209", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report\n", + "\n", + "class_labels = ['white wine', 'red wine']\n", + "test_predictions = np.where(test_predictions>0.5, 1, 0)\n", + "print(classification_report(y_test, test_predictions, target_names=class_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ebe7ec-a7e2-49bc-88bb-2b5ea79f3807", + "metadata": {}, + "outputs": [], + "source": [ + "cm = confusion_matrix(y_test, test_predictions)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n", + "\n", + "disp.plot()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9f40cc8-9a79-4521-9783-6a3aa0b0127a", + "metadata": {}, + "outputs": [], + "source": [ + "# register the best model\n", + "new_model_version = mlflow.register_model(f\"runs:/{best_run_id}/model\", \"WineQuality\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6d78d2a-563d-41de-a2a9-1dbddbdcb3cc", + "metadata": {}, + "outputs": [], + "source": [ + "# # Promote the new model version to Production\n", + "# client.transition_model_version_stage(\n", + "# name=\"TestModelD\",\n", + "# version=new_model_version.version,\n", + "# stage=\"Production\"\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d112aed5-fe65-47b3-aff3-899a96010bdf", + "metadata": {}, + "outputs": [], + "source": [ + "# # clean up models\n", + "# from mlflow.tracking import MlflowClient\n", + "# client = MlflowClient()\n", + "# client.delete_registered_model(name=\"winequality\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/1-prep_and_gather_data_instructor.ipynb b/1-prep_and_gather_data_instructor.ipynb new file mode 100644 index 0000000..f72a9e5 --- /dev/null +++ b/1-prep_and_gather_data_instructor.ipynb @@ -0,0 +1,771 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "73878580-3986-4fc3-81b6-f598e0783b4f", + "metadata": {}, + "source": [ + "# Demo project - Wine quality prediction" + ] + }, + { + "cell_type": "markdown", + "id": "53536e98-387e-4457-9d03-5a1d435837a6", + "metadata": {}, + "source": [ + "## Contents:\n", + "* [Import packages](#first-bullet)\n", + "* [Load Data](#second-bullet)\n", + "* [Exploratory data analysis](#third-bullet)\n", + "* [Prepare dataset for training model](#forth-bullet)\n", + "* [Build a baseline model](#fifth-bullet)\n", + "* [Experiment with a new model](#sixth-bullet)\n", + "* [Predict](#seventh-bullet)" + ] + }, + { + "cell_type": "markdown", + "id": "ff902f5d-7bc1-49c9-8994-d4c3e1cff7e8", + "metadata": {}, + "source": [ + "## Import packages " + ] + }, + { + "cell_type": "markdown", + "id": "d73dd05b-5b58-4587-a1df-02a9c96d1996", + "metadata": {}, + "source": [ + "In addition to the s3fs package, we will need to import hyperopt, cloudpickle, mlflow, and xgboost
\n", + "Modify the following cell to make this happen
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3744f2b-4d07-4f10-84f0-1daa05cb8573", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install s3fs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa3a0921-36d0-4b26-8378-cd5adab57fb8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import boto3\n", + "import pandas\n" + ] + }, + { + "cell_type": "markdown", + "id": "c5ac7837-ae31-4b73-9fd0-e8d8dddbafb6", + "metadata": { + "tags": [] + }, + "source": [ + "## Load Data " + ] + }, + { + "cell_type": "markdown", + "id": "f0fb050a-b46a-4de4-8fdd-0fe8e3b61d3e", + "metadata": {}, + "source": [ + "Assumption: the bucket is already created and \"winequality-red.csv\" & \"winequality-white.csv\" are uploaded into the bucket
\n", + "Read data from object store
\n", + "Connect to object store and instantiate a client object using boto3 session:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2aa0f94-5b37-4873-9467-44e6703af9c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']\n", + "AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']\n", + "AWS_S3_ENDPOINT = os.environ['AWS_S3_ENDPOINT']\n", + "AWS_S3_BUCKET = os.environ['AWS_S3_BUCKET']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47ae3876-523d-47f1-b895-e26bfb65977f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "s3conn = boto3.Session(aws_access_key_id=S3ACCESS_KEY,\n", + " aws_secret_access_key=S3SECRET_KEY)\n", + "s3_client = s3conn.client('s3',endpoint_url = S3ENDPOINT, verify=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d5bb280-9473-46a0-b575-dfa64f9872cf", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "s3_client.list_objects(Bucket='data')" + ] + }, + { + "cell_type": "markdown", + "id": "7cd65395-3135-45a0-8d7a-227ff2a05840", + "metadata": {}, + "source": [ + "Using the s3_client, retrieve data from objective store:
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9207c9a6-bcc4-42fc-abf1-ff84c431dbbd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "objectname = \"winequality-red.csv\"\n", + "file_addr = \"data/winequality-red.csv\"\n", + "response = s3_client.download_file(bucket_name, objectname, file_addr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e472d265-819b-4881-9fd4-b2fda3933179", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "objectname = \"winequality-white.csv\"\n", + "file_addr = \"data/winequality-white.csv\"\n", + "response = s3_client.download_file(bucket_name, objectname, file_addr)" + ] + }, + { + "cell_type": "markdown", + "id": "6e94b018-a5c8-4814-8619-0eece4e5d246", + "metadata": {}, + "source": [ + "## Exploratory data analysis " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3131849f-2412-4468-93b2-b21e73f91aa7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import s3fs\n", + "def read_data(datasrc):\n", + " data = pd.read_csv(\n", + " \"s3://\" + AWS_S3_BUCKET + \"/\" + datasrc, sep=';',\n", + " storage_options={\n", + " \"key\": AWS_ACCESS_KEY_ID,\n", + " \"secret\": AWS_SECRET_ACCESS_KEY,\n", + " \"endpoint_url\": AWS_S3_ENDPOINT,\n", + " }\n", + " )\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ab7588-f839-4325-a4cb-22486498884d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## after concatanation, setting the value of is_red for which is a red wine, which is a white wine - feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6af49215-b766-4ca4-a140-6e21f8d7ecb7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def transformdata(red_wine,white_wine):\n", + " red_wine['is_red'] = 1\n", + " white_wine['is_red'] = 0\n", + " data = pd.concat([red_wine, white_wine], axis=0)\n", + " data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)\n", + " return data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa3e4777-76c2-4687-a8da-6e10c18fe536", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "white_wine = read_data('winequality-white.csv')\n", + "red_wine = read_data('winequality-red.csv')\n", + "data = transformdata(red_wine, white_wine)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8620947-ab99-43cb-9a56-128f9ff03fc5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "bd056fae-7cf1-4e54-99d9-0f19c014534b", + "metadata": {}, + "source": [ + "Visualize data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cad8a920-4a11-4a5f-b60e-6eaf9d882bb2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4e1d074-f5fb-41b1-afc7-1150550db01f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "sns.displot(data.quality, kde=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee8a4a2e-3f5d-4699-a278-21b2c7c8a17f", + "metadata": {}, + "outputs": [], + "source": [ + "## set type boolean" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c06e500-1871-4c71-816a-b898cb8633d1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def settarget(data):\n", + " high_quality = (data.quality >= 7).astype(int)\n", + " data.quality = high_quality\n", + " return data\n", + "\n", + "data = settarget(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "661daf61-7b01-4dcd-9f26-30a0dfa3c51a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "sns.displot(data.quality, kde=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4fec16b-9490-46a1-9f98-fe917796f219", + "metadata": {}, + "outputs": [], + "source": [ + "## median, upper and lower quartile, IQR\n", + "## histogram for distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f886882-1fd1-4633-af8b-602dc90d369a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dims = (3, 4)\n", + "\n", + "f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))\n", + "axis_i, axis_j = 0, 0\n", + "for col in data.columns:\n", + " if col == 'is_red' or col == 'quality':\n", + " continue # Box plots cannot be used on indicator variables\n", + " sns.boxplot(x=data['quality'], y=data[col], ax=axes[axis_i, axis_j])\n", + " axis_j += 1\n", + " if axis_j == dims[1]:\n", + " axis_i += 1\n", + " axis_j = 0" + ] + }, + { + "cell_type": "markdown", + "id": "206b8310-cd76-4012-ba6f-f2f621cd3fde", + "metadata": {}, + "source": [ + "Check missing value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dee4b83a-a069-4ed8-a9c5-935e45539cd3", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "## scenarios for missing data - decision for the missing data\n", + "## if alcohol is not an indicator, delete that record\n", + "\n", + "## what are we going to do with the outliers? are they real outliers?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31857042-ba20-42b5-9739-6a017f6b1951", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data.isna().any()" + ] + }, + { + "cell_type": "markdown", + "id": "6b89fa04-ee75-4782-ac57-b2f5f2f45239", + "metadata": {}, + "source": [ + "## Prepare dataset for training model \n", + "Split the input data into 3 sets:\n", + "\n", + "- Train (60% of the dataset used to train the model)\n", + "- Validation (20% of the dataset used to tune the hyperparameters)\n", + "- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "757d77d2-d54e-4a2b-8b8d-1eba01ba61ce", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_trainingdata(data):\n", + " X = data.drop([\"quality\"], axis=1)\n", + " y = data.quality\n", + "\n", + " # Split out the training data\n", + " X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)\n", + "\n", + " # Split the remaining data equally into validation and test\n", + " X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)\n", + " return (X_train,X_val,X_test,y_train,y_val,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d6a7b2-974c-4c6c-a31f-3df48760c805", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "(X_train,X_val,X_test,y_train,y_val,y_test) = get_trainingdata(data)" + ] + }, + { + "cell_type": "markdown", + "id": "cfe73322-99ac-4e6b-8ed6-c475d418e108", + "metadata": { + "tags": [] + }, + "source": [ + "## Build a baseline model (random forest classifier) \n", + "Build a simple classifier using scikit-learn. Use MLflow to keep track of the model accuracy. You can read about Classification - ROC and AUC here if you want \n", + "https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc" + ] + }, + { + "cell_type": "markdown", + "id": "4ea89452-6ec8-43cc-923c-de443450ff35", + "metadata": {}, + "source": [ + "Enable MLflow autologging" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b53a514-9eab-491d-9237-80448e4cea20", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_name = \"WineQuality\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6628b383-14a4-493e-9add-e6076adf6ad5", + "metadata": {}, + "outputs": [], + "source": [ + "# check if experiment name already exists\n", + "mlflow.set_tracking_uri(\"http://mlflow:5500\")\n", + "mlflow.set_experiment(experiment_name)\n", + "\n", + "# enable autologging\n", + "mlflow.sklearn.autolog(log_input_examples=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f9e8cf7-0458-49d1-9dfa-a3061bbc00d4", + "metadata": {}, + "outputs": [], + "source": [ + "def log_featureimportance(model):\n", + " tmpdir = tempfile.mkdtemp()\n", + " filepath = os.path.join(tmpdir, 'feature_importance.json')\n", + " feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + " feature_importances.sort_values('importance', ascending=False).to_json(filepath)\n", + " mlflow.log_artifact(filepath)\n", + " return" + ] + }, + { + "cell_type": "markdown", + "id": "01bdefde-fc6a-4e7d-8c68-649528667fd4", + "metadata": {}, + "source": [ + "Train random forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f2832df-baa7-4125-8ef3-681517dbe8b0", + "metadata": {}, + "outputs": [], + "source": [ + "class SklearnModelWrapper(mlflow.pyfunc.PythonModel):\n", + " def __init__(self, model):\n", + " self.model = model\n", + "\n", + " def predict(self, context, model_input):\n", + " return self.model.predict_proba(model_input)[:,1]\n", + "\n", + "def train_randomforest(X_train,y_train,X_test,y_test):\n", + "\n", + " with mlflow.start_run(run_name='untuned_random_forest'):\n", + " n_estimators = 10\n", + " model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))\n", + " model.fit(X_train, y_train)\n", + "\n", + " predictions_test = model.predict_proba(X_test)[:,1]\n", + " auc_score = roc_auc_score(y_test, predictions_test)\n", + " mlflow.log_param('n_estimators', n_estimators) #specify the interested parameter/metric\n", + " mlflow.log_metric('auc', auc_score)\n", + " wrappedModel = SklearnModelWrapper(model)\n", + "\n", + " signature = infer_signature(X_train, wrappedModel.predict(None, X_train))\n", + "\n", + " conda_env = _mlflow_conda_env(\n", + " additional_conda_deps=None,\n", + " additional_pip_deps=[\"cloudpickle=={}\".format(cloudpickle.__version__), \"scikit-learn=={}\".format(sklearn.__version__)],\n", + " additional_conda_channels=None,\n", + " )\n", + " mlflow.pyfunc.log_model(\"random_forest_model\", python_model=wrappedModel, conda_env=conda_env, signature=signature)\n", + " log_featureimportance(model)\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8957943-d0dc-4803-aab5-6c6ceb8ba34d", + "metadata": {}, + "outputs": [], + "source": [ + "model = train_randomforest(X_train,y_train,X_test,y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c8cdc21-a1df-429b-8be3-3e4d1ea5ff2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity-check: This should match the AUC logged by MLflow\n", + "print(f'AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:,1])}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eba2f19-7755-412e-bc67-2e586817582c", + "metadata": {}, + "outputs": [], + "source": [ + "# Sanity-check: This should match the feature importance logged by MLflow\n", + "feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])\n", + "feature_importances.sort_values('importance', ascending=False)" + ] + }, + { + "cell_type": "markdown", + "id": "b4206c08-5021-43dd-a11d-e7e7f687ebef", + "metadata": {}, + "source": [ + "## Experiment with a new model (xgboost) \n", + "Use the xgboost library to train a more accurate model. Run hyperparameter tuning to train multiple models. As before, the code tracks the performance of each parameter configuration with MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84cf68d6-86a4-4daa-927b-b4dc53f8cc9d", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "search_space = {\n", + " 'max_depth': scope.int(hp.quniform('max_depth', 50, 100, 10)),\n", + " 'learning_rate': hp.loguniform('learning_rate', -3, 0),\n", + " 'reg_alpha': hp.loguniform('reg_alpha', -5, -1),\n", + " 'reg_lambda': hp.loguniform('reg_lambda', -6, -1),\n", + " 'min_child_weight': hp.loguniform('min_child_weight', -1, 3),\n", + " 'objective': 'binary:logistic',\n", + " 'seed': 123, # Set a seed for deterministic training\n", + "}\n", + "\n", + "def train_model(params):\n", + "\n", + " mlflow.xgboost.autolog()\n", + " with mlflow.start_run(nested=True):\n", + " train = xgb.DMatrix(data=X_train, label=y_train)\n", + " validation = xgb.DMatrix(data=X_val, label=y_val)\n", + "\n", + " booster = xgb.train(params=params, dtrain=train, num_boost_round=100,\\\n", + " evals=[(validation, \"validation\")], early_stopping_rounds=50)\n", + " validation_predictions = booster.predict(validation)\n", + " auc_score = roc_auc_score(y_val, validation_predictions)\n", + " mlflow.log_metric('auc', auc_score) #specify the interested parameter/metric\n", + "\n", + " signature = infer_signature(X_train, booster.predict(train))\n", + " mlflow.xgboost.log_model(booster, \"model\", signature=signature)\n", + "\n", + " return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}\n", + "\n", + "with mlflow.start_run(run_name='xgboost_models'):\n", + " best_params = fmin(\n", + " fn=train_model,\n", + " space=search_space,\n", + " algo=tpe.suggest,\n", + " max_evals=10,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12fb65af-5545-4563-a70a-ad62d11e6615", + "metadata": {}, + "outputs": [], + "source": [ + "best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]\n", + "best_run_id = best_run[\"run_id\"]\n", + "print(f'AUC of Best Run: {best_run[\"metrics.auc\"]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2df971e0-1748-489c-b36d-dd481c211a0d", + "metadata": {}, + "outputs": [], + "source": [ + "best_run_id" + ] + }, + { + "cell_type": "markdown", + "id": "1f57f0bc-ec52-463c-9241-0bf897465b1b", + "metadata": { + "tags": [] + }, + "source": [ + "## Predict " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db412034-daab-4942-b3ae-0c5410a3e5a5", + "metadata": {}, + "outputs": [], + "source": [ + "# model = mlflow.pyfunc.load_model(f\"models:/TestModelD/production\")\n", + "model = mlflow.pyfunc.load_model(\"runs:/\" + best_run_id + \"/model\")\n", + "\n", + "test_predictions = model.predict(X_test)\n", + "print(f'AUC: {roc_auc_score(y_test, test_predictions)}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5593581-c563-4c1a-aa80-f10d77f53209", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import classification_report\n", + "\n", + "class_labels = ['white wine', 'red wine']\n", + "test_predictions = np.where(test_predictions>0.5, 1, 0)\n", + "print(classification_report(y_test, test_predictions, target_names=class_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ebe7ec-a7e2-49bc-88bb-2b5ea79f3807", + "metadata": {}, + "outputs": [], + "source": [ + "cm = confusion_matrix(y_test, test_predictions)\n", + "disp = ConfusionMatrixDisplay(confusion_matrix=cm)\n", + "\n", + "disp.plot()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9f40cc8-9a79-4521-9783-6a3aa0b0127a", + "metadata": {}, + "outputs": [], + "source": [ + "# register the best model\n", + "new_model_version = mlflow.register_model(f\"runs:/{best_run_id}/model\", \"WineQuality\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6d78d2a-563d-41de-a2a9-1dbddbdcb3cc", + "metadata": {}, + "outputs": [], + "source": [ + "# # Promote the new model version to Production\n", + "# client.transition_model_version_stage(\n", + "# name=\"TestModelD\",\n", + "# version=new_model_version.version,\n", + "# stage=\"Production\"\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d112aed5-fe65-47b3-aff3-899a96010bdf", + "metadata": {}, + "outputs": [], + "source": [ + "# # clean up models\n", + "# from mlflow.tracking import MlflowClient\n", + "# client = MlflowClient()\n", + "# client.delete_registered_model(name=\"winequality\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}