diff --git a/week1_intro/crossentropy_method.ipynb b/week1_intro/crossentropy_method.ipynb new file mode 100644 index 000000000..691c1f912 --- /dev/null +++ b/week1_intro/crossentropy_method.ipynb @@ -0,0 +1,484 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + }, + "colab": { + "name": "crossentropy_method.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Tgw0BZflWFHi" + }, + "source": [ + "# Crossentropy method\n", + "\n", + "This notebook will teach you to solve reinforcement learning problems with crossentropy method. After that we'll scale everything up using neural network policy." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "58SAV2yrWFHp" + }, + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/grading.py -O ../grading.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/week1_intro/submit.py\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display for drawing game images on.\n", + "# It won't have any effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "yjWWG4ojWFHr" + }, + "source": [ + "import gym\n", + "import numpy as np\n", + "\n", + "env = gym.make(\"Taxi-v3\")\n", + "env.reset()\n", + "env.render()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6bkcCW0DWFHs" + }, + "source": [ + "n_states = env.observation_space.n\n", + "n_actions = env.action_space.n\n", + "\n", + "print(\"n_states=%i, n_actions=%i\" % (n_states, n_actions))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dWUbRVMwWFHt" + }, + "source": [ + "# Create stochastic policy\n", + "\n", + "This time our policy should be a probability distribution.\n", + "\n", + "```policy[s,a] = P(take action a | in state s)```\n", + "\n", + "Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n", + "\n", + "Please initialize the policy __uniformly__, that is, the probabililities of all actions should be equal." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ABbdevILWFHt" + }, + "source": [ + "def initialize_policy(n_states, n_actions):\n", + " \n", + " \n", + " return policy\n", + "\n", + "policy = initialize_policy(n_states, n_actions)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "l_petDW6WFHu" + }, + "source": [ + "assert type(policy) in (np.ndarray, np.matrix)\n", + "assert np.allclose(policy, 1./n_actions)\n", + "assert np.allclose(np.sum(policy, axis=1), 1)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y2y9sadSWFHv" + }, + "source": [ + "# Play the game\n", + "\n", + "Just like before, but we also record all states and actions we took." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lkEYeB3MWFHv" + }, + "source": [ + "def generate_session(env, policy, t_max=10**4):\n", + " \"\"\"\n", + " Play the game until the end or for t_max ticks.\n", + " :param policy: an array of shape [n_states,n_actions] with the action probabilities\n", + " :returns: list of states, list of actions and the sum of rewards\n", + " \"\"\"\n", + " states, actions = [], []\n", + " total_reward = 0.\n", + "\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " # Hint: you can use np.random.choice for sampling action\n", + " # https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n", + " a = \n", + "\n", + " new_s, r, done, info = env.step(a)\n", + "\n", + " # Record information we just got from the environment.\n", + " states.append(s)\n", + " actions.append(a)\n", + " total_reward += r\n", + "\n", + " s = new_s\n", + " if done:\n", + " break\n", + "\n", + " return states, actions, total_reward" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SV5ykl0DWFHw" + }, + "source": [ + "s, a, r = generate_session(env, policy)\n", + "assert type(s) == type(a) == list\n", + "assert len(s) == len(a)\n", + "assert type(r) in [float, np.float]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8_lQnXmZWFHw" + }, + "source": [ + "# let's see the initial reward distribution\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "sample_rewards = [generate_session(env, policy, t_max=1000)[-1] for _ in range(200)]\n", + "\n", + "plt.hist(sample_rewards, bins=20)\n", + "plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label=\"50'th percentile\", color='green')\n", + "plt.vlines([np.percentile(sample_rewards, 90)], [0], [100], label=\"90'th percentile\", color='red')\n", + "plt.legend()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z7WRpzqCWFHw" + }, + "source": [ + "### Crossentropy method steps" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IytipjvLWFHx" + }, + "source": [ + "def select_elites(states_batch, actions_batch, rewards_batch, percentile):\n", + " \"\"\"\n", + " Select states and actions from games that have rewards >= percentile\n", + " :param states_batch: list of lists of states, states_batch[session_i][t]\n", + " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", + " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", + "\n", + " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", + "\n", + " Please return elite states and actions in their original order \n", + " [i.e. sorted by session number and timestep within session]\n", + "\n", + " If you are confused, see examples below. Please don't assume that states are integers\n", + " (their type will change later).\n", + " \"\"\"\n", + "\n", + " reward_threshold = \n", + "\n", + " elite_states = \n", + " elite_actions = \n", + "\n", + " return elite_states, elite_actions" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "J4W9Ptu3WFHx" + }, + "source": [ + "states_batch = [\n", + " [1, 2, 3], # game1\n", + " [4, 2, 0, 2], # game2\n", + " [3, 1], # game3\n", + "]\n", + "\n", + "actions_batch = [\n", + " [0, 2, 4], # game1\n", + " [3, 2, 0, 1], # game2\n", + " [3, 3], # game3\n", + "]\n", + "rewards_batch = [\n", + " 3, # game1\n", + " 4, # game2\n", + " 5, # game3\n", + "]\n", + "\n", + "test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)\n", + "test_result_30 = select_elites(states_batch, actions_batch, rewards_batch, percentile=30)\n", + "test_result_90 = select_elites(states_batch, actions_batch, rewards_batch, percentile=90)\n", + "test_result_100 = select_elites(states_batch, actions_batch, rewards_batch, percentile=100)\n", + "\n", + "assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1]) \\\n", + " and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]), \\\n", + " \"For percentile 0 you should return all states and actions in chronological order\"\n", + "assert np.all(test_result_30[0] == [4, 2, 0, 2, 3, 1]) and \\\n", + " np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]), \\\n", + " \"For percentile 30 you should only select states/actions from two first\"\n", + "assert np.all(test_result_90[0] == [3, 1]) and \\\n", + " np.all(test_result_90[1] == [3, 3]), \\\n", + " \"For percentile 90 you should only select states/actions from one game\"\n", + "assert np.all(test_result_100[0] == [3, 1]) and\\\n", + " np.all(test_result_100[1] == [3, 3]), \\\n", + " \"Please make sure you use >=, not >. Also double-check how you compute percentile.\"\n", + "\n", + "print(\"Ok!\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uzl8MCioWFHy" + }, + "source": [ + "def get_new_policy(elite_states, elite_actions):\n", + " \"\"\"\n", + " Given a list of elite states/actions from select_elites,\n", + " return a new policy where each action probability is proportional to\n", + "\n", + " policy[s_i,a_i] ~ #[occurrences of s_i and a_i in elite states/actions]\n", + "\n", + " Don't forget to normalize the policy to get valid probabilities and handle the 0/0 case.\n", + " For states, that you never visited, use a uniform distribution (1/n_actions for all states).\n", + "\n", + " :param elite_states: 1D list of states from elite sessions\n", + " :param elite_actions: 1D list of actions from elite sessions\n", + "\n", + " \"\"\"\n", + "\n", + " new_policy = np.zeros([n_states, n_actions])\n", + "\n", + " \n", + " # Don't forget to set 1/n_actions for all actions in unvisited states.\n", + "\n", + " return new_policy" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Kbd9S0-kWFH0" + }, + "source": [ + "elite_states = [1, 2, 3, 4, 2, 0, 2, 3, 1]\n", + "elite_actions = [0, 2, 4, 3, 2, 0, 1, 3, 3]\n", + "\n", + "new_policy = get_new_policy(elite_states, elite_actions)\n", + "\n", + "assert np.isfinite(new_policy).all(), \\\n", + " \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n", + "assert np.all(new_policy >= 0), \\\n", + " \"Your new policy can't have negative action probabilities\"\n", + "assert np.allclose(new_policy.sum(axis=-1), 1), \\\n", + " \"Your new policy should be a valid probability distribution over actions\"\n", + "\n", + "reference_answer = np.array([\n", + " [1., 0., 0., 0., 0.],\n", + " [0.5, 0., 0., 0.5, 0.],\n", + " [0., 0.33333333, 0.66666667, 0., 0.],\n", + " [0., 0., 0., 0.5, 0.5]])\n", + "assert np.allclose(new_policy[:4, :5], reference_answer)\n", + "\n", + "print(\"Ok!\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c1OuPogqWFH0" + }, + "source": [ + "# Training loop\n", + "Generate sessions, select N best and fit to those." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EnUX8LZuWFH1" + }, + "source": [ + "from IPython.display import clear_output\n", + "\n", + "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", + " \"\"\"\n", + " A convenience function that displays training progress. \n", + " No cool math here, just charts.\n", + " \"\"\"\n", + "\n", + " mean_reward = np.mean(rewards_batch)\n", + " threshold = np.percentile(rewards_batch, percentile)\n", + " log.append([mean_reward, threshold])\n", + " \n", + " plt.figure(figsize=[8, 4])\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(list(zip(*log))[0], label='Mean rewards')\n", + " plt.plot(list(zip(*log))[1], label='Reward thresholds')\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.subplot(1, 2, 2)\n", + " plt.hist(rewards_batch, range=reward_range)\n", + " plt.vlines([np.percentile(rewards_batch, percentile)],\n", + " [0], [100], label=\"percentile\", color='red')\n", + " plt.legend()\n", + " plt.grid()\n", + " clear_output(True)\n", + " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rjWzwcnkWFH1" + }, + "source": [ + "# reset policy just in case\n", + "policy = initialize_policy(n_states, n_actions)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "rO4I4c_RWFH1" + }, + "source": [ + "n_sessions = 250 # sample this many sessions\n", + "percentile = 50 # take this percent of session with highest rewards\n", + "learning_rate = 0.5 # how quickly the policy is updated, on a scale from 0 to 1\n", + "\n", + "log = []\n", + "\n", + "for i in range(100):\n", + " %time sessions = [ ]\n", + "\n", + " states_batch, actions_batch, rewards_batch = zip(*sessions)\n", + "\n", + " elite_states, elite_actions = \n", + "\n", + " new_policy = \n", + "\n", + " policy = learning_rate * new_policy + (1 - learning_rate) * policy\n", + "\n", + " # display results on the chart\n", + " show_progress(rewards_batch, log, percentile)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DFbElLtLWFH2" + }, + "source": [ + "### Reflecting on the results\n", + "\n", + "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back to -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", + "\n", + "In case CEM failed to learn, how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n", + "\n", + "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate the strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M1yaM2oyWFH3" + }, + "source": [ + "### Submit to coursera" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8-gzJQyzWFH3" + }, + "source": [ + "from submit import submit_taxi\n", + "submit_taxi(generate_session, policy, 'your.email@example.com', 'YourAssignmentToken')" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/week1_intro/deep_crossentropy_method.ipynb b/week1_intro/deep_crossentropy_method.ipynb new file mode 100644 index 000000000..e41dccc69 --- /dev/null +++ b/week1_intro/deep_crossentropy_method.ipynb @@ -0,0 +1,468 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + }, + "colab": { + "name": "deep_crossentropy_method.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "3DQr095WiWhN" + }, + "source": [ + "# Deep Crossentropy method\n", + "\n", + "In this section we'll extend your CEM implementation with neural networks! You will train a multi-layer neural network to solve simple continuous state space games. __Please make sure you're done with tabular crossentropy method from the previous notebook.__\n", + "\n", + "![img](https://tip.duke.edu/independent_learning/greek/lesson/digging_deeper_final.jpg)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4OMgbRtniWhV" + }, + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/grading.py -O ../grading.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/week1_intro/submit.py\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display for drawing game images on.\n", + "# It won't have any effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "3JfSCwrdiWhW" + }, + "source": [ + "import gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# if you see \" has no attribute .env\", remove .env or update gym\n", + "env = gym.make(\"CartPole-v0\").env\n", + "\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_dim = env.observation_space.shape[0]\n", + "\n", + "plt.imshow(env.render(\"rgb_array\"))\n", + "print(\"state vector dim =\", state_dim)\n", + "print(\"n_actions =\", n_actions)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AVH_vr6NiWhW" + }, + "source": [ + "# Neural Network Policy\n", + "\n", + "For this assignment we'll utilize the simplified neural network implementation from __[Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)__. Here's what you'll need:\n", + "\n", + "* `agent.partial_fit(states, actions)` - makes a single training pass over the data. Maximize the probabilitity of :actions: from :states:\n", + "* `agent.predict_proba(states)` - predicts probabilities of all actions, a matrix of shape __[len(states), n_actions]__\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "klwcW0L8iWhX" + }, + "source": [ + "from sklearn.neural_network import MLPClassifier\n", + "\n", + "agent = MLPClassifier(\n", + " hidden_layer_sizes=(20, 20),\n", + " activation='tanh',\n", + ")\n", + "\n", + "# initialize agent to the dimension of state space and a number of actions\n", + "agent.partial_fit([env.reset()] * n_actions, range(n_actions), range(n_actions))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tHl1Z4XhiWhX" + }, + "source": [ + "def generate_session(env, agent, t_max=1000):\n", + " \"\"\"\n", + " Play a single game using agent neural network.\n", + " Terminate when game finishes or after :t_max: steps\n", + " \"\"\"\n", + " states, actions = [], []\n", + " total_reward = 0\n", + "\n", + " s = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " \n", + " # use agent to predict a vector of action probabilities for state :s:\n", + " probs = \n", + "\n", + " assert probs.shape == (env.action_space.n,), \"make sure that the probabilities are a vector (hint: np.reshape)\"\n", + " \n", + " # use the probabilities you predicted to pick an action\n", + " # sample proportionally to the probabilities, don't just take the most likely action\n", + " a = \n", + " # ^-- hint: try np.random.choice\n", + "\n", + " new_s, r, done, info = env.step(a)\n", + "\n", + " # record sessions like you did before\n", + " states.append(s)\n", + " actions.append(a)\n", + " total_reward += r\n", + "\n", + " s = new_s\n", + " if done:\n", + " break\n", + " return states, actions, total_reward" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "mBhJn6GuiWhZ" + }, + "source": [ + "dummy_states, dummy_actions, dummy_reward = generate_session(env, agent, t_max=5)\n", + "print(\"states:\", np.stack(dummy_states))\n", + "print(\"actions:\", dummy_actions)\n", + "print(\"reward:\", dummy_reward)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4nY61ynwiWhZ" + }, + "source": [ + "### CEM steps\n", + "Deep CEM uses exactly the same strategy as the regular CEM, so you can copy your function code from previous notebook.\n", + "\n", + "The only difference is that now each observation is not a number but a `float32` vector." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "irzlLE8-iWha" + }, + "source": [ + "def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):\n", + " \"\"\"\n", + " Select states and actions from games that have rewards >= percentile\n", + " :param states_batch: list of lists of states, states_batch[session_i][t]\n", + " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", + " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", + "\n", + " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", + "\n", + " Please return elite states and actions in their original order \n", + " [i.e. sorted by session number and timestep within session]\n", + "\n", + " If you are confused, see examples below. Please don't assume that states are integers\n", + " (their type will change later).\n", + " \"\"\"\n", + "\n", + " \n", + " \n", + " return elite_states, elite_actions" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xRxVyyv8iWha" + }, + "source": [ + "# Training loop\n", + "Generate sessions, select N best and fit to those." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BKII0ncFiWhb" + }, + "source": [ + "from IPython.display import clear_output\n", + "\n", + "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", + " \"\"\"\n", + " A convenience function that displays training progress. \n", + " No cool math here, just charts.\n", + " \"\"\"\n", + "\n", + " mean_reward = np.mean(rewards_batch)\n", + " threshold = np.percentile(rewards_batch, percentile)\n", + " log.append([mean_reward, threshold])\n", + "\n", + " clear_output(True)\n", + " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", + " plt.figure(figsize=[8, 4])\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(list(zip(*log))[0], label='Mean rewards')\n", + " plt.plot(list(zip(*log))[1], label='Reward thresholds')\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.subplot(1, 2, 2)\n", + " plt.hist(rewards_batch, range=reward_range)\n", + " plt.vlines([np.percentile(rewards_batch, percentile)],\n", + " [0], [100], label=\"percentile\", color='red')\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jejOIXVliWhb" + }, + "source": [ + "n_sessions = 100\n", + "percentile = 70\n", + "log = []\n", + "\n", + "for i in range(100):\n", + " # generate new sessions\n", + " sessions = [ ]\n", + "\n", + " states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))\n", + "\n", + " elite_states, elite_actions = \n", + "\n", + " \n", + "\n", + " show_progress(rewards_batch, log, percentile, reward_range=[0, np.max(rewards_batch)])\n", + "\n", + " if np.mean(rewards_batch) > 190:\n", + " print(\"You Win! You may stop training now via KeyboardInterrupt.\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "svMc8z9viWhc" + }, + "source": [ + "# Results" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YkjWv7qKiWhc" + }, + "source": [ + "# Record sessions\n", + "\n", + "import gym.wrappers\n", + "\n", + "with gym.wrappers.Monitor(gym.make(\"CartPole-v0\"), directory=\"videos\", force=True) as env_monitor:\n", + " sessions = [generate_session(env_monitor, agent) for _ in range(100)]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "mStHKVLqiWhd" + }, + "source": [ + "# Show video. In some setups this may not work. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from IPython.display import HTML\n", + "\n", + "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(video_names[-1])) # You can also try other indices" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GnY7Pd1oiWhd" + }, + "source": [ + "## Assignment: MountainCar\n", + "\n", + "By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to try something harder.\n", + "\n", + "_if you have any trouble with CartPole-v0 and feel stuck, take a look on forums_\n", + "\n", + "Your assignment is to obtain an average reward of __at least -150__ on `MountainCar-v0`.\n", + "\n", + "See the tips section below, it's kinda important.\n", + " \n", + "* Bonus quest: Devise a way to speed up training against the default version\n", + " * Obvious improvement: use [joblib](https://www.google.com/search?client=ubuntu&channel=fs&q=joblib&ie=utf-8&oe=utf-8)\n", + " * Try re-using samples from 3-5 last iterations when computing threshold and during training\n", + " * Experiment with an amount of training iterations and the learning rate of the neural network (see params)\n", + " \n", + " \n", + "### Tips\n", + "* Gym page: [MountainCar](https://gym.openai.com/envs/MountainCar-v0)\n", + "* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.\n", + " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 10% are better, than if you use percentile 20% as the threshold, R >= threshold __fails cut off bad sessions__ whule R > threshold works alright.\n", + "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent the training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", + "* If it won't train it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)\n", + "* 20-neuron network is probably not enough, feel free to experiment.\n", + "\n", + "You may find the following snippet useful:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "H9A0Z1_aiWhe", + "outputId": "e7d5cc4f-2191-422e-b884-06babeb6b9a4" + }, + "source": [ + "def visualize_mountain_car(env, agent):\n", + " # Compute policy for all possible x and v (with discretization)\n", + " xs = np.linspace(env.min_position, env.max_position, 100)\n", + " vs = np.linspace(-env.max_speed, env.max_speed, 100)\n", + " \n", + " grid = np.dstack(np.meshgrid(xs, vs[::-1])).transpose(1, 0, 2)\n", + " grid_flat = grid.reshape(len(xs) * len(vs), 2)\n", + " probs = agent.predict_proba(grid_flat).reshape(len(xs), len(vs), 3).transpose(1, 0, 2)\n", + "\n", + " # # The above code is equivalent to the following:\n", + " # probs = np.empty((len(vs), len(xs), 3))\n", + " # for i, v in enumerate(vs[::-1]):\n", + " # for j, x in enumerate(xs):\n", + " # probs[i, j, :] = agent.predict_proba([[x, v]])[0]\n", + "\n", + " # Draw policy\n", + " f, ax = plt.subplots(figsize=(7, 7))\n", + " ax.imshow(probs, extent=(env.min_position, env.max_position, -env.max_speed, env.max_speed), aspect='auto')\n", + " ax.set_title('Learned policy: red=left, green=nothing, blue=right')\n", + " ax.set_xlabel('position (x)')\n", + " ax.set_ylabel('velocity (v)')\n", + " \n", + " # Sample the trajectory and draw it\n", + " states, actions, _ = generate_session(env, agent)\n", + " states = np.array(states)\n", + " ax.plot(states[:, 0], states[:, 1], color='white')\n", + " \n", + " # Draw every 3rd action from the trajectory\n", + " for (x, v), a in zip(states[::3], actions[::3]):\n", + " if a == 0:\n", + " plt.arrow(x, v, -0.1, 0, color='white', head_length=0.02)\n", + " elif a == 2:\n", + " plt.arrow(x, v, 0.1, 0, color='white', head_length=0.02)\n", + "\n", + "with gym.make('MountainCar-v0').env as env:\n", + " visualize_mountain_car(env, agent_mountain_car)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BLbFIhgyiWhg" + }, + "source": [ + "# Implement generate_session_mountain_car(), training loop, etc.\n", + "\n", + "def generate_session_mountain_car(env, agent, t_max=10000):\n", + " \n", + " \n", + " return states, actions, total_reward\n", + "\n", + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T6MnwXZ5iWhg" + }, + "source": [ + "### Submit to Coursera" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZAmKPGLOiWhg" + }, + "source": [ + "from submit import submit_mountain_car\n", + "submit_mountain_car(generate_session_mountain_car, agent, 'your.email@example.com', 'YourAssignmentToken')" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/week1_intro/gym_interface.ipynb b/week1_intro/gym_interface.ipynb new file mode 100644 index 000000000..4656f3ca8 --- /dev/null +++ b/week1_intro/gym_interface.ipynb @@ -0,0 +1,249 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + }, + "colab": { + "name": "gym_interface.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "XCiI86ScJcep" + }, + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/grading.py -O ../grading.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/coursera/week1_intro/submit.py\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display for drawing game images.\n", + "# It has no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ggYvjt2RJcey" + }, + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TY5b59AkJcez" + }, + "source": [ + "### OpenAI Gym\n", + "\n", + "We're gonna spend several next weeks learning algorithms that solve decision processes. So we need a few interesting decision problems to test our algorithms.\n", + "\n", + "That's where OpenAI Gym comes into play. It's a Python library that wraps many classical decision problems, including robot control, videogames and board games.\n", + "\n", + "So here's how it works:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-DnbE2wYJcez" + }, + "source": [ + "import gym\n", + "\n", + "env = gym.make(\"MountainCar-v0\")\n", + "env.reset()\n", + "\n", + "plt.imshow(env.render('rgb_array'))\n", + "print(\"Observation space:\", env.observation_space)\n", + "print(\"Action space:\", env.action_space)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rbq7DyYYJce0" + }, + "source": [ + "Note: if you're running this on your local machine, you'll see a window popping up with the image above. Don't close it, just alt-tab away." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2EShsU_PJce0" + }, + "source": [ + "### Gym interface\n", + "\n", + "The three main methods of this environment are:\n", + "* `reset()`: resets an environment to the initial state, _return first observation_\n", + "* `render()`: shows the current environment state (a more colorful version :) )\n", + "* `step(a)`: commits an action `a` and returns `(new_observation, reward, is_done, info)`\n", + " * `new_observation`: an observation right after committing the action `a`\n", + " * `reward`: a number which represents your reward for committing action `a`\n", + " * `is_done`: True if the MDP has just finished, False if it is still in progress\n", + " * `info`: some auxiliary stuff about what just happened. For now, ignore it." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CzeQku6LJce0" + }, + "source": [ + "obs0 = env.reset()\n", + "print(\"initial observation code:\", obs0)\n", + "\n", + "# Note: in MountainCar, an observation is just two numbers: car position and velocity" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "1IZVB7MQJce1" + }, + "source": [ + "print(\"taking action 2 (right)\")\n", + "new_obs, reward, is_done, _ = env.step(2)\n", + "\n", + "print(\"new observation code:\", new_obs)\n", + "print(\"reward:\", reward)\n", + "print(\"is game over?:\", is_done)\n", + "\n", + "# Note: as you can see, the car has moved slightly to the right (around 0.0005)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BxHtJWTlJce1" + }, + "source": [ + "### Play with it\n", + "\n", + "Below is the code that drives the car to the right. However, if you simply use the default policy, the car won't reach the flag at the far right due to the gravity.\n", + "\n", + "__Your task__ is to fix it. Find a strategy that reaches the flag. \n", + "\n", + "You are not required to build any sophisticated algorithms for now, and you definitely don't need to know any reinforcement learning for this. Feel free to hard-code :)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NKG5uQ4lJce1" + }, + "source": [ + "from IPython import display\n", + "\n", + "# Create env manually to set time limit. Please don't change this.\n", + "TIME_LIMIT = 250\n", + "env = gym.wrappers.TimeLimit(\n", + " gym.envs.classic_control.MountainCarEnv(),\n", + " max_episode_steps=TIME_LIMIT + 1,\n", + ")\n", + "actions = {'left': 0, 'stop': 1, 'right': 2}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "deNJs8IeJce2" + }, + "source": [ + "def policy(obs, t):\n", + " # Write the code for your policy here. You can use the observation\n", + " # (a tuple of the position and the velocity), the current time step, or both,\n", + " # if you want.\n", + " position, velocity = obs\n", + " \n", + " # This is an example policy. You can try running it, but it won't work.\n", + " # Your goal is to fix that. You don't need anything sophisticated here,\n", + " # and you can hard-code any policy that seems to work.\n", + " # Hint: think how you would make a swing go faster and faster.\n", + " return actions['right']" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "bomTLtJIJce2" + }, + "source": [ + "plt.figure(figsize=(4, 3))\n", + "display.clear_output(wait=True)\n", + "\n", + "obs = env.reset()\n", + "for t in range(TIME_LIMIT):\n", + " plt.gca().clear()\n", + " \n", + " action = policy(obs, t) # Call your policy\n", + " obs, reward, done, _ = env.step(action) # Pass the action chosen by the policy to the environment\n", + " \n", + " # We won't do anything with reward here because MountainCar is a very simple environment,\n", + " # and reward is a constant -1. Therefore, your goal is to end the episode as quickly as possible.\n", + "\n", + " # Draw game image on display.\n", + " plt.imshow(env.render('rgb_array'))\n", + " \n", + " display.display(plt.gcf())\n", + " display.clear_output(wait=True)\n", + "\n", + " if done:\n", + " print(\"Well done!\")\n", + " break\n", + "else:\n", + " print(\"Time limit exceeded. Try again.\")\n", + "\n", + "display.clear_output(wait=True)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "o6sDccFIJce3" + }, + "source": [ + "from submit import submit_interface\n", + "submit_interface(policy, 'your.email@example.com', 'YourAssignmentToken')" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file