diff --git a/.travis.yml b/.travis.yml index 44884a8..b6bc993 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,11 +5,28 @@ python: notifications: email: false +env: + global: + - DOCKER_IMAGE=araffin/rl-baselines-zoo-cpu:v2.8.0 + services: - docker install: - - docker pull araffin/rl-baselines-zoo-cpu + - docker pull ${DOCKER_IMAGE} script: - - docker run -it --rm --network host --ipc=host --mount src=$(pwd),target=/root/code/stable-baselines,type=bind araffin/rl-baselines-zoo-cpu bash -c "cd /root/code/stable-baselines/ && pip install --upgrade git+https://github.com/pfnet/optuna.git && python -m pytest --cov-config .coveragerc --cov-report term --cov=. -v tests/" + - ./scripts/run_tests_travis.sh "${TEST_GLOB}" + +jobs: + include: + # Split test suite to avoid exceeding travis limit + - stage: Test + name: "Unit Tests Train" + env: TEST_GLOB="train.py" + + - name: "Unit Tests Enjoy" + env: TEST_GLOB="enjoy.py" + + - name: "Unit Tests Hyperparams opt" + env: TEST_GLOB="hyperparams_opt.py" diff --git a/README.md b/README.md index 6fe9706..42bb212 100644 --- a/README.md +++ b/README.md @@ -62,14 +62,14 @@ mpirun -n 16 python train.py --algo trpo --env BreakoutNoFrameskip-v4 We use [Optuna](https://optuna.org/) for optimizing the hyperparameters. -Note: hyperparameters search is only implemented for PPO2/A2C/SAC/TRPO/DDPG for now. +Note: hyperparameters search is not implemented for ACER and DQN for now. when using SuccessiveHalvingPruner ("halving"), you must specify `--n-jobs > 1` Budget of 1000 trials with a maximum of 50000 steps: ``` python train.py --algo ppo2 --env MountainCar-v0 -n 50000 -optimize --n-trials 1000 --n-jobs 2 \ - --sampler random --pruner median + --sampler tpe --pruner median ``` @@ -116,7 +116,7 @@ Additional Atari Games (to be completed): |----------|--------------|----------------|------------|--------------|--------------------------| | A2C | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | ACER | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | N/A | N/A | -| ACKTR | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | N/A | N/A | +| ACKTR | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | DQN | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | N/A | N/A | | DDPG | N/A | N/A | N/A | :heavy_check_mark: | :heavy_check_mark: | @@ -129,15 +129,15 @@ Additional Atari Games (to be completed): | RL Algo | BipedalWalker-v2 | LunarLander-v2 | LunarLanderContinuous-v2 | BipedalWalkerHardcore-v2 | CarRacing-v0 | |----------|--------------|----------------|------------|--------------|--------------------------| -| A2C | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| A2C | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | ACER | N/A | :heavy_check_mark: | N/A | N/A | N/A | -| ACKTR | N/A | :heavy_check_mark: | N/A | N/A | N/A | -| PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| ACKTR | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | +| PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | DQN | N/A | :heavy_check_mark: | N/A | N/A | N/A | -| DDPG | :heavy_check_mark: | N/A | :heavy_check_mark: | | | +| DDPG | :heavy_check_mark: | N/A | :heavy_check_mark: | | | | SAC | :heavy_check_mark: | N/A | :heavy_check_mark: | :heavy_check_mark: | | -| TD3 | | N/A | :heavy_check_mark: | | | -| TRPO | | :heavy_check_mark: | :heavy_check_mark: | | | +| TD3 | :heavy_check_mark: | N/A | :heavy_check_mark: | | | +| TRPO | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | ### PyBullet Environments @@ -149,6 +149,7 @@ Note: those environments are derived from [Roboschool](https://github.com/openai | RL Algo | Walker2D | HalfCheetah | Ant | Reacher | Hopper | Humanoid | |----------|-----------|-------------|-----|---------|---------|----------| | A2C | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | +| ACKTR | | :heavy_check_mark: | | | | | | PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | DDPG | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | | SAC | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | @@ -160,6 +161,7 @@ PyBullet Envs (Continued) | RL Algo | Minitaur | MinitaurDuck | InvertedDoublePendulum | InvertedPendulumSwingup | |----------|-----------|-------------|-----|---------| | A2C | | | | | +| ACKTR | | | | | | PPO2 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | DDPG | | | | | | SAC | | | :heavy_check_mark: | :heavy_check_mark: | @@ -209,11 +211,11 @@ You can train agents online using [colab notebook](https://colab.research.google ### Stable-Baselines PyPi Package -Min version: stable-baselines >= 2.7.0 +Min version: stable-baselines[mpi] >= 2.8.0 ``` apt-get install swig cmake libopenmpi-dev zlib1g-dev ffmpeg -pip install stable-baselines box2d box2d-kengz pyyaml pybullet optuna pytablewriter scikit-optimize +pip install stable-baselines[mpi] box2d box2d-kengz pyyaml pybullet optuna pytablewriter scikit-optimize ``` Please see [Stable Baselines README](https://github.com/hill-a/stable-baselines) for alternatives. diff --git a/benchmark.md b/benchmark.md index 8fdf98d..4edc701 100644 --- a/benchmark.md +++ b/benchmark.md @@ -34,12 +34,18 @@ |acer |SpaceInvadersNoFrameskip-v4 | 542.556| 172.332| 150374| 133| |acktr|Acrobot-v1 | -91.284| 32.515| 149959| 1625| |acktr|BeamRiderNoFrameskip-v4 | 3760.976| 1826.059| 147414| 41| +|acktr|BipedalWalker-v2 | 292.419| 54.373| 149881| 216| +|acktr|BipedalWalkerHardcore-v2 | 44.796| 113.898| 149216| 129| |acktr|BreakoutNoFrameskip-v4 | 448.514| 88.882| 143118| 37| |acktr|CartPole-v1 | 487.573| 63.866| 149685| 307| |acktr|EnduroNoFrameskip-v4 | 0.000| 0.000| 149574| 45| +|acktr|HalfCheetahBulletEnv-v0 | 2535.255| 110.368| 150000| 150| |acktr|LunarLander-v2 | 96.822| 64.020| 149905| 176| +|acktr|LunarLanderContinuous-v2 | 239.953| 58.406| 149825| 480| |acktr|MountainCar-v0 | -111.917| 21.422| 149969| 1340| +|acktr|MountainCarContinuous-v0 | 93.779| 0.115| 149993| 2265| |acktr|MsPacmanNoFrameskip-v4 | 1598.776| 264.338| 149588| 147| +|acktr|Pendulum-v0 | -213.831| 137.857| 150000| 750| |acktr|PongNoFrameskip-v4 | 19.224| 3.697| 147753| 67| |acktr|QbertNoFrameskip-v4 | 9569.575| 3980.468| 150896| 106| |acktr|SeaquestNoFrameskip-v4 | 1672.239| 105.092| 149148| 67| @@ -104,6 +110,7 @@ |sac |ReacherBulletEnv-v0 | 17.529| 9.860| 150000| 1000| |sac |Walker2DBulletEnv-v0 | 2052.646| 13.631| 150000| 150| |td3 |AntBulletEnv-v0 | 3269.021| 60.697| 150000| 150| +|td3 |BipedalWalker-v2 | 308.793| 23.750| 149713| 228| |td3 |HalfCheetahBulletEnv-v0 | 3160.318| 15.284| 150000| 150| |td3 |HopperBulletEnv-v0 | 2743.910| 20.159| 150000| 150| |td3 |HumanoidBulletEnv-v0 | 1638.081| 801.594| 149453| 182| diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index fdcf11d..620cb96 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -22,7 +22,7 @@ RUN \ pip install pytest-cov && \ pip install pyyaml && \ pip install box2d-py==2.3.5 && \ - pip install stable-baselines && \ + pip install stable-baselines[mpi]==2.8.0 && \ pip install pybullet && \ pip install gym-minigrid && \ pip install scikit-optimize && \ diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index cbed120..47fd9b3 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -22,7 +22,7 @@ RUN \ pip install pyyaml && \ pip install box2d-py==2.3.5 && \ pip install tensorflow-gpu==1.8.0 && \ - pip install stable-baselines && \ + pip install stable-baselines[mpi]==2.8.0 && \ pip install pybullet && \ pip install gym-minigrid && \ pip install scikit-optimize && \ diff --git a/enjoy.py b/enjoy.py index 7b8e421..d0d4062 100644 --- a/enjoy.py +++ b/enjoy.py @@ -73,10 +73,18 @@ def main(): else: log_path = os.path.join(folder, algo) - model_path = "{}/{}.pkl".format(log_path, env_id) assert os.path.isdir(log_path), "The {} folder was not found".format(log_path) - assert os.path.isfile(model_path), "No model found for {} on {}, path: {}".format(algo, env_id, model_path) + + found = False + for ext in ['pkl', 'zip']: + model_path = "{}/{}.{}".format(log_path, env_id, ext) + found = os.path.isfile(model_path) + if found: + break + + if not found: + raise ValueError("No model found for {} on {}, path: {}".format(algo, env_id, model_path)) if algo in ['dqn', 'ddpg', 'sac', 'td3']: args.n_envs = 1 diff --git a/hyperparams/acktr.yml b/hyperparams/acktr.yml index f22435e..3e605af 100644 --- a/hyperparams/acktr.yml +++ b/hyperparams/acktr.yml @@ -32,3 +32,105 @@ Acrobot-v1: n_timesteps: !!float 5e5 policy: 'MlpPolicy' ent_coef: 0.0 + +Pendulum-v0: + n_envs: 4 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + ent_coef: 0.0 + gamma: 0.99 + n_steps: 16 + learning_rate: 0.06 + lr_schedule: 'constant' + +LunarLanderContinuous-v2: + normalize: true + n_envs: 8 + n_timesteps: !!float 5e6 + policy: 'MlpPolicy' + gamma: 0.99 + n_steps: 16 + ent_coef: 0.0 + learning_rate: 0.06 + lr_schedule: 'constant' + +MountainCarContinuous-v0: + normalize: true + n_envs: 16 + n_timesteps: !!float 3e5 + policy: 'MlpPolicy' + ent_coef: 0.0 + +# Tuned +HalfCheetahBulletEnv-v0: + env_wrapper: utils.wrappers.TimeFeatureWrapper + normalize: True + n_envs: 1 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + ent_coef: 0.0 + lr_schedule: 'constant' + learning_rate: 0.0217 + n_steps: 128 + nprocs: 4 + max_grad_norm: 0.5 + gamma: 0.98 + vf_coef: 0.946 + +# TO BE tuned +Walker2DBulletEnv-v0: + env_wrapper: utils.wrappers.TimeFeatureWrapper + normalize: True + n_envs: 1 + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + ent_coef: 0.0 + # lr_schedule: 'constant' + # learning_rate: 0.0217 + n_steps: 128 + nprocs: 4 + gamma: 0.99 + vf_coef: 0.946 + + +HalfCheetah-v2: + env_wrapper: utils.wrappers.TimeFeatureWrapper + normalize: True + n_envs: 1 + n_timesteps: !!float 1e6 + policy: 'MlpPolicy' + ent_coef: 0.0 + lr_schedule: 'constant' + learning_rate: 0.2 + n_steps: 2048 + nprocs: 4 + max_grad_norm: 10 + gamma: 0.99 + vf_coef: 0.5 + policy_kwargs: "dict(net_arch=[256, 256])" + +# Tuned +BipedalWalkerHardcore-v2: + normalize: true + n_envs: 8 + n_timesteps: !!float 10e7 + policy: 'MlpPolicy' + ent_coef: 0.000125 + lr_schedule: 'constant' + learning_rate: 0.0675 + n_steps: 16 + gamma: 0.9999 + vf_coef: 0.51 + +# Tuned +BipedalWalker-v2: + normalize: true + n_envs: 8 + n_timesteps: !!float 5e6 + policy: 'MlpPolicy' + ent_coef: 0.0 + lr_schedule: 'constant' + learning_rate: 0.298 + n_steps: 32 + gamma: 0.98 + vf_coef: 0.38 diff --git a/hyperparams/td3.yml b/hyperparams/td3.yml index f877ab2..77ec58b 100644 --- a/hyperparams/td3.yml +++ b/hyperparams/td3.yml @@ -49,6 +49,20 @@ HalfCheetahBulletEnv-v0: gradient_steps: 1000 policy_kwargs: "dict(layers=[400, 300])" +BipedalWalker-v2: + n_timesteps: !!float 2e6 + policy: 'MlpPolicy' + gamma: 0.99 + buffer_size: 1000000 + noise_type: 'normal' + noise_std: 0.1 + learning_starts: 10000 + batch_size: 100 + learning_rate: !!float 1e-3 + train_freq: 1000 + gradient_steps: 1000 + policy_kwargs: "dict(layers=[400, 300])" + # To be tuned BipedalWalkerHardcore-v2: n_timesteps: !!float 5e7 @@ -59,7 +73,7 @@ BipedalWalkerHardcore-v2: noise_std: 0.2 learning_starts: 10000 batch_size: 100 - learning_rate: 1e-3 + learning_rate: !!float 1e-3 train_freq: 1000 gradient_steps: 1000 policy_kwargs: "dict(layers=[400, 300])" diff --git a/run_docker_cpu.sh b/run_docker_cpu.sh index 902eecd..4001969 100755 --- a/run_docker_cpu.sh +++ b/run_docker_cpu.sh @@ -8,5 +8,5 @@ echo $cmd_line docker run -it --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines,type=bind araffin/rl-baselines-zoo-cpu\ + --mount src=$(pwd),target=/root/code/stable-baselines,type=bind araffin/rl-baselines-zoo-cpu:v2.8.0\ bash -c "cd /root/code/stable-baselines/ && $cmd_line" diff --git a/run_docker_gpu.sh b/run_docker_gpu.sh old mode 100644 new mode 100755 index ec6a1bb..f288224 --- a/run_docker_gpu.sh +++ b/run_docker_gpu.sh @@ -8,5 +8,5 @@ echo $cmd_line docker run -it --runtime=nvidia --rm --network host --ipc=host \ - --mount src=$(pwd),target=/root/code/stable-baselines,type=bind araffin/rl-baselines-zoo\ + --mount src=$(pwd),target=/root/code/stable-baselines,type=bind araffin/rl-baselines-zoo:v2.8.0\ bash -c "cd /root/code/stable-baselines/ && $cmd_line" diff --git a/scripts/run_tests_travis.sh b/scripts/run_tests_travis.sh new file mode 100755 index 0000000..4f01313 --- /dev/null +++ b/scripts/run_tests_travis.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +DOCKER_CMD="docker run -it --rm --network host --ipc=host --mount src=$(pwd),target=/root/code/stable-baselines,type=bind" +BASH_CMD="cd /root/code/stable-baselines/" + +if [[ $# -ne 1 ]]; then + echo "usage: $0 " + exit 1 +fi + +if [[ ${DOCKER_IMAGE} = "" ]]; then + echo "Need DOCKER_IMAGE environment variable to be set." + exit 1 +fi + +TEST_GLOB=$1 + +set -e # exit immediately on any error + + +${DOCKER_CMD} ${DOCKER_IMAGE} \ + bash -c "${BASH_CMD} && \ + python -m pytest --cov-config .coveragerc --cov-report term --cov=. -v tests/test_${TEST_GLOB}" diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py index 096c1a5..d251901 100644 --- a/tests/test_hyperparams_opt.py +++ b/tests/test_hyperparams_opt.py @@ -13,9 +13,9 @@ def _assert_eq(left, right): N_TRIALS = 2 N_JOBS = 1 -ALGOS = ('ppo2', 'a2c', 'trpo') +ALGOS = ('ppo2', 'a2c', 'trpo', 'acktr') # Not yet supported: -# ALGOS = ('acer', 'acktr', 'dqn') +# ALGOS = ('acer', 'dqn') ENV_IDS = ('CartPole-v1',) LOG_FOLDER = 'logs/tests_optimize/' @@ -29,6 +29,8 @@ def _assert_eq(left, right): experiments['ddpg-MountainCarContinuous-v0'] = ('ddpg', 'MountainCarContinuous-v0') # Test for SAC experiments['sac-Pendulum-v0'] = ('sac', 'Pendulum-v0') +# Test for TD3 +experiments['td3-Pendulum-v0'] = ('td3', 'Pendulum-v0') # Clean up if os.path.isdir(LOG_FOLDER): diff --git a/train.py b/train.py index 1fd7d34..27b3187 100644 --- a/train.py +++ b/train.py @@ -51,9 +51,9 @@ help='Run hyperparameters search') parser.add_argument('--n-jobs', help='Number of parallel jobs when optimizing hyperparameters', type=int, default=1) parser.add_argument('--sampler', help='Sampler to use when optimizing hyperparameters', type=str, - default='skopt', choices=['random', 'tpe', 'skopt']) + default='tpe', choices=['random', 'tpe', 'skopt']) parser.add_argument('--pruner', help='Pruner to use when optimizing hyperparameters', type=str, - default='none', choices=['halving', 'median', 'none']) + default='median', choices=['halving', 'median', 'none']) parser.add_argument('--verbose', help='Verbose mode (0: no output, 1: INFO)', default=1, type=int) parser.add_argument('--gym-packages', type=str, nargs='+', default=[], help='Additional external Gym environemnt package modules to import (e.g. gym_minigrid)') diff --git a/trained_agents/acktr/BipedalWalker-v2.zip b/trained_agents/acktr/BipedalWalker-v2.zip new file mode 100644 index 0000000..1cee58f Binary files /dev/null and b/trained_agents/acktr/BipedalWalker-v2.zip differ diff --git a/trained_agents/acktr/BipedalWalker-v2/config.yml b/trained_agents/acktr/BipedalWalker-v2/config.yml new file mode 100644 index 0000000..22cb148 --- /dev/null +++ b/trained_agents/acktr/BipedalWalker-v2/config.yml @@ -0,0 +1,11 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.0] + - [gamma, 0.98] + - [learning_rate, 0.298] + - [lr_schedule, constant] + - [n_envs, 8] + - [n_steps, 32] + - [n_timesteps, 5000000.0] + - [normalize, true] + - [policy, MlpPolicy] + - [vf_coef, 0.38] diff --git a/trained_agents/acktr/BipedalWalker-v2/obs_rms.pkl b/trained_agents/acktr/BipedalWalker-v2/obs_rms.pkl new file mode 100644 index 0000000..9cb6bc1 Binary files /dev/null and b/trained_agents/acktr/BipedalWalker-v2/obs_rms.pkl differ diff --git a/trained_agents/acktr/BipedalWalker-v2/ret_rms.pkl b/trained_agents/acktr/BipedalWalker-v2/ret_rms.pkl new file mode 100644 index 0000000..611c4e0 Binary files /dev/null and b/trained_agents/acktr/BipedalWalker-v2/ret_rms.pkl differ diff --git a/trained_agents/acktr/BipedalWalkerHardcore-v2.zip b/trained_agents/acktr/BipedalWalkerHardcore-v2.zip new file mode 100644 index 0000000..057e2b7 Binary files /dev/null and b/trained_agents/acktr/BipedalWalkerHardcore-v2.zip differ diff --git a/trained_agents/acktr/BipedalWalkerHardcore-v2/config.yml b/trained_agents/acktr/BipedalWalkerHardcore-v2/config.yml new file mode 100644 index 0000000..c83e60f --- /dev/null +++ b/trained_agents/acktr/BipedalWalkerHardcore-v2/config.yml @@ -0,0 +1,11 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.000125] + - [gamma, 0.9999] + - [learning_rate, 0.0675] + - [lr_schedule, constant] + - [n_envs, 8] + - [n_steps, 16] + - [n_timesteps, 100000000.0] + - [normalize, true] + - [policy, MlpPolicy] + - [vf_coef, 0.51] diff --git a/trained_agents/acktr/BipedalWalkerHardcore-v2/obs_rms.pkl b/trained_agents/acktr/BipedalWalkerHardcore-v2/obs_rms.pkl new file mode 100644 index 0000000..e5064df Binary files /dev/null and b/trained_agents/acktr/BipedalWalkerHardcore-v2/obs_rms.pkl differ diff --git a/trained_agents/acktr/BipedalWalkerHardcore-v2/ret_rms.pkl b/trained_agents/acktr/BipedalWalkerHardcore-v2/ret_rms.pkl new file mode 100644 index 0000000..1210735 Binary files /dev/null and b/trained_agents/acktr/BipedalWalkerHardcore-v2/ret_rms.pkl differ diff --git a/trained_agents/acktr/HalfCheetahBulletEnv-v0.zip b/trained_agents/acktr/HalfCheetahBulletEnv-v0.zip new file mode 100644 index 0000000..f2f0b34 Binary files /dev/null and b/trained_agents/acktr/HalfCheetahBulletEnv-v0.zip differ diff --git a/trained_agents/acktr/HalfCheetahBulletEnv-v0/config.yml b/trained_agents/acktr/HalfCheetahBulletEnv-v0/config.yml new file mode 100644 index 0000000..ea8d97c --- /dev/null +++ b/trained_agents/acktr/HalfCheetahBulletEnv-v0/config.yml @@ -0,0 +1,14 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.0] + - [env_wrapper, utils.wrappers.TimeFeatureWrapper] + - [gamma, 0.98] + - [learning_rate, 0.0217] + - [lr_schedule, constant] + - [max_grad_norm, 0.5] + - [n_envs, 1] + - [n_steps, 128] + - [n_timesteps, 2000000.0] + - [normalize, true] + - [nprocs, 4] + - [policy, MlpPolicy] + - [vf_coef, 0.946] diff --git a/trained_agents/acktr/HalfCheetahBulletEnv-v0/obs_rms.pkl b/trained_agents/acktr/HalfCheetahBulletEnv-v0/obs_rms.pkl new file mode 100644 index 0000000..030680a Binary files /dev/null and b/trained_agents/acktr/HalfCheetahBulletEnv-v0/obs_rms.pkl differ diff --git a/trained_agents/acktr/HalfCheetahBulletEnv-v0/ret_rms.pkl b/trained_agents/acktr/HalfCheetahBulletEnv-v0/ret_rms.pkl new file mode 100644 index 0000000..d499aaf Binary files /dev/null and b/trained_agents/acktr/HalfCheetahBulletEnv-v0/ret_rms.pkl differ diff --git a/trained_agents/acktr/LunarLanderContinuous-v2.zip b/trained_agents/acktr/LunarLanderContinuous-v2.zip new file mode 100644 index 0000000..8a81eaf Binary files /dev/null and b/trained_agents/acktr/LunarLanderContinuous-v2.zip differ diff --git a/trained_agents/acktr/LunarLanderContinuous-v2/config.yml b/trained_agents/acktr/LunarLanderContinuous-v2/config.yml new file mode 100644 index 0000000..ea412d2 --- /dev/null +++ b/trained_agents/acktr/LunarLanderContinuous-v2/config.yml @@ -0,0 +1,10 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.0] + - [gamma, 0.99] + - [learning_rate, 0.06] + - [lr_schedule, constant] + - [n_envs, 8] + - [n_steps, 16] + - [n_timesteps, 5000000.0] + - [normalize, true] + - [policy, MlpPolicy] diff --git a/trained_agents/acktr/LunarLanderContinuous-v2/obs_rms.pkl b/trained_agents/acktr/LunarLanderContinuous-v2/obs_rms.pkl new file mode 100644 index 0000000..69efc9c Binary files /dev/null and b/trained_agents/acktr/LunarLanderContinuous-v2/obs_rms.pkl differ diff --git a/trained_agents/acktr/LunarLanderContinuous-v2/ret_rms.pkl b/trained_agents/acktr/LunarLanderContinuous-v2/ret_rms.pkl new file mode 100644 index 0000000..33301f7 Binary files /dev/null and b/trained_agents/acktr/LunarLanderContinuous-v2/ret_rms.pkl differ diff --git a/trained_agents/acktr/MountainCarContinuous-v0.zip b/trained_agents/acktr/MountainCarContinuous-v0.zip new file mode 100644 index 0000000..8443d85 Binary files /dev/null and b/trained_agents/acktr/MountainCarContinuous-v0.zip differ diff --git a/trained_agents/acktr/MountainCarContinuous-v0/config.yml b/trained_agents/acktr/MountainCarContinuous-v0/config.yml new file mode 100644 index 0000000..705df27 --- /dev/null +++ b/trained_agents/acktr/MountainCarContinuous-v0/config.yml @@ -0,0 +1,6 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.0] + - [n_envs, 16] + - [n_timesteps, 1000000.0] + - [normalize, true] + - [policy, MlpPolicy] diff --git a/trained_agents/acktr/MountainCarContinuous-v0/obs_rms.pkl b/trained_agents/acktr/MountainCarContinuous-v0/obs_rms.pkl new file mode 100644 index 0000000..a67bd39 Binary files /dev/null and b/trained_agents/acktr/MountainCarContinuous-v0/obs_rms.pkl differ diff --git a/trained_agents/acktr/MountainCarContinuous-v0/ret_rms.pkl b/trained_agents/acktr/MountainCarContinuous-v0/ret_rms.pkl new file mode 100644 index 0000000..a37c739 Binary files /dev/null and b/trained_agents/acktr/MountainCarContinuous-v0/ret_rms.pkl differ diff --git a/trained_agents/acktr/Pendulum-v0.zip b/trained_agents/acktr/Pendulum-v0.zip new file mode 100644 index 0000000..1698a51 Binary files /dev/null and b/trained_agents/acktr/Pendulum-v0.zip differ diff --git a/trained_agents/acktr/Pendulum-v0/config.yml b/trained_agents/acktr/Pendulum-v0/config.yml new file mode 100644 index 0000000..6b40814 --- /dev/null +++ b/trained_agents/acktr/Pendulum-v0/config.yml @@ -0,0 +1,9 @@ +!!python/object/apply:collections.OrderedDict +- - [ent_coef, 0.0] + - [gamma, 0.99] + - [learning_rate, 0.06] + - [lr_schedule, constant] + - [n_envs, 4] + - [n_steps, 16] + - [n_timesteps, 2000000.0] + - [policy, MlpPolicy] diff --git a/trained_agents/td3/BipedalWalker-v2.zip b/trained_agents/td3/BipedalWalker-v2.zip new file mode 100644 index 0000000..3cca4d6 Binary files /dev/null and b/trained_agents/td3/BipedalWalker-v2.zip differ diff --git a/trained_agents/td3/BipedalWalker-v2/config.yml b/trained_agents/td3/BipedalWalker-v2/config.yml new file mode 100644 index 0000000..418e84b --- /dev/null +++ b/trained_agents/td3/BipedalWalker-v2/config.yml @@ -0,0 +1,13 @@ +!!python/object/apply:collections.OrderedDict +- - [batch_size, 100] + - [buffer_size, 1000000] + - [gamma, 0.99] + - [gradient_steps, 1000] + - [learning_rate, 0.001] + - [learning_starts, 10000] + - [n_timesteps, 2000000.0] + - [noise_std, 0.1] + - [noise_type, normal] + - [policy, MlpPolicy] + - [policy_kwargs, 'dict(layers=[400, 300])'] + - [train_freq, 1000] diff --git a/utils/benchmark.py b/utils/benchmark.py index f1f4b2c..4603f07 100644 --- a/utils/benchmark.py +++ b/utils/benchmark.py @@ -45,7 +45,7 @@ algo, env_id = trained_models[trained_model] n_envs = args.n_envs n_timesteps = args.n_timesteps - if algo in ['dqn', 'ddpg', 'sac']: + if algo in ['dqn', 'ddpg', 'sac', 'td3']: n_envs = 1 n_timesteps *= args.n_envs diff --git a/utils/hyperparams_opt.py b/utils/hyperparams_opt.py index 386c40a..758ca71 100644 --- a/utils/hyperparams_opt.py +++ b/utils/hyperparams_opt.py @@ -5,7 +5,7 @@ from optuna.pruners import SuccessiveHalvingPruner, MedianPruner from optuna.samplers import RandomSampler, TPESampler from optuna.integration.skopt import SkoptSampler -from stable_baselines import SAC, DDPG +from stable_baselines import SAC, DDPG, TD3 from stable_baselines.ddpg import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines.common.vec_env import VecNormalize, VecEnv from stable_baselines.her import HERGoalEnvWrapper @@ -77,8 +77,8 @@ def objective(trial): if algo == 'her': trial.model_class = hyperparams['model_class'] - # Hack to use DDPG sampler - if algo == 'ddpg' or trial.model_class == 'ddpg': + # Hack to use DDPG/TD3 noise sampler + if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) @@ -237,7 +237,7 @@ def sample_a2c_params(trial): :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) - n_steps = trial.suggest_categorical('n_steps', [5, 16, 32, 64, 128, 256]) + n_steps = trial.suggest_categorical('n_steps', [8, 16, 32, 64, 128, 256, 512, 1024, 2048]) lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1) @@ -254,6 +254,29 @@ def sample_a2c_params(trial): 'vf_coef': vf_coef } +def sample_acktr_params(trial): + """ + Sampler for ACKTR hyperparams. + + :param trial: (optuna.trial) + :return: (dict) + """ + gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) + n_steps = trial.suggest_categorical('n_steps', [16, 32, 64, 128, 256, 512, 1024, 2048]) + lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant']) + learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) + ent_coef = trial.suggest_loguniform('ent_coef', 0.00000001, 0.1) + vf_coef = trial.suggest_uniform('vf_coef', 0, 1) + + return { + 'n_steps': n_steps, + 'gamma': gamma, + 'learning_rate': learning_rate, + 'lr_schedule': lr_schedule, + 'ent_coef': ent_coef, + 'vf_coef': vf_coef + } + def sample_sac_params(trial): """ @@ -270,8 +293,7 @@ def sample_sac_params(trial): train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300]) # gradient_steps takes too much time # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300]) - # gradient_steps = 1 - gradient_steps = trial.suggest_categorical('gradient_steps', [1, 2, 5]) + gradient_steps = train_freq ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001]) target_entropy = 'auto' @@ -290,6 +312,39 @@ def sample_sac_params(trial): 'target_entropy': target_entropy } +def sample_td3_params(trial): + """ + Sampler for TD3 hyperparams. + + :param trial: (optuna.trial) + :return: (dict) + """ + gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) + learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) + batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) + buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)]) + train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000]) + gradient_steps = train_freq + noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) + noise_std = trial.suggest_uniform('noise_std', 0, 1) + + hyperparams = { + 'gamma': gamma, + 'learning_rate': learning_rate, + 'batch_size': batch_size, + 'buffer_size': buffer_size, + 'train_freq': train_freq, + 'gradient_steps': gradient_steps, + } + + if noise_type == 'normal': + hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions), + sigma=noise_std * np.ones(trial.n_actions)) + elif noise_type == 'ornstein-uhlenbeck': + hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions), + sigma=noise_std * np.ones(trial.n_actions)) + + return hyperparams def sample_trpo_params(trial): """ @@ -375,6 +430,8 @@ def sample_her_params(trial): hyperparams = sample_sac_params(trial) elif trial.model_class == DDPG: hyperparams = sample_ddpg_params(trial) + elif trial.model_class == TD3: + hyperparams = sample_td3_params(trial) hyperparams['random_exploration'] = trial.suggest_uniform('random_exploration', 0, 1) hyperparams['n_sampled_goal'] = trial.suggest_categorical('n_sampled_goal', [1, 2, 4, 6, 8]) @@ -388,5 +445,7 @@ def sample_her_params(trial): 'a2c': sample_a2c_params, 'trpo': sample_trpo_params, 'ddpg': sample_ddpg_params, - 'her': sample_her_params + 'her': sample_her_params, + 'acktr': sample_acktr_params, + 'td3': sample_td3_params } diff --git a/utils/utils.py b/utils/utils.py index e193eed..419ea18 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -251,10 +251,11 @@ def get_trained_models(log_folder): algos = os.listdir(log_folder) trained_models = {} for algo in algos: - for env_id in glob.glob('{}/{}/*.pkl'.format(log_folder, algo)): - # Retrieve env name - env_id = env_id.split('/')[-1].split('.pkl')[0] - trained_models['{}-{}'.format(algo, env_id)] = (algo, env_id) + for ext in ['zip', 'pkl']: + for env_id in glob.glob('{}/{}/*.{}'.format(log_folder, algo, ext)): + # Retrieve env name + env_id = env_id.split('/')[-1].split('.{}'.format(ext))[0] + trained_models['{}-{}'.format(algo, env_id)] = (algo, env_id) return trained_models