Merge branch 'tune-bullets-env' of github.com:araffin/rl-baselines-zo…

…o into tune-bullets-env
araffin · Jan 17, 2019 · 3b71b78 · 3b71b78
2 parents 2874d24 + 5de5b28
commit 3b71b78
Show file tree

Hide file tree

Showing 7 changed files with 24 additions and 20 deletions.
diff --git a/benchmark.md b/benchmark.md
@@ -85,7 +85,7 @@
 |sac  |HumanoidBulletEnv-v0               |   2048.187|   829.776|     149886|       172|
 |sac  |InvertedDoublePendulumBulletEnv-v0 |   9357.406|     0.504|     150000|       150|
 |sac  |InvertedPendulumSwingupBulletEnv-v0|    891.508|     0.963|     150000|       150|
-|sac  |LunarLanderContinuous-v2           |    194.191|   100.631|     149699|       304|
+|sac  |LunarLanderContinuous-v2           |    269.783|    57.077|     149852|       709|
 |sac  |Pendulum-v0                        |   -159.669|    86.665|     150000|       750|
 |sac  |ReacherBulletEnv-v0                |     17.529|     9.860|     150000|      1000|
 |sac  |Walker2DBulletEnv-v0               |   2052.646|    13.631|     150000|       150|
diff --git a/enjoy.py b/enjoy.py
@@ -71,7 +71,10 @@
                       should_render=not args.no_render,
                       hyperparams=hyperparams)
 
-model = ALGOS[algo].load(model_path)
+# ACER raises errors because the environment passed must have
+# the same number of environments as the model was trained on.
+load_env = None if algo == 'acer' else env
+model = ALGOS[algo].load(model_path, env=load_env)
 
 obs = env.reset()
 

diff --git a/hyperparams/ddpg.yml b/hyperparams/ddpg.yml
@@ -19,6 +19,7 @@ Pendulum-v0:
   noise_std: 0.1
   memory_limit: 50000
 
+# To be tuned
 BipedalWalker-v2:
   n_timesteps: !!float 5e6
   policy: 'LnMlpPolicy'
@@ -27,6 +28,7 @@ BipedalWalker-v2:
   noise_std: 0.2
   memory_limit: 50000
 
+# To be tuned
 Walker2DBulletEnv-v0:
   n_timesteps: !!float 2e6
   policy: 'LnMlpPolicy'
@@ -36,6 +38,7 @@ Walker2DBulletEnv-v0:
   batch_size: 64
   normalize_observations: True
 
+# To be tuned
 HalfCheetahBulletEnv-v0:
   n_timesteps: !!float 2e6
   policy: 'LnMlpPolicy'

diff --git a/hyperparams/ppo2.yml b/hyperparams/ppo2.yml
@@ -220,6 +220,7 @@ MinitaurBulletDuckEnv-v0:
   learning_rate: 2.5e-4
   cliprange: 0.2
 
+# To be tuned
 HumanoidBulletEnv-v0:
   normalize: true
   n_envs: 8

diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
@@ -5,24 +5,20 @@ MountainCarContinuous-v0:
   learning_rate: lin_3e-4
   buffer_size: 1000000
   batch_size: 64
-  ent_coef: 0.01
+  ent_coef: 'auto'
   train_freq: 1
   gradient_steps: 1
   learning_starts: 10000
 
 Pendulum-v0:
   n_timesteps: !!float 60000
   policy: 'MlpPolicy'
-  ent_coef: 0.2
   learning_starts: 1000
 
 LunarLanderContinuous-v2:
-  n_timesteps: !!float 1e5
+  n_timesteps: !!float 5e5
   policy: 'MlpPolicy'
-  learning_rate: !!float 3e-3
-  buffer_size: 50000
-  batch_size: 32
-  ent_coef: 0.2
+  batch_size: 256
   learning_starts: 1000
 
 BipedalWalker-v2:
@@ -53,7 +49,7 @@ HalfCheetahBulletEnv-v0:
   learning_rate: lin_3e-4
   buffer_size: 1000000
   batch_size: 64
-  ent_coef: 0.01
+  ent_coef: 'auto'
   train_freq: 1
   gradient_steps: 1
   learning_starts: 10000
@@ -103,12 +99,13 @@ ReacherBulletEnv-v0:
   learning_starts: 1000
 
 HumanoidBulletEnv-v0:
+  normalize: "{'norm_obs': True, 'norm_reward': False}"
   n_timesteps: !!float 2e7
   policy: 'CustomSACPolicy'
   learning_rate: lin_3e-4
   buffer_size: 1000000
   batch_size: 64
-  ent_coef: 0.01
+  ent_coef: 'auto'
   train_freq: 1
   gradient_steps: 1
   learning_starts: 1000
@@ -135,26 +132,29 @@ InvertedPendulumSwingupBulletEnv-v0:
   gradient_steps: 1
   learning_starts: 1000
 
+# To be tuned
 MinitaurBulletEnv-v0:
   normalize: "{'norm_obs': True, 'norm_reward': False}"
   n_timesteps: !!float 1e6
   policy: 'CustomSACPolicy'
   learning_rate: lin_3e-4
   buffer_size: 1000000
-  batch_size: 256
-  ent_coef: 0.05
+  batch_size: 64
+  ent_coef: 'auto'
+  # ent_coef: 0.0003
   train_freq: 1
   gradient_steps: 1
   learning_starts: 1000
 
+# To be tuned
 MinitaurBulletDuckEnv-v0:
-  normalize: "{'norm_obs': True, 'norm_reward': False}"
+  # normalize: "{'norm_obs': True, 'norm_reward': False}"
   n_timesteps: !!float 1e6
   policy: 'CustomSACPolicy'
   learning_rate: lin_3e-4
   buffer_size: 1000000
   batch_size: 256
-  ent_coef: 0.05
+  ent_coef: 'auto'
   train_freq: 1
   gradient_steps: 1
   learning_starts: 1000
diff --git a/trained_agents/sac/LunarLanderContinuous-v2.pkl b/trained_agents/sac/LunarLanderContinuous-v2.pkl
diff --git a/trained_agents/sac/LunarLanderContinuous-v2/config.yml b/trained_agents/sac/LunarLanderContinuous-v2/config.yml
@@ -1,8 +1,5 @@
 !!python/object/apply:collections.OrderedDict
-- - [batch_size, 32]
-  - [buffer_size, 50000]
-  - [ent_coef, 0.2]
-  - [learning_rate, 0.003]
+- - [batch_size, 256]
   - [learning_starts, 1000]
-  - [n_timesteps, 100000.0]
+  - [n_timesteps, 500000.0]
   - [policy, MlpPolicy]