ucl-dark · Aidandos · Oct 24, 2023 · Oct 20, 2023 · Oct 23, 2023 · Oct 24, 2023
diff --git a/docs/getting-started/runners.md b/docs/getting-started/runners.md
@@ -1,9 +1,28 @@
-# Runner 
+# Runners 
+
+## Evo Runner
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/cg/mfos.yaml) for an example of how to configure it.
+
+## Evo Runner N-Roles
+
+This runner extends the evo runner to `N > 2` agents by letting the first and second agent assume multiple roles that can be configured via `agent1_roles` and `agent2_roles` in the experiment configuration.
+Both agents receive different sets of memories for each role that they assume but share the weights.
+
+- For heterogeneous games roles can be shuffled for each rollout using the `shuffle_players` flag. 
+- Using the `self_play_anneal` flag one can anneal the self-play probability from 0 to 1 over the course of the experiment.
+
+See [this experiment](https://github.com/akbir/pax/blob/bb0e69ef71fd01ec9c85753814ffba3c5cb77935/pax/conf/experiment/rice/shaper_v_ppo.yaml) for an example of how to configure it.
+
+## Weight sharing Runner
+
+A simple baseline for MARL experiments is having one agent assume multiple roles and share the weights between them (but not the memory).
+In order for this approach to work the observation vector needs to include one entry that indicates the role of the agent (see [Terry et al.](https://arxiv.org/abs/2005.13625v7).
+
+See [this experiment](https://github.com/akbir/pax/blob/9d3fa62e34279a338c07cffcbf208edc8a95e7ba/pax/conf/experiment/rice/weight_sharing.yaml) for an example of how to configure it.
 
-## Runner 1
 
-Lorem ipsum.
 
-## Runner 2
 
-Lorem ipsum.
diff --git a/pax/agents/ppo/ppo.py b/pax/agents/ppo/ppo.py
@@ -16,7 +16,10 @@
     make_cournot_network,
     make_fishery_network,
     make_rice_sarl_network,
+    make_ipd_network,
 )
+from pax.envs.iterated_matrix_game import IteratedMatrixGame
+from pax.envs.iterated_tensor_game_n_player import IteratedTensorGameNPlayer
 from pax.envs.rice.c_rice import ClubRice
 from pax.envs.rice.rice import Rice
 from pax.envs.rice.sarl_rice import SarlRice
@@ -515,6 +518,11 @@ def make_agent(
         network = make_rice_sarl_network(action_spec, agent_args.hidden_size)
     elif args.runner == "sarl":
         network = make_sarl_network(action_spec)
+    elif args.env_id in [
+        IteratedMatrixGame.env_id,
+        IteratedTensorGameNPlayer.env_id,
+    ]:
+        network = make_ipd_network(action_spec, True, agent_args.hidden_size)
     else:
         raise NotImplementedError(
             f"No ppo network implemented for env {args.env_id}"

diff --git a/pax/conf/config.yaml b/pax/conf/config.yaml
@@ -30,6 +30,11 @@ agent1_roles: 1
 agent2_roles: 1 # Make agent 2 assume multiple roles in an n-player game
 agent2_reset_interval: 1 # Reset agent 2 every rollout
 
+# When True: runner_evo will replace the opponent by the agent itself
+# at a linearly increasing probability during training
+self_play_anneal: False
+
+
 # Logging setup
 wandb:
   entity: "ucl-dark"

diff --git a/pax/conf/experiment/c_rice/debug.yaml b/pax/conf/experiment/c_rice/debug.yaml
@@ -10,7 +10,7 @@ env_type: meta
 num_players: 6
 has_mediator: True
 config_folder: pax/envs/rice/5_regions
-runner: evo
+runner: evo_nroles
 
 # Training
 top_k: 5

diff --git a/pax/conf/experiment/c_rice/eval_mediator_gs_ppo.yaml b/pax/conf/experiment/c_rice/eval_mediator_gs_ppo.yaml
@@ -28,15 +28,32 @@ num_devices: 1
 num_steps: 10
 
 # Train to convergence
-agent2_reset_interval: 1000
-# Regular mediator
+agent2_reset_interval: 2000
+# Reward objective
 #run_path: chrismatix/c-rice/runs/3w7d59ug
 #model_path: exp/mediator/c_rice-mediator-gs-ppo-interval10_seed0/2023-10-09_17.00.59.872280/generation_1499
 
 # Climate objective
 run_path: chrismatix/c-rice/runs/ovss1ahd
 model_path: exp/mediator/c-rice-mediator-GS-PPO_memory-seed-0-climate-obj/2023-10-14_17.23.35.878225/generation_1499
 
+# 0.9 climate 0.1 reward
+#run_path: chrismatix/c-rice/runs/mmtc40ja
+#model_path: exp/mediator/c-rice-mediator-GS-PPO_memory-seed-0-c.9-u.1/2023-10-17_17.03.26.660387/generation_1499
+
+
+# 0.7 climate 0.3 reward
+#run_path: chrismatix/c-rice/runs/sdpc3s71
+#model_path: exp/mediator/c-rice-mediator-GS-PPO_memory-seed-0-c.7-u.3/2023-10-20_17.12.09.658666/generation_1499
+
+# 0.5 climate 0.5 reward
+#run_path: chrismatix/c-rice/runs/6wpuz6i2
+#model_path: exp/mediator/c-rice-mediator-GS-PPO_memory-seed-0-c.5-u.5/2023-10-20_15.48.04.605509/generation_1499
+
+# high reward
+#run_path: chrismatix/c-rice/runs/l4enoiku
+#model_path: exp/mediator/c-rice-mediator-GS-PPO_memory-seed-0/2023-10-02_18.01.15.434206/generation_1499
+
 # PPO agent parameters
 ppo_default:
   num_minibatches: 4

diff --git a/pax/conf/experiment/c_rice/marl_baseline.yaml b/pax/conf/experiment/c_rice/marl_baseline.yaml
@@ -9,7 +9,7 @@ env_type: meta
 num_players: 6
 has_mediator: True
 config_folder: pax/envs/rice/5_regions
-runner: evo
+runner: evo_nroles
 rice_v2_network: True
 
 # Training

diff --git a/pax/conf/experiment/c_rice/mediator_gs_ppo.yaml b/pax/conf/experiment/c_rice/mediator_gs_ppo.yaml
@@ -12,7 +12,7 @@ env_type: meta
 num_players: 6
 has_mediator: True
 config_folder: pax/envs/rice/5_regions
-runner: evo
+runner: evo_nroles
 rice_v2_network: True
 agent2_reset_interval: 10
 

diff --git a/pax/conf/experiment/c_rice/shaper_v_ppo.yaml b/pax/conf/experiment/c_rice/shaper_v_ppo.yaml
@@ -14,7 +14,7 @@ num_players: 5
 has_mediator: False
 shuffle_players: False
 config_folder: pax/envs/rice/5_regions
-runner: evo
+runner: evo_nroles
 rice_v2_network: True
 
 default_club_mitigation_rate: 0.1

diff --git a/pax/conf/experiment/cg/mfos.yaml b/pax/conf/experiment/cg/mfos.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-# Agents  
+# Agents
 agent1: 'MFOS'
 agent2: 'PPO_memory'
 
@@ -11,24 +11,27 @@ egocentric: True
 env_discount: 0.96
 payoff: [[1, 1, -2], [1, 1, -2]]
 
-# Runner 
+# Runner
 runner: evo
 
+top_k: 4
+popsize: 1000 #512
 # env_batch_size = num_envs * num_opponents
 num_envs: 250
 num_opps: 1
 num_outer_steps: 600
-num_inner_steps: 16 
-save_interval: 100 
+num_inner_steps: 16
+save_interval: 100
+num_steps: '${num_inner_steps}'
 
-# Evaluation 
+# Evaluation
 run_path: ucl-dark/cg/12auc9um
 model_path: exp/sanity-PPO-vs-PPO-parity/run-seed-0/2022-09-08_20.04.17.155963/iteration_500
 
 # PPO agent parameters
-ppo:
+ppo1:
   num_minibatches: 8
-  num_epochs: 2 
+  num_epochs: 2
   gamma: 0.96
   gae_lambda: 0.95
   ppo_clipping_epsilon: 0.2
@@ -49,6 +52,52 @@ ppo:
   separate: True # only works with CNN
   hidden_size: 16 #50
 
+ppo2:
+  num_minibatches: 8
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: False
+  learning_rate: 0.01 #0.05
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: True # only works with CNN
+  hidden_size: 16 #50
+
+# ES parameters
+es:
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.01    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness
+  maximise: True       # Maximise fitness
+  z_score: False       # Normalise fitness
+  mean_reduce: True    # Remove mean
+
+
 # Logging setup
 wandb:
   entity: "ucl-dark"

diff --git a/pax/conf/experiment/cg/tabular.yaml b/pax/conf/experiment/cg/tabular.yaml
@@ -1,6 +1,6 @@
 # @package _global_
 
-# Agents  
+# Agents
 agent1: 'Tabular'
 agent2: 'Random'
 
@@ -25,9 +25,32 @@ num_iters: 10000
 # train_batch_size = num_envs * num_opponents * num_steps
 
 # PPO agent parameters
-ppo:
+ppo1:
   num_minibatches: 8
-  num_epochs: 2 
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: True
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: True
+  learning_rate: 0.01 #0.05
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: True # only works with CNN
+  hidden_size: 16 #50
+
+ppo2:
+  num_minibatches: 8
+  num_epochs: 2
   gamma: 0.96
   gae_lambda: 0.95
   ppo_clipping_epsilon: 0.2

diff --git a/pax/conf/experiment/cournot/eval_shaper_v_ppo.yaml b/pax/conf/experiment/cournot/eval_shaper_v_ppo.yaml
@@ -0,0 +1,80 @@
+# @package _global_
+
+# Agents
+agent1: 'PPO_memory'
+agent_default: 'PPO'
+
+# Environment
+env_id: Cournot
+env_type: meta
+a: 100
+b: 1
+marginal_cost: 10
+
+# Runner
+runner: evo_nroles
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 4
+num_opps: 1
+num_outer_steps: 300
+num_inner_steps: 1 # One-shot game
+num_iters: 1000
+num_devices: 1
+num_steps: '${num_inner_steps}'
+
+
+# PPO agent parameters
+ppo_default:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters
+es:
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.01    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness
+  maximise: True       # Maximise fitness
+  z_score: False       # Normalise fitness
+  mean_reduce: True    # Remove mean
+
+# Logging setup
+wandb:
+  project: cournot
+  group: 'shaper'
+  name: 'cournot-SHAPER-${num_players}p-seed-${seed}'
+  log: True
+
+