ucl-dark · newtonkwan · Jun 27, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022
diff --git a/pax/centralized_learners.py b/pax/centralized_learners.py
@@ -0,0 +1,64 @@
+from typing import Callable, List
+
+from dm_env import TimeStep
+import jax.numpy as jnp
+
+
+class CentralizedLearners:
+    """Interface for a set of batched agents to work with environment
+    Performs centralized training"""
+
+    def __init__(self, agents: list):
+        self.num_agents: int = len(agents)
+        self.agents: list = agents
+
+    def select_action(self, timesteps: List[TimeStep]) -> List[jnp.ndarray]:
+        assert len(timesteps) == self.num_agents
+        return [
+            agent.select_action(t) for agent, t in zip(self.agents, timesteps)
+        ]
+
+    def in_lookahead(self, env):
+        """Simulates a rollout and gradient update"""
+        counter = 0
+        for agent in self.agents:
+            # All other agents in a list
+            # i.e. if i am agent2, then other_agents=[agent1, agent3, agent4 ...]
+            other_agents = self.agents[:counter] + self.agents[counter + 1 :]
+            agent.in_lookahead(env, other_agents)
+            counter += 1
+
+    def out_lookahead(self, env):
+        """Performs a real rollout and update"""
+        counter = 0
+        for agent in self.agents:
+            # All other agents in a list
+            # i.e. if i am agent2, then other_agents=[agent1, agent3, agent4 ...]
+            other_agents = self.agents[:counter] + self.agents[counter + 1 :]
+            agent.out_lookahead(env, other_agents)
+            counter += 1
+
+    # TODO: Obselete at the moment. This can be put into the LOLA.
+    def update(
+        self,
+        old_timesteps: List[TimeStep],
+        actions: List[jnp.ndarray],
+        timesteps: List[TimeStep],
+    ) -> None:
+        counter = 0
+        for agent, t, action, t_1 in zip(
+            self.agents, old_timesteps, actions, timesteps
+        ):
+            # All other agents in a list
+            # i.e. if i am agent2, then other_agents=[agent1, agent3, agent4 ...]
+            other_agents = self.agents[:counter] + self.agents[counter + 1 :]
+            agent.update(t, action, t_1, other_agents)
+            counter += 1
+
+    def log(self, metrics: List[Callable]) -> None:
+        for metric, agent in zip(metrics, self.agents):
+            metric(agent)
+
+    def eval(self, set_flag: bool) -> None:
+        for agent in self.agents:
+            agent.eval = set_flag
diff --git a/pax/conf/config.yaml b/pax/conf/config.yaml
@@ -9,77 +9,92 @@ hydra:
       level: INFO
 
 # Global variables 
-seed: 0
+seed: 25
 save_dir: "./exp/${wandb.group}/${wandb.name}"
 debug: False
 
 # Agents  
-agent1: 'Hyper'
-agent2: 'NaiveLearnerEx'
+agent1: 'LOLA'
+agent2: 'LOLA'
 
 # Environment
 env_id: ipd
 game: ipd 
-env_type: infinite
+env_type: finite
 env_discount: 0.96
-payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+payoff: [[-1,-1], [-3,0], [0,-3], [-2,-2]]
+centralized: True
 
 # Training hyperparameters
-num_envs: 4000
-num_steps: 100 # number of steps per episode
-total_timesteps: 1.6e9
-eval_every: 0.4e9 # timesteps for update
+num_envs: 128
+num_steps: 150 # number of steps per episode
+total_timesteps: 1_000_000
+eval_every: 100_000 # eval every n episodes, not timesteps
+
 
 # Useful information
 # num_episodes = total_timesteps / (num_steps * num_envs) 
 # num_updates = num_episodes / eval_every
 # batch_size = num_envs * num_steps
 
+# DQN agent parameters
+dqn: 
+  batch_size: 256
+  discount: 0.99
+  learning_rate: 1e-2
+  epsilon: 0.5
+  replay_capacity: 100000
+  min_replay_size: 1000
+  sgd_period: 1 
+  target_update_period: 4 
 
 # PPO agent parameters
 ppo:
-  num_minibatches: 1
+  num_minibatches: 10
   num_epochs: 4 
   gamma: 0.96
-  gae_lambda: 0.99
+  gae_lambda: 0.95
   ppo_clipping_epsilon: 0.2
   value_coeff: 0.5
   clip_value: True
   max_gradient_norm: 0.5
-  anneal_entropy: False
-  entropy_coeff_start: 0.01
-  entropy_coeff_horizon: 0.8e9
-  entropy_coeff_end: 0.001
-  lr_scheduling: False
-  learning_rate: 4e-3
+  anneal_entropy: True
+  entropy_coeff_start: 0.2 
+  entropy_coeff_horizon: 5_000_000
+  # for halfway, the horizon should (1/2) * (total_timesteps / num_envs)
+  entropy_coeff_end: 0.01
+  lr_scheduling: True
+  learning_rate: 2.5e-3
   adam_epsilon: 1e-5
   adam_eps_root: 0.
   with_memory: False
 
 # Naive Learner parameters
 naive:
-  lr: 1.0
+  num_minibatches: 1
+  num_epochs: 1
+  gamma: 0.96
+  gae_lambda: 0.95
+  max_gradient_norm: 1
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
 
 # LOLA agent parameters
-# lola:
-#   ... 
+lola: 
+  use_baseline: True   
+  adam_epsilon: 1e-5
+  lr_in: 0.3
+  lr_out: 0.2
+  gamma: 0.96
+  num_lookaheads: 1
 
 # Logging setup
 wandb:
   entity: "ucl-dark"
   project: ipd
-  group: 'MFOS-${agent1}-vs-${agent2}-${game}'
+  group: 'LOLA-vs-${agent2}-${game}'
   name: run-seed-${seed}
-  log: True
+  log: False
 
 
-# DQN agent parameters
-dqn: 
-  batch_size: 256
-  discount: 0.99
-  learning_rate: 1e-2
-  epsilon: 0.5
-  replay_capacity: 100000
-  min_replay_size: 1000
-  sgd_period: 1 
-  target_update_period: 4 
diff --git a/pax/conf/experiment/debug.yaml b/pax/conf/experiment/debug.yaml
@@ -4,5 +4,5 @@ debug: true
 
 wandb:
   group: debug
-  log: true
+  log: False
 
diff --git a/pax/conf/experiment/lola.yaml b/pax/conf/experiment/lola.yaml
@@ -0,0 +1,43 @@
+# @package _global_
+
+# Agents  
+agent1: 'LOLA'
+agent2: 'LOLA'
+centralized: True
+
+# Environment
+env_id: ipd
+game: ipd 
+env_type: finite
+env_discount: 0.96
+payoff: [[-1,-1], [-3,0], [0,-3], [-2,-2]]
+
+
+# Training hyperparameters
+num_envs: 128
+num_steps: 150 # number of steps per episode
+total_timesteps: 4_000_000
+eval_every: 4_000_000 # timesteps
+
+# Useful information
+# num_episodes = total_timesteps / num_steps
+# num_updates = num_episodes / eval_every
+# batch_size = num_envs * num_steps
+
+# LOLA agent parameters
+lola: 
+  use_baseline: False   
+  adam_epsilon: 1e-5
+  lr_in: 0.3
+  lr_out: 0.2
+  lr_value: 0.1
+  gamma: 0.96
+  num_lookaheads: 0
+
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'LOLA-vs-${agent2}-${game}'
+  name: run-seed-${seed}-${lola.num_lookaheads}-lookaheads
+  log: True 
diff --git a/pax/env.py b/pax/env.py
@@ -53,6 +53,8 @@ def step(
             return self.reset()
         action_1, action_2 = actions
         self._num_steps += 1
+        # print("action_1.shape", action_1.shape)
+        # print("action_1", action_1)
         assert action_1.shape == action_2.shape
         assert action_1.shape == (self.num_envs,)