ucl-dark · newtonkwan · Jun 27, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022
diff --git a/pax/conf/config.yaml b/pax/conf/config.yaml
@@ -13,13 +13,14 @@ seed: 0
 save_dir: "./exp/${wandb.group}/${wandb.name}"
 
 # Agents  
-agent1: 'PPO'
-agent2: 'TitForTat'
+agent1: 'LOLA'
+agent2: 'PPO'
 
 # Environment
 env_id: ipd
 game: ipd 
 payoff:
+centralized: True
 
 # Training hyperparameters
 num_envs: 100
@@ -54,8 +55,8 @@ ppo:
   clip_value: True
   max_gradient_norm: 0.5
   anneal_entropy: True
-  entropy_coeff_start: 0.1
-  entropy_coeff_horizon: 200_000_000
+  entropy_coeff_start: 0.2 
+  entropy_coeff_horizon: 500_000
   entropy_coeff_end: 0.01
   lr_scheduling: True
   learning_rate: 2.5e-2
@@ -70,6 +71,6 @@ ppo:
 wandb:
   entity: "ucl-dark"
   project: ipd
-  group: '${agent1}-vs-${agent2}-${game}-with-memory=${ppo.with_memory}-final'
+  group: '${agent1}-vs-${agent2}-${game}-with-memory=${ppo.with_memory}-v3'
   name: run-seed-${seed}
   log: True
diff --git a/pax/experiment.py b/pax/experiment.py
@@ -23,7 +23,6 @@
     Random,
     Human,
     GrimTrigger,
-    # ZDExtortion,
 )
 from pax.utils import Section
 from pax.watchers import (

diff --git a/pax/lola/lola.py b/pax/lola/lola.py
@@ -59,22 +59,21 @@ def update(
         t: TimeStep,
         actions: jnp.ndarray,
         t_prime: TimeStep,
-        other_agents=None,
+        other_agents: list = None,
     ):
         """Update agent"""
-        # an sgd step requires the parameters of the other agent.
-        # currently, the runner file doesn't have access to the other agent's gradients
-        # we could put the parameters of the agent inside the timestep
+        # for agent in other_agents:
+        #     other_agent_obs = agent._trajectory_buffer.observations
         pass
 
 
 def make_lola(seed: int) -> LOLA:
-    """ "Instantiate LOLA"""
+    """Instantiate LOLA"""
     random_key = jax.random.PRNGKey(seed)
 
     def forward(inputs):
-        """Forward pass for LOLA exact"""
-        values = hk.Linear(1, with_bias=False)
+        """Forward pass for LOLA"""
+        values = hk.Linear(2, with_bias=False)
         return values(inputs)
 
     network = hk.without_apply_rng(hk.transform(forward))

diff --git a/pax/ppo/buffer.py b/pax/ppo/buffer.py
@@ -153,6 +153,8 @@ def reset(self):
             (self._num_envs, self._num_steps, self.gru_dim)
         )
 
+        self.parameters = jnp.zeros((self._num_envs, self._num_steps))
+
 
 if __name__ == "__main__":
     pass
diff --git a/pax/ppo/networks.py b/pax/ppo/networks.py
@@ -19,12 +19,14 @@ def __init__(
         super().__init__(name=name)
         self._logit_layer = hk.Linear(
             num_values,
-            w_init=hk.initializers.Orthogonal(0.01),  # baseline
+            # w_init=hk.initializers.Orthogonal(0.01),  # baseline
+            w_init=hk.initializers.Constant(0.5),
             with_bias=False,
         )
         self._value_layer = hk.Linear(
             1,
-            w_init=hk.initializers.Orthogonal(1.0),  # baseline
+            # w_init=hk.initializers.Orthogonal(1.0),  # baseline
+            w_init=hk.initializers.Constant(0.5),
             with_bias=False,
         )
 

diff --git a/pax/ppo/ppo.py b/pax/ppo/ppo.py
@@ -184,9 +184,10 @@ def loss(
                     fraction * entropy_coeff_start
                     + (1 - fraction) * entropy_coeff_end
                 )
+
             # Constant Entropy term
-            else:
-                entropy_cost = entropy_coeff_start
+            # else:
+            #     entropy_cost = entropy_coeff_start
             entropy_loss = -jnp.mean(entropy)
 
             # Total loss: Minimize policy and value loss; maximize entropy
@@ -201,6 +202,7 @@ def loss(
                 "loss_policy": policy_loss,
                 "loss_value": value_loss,
                 "loss_entropy": entropy_loss,
+                "entropy_cost": entropy_cost,
             }
 
         @jax.jit
@@ -371,8 +373,6 @@ def make_initial_state(key: Any, obs_spec: Tuple) -> TrainingState:
             dummy_obs = utils.add_batch_dim(dummy_obs)
             initial_params = network.init(subkey, dummy_obs)
             initial_opt_state = optimizer.init(initial_params)
-            # for dict_key in initial_params.keys():
-            #     print(initial_params[dict_key])
             return TrainingState(
                 params=initial_params,
                 opt_state=initial_opt_state,
@@ -401,6 +401,7 @@ def make_initial_state(key: Any, obs_spec: Tuple) -> TrainingState:
             "loss_policy": 0,
             "loss_value": 0,
             "loss_entropy": 0,
+            "entropy_cost": entropy_coeff_start,
         }
 
         # Initialize functions
@@ -480,6 +481,7 @@ def update(
         self._logger.metrics["loss_policy"] = results["loss_policy"]
         self._logger.metrics["loss_value"] = results["loss_value"]
         self._logger.metrics["loss_entropy"] = results["loss_entropy"]
+        self._logger.metrics["entropy_cost"] = results["entropy_cost"]
 
 
 # TODO: seed, and player_id not used in CartPole

diff --git a/pax/ppo/ppo_gru.py b/pax/ppo/ppo_gru.py
@@ -209,6 +209,7 @@ def loss(
                 "loss_policy": policy_loss,
                 "loss_value": value_loss,
                 "loss_entropy": entropy_loss,
+                "entropy_cost": entropy_cost,
             }
             # }, new_rnn_unroll_state
 
@@ -429,6 +430,7 @@ def make_initial_state(
             "loss_policy": 0,
             "loss_value": 0,
             "loss_entropy": 0,
+            "entropy_cost": entropy_coeff_start,
         }
 
         # Initialize functions
@@ -503,6 +505,7 @@ def update(
         self._logger.metrics["loss_policy"] = results["loss_policy"]
         self._logger.metrics["loss_value"] = results["loss_value"]
         self._logger.metrics["loss_entropy"] = results["loss_entropy"]
+        self._logger.metrics["entropy_cost"] = results["entropy_cost"]
 
 
 # TODO: seed, and player_id not used in CartPole

diff --git a/pax/watchers.py b/pax/watchers.py
@@ -6,11 +6,11 @@
 # five possible states
 START = jnp.array([[0, 0, 0, 0, 1]])
 CC = jnp.array([[1, 0, 0, 0, 0]])
-CD = jnp.array([[0, 1, 0, 0, 0]])
-DC = jnp.array([[0, 0, 1, 0, 0]])
+DC = jnp.array([[0, 1, 0, 0, 0]])
+CD = jnp.array([[0, 0, 1, 0, 0]])
 DD = jnp.array([[0, 0, 0, 1, 0]])
-STATE_NAMES = ["START", "CC", "CD", "DC", "DD"]
-ALL_STATES = [START, CC, CD, DC, DD]
+STATE_NAMES = ["START", "CC", "DC", "CD", "DD"]
+ALL_STATES = [START, CC, DC, CD, DD]
 
 
 def policy_logger(agent) -> None:
@@ -119,11 +119,13 @@ def ppo_losses(agent) -> None:
     loss_policy = agent._logger.metrics["loss_policy"]
     loss_value = agent._logger.metrics["loss_value"]
     loss_entropy = agent._logger.metrics["loss_entropy"]
+    entropy_coefficient = agent._logger.metrics["entropy_cost"]
     losses = {
         "sgd_steps": sgd_steps,
-        "losses/total": loss_total,
-        "losses/policy": loss_policy,
-        "losses/value": loss_value,
-        "losses/entropy": loss_entropy,
+        "train/total": loss_total,
+        "train/policy": loss_policy,
+        "train/value": loss_value,
+        "train/entropy": loss_entropy,
+        "train/entropy_coefficient": entropy_coefficient,
     }
     return losses