diff --git a/.gitignore b/.gitignore
index f8f04061..94f827de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,8 @@ experiment.log
 
 # Pax
 pax/version.py
+
+*.gif
+*.json
+*.png
+*.sh
diff --git a/docs/getting-started/runners.md b/docs/getting-started/runners.md
index 43dc241d..45d55770 100644
--- a/docs/getting-started/runners.md
+++ b/docs/getting-started/runners.md
@@ -23,6 +23,61 @@ In order for this approach to work the observation vector needs to include one e
 
 See [this experiment](https://github.com/akbir/pax/blob/9d3fa62e34279a338c07cffcbf208edc8a95e7ba/pax/conf/experiment/rice/weight_sharing.yaml) for an example of how to configure it.
 
+## Evo Hardstop
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+This runner stops the learning of an opponent during training, corresponds to the hardstop challenge of Shaper.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Scanned
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+Here we also scan over the evolutionary steps, which makes compilation longer, training shorter and logging stats is not possible.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed LR Runner (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+This runner randomly samples learning rates for the opponents.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed Payoff (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+Payoff matrix is randomly sampled at each rollout. Each opponent has a different payoff matrix.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed Payoff Gen (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed IPD Payoff (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+This runner randomly samples payoffs that follow Iterated Prisoner's Dilemma [constraints](https://en.wikipedia.org/wiki/Prisoner%27s_dilemma).
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed Payoff Input (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix. The payoff matrix is observed as input to the agent.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
+
+## Evo Mixed Payoff Only Opp (experimental)
+
+The Evo Runner optimizes the first agent using evolutionary learning. 
+Noise is added to the opponents IPD-like payout matrix at each rollout. Each opponent has the same noise added.
+
+See [this experiment](https://github.com/akbir/pax/blob/9a01bae33dcb2f812977be388751393f570957e9/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml) for an example of how to configure it.
 
 
 
diff --git a/hardstop_eval_bash.sh b/hardstop_eval_bash.sh
new file mode 100755
index 00000000..0d7e1db8
--- /dev/null
+++ b/hardstop_eval_bash.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+###### MFOS AVG ######
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4ykf9oe8 ++model_path=exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1 
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/eopf93re ++model_path=exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1sqbd09n ++model_path=exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3n7l8ods ++model_path=exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4mf1ecxq ++model_path=exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4ykf9oe8 ++model_path=exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/eopf93re ++model_path=exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1sqbd09n ++model_path=exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3n7l8ods ++model_path=exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=mfos_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/4mf1ecxq ++model_path=exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+
+###### Shaper Nothing #$$$
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/2m3wh5g7 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1jk5zly5 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1cvpiolk ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3vml0wjy ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=100
+
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/2m3wh5g7 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1jk5zly5 ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/1cvpiolk ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
+python -m pax.experiment -m +experiment/ipd=shaper_att_v_tabular_hardstop_eval ++wandb.log=True ++run_path=ucl-dark/ipd/3vml0wjy ++model_path=exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900 ++seed=85768,785678,764578,89678,97869,4567456,856778,3456347,45673,83346 ++stop=1
\ No newline at end of file
diff --git a/pax/agents/mfos_ppo/networks.py b/pax/agents/mfos_ppo/networks.py
index cb5397a1..5b3ea405 100644
--- a/pax/agents/mfos_ppo/networks.py
+++ b/pax/agents/mfos_ppo/networks.py
@@ -151,6 +151,28 @@ def forward_fn(
     network = hk.without_apply_rng(hk.transform(forward_fn))
     return network, hidden_state
 
+def make_mfos_avg_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, 3 * hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray,
+        state: Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray],
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        mfos = ActorCriticMFOS(num_actions, hidden_size)
+        hidden_t, hidden_a, hidden_v = jnp.split(state, 3, axis=-1)
+        avg_hidden_t = jnp.mean(hidden_t, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        avg_hidden_a = jnp.mean(hidden_a, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        avg_hidden_v = jnp.mean(hidden_v, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        hidden_t = 0.5*hidden_t + 0.5*avg_hidden_t
+        hidden_a = 0.5*hidden_a + 0.5*avg_hidden_a
+        hidden_v = 0.5*hidden_v + 0.5*avg_hidden_v
+        state = jnp.concatenate([hidden_t, hidden_a, hidden_v], axis=-1)
+        logits, values, state = mfos(inputs, state)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
 
 def make_mfos_continuous_network(num_actions: int, hidden_size: int):
     hidden_state = jnp.zeros((1, 3 * hidden_size))
diff --git a/pax/agents/mfos_ppo/ppo_gru.py b/pax/agents/mfos_ppo/ppo_gru.py
index 2e130fa7..15e08860 100644
--- a/pax/agents/mfos_ppo/ppo_gru.py
+++ b/pax/agents/mfos_ppo/ppo_gru.py
@@ -12,6 +12,7 @@
 from pax.agents.mfos_ppo.networks import (
     make_mfos_ipditm_network,
     make_mfos_network,
+    make_mfos_avg_network,
     make_mfos_continuous_network,
 )
 from pax.envs.rice.rice import Rice
@@ -65,7 +66,6 @@ def __init__(
         obs_spec: Tuple,
         batch_size: int = 2000,
         num_envs: int = 4,
-        num_steps: int = 500,
         num_minibatches: int = 16,
         num_epochs: int = 4,
         clip_value: bool = True,
@@ -481,8 +481,8 @@ def prepare_batch(
 
         # Other useful hyperparameters
         self._num_envs = num_envs  # number of environments
-        self._num_steps = num_steps  # number of steps per environment
-        self._batch_size = int(num_envs * num_steps)  # number in one batch
+        # self._num_steps = num_steps  # number of steps per environment
+        # self._batch_size = int(num_envs * num_steps)  # number in one batch
         self._num_minibatches = num_minibatches  # number of minibatches
         self._num_epochs = num_epochs  # number of epochs to use sample
         self._gru_dim = gru_dim
@@ -578,6 +578,17 @@ def make_mfos_agent(
             agent_args.output_channels,
             agent_args.kernel_shape,
         )
+    elif args.env_id == "iterated_matrix_game":
+        if args.att_type=='att':
+            raise ValueError("Attention not supported")
+        elif args.att_type=='avg':
+            network, initial_hidden_state = make_mfos_avg_network(
+                action_spec, agent_args.hidden_size
+            )
+        elif args.att_type=='nothing':
+            network, initial_hidden_state = make_mfos_network(
+                action_spec, agent_args.hidden_size
+            )
     else:
         raise ValueError("Unsupported environment")
 
@@ -620,7 +631,6 @@ def make_mfos_agent(
         obs_spec=obs_spec,
         batch_size=None,
         num_envs=args.num_envs,
-        num_steps=args.num_steps,
         num_minibatches=agent_args.num_minibatches,
         num_epochs=agent_args.num_epochs,
         clip_value=agent_args.clip_value,
diff --git a/pax/agents/ppo/ppo.py b/pax/agents/ppo/ppo.py
index 9a098846..459d6c01 100644
--- a/pax/agents/ppo/ppo.py
+++ b/pax/agents/ppo/ppo.py
@@ -506,6 +506,16 @@ def make_agent(
             agent_args.output_channels,
             agent_args.kernel_shape,
         )
+    elif args.env_id in [
+        "iterated_matrix_game",
+        "iterated_tensor_game",
+        "iterated_nplayer_tensor_game",
+        "third_party_punishment",
+        "third_party_random",
+    ]:
+        network = make_ipd_network(
+            action_spec, tabular, agent_args.hidden_size
+        )
     elif args.env_id == "Cournot":
         network = make_cournot_network(action_spec, agent_args.hidden_size)
     elif args.env_id == "Fishery":
@@ -534,6 +544,7 @@ def make_agent(
     )
 
     if agent_args.lr_scheduling:
+        scale = optax.inject_hyperparams(optax.scale)(step_size=-1.0)
         scheduler = optax.linear_schedule(
             init_value=agent_args.learning_rate,
             end_value=0,
@@ -543,15 +554,18 @@ def make_agent(
             optax.clip_by_global_norm(agent_args.max_gradient_norm),
             optax.scale_by_adam(eps=agent_args.adam_epsilon),
             optax.scale_by_schedule(scheduler),
-            optax.scale(-1),
+            scale,
         )
+        # optimizer = optax.inject_hyperparams(optimizer)(learning_rate=agent_args.learning_rate)
 
     else:
+        scale = optax.inject_hyperparams(optax.scale)(step_size=-agent_args.learning_rate)
         optimizer = optax.chain(
             optax.clip_by_global_norm(agent_args.max_gradient_norm),
             optax.scale_by_adam(eps=agent_args.adam_epsilon),
-            optax.scale(-agent_args.learning_rate),
+            scale,
         )
+        # optimizer = optax.inject_hyperparams(optimizer)(learning_rate=agent_args.learning_rate)
 
     # Random key
     random_key = jax.random.PRNGKey(seed=seed)
diff --git a/pax/agents/shaper_att/networks.py b/pax/agents/shaper_att/networks.py
new file mode 100644
index 00000000..a8396d8a
--- /dev/null
+++ b/pax/agents/shaper_att/networks.py
@@ -0,0 +1,697 @@
+from typing import Optional, Tuple
+
+import distrax
+import haiku as hk
+import jax
+import jax.numpy as jnp
+
+from pax import utils
+
+
+class CategoricalValueHead(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1),
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHead_ipd(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHeadSeparate(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._action_body = hk.nets.MLP(
+            [64, 64],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            b_init=hk.initializers.Constant(0),
+        )
+        self._value_body = hk.nets.MLP(
+            [64, 64],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1),
+            b_init=hk.initializers.Constant(0),
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        # action_output, value_output = inputs
+        logits = self._action_body(inputs)
+        logits = self._logit_layer(logits)
+
+        value = self._value_body(inputs)
+        value = jnp.squeeze(self._value_layer(value), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHeadSeparate_ipditm(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        hidden_size: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._action_body = hk.nets.MLP(
+            [hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._value_body = hk.nets.MLP(
+            [hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            b_init=hk.initializers.Constant(0),
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1.0),
+            b_init=hk.initializers.Constant(0),
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        # action_output, value_output = inputs
+        logits = self._action_body(inputs)
+        logits = self._logit_layer(logits)
+
+        value = self._value_body(inputs)
+        value = jnp.squeeze(self._value_layer(value), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class ContinuousValueHead(hk.Module):
+    """Network head that produces a continuous distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),  # baseline
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1.0),  # baseline
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.MultivariateNormalDiag(loc=logits), value)
+
+
+class Tabular(hk.Module):
+    def __init__(self, num_values: int):
+        super().__init__(name="Tabular")
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+
+        def _input_to_onehot(input: jnp.ndarray):
+            chunks = jnp.array([9**3, 9**2, 9, 1], dtype=jnp.int32)
+            idx = input.nonzero(size=4)[0]
+            idx = jnp.mod(idx, 9)
+            idx = chunks * idx
+            idx = jnp.sum(idx)
+            return jax.nn.one_hot(idx, num_classes=6561)
+
+        self.input_to_onehot = jax.vmap(_input_to_onehot)
+
+    def __call__(self, inputs: jnp.ndarray):
+        inputs = self.input_to_onehot(inputs)
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CNN(hk.Module):
+    def __init__(self, output_channels, kernel_shape):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        self.conv_a_1 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        self.linear_a_0 = hk.Linear(output_channels)
+
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs: jnp.ndarray):
+        # Actor and Critic
+        x = self.conv_a_0(inputs)
+        x = jax.nn.relu(x)
+        x = self.conv_a_1(x)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = self.linear_a_0(x)
+        x = jax.nn.relu(x)
+        return x
+
+
+class CNN_ipditm(hk.Module):
+    def __init__(self, output_channels, kernel_shape):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+        )
+        # akbir suggested fix
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs: jnp.ndarray):
+        obs = inputs["observation"]
+        inventory = inputs["inventory"]
+        # Actor and Critic
+        x = self.conv_a_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        return x
+
+
+class CNNSeparate_ipditm(hk.Module):
+    def __init__(self, output_channels, kernel_shape, num_actions: int):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+        )
+        self.linear_a_0 = hk.Linear(output_channels)
+        self.conv_v_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+        )
+        self.linear_v_0 = hk.Linear(1)
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs):
+        obs = inputs["observation"]
+        inventory = inputs["inventory"]
+        # Actor
+        x = self.conv_a_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        logits = self.linear_a_0(x)
+
+        # Critic
+        x = self.conv_v_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        x = self.linear_v_0(x)
+        val = x
+        return (distrax.Categorical(logits=logits), jnp.squeeze(val, axis=-1))
+
+
+def make_GRU_ipd_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        gru = hk.GRU(hidden_size)
+        embedding, state = gru(inputs, state)
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+def make_GRU_ipd_avg_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        gru = hk.GRU(hidden_size)
+        old_state = state
+        # jax.debug.breakpoint()
+        state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        state = 0.5*state + 0.5*old_state
+        embedding, state = gru(inputs, state)
+
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+def make_GRU_ipd_att_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        # print(state.shape, 'STATE shape')
+        gru = hk.GRU(hidden_size)
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 1
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)),
+            # w_init=hk.initializers.Constant(0.5),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)),
+            # w_init=hk.initializers.Constant(0.5),
+            b_init=hk.initializers.Constant(0),
+            # with_bias=False,
+        )
+        old_state = state
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state_attn + state)
+        state = shape_mlp(state)
+        state = 0.5*old_state + 0.5*state
+        embedding, state = gru(inputs, state)
+
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+
+def make_GRU_cartpole_network(num_actions: int):
+    hidden_size = 256
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        torso = hk.nets.MLP(
+            [hidden_size, hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+        )
+        gru = hk.GRU(hidden_size)
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+
+def make_GRU_coingame_network(
+    num_actions: int,
+    with_cnn: bool,
+    hidden_size: int,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+
+        if with_cnn:
+            torso = CNN(output_channels, kernel_shape)(inputs)
+
+        else:
+            torso = hk.nets.MLP(
+                [hidden_size],
+                w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+                b_init=hk.initializers.Constant(0),
+                activate_final=True,
+            )
+        gru = hk.GRU(
+            hidden_size,
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_coingame_att_network(
+    num_actions: int,
+    with_cnn: bool,
+    hidden_size: int,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+
+        if with_cnn:
+            torso = CNN(output_channels, kernel_shape)(inputs)
+
+        else:
+            torso = hk.nets.MLP(
+                [hidden_size],
+                w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+                b_init=hk.initializers.Constant(0),
+                activate_final=True,
+            )
+        gru = hk.GRU(
+            hidden_size,
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 8
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        embedding = torso(inputs)
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state + state_attn)
+        state = shape_mlp(state)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+
+def make_GRU_ipditm_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_ipditm_att_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+
+        # input_shape = [num_opps, num_envs, obs_spec...]
+        # num_opps is our true batch size
+        # num_envs is actually part of our featuer space
+        # lets use attention network to over the hidden_states
+
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 8
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+
+        # shaper network to obfuscated
+        print("state", state.shape)
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state + state_attn)
+        state = shape_mlp(state)
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_ipditm_avg_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+
+        # input_shape = [num_opps, num_envs, obs_spec...]
+        # num_opps is our true batch size
+        # num_envs is actually part of our featuer space
+        # lets use attention network to over the hidden_states
+
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        old_state = state
+        state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        state = 0.5*state + 0.5*old_state
+
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+
+        # shaper network to obfuscated
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+
+def test_GRU():
+    key = jax.random.PRNGKey(seed=0)
+    num_actions = 2
+    obs_spec = (5,)
+    key, subkey = jax.random.split(key)
+    dummy_obs = jnp.zeros(shape=obs_spec)
+    dummy_obs = utils.add_batch_dim(dummy_obs)
+    network, hidden = make_GRU_ipd_network(num_actions)
+    print(hidden.shape)
+    initial_params = network.init(subkey, dummy_obs, hidden)
+    print("GRU w_i", initial_params["gru"]["w_i"].shape)
+    print("GRU w_h", initial_params["gru"]["w_h"].shape)
+    print(
+        "Policy head",
+        initial_params["categorical_value_head/~/linear"]["w"].shape,
+    )
+    print(
+        "Value head",
+        initial_params["categorical_value_head/~/linear_1"]["w"].shape,
+    )
+    observation = jnp.zeros(shape=(1, 5))
+    observation = jnp.zeros(shape=(10, 5))
+    (logits, values), hidden = network.apply(
+        initial_params, observation, hidden
+    )
+    print(hidden.shape)
+    return network
+
+
+if __name__ == "__main__":
+    test_GRU()
diff --git a/pax/agents/shaper_att/ppo_gru.py b/pax/agents/shaper_att/ppo_gru.py
new file mode 100644
index 00000000..87b0056a
--- /dev/null
+++ b/pax/agents/shaper_att/ppo_gru.py
@@ -0,0 +1,610 @@
+# Adapted from https://github.com/deepmind/acme/blob/master/acme/agents/jax/ppo/learning.py
+
+from typing import Any, Dict, NamedTuple, Tuple
+
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import optax
+
+from pax import utils
+from pax.agents.agent import AgentInterface
+from pax.agents.shaper_att.networks import (
+    make_GRU_cartpole_network,
+    make_GRU_coingame_att_network,
+    make_GRU_ipd_network,
+    make_GRU_ipd_avg_network,
+    make_GRU_ipd_att_network,
+    make_GRU_ipditm_att_network,
+    make_GRU_ipditm_avg_network,
+)
+from pax.utils import MemoryState, TrainingState, get_advantages
+
+# from dm_env import TimeStep
+
+
+class Batch(NamedTuple):
+    """A batch of data; all shapes are expected to be [B, ...]."""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    advantages: jnp.ndarray
+
+    # Target value estimate used to bootstrap the value function.
+    target_values: jnp.ndarray
+
+    # Value estimate and action log-prob at behavior time.
+    behavior_values: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+
+    # GRU specific
+    hiddens: jnp.ndarray
+
+
+class Logger:
+    metrics: dict
+
+
+class PPO(AgentInterface):
+    """A simple PPO agent with memory using JAX"""
+
+    def __init__(
+        self,
+        network: NamedTuple,
+        initial_hidden_state: jnp.ndarray,
+        optimizer: optax.GradientTransformation,
+        random_key: jnp.ndarray,
+        gru_dim: int,
+        obs_spec: Tuple,
+        num_envs: int = 4,
+        num_minibatches: int = 16,
+        num_epochs: int = 4,
+        clip_value: bool = True,
+        value_coeff: float = 0.5,
+        anneal_entropy: bool = False,
+        entropy_coeff_start: float = 0.1,
+        entropy_coeff_end: float = 0.01,
+        entropy_coeff_horizon: int = 3_000_000,
+        ppo_clipping_epsilon: float = 0.2,
+        gamma: float = 0.99,
+        gae_lambda: float = 0.95,
+        player_id: int = 0,
+    ):
+        @jax.jit
+        def policy(
+            state: TrainingState, observation: jnp.ndarray, mem: MemoryState
+        ):
+            """Agent policy to select actions and calculate agent specific information"""
+            key, subkey = jax.random.split(state.random_key)
+            (dist, values), hidden_state = network.apply(
+                state.params, observation, mem.hidden
+            )
+
+            actions = dist.sample(seed=subkey)
+            mem.extras["values"] = values
+            mem.extras["log_probs"] = dist.log_prob(actions)
+            mem = mem._replace(hidden=hidden_state, extras=mem.extras)
+            state = state._replace(random_key=key)
+            return (
+                actions,
+                state,
+                mem,
+            )
+
+        @jax.jit
+        def gae_advantages(
+            rewards: jnp.ndarray, values: jnp.ndarray, dones: jnp.ndarray
+        ) -> jnp.ndarray:
+            """Calculates the gae advantages from a sequence. Note that the
+            arguments are of length = rollout length + 1"""
+            # 'Zero out' the terminated states
+            discounts = gamma * jnp.logical_not(dones)
+            reverse_batch = (
+                jnp.flip(values[:-1], axis=0),
+                jnp.flip(rewards, axis=0),
+                jnp.flip(discounts, axis=0),
+            )
+
+            _, advantages = jax.lax.scan(
+                get_advantages,
+                (
+                    jnp.zeros_like(values[-1]),
+                    values[-1],
+                    jnp.ones_like(values[-1]) * gae_lambda,
+                ),
+                reverse_batch,
+            )
+
+            advantages = jnp.flip(advantages, axis=0)
+            target_values = values[:-1] + advantages  # Q-value estimates
+            target_values = jax.lax.stop_gradient(target_values)
+            return advantages, target_values
+
+        def loss(
+            params: hk.Params,
+            timesteps: int,
+            observations: jnp.ndarray,
+            actions: jnp.array,
+            behavior_log_probs: jnp.array,
+            target_values: jnp.array,
+            advantages: jnp.array,
+            behavior_values: jnp.array,
+            hiddens: jnp.ndarray,
+        ):
+            """Surrogate loss using clipped probability ratios."""
+            (distribution, values), _ = network.apply(
+                params, observations, hiddens
+            )
+
+            log_prob = distribution.log_prob(actions)
+            entropy = distribution.entropy()
+
+            # Compute importance sampling weights: current policy / behavior policy.
+            rhos = jnp.exp(log_prob - behavior_log_probs)
+
+            # Policy loss: Clipping
+            clipped_ratios_t = jnp.clip(
+                rhos, 1.0 - ppo_clipping_epsilon, 1.0 + ppo_clipping_epsilon
+            )
+            clipped_objective = jnp.fmin(
+                rhos * advantages, clipped_ratios_t * advantages
+            )
+            policy_loss = -jnp.mean(clipped_objective)
+
+            # Value loss: MSE
+            value_cost = value_coeff
+            unclipped_value_error = target_values - values
+            unclipped_value_loss = unclipped_value_error**2
+
+            # Value clipping
+            if clip_value:
+                # Clip values to reduce variablility during critic training.
+                clipped_values = behavior_values + jnp.clip(
+                    values - behavior_values,
+                    -ppo_clipping_epsilon,
+                    ppo_clipping_epsilon,
+                )
+                clipped_value_error = target_values - clipped_values
+                clipped_value_loss = clipped_value_error**2
+                value_loss = jnp.mean(
+                    jnp.fmax(unclipped_value_loss, clipped_value_loss)
+                )
+            else:
+                value_loss = jnp.mean(unclipped_value_loss)
+
+            # Entropy loss: Standard entropy term
+            # Calculate the new value based on linear annealing formula
+            if anneal_entropy:
+                fraction = jnp.fmax(1 - timesteps / entropy_coeff_horizon, 0)
+                entropy_cost = (
+                    fraction * entropy_coeff_start
+                    + (1 - fraction) * entropy_coeff_end
+                )
+            # Constant Entropy term
+            else:
+                entropy_cost = entropy_coeff_start
+            entropy_loss = -jnp.mean(entropy)
+
+            # Total loss: Minimize policy and value loss; maximize entropy
+            total_loss = (
+                policy_loss
+                + entropy_cost * entropy_loss
+                + value_loss * value_cost
+            )
+
+            return total_loss, {
+                "loss_total": total_loss,
+                "loss_policy": policy_loss,
+                "loss_value": value_loss,
+                "loss_entropy": entropy_loss,
+                "entropy_cost": entropy_cost,
+            }
+
+        @jax.jit
+        def sgd_step(
+            state: TrainingState, sample: NamedTuple
+        ) -> Tuple[TrainingState, Dict[str, jnp.ndarray]]:
+            """Performs a minibatch SGD step, returning new state and metrics."""
+            # Extract data
+            (
+                observations,
+                actions,
+                rewards,
+                behavior_log_probs,
+                behavior_values,
+                dones,
+                hiddens,
+            ) = (
+                sample.observations,
+                sample.actions,
+                sample.rewards,
+                sample.behavior_log_probs,
+                sample.behavior_values,
+                sample.dones,
+                sample.hiddens,
+            )
+
+            # batch_gae_advantages = jax.vmap(gae_advantages, 1, (0, 0))
+            advantages, target_values = gae_advantages(
+                rewards=rewards, values=behavior_values, dones=dones
+            )
+
+            # Exclude the last step - it was only used for bootstrapping.
+            # The shape is [num_steps, num_envs, ..]
+            behavior_values = behavior_values[:-1, :]
+            trajectories = Batch(
+                observations=observations,
+                actions=actions,
+                advantages=advantages,
+                behavior_log_probs=behavior_log_probs,
+                target_values=target_values,
+                behavior_values=behavior_values,
+                hiddens=hiddens,
+            )
+            # Concatenate all trajectories. Reshape from [num_envs, num_steps, ..]
+            # to [num_envs * num_steps,..]
+            assert len(target_values.shape) > 1
+            num_envs = target_values.shape[1]
+            num_steps = target_values.shape[0]
+            batch_size = num_envs * num_steps
+            assert batch_size % num_minibatches == 0, (
+                "Num minibatches must divide batch size. Got batch_size={}"
+                " num_minibatches={}."
+            ).format(batch_size, num_minibatches)
+
+            batch = jax.tree_util.tree_map(
+                lambda x: x.reshape((batch_size,) + x.shape[2:]), trajectories
+            )
+            # Compute gradients.
+            grad_fn = jax.jit(jax.grad(loss, has_aux=True))
+
+            def model_update_minibatch(
+                carry: Tuple[hk.Params, optax.OptState, int],
+                minibatch: Batch,
+            ) -> Tuple[
+                Tuple[hk.Params, optax.OptState, int], Dict[str, jnp.ndarray]
+            ]:
+                """Performs model update for a single minibatch."""
+                params, opt_state, timesteps = carry
+                # Normalize advantages at the minibatch level before using them.
+                advantages = (
+                    minibatch.advantages
+                    - jnp.mean(minibatch.advantages, axis=0)
+                ) / (jnp.std(minibatch.advantages, axis=0) + 1e-8)
+                gradients, metrics = grad_fn(
+                    params,
+                    timesteps,
+                    minibatch.observations,
+                    minibatch.actions,
+                    minibatch.behavior_log_probs,
+                    minibatch.target_values,
+                    advantages,
+                    minibatch.behavior_values,
+                    minibatch.hiddens,
+                )
+
+                # Apply updates
+                updates, opt_state = optimizer.update(gradients, opt_state)
+                params = optax.apply_updates(params, updates)
+
+                metrics["norm_grad"] = optax.global_norm(gradients)
+                metrics["norm_updates"] = optax.global_norm(updates)
+                return (params, opt_state, timesteps), metrics
+
+            def model_update_epoch(
+                carry: Tuple[
+                    jnp.ndarray, hk.Params, optax.OptState, int, Batch
+                ],
+                unused_t: Tuple[()],
+            ) -> Tuple[
+                Tuple[jnp.ndarray, hk.Params, optax.OptState, Batch],
+                Dict[str, jnp.ndarray],
+            ]:
+                """Performs model updates based on one epoch of data."""
+                key, params, opt_state, timesteps, batch = carry
+                key, subkey = jax.random.split(key)
+                permutation = jax.random.permutation(subkey, batch_size)
+                shuffled_batch = jax.tree_util.tree_map(
+                    lambda x: jnp.take(x, permutation, axis=0), batch
+                )
+                minibatches = jax.tree_util.tree_map(
+                    lambda x: jnp.reshape(
+                        x, [num_minibatches, -1] + list(x.shape[1:])
+                    ),
+                    shuffled_batch,
+                )
+
+                (params, opt_state, timesteps), metrics = jax.lax.scan(
+                    model_update_minibatch,
+                    (params, opt_state, timesteps),
+                    minibatches,
+                    length=num_minibatches,
+                )
+                return (key, params, opt_state, timesteps, batch), metrics
+
+            params = state.params
+            opt_state = state.opt_state
+            timesteps = state.timesteps
+
+            # Repeat training for the given number of epoch, taking a random
+            # permutation for every epoch.
+            # signature is scan(function, carry, tuple to iterate over, length)
+            (key, params, opt_state, timesteps, _), metrics = jax.lax.scan(
+                model_update_epoch,
+                (state.random_key, params, opt_state, timesteps, batch),
+                (),
+                length=num_epochs,
+            )
+
+            metrics = jax.tree_util.tree_map(jnp.mean, metrics)
+            metrics["rewards_mean"] = jnp.mean(
+                jnp.abs(jnp.mean(rewards, axis=(0, 1)))
+            )
+            metrics["rewards_std"] = jnp.std(rewards, axis=(0, 1))
+
+            # Reset the memory
+            new_state = TrainingState(
+                params=params,
+                opt_state=opt_state,
+                random_key=key,
+                timesteps=timesteps + batch_size,
+            )
+
+            new_memory = MemoryState(
+                hidden=jnp.zeros(shape=(self._num_envs,) + (gru_dim,)),
+                extras={
+                    "log_probs": jnp.zeros(self._num_envs),
+                    "values": jnp.zeros(self._num_envs),
+                },
+            )
+
+            return new_state, new_memory, metrics
+
+        def make_initial_state(
+            key: Any, initial_hidden_state: jnp.ndarray
+        ) -> TrainingState:
+            """Initialises the training state (parameters and optimiser state)."""
+
+            # We pass through initial_hidden_state so its easy to batch memory
+            key, subkey = jax.random.split(key)
+
+            if isinstance(obs_spec, dict):
+                dummy_obs = {}
+                for k, v in obs_spec.items():
+                    dummy_obs[k] = jnp.zeros(shape=v)
+
+            else:
+                dummy_obs = jnp.zeros(shape=obs_spec)
+            dummy_obs = utils.add_batch_dim(dummy_obs)
+            initial_params = network.init(
+                subkey, dummy_obs, initial_hidden_state
+            )
+            initial_opt_state = optimizer.init(initial_params)
+            return TrainingState(
+                random_key=key,
+                params=initial_params,
+                opt_state=initial_opt_state,
+                timesteps=0,
+            ), MemoryState(
+                hidden=jnp.zeros(
+                    (num_envs, initial_hidden_state.shape[-1])
+                ),  # initial_hidden_state,
+                extras={
+                    "values": jnp.zeros(num_envs),
+                    "log_probs": jnp.zeros(num_envs),
+                },
+            )
+
+        # @jax.jit
+        def prepare_batch(
+            traj_batch: NamedTuple,
+            done: Any,
+            action_extras: dict,
+        ):
+            # Rollouts complete -> Training begins
+            # Add an additional rollout step for advantage calculation
+            _value = jax.lax.select(
+                done,
+                jnp.zeros_like(action_extras["values"]),
+                action_extras["values"],
+            )
+
+            _value = jax.lax.expand_dims(_value, [0])
+
+            # need to add final value here
+            traj_batch = traj_batch._replace(
+                behavior_values=jnp.concatenate(
+                    [traj_batch.behavior_values, _value], axis=0
+                )
+            )
+            return traj_batch
+
+        # Initialise training state (parameters, optimiser state, extras).
+        self._state, self._mem = make_initial_state(
+            random_key, initial_hidden_state
+        )
+
+        self.make_initial_state = make_initial_state
+
+        self._prepare_batch = prepare_batch
+        self._sgd_step = jax.jit(sgd_step)
+
+        # Set up counters and logger
+        self._logger = Logger()
+        self._total_steps = 0
+        self._until_sgd = 0
+        self._logger.metrics = {
+            "total_steps": 0,
+            "sgd_steps": 0,
+            "loss_total": 0,
+            "loss_policy": 0,
+            "loss_value": 0,
+            "loss_entropy": 0,
+            "entropy_cost": entropy_coeff_start,
+        }
+
+        # Initialize functions
+        self._policy = policy
+        self.forward = network.apply
+        self.player_id = player_id
+
+        # Other useful hyperparameters
+        self._num_envs = num_envs  # number of environments
+        self._num_minibatches = num_minibatches  # number of minibatches
+        self._num_epochs = num_epochs  # number of epochs to use sample
+        self._gru_dim = gru_dim
+
+    def reset_memory(self, memory, eval=False) -> TrainingState:
+        num_envs = 1 if eval else self._num_envs
+        memory = memory._replace(
+            extras={
+                "values": jnp.zeros(num_envs),
+                "log_probs": jnp.zeros(num_envs),
+            },
+            hidden=jnp.zeros((num_envs, self._gru_dim)),
+        )
+        return memory
+
+    def update(
+        self,
+        traj_batch: NamedTuple,
+        obs: jnp.ndarray,
+        state: TrainingState,
+        mem: MemoryState,
+    ):
+
+        """Update the agent -> only called at the end of a trajectory"""
+
+        _, _, mem = self._policy(state, obs, mem)
+        traj_batch = self._prepare_batch(
+            traj_batch, traj_batch.dones[-1, ...], mem.extras
+        )
+        state, mem, metrics = self._sgd_step(state, traj_batch)
+
+        # update logging
+
+        self._logger.metrics["sgd_steps"] += (
+            self._num_minibatches * self._num_epochs
+        )
+        self._logger.metrics["loss_total"] = metrics["loss_total"]
+        self._logger.metrics["loss_policy"] = metrics["loss_policy"]
+        self._logger.metrics["loss_value"] = metrics["loss_value"]
+        self._logger.metrics["loss_entropy"] = metrics["loss_entropy"]
+        self._logger.metrics["entropy_cost"] = metrics["entropy_cost"]
+        return state, mem, metrics
+
+
+# TODO: seed, and player_id not used in CartPole
+def make_shaper_agent(
+    args,
+    agent_args,
+    obs_spec,
+    action_spec,
+    seed: int,
+    num_iterations: int,
+    player_id: int,
+):
+    """Make PPO agent"""
+    # Network
+    if args.env_id == "CartPole-v1":
+        network, initial_hidden_state = make_GRU_cartpole_network(action_spec)
+    elif args.env_id == "coin_game":
+        network, initial_hidden_state = make_GRU_coingame_att_network(
+            action_spec,
+            agent_args.with_cnn,
+            agent_args.hidden_size,
+            agent_args.output_channels,
+            agent_args.kernel_shape,
+        )
+    elif args.env_id == "iterated_matrix_game":
+        if args.att_type=='att':
+            network, initial_hidden_state = make_GRU_ipd_att_network(
+                action_spec, agent_args.hidden_size
+            )
+        elif args.att_type=='avg':
+            network, initial_hidden_state = make_GRU_ipd_avg_network(
+                action_spec, agent_args.hidden_size
+            )
+        elif args.att_type=='nothing':
+            network, initial_hidden_state = make_GRU_ipd_network(
+                action_spec, agent_args.hidden_size
+            )
+
+    elif args.env_id == "InTheMatrix":
+        if args.att_type=='avg':
+            network, initial_hidden_state = make_GRU_ipditm_avg_network(
+                action_spec,
+                agent_args.hidden_size,
+                agent_args.separate,
+                agent_args.output_channels,
+                agent_args.kernel_shape,
+            )
+        if args.att_type=='att':         
+            network, initial_hidden_state = make_GRU_ipditm_att_network(
+                action_spec,
+                agent_args.hidden_size,
+                agent_args.separate,
+                agent_args.output_channels,
+                agent_args.kernel_shape,
+            )
+
+    gru_dim = initial_hidden_state.shape[1]
+
+    initial_hidden_state = jnp.zeros(
+        (args.num_envs, initial_hidden_state.shape[1])
+    )
+
+    # Optimizer
+    transition_steps = (
+        num_iterations * agent_args.num_epochs * agent_args.num_minibatches
+    )
+
+    if agent_args.lr_scheduling:
+        scheduler = optax.linear_schedule(
+            init_value=agent_args.learning_rate,
+            end_value=0,
+            transition_steps=transition_steps,
+        )
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(agent_args.max_gradient_norm),
+            optax.scale_by_adam(eps=agent_args.adam_epsilon),
+            optax.scale_by_schedule(scheduler),
+            optax.scale(-1),
+        )
+
+    else:
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(agent_args.max_gradient_norm),
+            optax.scale_by_adam(eps=agent_args.adam_epsilon),
+            optax.scale(-agent_args.learning_rate),
+        )
+
+    # Random key
+    random_key = jax.random.PRNGKey(seed=seed)
+
+    agent = PPO(
+        network=network,
+        initial_hidden_state=initial_hidden_state,
+        optimizer=optimizer,
+        random_key=random_key,
+        gru_dim=gru_dim,
+        obs_spec=obs_spec,
+        num_envs=args.num_envs,
+        num_minibatches=agent_args.num_minibatches,
+        num_epochs=agent_args.num_epochs,
+        clip_value=agent_args.clip_value,
+        value_coeff=agent_args.value_coeff,
+        anneal_entropy=agent_args.anneal_entropy,
+        entropy_coeff_start=agent_args.entropy_coeff_start,
+        entropy_coeff_end=agent_args.entropy_coeff_end,
+        entropy_coeff_horizon=agent_args.entropy_coeff_horizon,
+        ppo_clipping_epsilon=agent_args.ppo_clipping_epsilon,
+        gamma=agent_args.gamma,
+        gae_lambda=agent_args.gae_lambda,
+        player_id=player_id,
+    )
+    return agent
+
+
+if __name__ == "__main__":
+    pass
diff --git a/pax/agents/shaper_pred/networks.py b/pax/agents/shaper_pred/networks.py
new file mode 100644
index 00000000..faa6b6db
--- /dev/null
+++ b/pax/agents/shaper_pred/networks.py
@@ -0,0 +1,696 @@
+from typing import Optional, Tuple
+
+import distrax
+import haiku as hk
+import jax
+import jax.numpy as jnp
+
+from pax import utils
+
+
+class CategoricalValueHead(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1),
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHead_ipd(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHeadSeparate(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._action_body = hk.nets.MLP(
+            [64, 64],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            b_init=hk.initializers.Constant(0),
+        )
+        self._value_body = hk.nets.MLP(
+            [64, 64],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1),
+            b_init=hk.initializers.Constant(0),
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        # action_output, value_output = inputs
+        logits = self._action_body(inputs)
+        logits = self._logit_layer(logits)
+
+        value = self._value_body(inputs)
+        value = jnp.squeeze(self._value_layer(value), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CategoricalValueHeadSeparate_ipditm(hk.Module):
+    """Network head that produces a categorical distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        hidden_size: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._action_body = hk.nets.MLP(
+            [hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._value_body = hk.nets.MLP(
+            [hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+            activation=jnp.tanh,
+        )
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),
+            b_init=hk.initializers.Constant(0),
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1.0),
+            b_init=hk.initializers.Constant(0),
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        # action_output, value_output = inputs
+        logits = self._action_body(inputs)
+        logits = self._logit_layer(logits)
+
+        value = self._value_body(inputs)
+        value = jnp.squeeze(self._value_layer(value), axis=-1)
+        return (distrax.Categorical(logits=logits), value)
+
+
+class ContinuousValueHead(hk.Module):
+    """Network head that produces a continuous distribution and value."""
+
+    def __init__(
+        self,
+        num_values: int,
+        name: Optional[str] = None,
+    ):
+        super().__init__(name=name)
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Orthogonal(0.01),  # baseline
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Orthogonal(1.0),  # baseline
+            with_bias=False,
+        )
+
+    def __call__(self, inputs: jnp.ndarray):
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+        return (distrax.MultivariateNormalDiag(loc=logits), value)
+
+
+class Tabular(hk.Module):
+    def __init__(self, num_values: int):
+        super().__init__(name="Tabular")
+        self._logit_layer = hk.Linear(
+            num_values,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+        self._value_layer = hk.Linear(
+            1,
+            w_init=hk.initializers.Constant(0.5),
+            with_bias=False,
+        )
+
+        def _input_to_onehot(input: jnp.ndarray):
+            chunks = jnp.array([9**3, 9**2, 9, 1], dtype=jnp.int32)
+            idx = input.nonzero(size=4)[0]
+            idx = jnp.mod(idx, 9)
+            idx = chunks * idx
+            idx = jnp.sum(idx)
+            return jax.nn.one_hot(idx, num_classes=6561)
+
+        self.input_to_onehot = jax.vmap(_input_to_onehot)
+
+    def __call__(self, inputs: jnp.ndarray):
+        inputs = self.input_to_onehot(inputs)
+        logits = self._logit_layer(inputs)
+        value = jnp.squeeze(self._value_layer(inputs), axis=-1)
+
+        return (distrax.Categorical(logits=logits), value)
+
+
+class CNN(hk.Module):
+    def __init__(self, output_channels, kernel_shape):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        self.conv_a_1 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        self.linear_a_0 = hk.Linear(output_channels)
+
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs: jnp.ndarray):
+        # Actor and Critic
+        x = self.conv_a_0(inputs)
+        x = jax.nn.relu(x)
+        x = self.conv_a_1(x)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = self.linear_a_0(x)
+        x = jax.nn.relu(x)
+        return x
+
+
+class CNN_ipditm(hk.Module):
+    def __init__(self, output_channels, kernel_shape):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+        )
+        # akbir suggested fix
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs: jnp.ndarray):
+        obs = inputs["observation"]
+        inventory = inputs["inventory"]
+        # Actor and Critic
+        x = self.conv_a_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        return x
+
+
+class CNNSeparate_ipditm(hk.Module):
+    def __init__(self, output_channels, kernel_shape, num_actions: int):
+        super().__init__(name="CNN")
+        self.conv_a_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+        )
+        self.linear_a_0 = hk.Linear(output_channels)
+        self.conv_v_0 = hk.Conv2D(
+            output_channels=output_channels,
+            kernel_shape=kernel_shape,
+            stride=1,
+            padding="SAME",
+        )
+        self.linear_v_0 = hk.Linear(1)
+        self.flatten = hk.Flatten()
+
+    def __call__(self, inputs):
+        obs = inputs["observation"]
+        inventory = inputs["inventory"]
+        # Actor
+        x = self.conv_a_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        logits = self.linear_a_0(x)
+
+        # Critic
+        x = self.conv_v_0(obs)
+        x = jax.nn.relu(x)
+        x = self.flatten(x)
+        x = jnp.concatenate([x, inventory], axis=-1)
+        x = self.linear_v_0(x)
+        val = x
+        return (distrax.Categorical(logits=logits), jnp.squeeze(val, axis=-1))
+
+
+def make_GRU_ipd_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        gru = hk.GRU(hidden_size)
+        embedding, state = gru(inputs, state)
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+def make_GRU_ipd_avg_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        gru = hk.GRU(hidden_size)
+        old_state = state
+        state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        state = 0.5*state + 0.5*old_state
+        embedding, state = gru(inputs, state)
+
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+def make_GRU_ipd_att_network(num_actions: int, hidden_size: int):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        print(state.shape, 'STATE shape')
+        gru = hk.GRU(hidden_size)
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 1
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)),
+            # w_init=hk.initializers.Constant(0.5),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(1/jnp.sqrt(hidden_size)),
+            # w_init=hk.initializers.Constant(0.5),
+            b_init=hk.initializers.Constant(0),
+            # with_bias=False,
+        )
+        old_state = state
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state_attn + state)
+        state = shape_mlp(state)
+        state = 0.5*old_state + 0.5*state
+        embedding, state = gru(inputs, state)
+
+        logits, values = CategoricalValueHead_ipd(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+
+def make_GRU_cartpole_network(num_actions: int):
+    hidden_size = 256
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        torso = hk.nets.MLP(
+            [hidden_size, hidden_size],
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+            activate_final=True,
+        )
+        gru = hk.GRU(hidden_size)
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+
+    return network, hidden_state
+
+
+def make_GRU_coingame_network(
+    num_actions: int,
+    with_cnn: bool,
+    hidden_size: int,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+
+        if with_cnn:
+            torso = CNN(output_channels, kernel_shape)(inputs)
+
+        else:
+            torso = hk.nets.MLP(
+                [hidden_size],
+                w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+                b_init=hk.initializers.Constant(0),
+                activate_final=True,
+            )
+        gru = hk.GRU(
+            hidden_size,
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_coingame_att_network(
+    num_actions: int,
+    with_cnn: bool,
+    hidden_size: int,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+
+        if with_cnn:
+            torso = CNN(output_channels, kernel_shape)(inputs)
+
+        else:
+            torso = hk.nets.MLP(
+                [hidden_size],
+                w_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+                b_init=hk.initializers.Constant(0),
+                activate_final=True,
+            )
+        gru = hk.GRU(
+            hidden_size,
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(2)),
+            b_init=hk.initializers.Constant(0),
+        )
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 8
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        embedding = torso(inputs)
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state + state_attn)
+        state = shape_mlp(state)
+        embedding, state = gru(embedding, state)
+        logits, values = CategoricalValueHead(num_actions)(embedding)
+
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+
+def make_GRU_ipditm_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_ipditm_att_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+
+        # input_shape = [num_opps, num_envs, obs_spec...]
+        # num_opps is our true batch size
+        # num_envs is actually part of our featuer space
+        # lets use attention network to over the hidden_states
+
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        layer_norm1 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+
+        num_heads = 8
+        shape_attn = hk.MultiHeadAttention(
+            num_heads=num_heads,
+            key_size=hidden_size // num_heads,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+        )
+
+        layer_norm2 = hk.LayerNorm(
+            axis=-2, create_scale=True, create_offset=True
+        )
+        shape_mlp = hk.Linear(
+            hidden_size,
+            w_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+
+        # shaper network to obfuscated
+        print("state", state.shape)
+        state_attn = layer_norm1(state)
+        state_attn = shape_attn(state_attn, state_attn, state_attn)
+        state = layer_norm2(state + state_attn)
+        state = shape_mlp(state)
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+def make_GRU_ipditm_avg_network(
+    num_actions: int,
+    hidden_size: int,
+    separate: bool,
+    output_channels: int,
+    kernel_shape: Tuple[int],
+):
+    hidden_state = jnp.zeros((1, hidden_size))
+
+    def forward_fn(
+        inputs: jnp.ndarray, state: jnp.ndarray
+    ) -> Tuple[Tuple[jnp.ndarray, jnp.ndarray], jnp.ndarray]:
+        """forward function"""
+
+        # input_shape = [num_opps, num_envs, obs_spec...]
+        # num_opps is our true batch size
+        # num_envs is actually part of our featuer space
+        # lets use attention network to over the hidden_states
+
+        torso = CNN_ipditm(output_channels, kernel_shape)
+        gru = hk.GRU(
+            hidden_size,
+            w_i_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            w_h_init=hk.initializers.Orthogonal(jnp.sqrt(1)),
+            b_init=hk.initializers.Constant(0),
+        )
+        old_state = state
+        state = jnp.mean(state, axis=0, keepdims=True).repeat(state.shape[0], axis=0)
+        state = 0.5*state + 0.5*old_state
+
+        if separate:
+            cvh = CategoricalValueHeadSeparate_ipditm(
+                num_values=num_actions, hidden_size=hidden_size
+            )
+        else:
+            cvh = CategoricalValueHead(num_values=num_actions)
+        embedding = torso(inputs)
+
+        # shaper network to obfuscated
+        embedding, state = gru(embedding, state)
+        logits, values = cvh(embedding)
+        return (logits, values), state
+
+    network = hk.without_apply_rng(hk.transform(forward_fn))
+    return network, hidden_state
+
+
+def test_GRU():
+    key = jax.random.PRNGKey(seed=0)
+    num_actions = 2
+    obs_spec = (5,)
+    key, subkey = jax.random.split(key)
+    dummy_obs = jnp.zeros(shape=obs_spec)
+    dummy_obs = utils.add_batch_dim(dummy_obs)
+    network, hidden = make_GRU_ipd_network(num_actions)
+    print(hidden.shape)
+    initial_params = network.init(subkey, dummy_obs, hidden)
+    print("GRU w_i", initial_params["gru"]["w_i"].shape)
+    print("GRU w_h", initial_params["gru"]["w_h"].shape)
+    print(
+        "Policy head",
+        initial_params["categorical_value_head/~/linear"]["w"].shape,
+    )
+    print(
+        "Value head",
+        initial_params["categorical_value_head/~/linear_1"]["w"].shape,
+    )
+    observation = jnp.zeros(shape=(1, 5))
+    observation = jnp.zeros(shape=(10, 5))
+    (logits, values), hidden = network.apply(
+        initial_params, observation, hidden
+    )
+    print(hidden.shape)
+    return network
+
+
+if __name__ == "__main__":
+    test_GRU()
diff --git a/pax/agents/shaper_pred/ppo_gru.py b/pax/agents/shaper_pred/ppo_gru.py
new file mode 100644
index 00000000..1de35b2d
--- /dev/null
+++ b/pax/agents/shaper_pred/ppo_gru.py
@@ -0,0 +1,611 @@
+# Adapted from https://github.com/deepmind/acme/blob/master/acme/agents/jax/ppo/learning.py
+
+from typing import Any, Dict, NamedTuple, Tuple
+
+import haiku as hk
+import jax
+import jax.numpy as jnp
+import optax
+
+from pax import utils
+from pax.agents.agent import AgentInterface
+from pax.agents.shaper_att.networks import (
+    make_GRU_cartpole_network,
+    make_GRU_coingame_att_network,
+    make_GRU_ipd_network,
+    make_GRU_ipd_avg_network,
+    make_GRU_ipd_att_network,
+    make_GRU_ipditm_att_network,
+    make_GRU_ipditm_avg_network,
+)
+from pax.utils import MemoryState, TrainingState, get_advantages
+
+# from dm_env import TimeStep
+
+
+class Batch(NamedTuple):
+    """A batch of data; all shapes are expected to be [B, ...]."""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    advantages: jnp.ndarray
+
+    # Target value estimate used to bootstrap the value function.
+    target_values: jnp.ndarray
+
+    # Value estimate and action log-prob at behavior time.
+    behavior_values: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+
+    # GRU specific
+    hiddens: jnp.ndarray
+
+
+class Logger:
+    metrics: dict
+
+
+class PPO(AgentInterface):
+    """A simple PPO agent with memory using JAX"""
+
+    def __init__(
+        self,
+        network: NamedTuple,
+        initial_hidden_state: jnp.ndarray,
+        optimizer: optax.GradientTransformation,
+        random_key: jnp.ndarray,
+        gru_dim: int,
+        obs_spec: Tuple,
+        num_envs: int = 4,
+        num_minibatches: int = 16,
+        num_epochs: int = 4,
+        clip_value: bool = True,
+        value_coeff: float = 0.5,
+        anneal_entropy: bool = False,
+        entropy_coeff_start: float = 0.1,
+        entropy_coeff_end: float = 0.01,
+        entropy_coeff_horizon: int = 3_000_000,
+        ppo_clipping_epsilon: float = 0.2,
+        gamma: float = 0.99,
+        gae_lambda: float = 0.95,
+        player_id: int = 0,
+    ):
+        @jax.jit
+        def policy(
+            state: TrainingState, observation: jnp.ndarray, mem: MemoryState
+        ):
+            """Agent policy to select actions and calculate agent specific information"""
+            key, subkey = jax.random.split(state.random_key)
+            (dist, values, pred), hidden_state = network.apply(
+                state.params, observation, mem.hidden
+            )
+
+            actions = dist.sample(seed=subkey)
+            mem.extras["values"] = values
+            mem.extras["log_probs"] = dist.log_prob(actions)
+            mem = mem._replace(hidden=hidden_state, extras=mem.extras)
+            state = state._replace(random_key=key)
+            return (
+                actions,
+                pred,
+                state,
+                mem,
+            )
+
+        @jax.jit
+        def gae_advantages(
+            rewards: jnp.ndarray, values: jnp.ndarray, dones: jnp.ndarray
+        ) -> jnp.ndarray:
+            """Calculates the gae advantages from a sequence. Note that the
+            arguments are of length = rollout length + 1"""
+            # 'Zero out' the terminated states
+            discounts = gamma * jnp.logical_not(dones)
+            reverse_batch = (
+                jnp.flip(values[:-1], axis=0),
+                jnp.flip(rewards, axis=0),
+                jnp.flip(discounts, axis=0),
+            )
+
+            _, advantages = jax.lax.scan(
+                get_advantages,
+                (
+                    jnp.zeros_like(values[-1]),
+                    values[-1],
+                    jnp.ones_like(values[-1]) * gae_lambda,
+                ),
+                reverse_batch,
+            )
+
+            advantages = jnp.flip(advantages, axis=0)
+            target_values = values[:-1] + advantages  # Q-value estimates
+            target_values = jax.lax.stop_gradient(target_values)
+            return advantages, target_values
+
+        def loss(
+            params: hk.Params,
+            timesteps: int,
+            observations: jnp.ndarray,
+            actions: jnp.array,
+            behavior_log_probs: jnp.array,
+            target_values: jnp.array,
+            advantages: jnp.array,
+            behavior_values: jnp.array,
+            hiddens: jnp.ndarray,
+        ):
+            """Surrogate loss using clipped probability ratios."""
+            (distribution, values), _ = network.apply(
+                params, observations, hiddens
+            )
+
+            log_prob = distribution.log_prob(actions)
+            entropy = distribution.entropy()
+
+            # Compute importance sampling weights: current policy / behavior policy.
+            rhos = jnp.exp(log_prob - behavior_log_probs)
+
+            # Policy loss: Clipping
+            clipped_ratios_t = jnp.clip(
+                rhos, 1.0 - ppo_clipping_epsilon, 1.0 + ppo_clipping_epsilon
+            )
+            clipped_objective = jnp.fmin(
+                rhos * advantages, clipped_ratios_t * advantages
+            )
+            policy_loss = -jnp.mean(clipped_objective)
+
+            # Value loss: MSE
+            value_cost = value_coeff
+            unclipped_value_error = target_values - values
+            unclipped_value_loss = unclipped_value_error**2
+
+            # Value clipping
+            if clip_value:
+                # Clip values to reduce variablility during critic training.
+                clipped_values = behavior_values + jnp.clip(
+                    values - behavior_values,
+                    -ppo_clipping_epsilon,
+                    ppo_clipping_epsilon,
+                )
+                clipped_value_error = target_values - clipped_values
+                clipped_value_loss = clipped_value_error**2
+                value_loss = jnp.mean(
+                    jnp.fmax(unclipped_value_loss, clipped_value_loss)
+                )
+            else:
+                value_loss = jnp.mean(unclipped_value_loss)
+
+            # Entropy loss: Standard entropy term
+            # Calculate the new value based on linear annealing formula
+            if anneal_entropy:
+                fraction = jnp.fmax(1 - timesteps / entropy_coeff_horizon, 0)
+                entropy_cost = (
+                    fraction * entropy_coeff_start
+                    + (1 - fraction) * entropy_coeff_end
+                )
+            # Constant Entropy term
+            else:
+                entropy_cost = entropy_coeff_start
+            entropy_loss = -jnp.mean(entropy)
+
+            # Total loss: Minimize policy and value loss; maximize entropy
+            total_loss = (
+                policy_loss
+                + entropy_cost * entropy_loss
+                + value_loss * value_cost
+            )
+
+            return total_loss, {
+                "loss_total": total_loss,
+                "loss_policy": policy_loss,
+                "loss_value": value_loss,
+                "loss_entropy": entropy_loss,
+                "entropy_cost": entropy_cost,
+            }
+
+        @jax.jit
+        def sgd_step(
+            state: TrainingState, sample: NamedTuple
+        ) -> Tuple[TrainingState, Dict[str, jnp.ndarray]]:
+            """Performs a minibatch SGD step, returning new state and metrics."""
+            # Extract data
+            (
+                observations,
+                actions,
+                rewards,
+                behavior_log_probs,
+                behavior_values,
+                dones,
+                hiddens,
+            ) = (
+                sample.observations,
+                sample.actions,
+                sample.rewards,
+                sample.behavior_log_probs,
+                sample.behavior_values,
+                sample.dones,
+                sample.hiddens,
+            )
+
+            # batch_gae_advantages = jax.vmap(gae_advantages, 1, (0, 0))
+            advantages, target_values = gae_advantages(
+                rewards=rewards, values=behavior_values, dones=dones
+            )
+
+            # Exclude the last step - it was only used for bootstrapping.
+            # The shape is [num_steps, num_envs, ..]
+            behavior_values = behavior_values[:-1, :]
+            trajectories = Batch(
+                observations=observations,
+                actions=actions,
+                advantages=advantages,
+                behavior_log_probs=behavior_log_probs,
+                target_values=target_values,
+                behavior_values=behavior_values,
+                hiddens=hiddens,
+            )
+            # Concatenate all trajectories. Reshape from [num_envs, num_steps, ..]
+            # to [num_envs * num_steps,..]
+            assert len(target_values.shape) > 1
+            num_envs = target_values.shape[1]
+            num_steps = target_values.shape[0]
+            batch_size = num_envs * num_steps
+            assert batch_size % num_minibatches == 0, (
+                "Num minibatches must divide batch size. Got batch_size={}"
+                " num_minibatches={}."
+            ).format(batch_size, num_minibatches)
+
+            batch = jax.tree_util.tree_map(
+                lambda x: x.reshape((batch_size,) + x.shape[2:]), trajectories
+            )
+            # Compute gradients.
+            grad_fn = jax.jit(jax.grad(loss, has_aux=True))
+
+            def model_update_minibatch(
+                carry: Tuple[hk.Params, optax.OptState, int],
+                minibatch: Batch,
+            ) -> Tuple[
+                Tuple[hk.Params, optax.OptState, int], Dict[str, jnp.ndarray]
+            ]:
+                """Performs model update for a single minibatch."""
+                params, opt_state, timesteps = carry
+                # Normalize advantages at the minibatch level before using them.
+                advantages = (
+                    minibatch.advantages
+                    - jnp.mean(minibatch.advantages, axis=0)
+                ) / (jnp.std(minibatch.advantages, axis=0) + 1e-8)
+                gradients, metrics = grad_fn(
+                    params,
+                    timesteps,
+                    minibatch.observations,
+                    minibatch.actions,
+                    minibatch.behavior_log_probs,
+                    minibatch.target_values,
+                    advantages,
+                    minibatch.behavior_values,
+                    minibatch.hiddens,
+                )
+
+                # Apply updates
+                updates, opt_state = optimizer.update(gradients, opt_state)
+                params = optax.apply_updates(params, updates)
+
+                metrics["norm_grad"] = optax.global_norm(gradients)
+                metrics["norm_updates"] = optax.global_norm(updates)
+                return (params, opt_state, timesteps), metrics
+
+            def model_update_epoch(
+                carry: Tuple[
+                    jnp.ndarray, hk.Params, optax.OptState, int, Batch
+                ],
+                unused_t: Tuple[()],
+            ) -> Tuple[
+                Tuple[jnp.ndarray, hk.Params, optax.OptState, Batch],
+                Dict[str, jnp.ndarray],
+            ]:
+                """Performs model updates based on one epoch of data."""
+                key, params, opt_state, timesteps, batch = carry
+                key, subkey = jax.random.split(key)
+                permutation = jax.random.permutation(subkey, batch_size)
+                shuffled_batch = jax.tree_util.tree_map(
+                    lambda x: jnp.take(x, permutation, axis=0), batch
+                )
+                minibatches = jax.tree_util.tree_map(
+                    lambda x: jnp.reshape(
+                        x, [num_minibatches, -1] + list(x.shape[1:])
+                    ),
+                    shuffled_batch,
+                )
+
+                (params, opt_state, timesteps), metrics = jax.lax.scan(
+                    model_update_minibatch,
+                    (params, opt_state, timesteps),
+                    minibatches,
+                    length=num_minibatches,
+                )
+                return (key, params, opt_state, timesteps, batch), metrics
+
+            params = state.params
+            opt_state = state.opt_state
+            timesteps = state.timesteps
+
+            # Repeat training for the given number of epoch, taking a random
+            # permutation for every epoch.
+            # signature is scan(function, carry, tuple to iterate over, length)
+            (key, params, opt_state, timesteps, _), metrics = jax.lax.scan(
+                model_update_epoch,
+                (state.random_key, params, opt_state, timesteps, batch),
+                (),
+                length=num_epochs,
+            )
+
+            metrics = jax.tree_util.tree_map(jnp.mean, metrics)
+            metrics["rewards_mean"] = jnp.mean(
+                jnp.abs(jnp.mean(rewards, axis=(0, 1)))
+            )
+            metrics["rewards_std"] = jnp.std(rewards, axis=(0, 1))
+
+            # Reset the memory
+            new_state = TrainingState(
+                params=params,
+                opt_state=opt_state,
+                random_key=key,
+                timesteps=timesteps + batch_size,
+            )
+
+            new_memory = MemoryState(
+                hidden=jnp.zeros(shape=(self._num_envs,) + (gru_dim,)),
+                extras={
+                    "log_probs": jnp.zeros(self._num_envs),
+                    "values": jnp.zeros(self._num_envs),
+                },
+            )
+
+            return new_state, new_memory, metrics
+
+        def make_initial_state(
+            key: Any, initial_hidden_state: jnp.ndarray
+        ) -> TrainingState:
+            """Initialises the training state (parameters and optimiser state)."""
+
+            # We pass through initial_hidden_state so its easy to batch memory
+            key, subkey = jax.random.split(key)
+
+            if isinstance(obs_spec, dict):
+                dummy_obs = {}
+                for k, v in obs_spec.items():
+                    dummy_obs[k] = jnp.zeros(shape=v)
+
+            else:
+                dummy_obs = jnp.zeros(shape=obs_spec)
+            dummy_obs = utils.add_batch_dim(dummy_obs)
+            initial_params = network.init(
+                subkey, dummy_obs, initial_hidden_state
+            )
+            initial_opt_state = optimizer.init(initial_params)
+            return TrainingState(
+                random_key=key,
+                params=initial_params,
+                opt_state=initial_opt_state,
+                timesteps=0,
+            ), MemoryState(
+                hidden=jnp.zeros(
+                    (num_envs, initial_hidden_state.shape[-1])
+                ),  # initial_hidden_state,
+                extras={
+                    "values": jnp.zeros(num_envs),
+                    "log_probs": jnp.zeros(num_envs),
+                },
+            )
+
+        # @jax.jit
+        def prepare_batch(
+            traj_batch: NamedTuple,
+            done: Any,
+            action_extras: dict,
+        ):
+            # Rollouts complete -> Training begins
+            # Add an additional rollout step for advantage calculation
+            _value = jax.lax.select(
+                done,
+                jnp.zeros_like(action_extras["values"]),
+                action_extras["values"],
+            )
+
+            _value = jax.lax.expand_dims(_value, [0])
+
+            # need to add final value here
+            traj_batch = traj_batch._replace(
+                behavior_values=jnp.concatenate(
+                    [traj_batch.behavior_values, _value], axis=0
+                )
+            )
+            return traj_batch
+
+        # Initialise training state (parameters, optimiser state, extras).
+        self._state, self._mem = make_initial_state(
+            random_key, initial_hidden_state
+        )
+
+        self.make_initial_state = make_initial_state
+
+        self._prepare_batch = prepare_batch
+        self._sgd_step = jax.jit(sgd_step)
+
+        # Set up counters and logger
+        self._logger = Logger()
+        self._total_steps = 0
+        self._until_sgd = 0
+        self._logger.metrics = {
+            "total_steps": 0,
+            "sgd_steps": 0,
+            "loss_total": 0,
+            "loss_policy": 0,
+            "loss_value": 0,
+            "loss_entropy": 0,
+            "entropy_cost": entropy_coeff_start,
+        }
+
+        # Initialize functions
+        self._policy = policy
+        self.forward = network.apply
+        self.player_id = player_id
+
+        # Other useful hyperparameters
+        self._num_envs = num_envs  # number of environments
+        self._num_minibatches = num_minibatches  # number of minibatches
+        self._num_epochs = num_epochs  # number of epochs to use sample
+        self._gru_dim = gru_dim
+
+    def reset_memory(self, memory, eval=False) -> TrainingState:
+        num_envs = 1 if eval else self._num_envs
+        memory = memory._replace(
+            extras={
+                "values": jnp.zeros(num_envs),
+                "log_probs": jnp.zeros(num_envs),
+            },
+            hidden=jnp.zeros((num_envs, self._gru_dim)),
+        )
+        return memory
+
+    def update(
+        self,
+        traj_batch: NamedTuple,
+        obs: jnp.ndarray,
+        state: TrainingState,
+        mem: MemoryState,
+    ):
+
+        """Update the agent -> only called at the end of a trajectory"""
+
+        _, _, mem = self._policy(state, obs, mem)
+        traj_batch = self._prepare_batch(
+            traj_batch, traj_batch.dones[-1, ...], mem.extras
+        )
+        state, mem, metrics = self._sgd_step(state, traj_batch)
+
+        # update logging
+
+        self._logger.metrics["sgd_steps"] += (
+            self._num_minibatches * self._num_epochs
+        )
+        self._logger.metrics["loss_total"] = metrics["loss_total"]
+        self._logger.metrics["loss_policy"] = metrics["loss_policy"]
+        self._logger.metrics["loss_value"] = metrics["loss_value"]
+        self._logger.metrics["loss_entropy"] = metrics["loss_entropy"]
+        self._logger.metrics["entropy_cost"] = metrics["entropy_cost"]
+        return state, mem, metrics
+
+
+# TODO: seed, and player_id not used in CartPole
+def make_shaper_agent(
+    args,
+    agent_args,
+    obs_spec,
+    action_spec,
+    seed: int,
+    num_iterations: int,
+    player_id: int,
+):
+    """Make PPO agent"""
+    # Network
+    if args.env_id == "CartPole-v1":
+        network, initial_hidden_state = make_GRU_cartpole_network(action_spec)
+    elif args.env_id == "coin_game":
+        network, initial_hidden_state = make_GRU_coingame_att_network(
+            action_spec,
+            agent_args.with_cnn,
+            agent_args.hidden_size,
+            agent_args.output_channels,
+            agent_args.kernel_shape,
+        )
+    elif args.env_id == "iterated_matrix_game":
+        if args.att_type=='att':
+            network, initial_hidden_state = make_GRU_ipd_att_network(
+                action_spec, agent_args.hidden_size
+            )
+        elif args.att_type=='avg':
+            network, initial_hidden_state = make_GRU_ipd_avg_network(
+                action_spec, agent_args.hidden_size
+            )
+        elif args.att_type=='nothing':
+            network, initial_hidden_state = make_GRU_ipd_network(
+                action_spec, agent_args.hidden_size
+            )
+
+    elif args.env_id == "InTheMatrix":
+        if args.att_type=='avg':
+            network, initial_hidden_state = make_GRU_ipditm_avg_network(
+                action_spec,
+                agent_args.hidden_size,
+                agent_args.separate,
+                agent_args.output_channels,
+                agent_args.kernel_shape,
+            )
+        if args.att_type=='att':         
+            network, initial_hidden_state = make_GRU_ipditm_att_network(
+                action_spec,
+                agent_args.hidden_size,
+                agent_args.separate,
+                agent_args.output_channels,
+                agent_args.kernel_shape,
+            )
+
+    gru_dim = initial_hidden_state.shape[1]
+
+    initial_hidden_state = jnp.zeros(
+        (args.num_envs, initial_hidden_state.shape[1])
+    )
+
+    # Optimizer
+    transition_steps = (
+        num_iterations * agent_args.num_epochs * agent_args.num_minibatches
+    )
+
+    if agent_args.lr_scheduling:
+        scheduler = optax.linear_schedule(
+            init_value=agent_args.learning_rate,
+            end_value=0,
+            transition_steps=transition_steps,
+        )
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(agent_args.max_gradient_norm),
+            optax.scale_by_adam(eps=agent_args.adam_epsilon),
+            optax.scale_by_schedule(scheduler),
+            optax.scale(-1),
+        )
+
+    else:
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(agent_args.max_gradient_norm),
+            optax.scale_by_adam(eps=agent_args.adam_epsilon),
+            optax.scale(-agent_args.learning_rate),
+        )
+
+    # Random key
+    random_key = jax.random.PRNGKey(seed=seed)
+
+    agent = PPO(
+        network=network,
+        initial_hidden_state=initial_hidden_state,
+        optimizer=optimizer,
+        random_key=random_key,
+        gru_dim=gru_dim,
+        obs_spec=obs_spec,
+        num_envs=args.num_envs,
+        num_minibatches=agent_args.num_minibatches,
+        num_epochs=agent_args.num_epochs,
+        clip_value=agent_args.clip_value,
+        value_coeff=agent_args.value_coeff,
+        anneal_entropy=agent_args.anneal_entropy,
+        entropy_coeff_start=agent_args.entropy_coeff_start,
+        entropy_coeff_end=agent_args.entropy_coeff_end,
+        entropy_coeff_horizon=agent_args.entropy_coeff_horizon,
+        ppo_clipping_epsilon=agent_args.ppo_clipping_epsilon,
+        gamma=agent_args.gamma,
+        gae_lambda=agent_args.gae_lambda,
+        player_id=player_id,
+    )
+    return agent
+
+
+if __name__ == "__main__":
+    pass
diff --git a/pax/conf/experiment/imp/mfos_att_v_tabular.yaml b/pax/conf/experiment/imp/mfos_att_v_tabular.yaml
new file mode 100644
index 00000000..963289cd
--- /dev/null
+++ b/pax/conf/experiment/imp/mfos_att_v_tabular.yaml
@@ -0,0 +1,115 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[1, -1], [-1, 1], [-1, 1], [1, -1]]
+
+# Runner 
+runner: evo
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+num_devices: 1
+att_type: 'avg'
+
+# MFOS vs. Tabular trained on seed = 0
+run_path: ucl-dark/ipd/1r9txdso
+model_path:  exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: imp
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}
+  log: False
+
+
diff --git a/pax/conf/experiment/imp/shaper_att_v_tabular.yaml b/pax/conf/experiment/imp/shaper_att_v_tabular.yaml
new file mode 100644
index 00000000..4fa219de
--- /dev/null
+++ b/pax/conf/experiment/imp/shaper_att_v_tabular.yaml
@@ -0,0 +1,103 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[1, -1], [-1, 1], [-1, 1], [1, -1]]
+
+# Runner 
+runner: evo 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+att_type: 'avg'
+
+# Evaluation 
+run_path: ucl-dark/ipd/1ui7wfop
+model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: imp
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}
+  log: False
+
+
diff --git a/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml b/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml
new file mode 100644
index 00000000..410c12d5
--- /dev/null
+++ b/pax/conf/experiment/imp/shaper_att_v_tabular_hardstop.yaml
@@ -0,0 +1,104 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+att_type: 'avg'
+
+# Evaluation 
+AVG-model
+run_path: ucl-dark/imp/1dfrc0c5
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10/2023-05-11_15.30.16.570714/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}
+  log: False
+
+
diff --git a/pax/conf/experiment/impitm/train_shaper_att.yaml b/pax/conf/experiment/impitm/train_shaper_att.yaml
new file mode 100644
index 00000000..b36e7ed3
--- /dev/null
+++ b/pax/conf/experiment/impitm/train_shaper_att.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'PPO_memory'
+
+# Environment
+env_id: InTheMatrix
+env_type: meta
+env_discount: 0.96
+freeze: 5
+payoff: [[[1, -1], [-1, 1]], [[-1, 1], [1, -1]]]
+fixed_coins: False
+
+# Save
+save: True
+save_interval: 100
+benchmark: False
+
+# Runner 
+runner: evo
+
+# Training
+top_k: 8
+popsize: 128 #512
+# total popsize = popsize * num_devices
+num_envs: 50
+num_opps: 1
+num_devices: 8
+num_outer_steps: 500
+num_inner_steps: 152 
+num_iters: 5000
+att_type: avg
+
+# Evaluation
+run_path: ucl-dark/cg/3mpgbfm2
+model_path: exp/coin_game-EARL-PPO_memory-vs-Random/run-seed-0/2022-09-08_20.41.03.643377/generation_30
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 8
+  num_epochs: 2 
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: False
+  learning_rate: 0.005
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: True
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: False # only works with CNN
+  hidden_size: 32
+
+ppo2:
+  num_minibatches: 8
+  num_epochs: 2 
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: False
+  learning_rate: 0.005
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: True
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: True # only works with CNN
+  hidden_size: 8
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES, SimpleGA]
+  sigma_init: 0.075   # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.05     # Initial learning rate
+  lrate_decay: 0.999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  elite_ratio: 0.1
+  centered_rank: True     # Fitness centered_rank
+  w_decay: 0.1        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: False      # Normalise fitness 
+  mean_reduce: False    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipditm
+  group: 'shaping-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}
+  log: True
\ No newline at end of file
diff --git a/pax/conf/experiment/ipd/gs_v_ppo.yaml b/pax/conf/experiment/ipd/gs_v_ppo.yaml
index b2ef7f3e..051dd9ba 100644
--- a/pax/conf/experiment/ipd/gs_v_ppo.yaml
+++ b/pax/conf/experiment/ipd/gs_v_ppo.yaml
@@ -11,7 +11,7 @@ env_discount: 0.96
 payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
 
 # Runner 
-runner: evo 
+runner: evo
 
 # Training
 top_k: 5
@@ -44,6 +44,26 @@ ppo1:
   with_cnn: False
   hidden_size: 16
 
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
 # ES parameters 
 es: 
   algo: OpenES        # [OpenES, CMA_ES]
diff --git a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml
index 55995b03..6bc1de60 100644
--- a/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml
+++ b/pax/conf/experiment/ipd/gs_v_ppo_mem.yaml
@@ -17,7 +17,7 @@ runner: evo
 top_k: 5
 popsize: 1000
 num_envs: 2
-num_opps: 1
+num_opps: 10
 num_outer_steps: 100 
 num_inner_steps: 100 
 num_iters: 5000
@@ -44,7 +44,25 @@ ppo1:
   with_cnn: False
   hidden_size: 16
 
-
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
 
 # ES parameters 
 es: 
diff --git a/pax/conf/experiment/ipd/gs_v_tabular.yaml b/pax/conf/experiment/ipd/gs_v_tabular.yaml
index 639a3cf4..aac6e0f4 100644
--- a/pax/conf/experiment/ipd/gs_v_tabular.yaml
+++ b/pax/conf/experiment/ipd/gs_v_tabular.yaml
@@ -18,45 +18,12 @@ top_k: 5
 popsize: 1000
 num_envs: 2
 num_opps: 1
-num_outer_steps: 1 
+num_outer_steps: 100 
 num_inner_steps: 100 
 num_iters: 5000
 num_devices: 1
 
-# Evaluation 
-# GS vs. Tabular trained on seed=0, where Naive Learners have their learning rate annealed halfway through the trial 
-# run_path: ucl-dark/ipd/1gg0p92x
-# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-28_01.57.34.854198/generation_4900
-
-# GS vs. Tabular trained on seed=1, where Naive Learners have their learning rate annealed halfway through the trial 
-# run_path: ucl-dark/ipd/scffrmfv
-# model_path:  exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-28_05.00.56.131987/generation_4900
-
-# GS vs. Tabular trained on seed=2, where Naive Learners have their learning rate annealed halfway through the trial 
-# run_path: ucl-dark/ipd/2858x8sa
-# model_path:  exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-28_07.38.37.221049/generation_4900
-
-# GS vs. Tabular trained on seed=3, where Naive Learners have their learning rate annealed halfway through the trial 
-# run_path: ucl-dark/ipd/1y9tefvj
-# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-28_01.57.40.696321/generation_4900
-
-# GS vs. Tabular trained on seed=4, where Naive Learners have their learning rate annealed halfway through the trial 
-# run_path: ucl-dark/ipd/8j6zmb6h
-# model_path:  exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-28_05.11.49.206169/generation_4900
-
-# GS vs. Tabular trained on seed = 0
-# run_path: ucl-dark/ipd/tywwxijw
-# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_16.06.55.715665/generation_4900
-# GS vs. Tabular trained on seed = 1
-# run_path: ucl-dark/ipd/2lyn9n10
-# model_path: exp/GS-PPO-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_16.07.48.978281/generation_4900
-# GS vs. Tabular trained on seed = 2
-# run_path: ucl-dark/ipd/f2xhuhcz
-# model_path:  exp/GS-PPO-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_16.08.35.015944/generation_4900
-# GS vs. Tabular trained on seed = 3
-# run_path: ucl-dark/ipd/16wzxeb6
-# model_path: exp/GS-PPO-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_16.09.01.274669/generation_4900
-# GS vs. Tabular trained on seed = 4
+# Evaluation
 run_path: ucl-dark/ipd/3dzkof3f
 model_path:  exp/GS-PPO-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_16.41.50.643263/generation_4900
 
@@ -81,6 +48,27 @@ ppo1:
   with_cnn: False
   hidden_size: 16
 
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
 # ES parameters 
 es: 
   algo: OpenES        # [OpenES, CMA_ES]
@@ -91,7 +79,7 @@ es:
   init_max: 0.0       # Range of parameter mean initialization - Max
   clip_min: -1e10     # Range of parameter proposals - Min
   clip_max: 1e10      # Range of parameter proposals - Max
-  lrate_init: 0.01    # Initial learning rate
+  lrate_init: 0.1    # Initial learning rate
   lrate_decay: 0.9999 # Multiplicative decay factor
   lrate_limit: 0.001  # Smallest possible lrate
   beta_1: 0.99        # Adam - beta_1
diff --git a/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml
new file mode 100644
index 00000000..ace3f5aa
--- /dev/null
+++ b/pax/conf/experiment/ipd/gs_v_tabular_hardstop_eval.yaml
@@ -0,0 +1,140 @@
+# @package _global_
+
+# Agents  
+agent1: 'PPO'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: eval_hardstop 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+stop: 2
+
+# run_path: ucl-dark/ipd/3ipiqfwz
+# model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2023-05-23_13.41.36.367352/generation_900
+
+run_path: ucl-dark/ipd/hl9q06ix
+model_path: exp/GS-PPO-vs-Tabular/run-seed-0-pop-size-1000/2023-05-23_15.00.59.246054/generation_300
+# Evaluation 
+# # AVG-model 0
+# run_path: ucl-dark/ipd/1n313hkb
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml b/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml
new file mode 100644
index 00000000..d82a379b
--- /dev/null
+++ b/pax/conf/experiment/ipd/mfos_att_v_ppo.yaml
@@ -0,0 +1,104 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'PPO'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+num_devices: 1
+att_type: 'avg'
+
+# MFOS vs. Tabular trained on seed = 0
+run_path: ucl-dark/ipd/1r9txdso
+model_path:  exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400
+
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml
new file mode 100644
index 00000000..e1c4900c
--- /dev/null
+++ b/pax/conf/experiment/ipd/mfos_att_v_tabular.yaml
@@ -0,0 +1,104 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+num_devices: 1
+att_type: 'avg'
+
+# MFOS vs. Tabular trained on seed = 0
+run_path: ucl-dark/ipd/1r9txdso
+model_path:  exp/GS-MFOS-vs-Tabular/run-seed-0-pop-size-1000/2022-09-25_20.32.20.821162/generation_4400
+
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml
new file mode 100644
index 00000000..a66b2aa9
--- /dev/null
+++ b/pax/conf/experiment/ipd/mfos_att_v_tabular_hardstop_eval.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: eval_hardstop
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+stop: 100
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+run_path: ucl-dark/ipd/4ykf9oe8
+model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml b/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml
new file mode 100644
index 00000000..3d641a1a
--- /dev/null
+++ b/pax/conf/experiment/ipd/mfos_att_v_tabular_stevie.yaml
@@ -0,0 +1,155 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/mfos_v_tabular.yaml b/pax/conf/experiment/ipd/mfos_v_tabular.yaml
index ea55983e..f414ef45 100644
--- a/pax/conf/experiment/ipd/mfos_v_tabular.yaml
+++ b/pax/conf/experiment/ipd/mfos_v_tabular.yaml
@@ -17,7 +17,7 @@ runner: evo
 top_k: 5
 popsize: 1000
 num_envs: 2
-num_opps: 1
+num_opps: 10
 num_outer_steps: 100 
 num_inner_steps: 100 
 num_iters: 5000
diff --git a/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml b/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml
index 61ec02ee..5314a22b 100644
--- a/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml
+++ b/pax/conf/experiment/ipd/ppo_mem_v_tft.yaml
@@ -42,7 +42,27 @@ ppo1:
   entropy_coeff_horizon: 1e7
   entropy_coeff_end: 0.001
   lr_scheduling: True
-  learning_rate: 0.001
+  learning_rate: 0.1 #0.001
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 4
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2 
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: True
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 1e7
+  entropy_coeff_end: 0.001
+  lr_scheduling: True
+  learning_rate: 0.1
   adam_epsilon: 1e-5
   with_memory: True
   with_cnn: False
diff --git a/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml b/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml
new file mode 100644
index 00000000..5f640045
--- /dev/null
+++ b/pax/conf/experiment/ipd/shaper_att_v_ppo.yaml
@@ -0,0 +1,103 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'PPO'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+num_devices: 1
+att_type: 'nothing'
+
+# Evaluation 
+run_path: ucl-dark/ipd/1ui7wfop
+model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.01    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml b/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml
new file mode 100644
index 00000000..123dfb16
--- /dev/null
+++ b/pax/conf/experiment/ipd/shaper_att_v_ppo_mem.yaml
@@ -0,0 +1,98 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'PPO_memory'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+num_devices: 1
+att_type: 'nothing'
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 0.1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 0.1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.01    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml
new file mode 100644
index 00000000..c760e511
--- /dev/null
+++ b/pax/conf/experiment/ipd/shaper_att_v_tabular.yaml
@@ -0,0 +1,104 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: evo_mixed_payoff
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 10
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_iters: 1000
+att_type: 'nothing'
+num_devices: 1
+
+# Evaluation 
+run_path: ucl-dark/ipd/1ui7wfop
+model_path: exp/EARL-PPO_memory-vs-PPO/run-seed-25-OpenES-pop-size-1000-num-opps-1/2022-09-15_02.32.16.559924/generation_2900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml
new file mode 100644
index 00000000..b023a2e2
--- /dev/null
+++ b/pax/conf/experiment/ipd/shaper_att_v_tabular_hardstop_eval.yaml
@@ -0,0 +1,135 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: eval_hardstop 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+stop: 2
+
+# Evaluation 
+# # AVG-model 0
+# run_path: ucl-dark/ipd/1n313hkb
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+run_path: ucl-dark/ipd/2m3wh5g7
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml b/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml
new file mode 100644
index 00000000..5e37177c
--- /dev/null
+++ b/pax/conf/experiment/ipd/shaper_att_v_tabular_stevie.yaml
@@ -0,0 +1,134 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+
+# Evaluation 
+# # AVG-model 0
+run_path: ucl-dark/ipd/1n313hkb
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/shaper_v_ppo.yaml b/pax/conf/experiment/ipd/shaper_v_ppo.yaml
index 1543cb30..8ce491e7 100644
--- a/pax/conf/experiment/ipd/shaper_v_ppo.yaml
+++ b/pax/conf/experiment/ipd/shaper_v_ppo.yaml
@@ -17,7 +17,7 @@ runner: evo
 top_k: 5
 popsize: 1000
 num_envs: 2
-num_opps: 1
+num_opps: 10
 num_outer_steps: 100 
 num_inner_steps: 100 
 num_iters: 5000
@@ -79,7 +79,7 @@ es:
   init_max: 0.0       # Range of parameter mean initialization - Max
   clip_min: -1e10     # Range of parameter proposals - Min
   clip_max: 1e10      # Range of parameter proposals - Max
-  lrate_init: 0.01    # Initial learning rate
+  lrate_init: 0.1    # Initial learning rate
   lrate_decay: 0.9999 # Multiplicative decay factor
   lrate_limit: 0.001  # Smallest possible lrate
   beta_1: 0.99        # Adam - beta_1
diff --git a/pax/conf/experiment/ipd/shaper_v_tabular.yaml b/pax/conf/experiment/ipd/shaper_v_tabular.yaml
index 10ba189f..364a14d2 100644
--- a/pax/conf/experiment/ipd/shaper_v_tabular.yaml
+++ b/pax/conf/experiment/ipd/shaper_v_tabular.yaml
@@ -17,7 +17,7 @@ runner: evo
 top_k: 5
 popsize: 1000
 num_envs: 2
-num_opps: 1
+num_opps: 10
 num_outer_steps: 100 
 num_inner_steps: 100 
 num_iters: 5000
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml
new file mode 100644
index 00000000..f66ebd0f
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_0.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+run_path: ucl-dark/ipd/4ykf9oe8
+model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml
new file mode 100644
index 00000000..3ca9736f
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_1.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+run_path: ucl-dark/ipd/eopf93re
+model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml
new file mode 100644
index 00000000..60384066
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_2.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+run_path: ucl-dark/ipd/1sqbd09n
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml
new file mode 100644
index 00000000..4042ad3d
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_3.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+run_path: ucl-dark/ipd/3n7l8ods
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml
new file mode 100644
index 00000000..e869d86e
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/ten/mfos_avg_4.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+run_path: ucl-dark/ipd/4mf1ecxq
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml
new file mode 100644
index 00000000..ddbd91c4
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_0.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+run_path: ucl-dark/ipd/4ykf9oe8
+model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml
new file mode 100644
index 00000000..4307f733
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_1.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+run_path: ucl-dark/ipd/eopf93re
+model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml
new file mode 100644
index 00000000..f39b4163
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_2.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+run_path: ucl-dark/ipd/1sqbd09n
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml
new file mode 100644
index 00000000..7ea81f4d
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_3.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+run_path: ucl-dark/ipd/3n7l8ods
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml
new file mode 100644
index 00000000..90533f91
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/twenty/mfos_avg_4.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+run_path: ucl-dark/ipd/4mf1ecxq
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml
new file mode 100644
index 00000000..e147b834
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_0.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+run_path: ucl-dark/ipd/4ykf9oe8
+model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml
new file mode 100644
index 00000000..88552d56
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_1.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+run_path: ucl-dark/ipd/eopf93re
+model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml
new file mode 100644
index 00000000..1ced362c
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_2.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+run_path: ucl-dark/ipd/1sqbd09n
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml
new file mode 100644
index 00000000..9484f767
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_3.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+run_path: ucl-dark/ipd/3n7l8ods
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+# run_path: ucl-dark/ipd/4mf1ecxq
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml
new file mode 100644
index 00000000..b60172de
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_avg/two/mfos_avg_4.yaml
@@ -0,0 +1,156 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'avg'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+# AVG model seed=23
+# run_path: ucl-dark/ipd/4ykf9oe8
+# model_path: exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-11_14.58.45.927266/generation_900
+
+## avg model 1 seed=65
+# run_path: ucl-dark/ipd/eopf93re
+# model_path: exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000/2023-05-11_20.31.48.530245/generation_900
+
+# #avg model 2 seed=47
+# run_path: ucl-dark/ipd/1sqbd09n
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-11_17.45.03.318240/generation_900
+
+# ## avg model 3 seed=8
+# run_path: ucl-dark/ipd/3n7l8ods
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-11_12.12.19.914211/generation_900
+
+# ## avg model 4 seed=6
+run_path: ucl-dark/ipd/4mf1ecxq
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-11_09.25.40.656392/generation_900
+
+# #nothing model, seed=23
+# run_path: ucl-dark/ipd/3i5m1agd
+# model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml
new file mode 100644
index 00000000..389cbe64
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_0.yaml
@@ -0,0 +1,137 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=23
+run_path: ucl-dark/ipd/3i5m1agd
+model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml
new file mode 100644
index 00000000..edf3b243
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_1.yaml
@@ -0,0 +1,134 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=65
+run_path: ucl-dark/ipd/1s3kty0d
+model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml
new file mode 100644
index 00000000..3b021ef9
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_2.yaml
@@ -0,0 +1,130 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=47
+run_path: ucl-dark/ipd/37v877f5
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml
new file mode 100644
index 00000000..2dd1a6f8
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_3.yaml
@@ -0,0 +1,125 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=8
+run_path: ucl-dark/ipd/1wcrrl9h
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml
new file mode 100644
index 00000000..ca5deaae
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/ten/mfos_nothing_4.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=6
+run_path: ucl-dark/ipd/1vkddd7q
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml
new file mode 100644
index 00000000..7787db5c
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_0.yaml
@@ -0,0 +1,137 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=23
+run_path: ucl-dark/ipd/3i5m1agd
+model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml
new file mode 100644
index 00000000..f3f34090
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_1.yaml
@@ -0,0 +1,134 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=65
+run_path: ucl-dark/ipd/1s3kty0d
+model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml
new file mode 100644
index 00000000..cb33a46c
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_2.yaml
@@ -0,0 +1,130 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=47
+run_path: ucl-dark/ipd/37v877f5
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml
new file mode 100644
index 00000000..afbdf4fd
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_3.yaml
@@ -0,0 +1,125 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=8
+run_path: ucl-dark/ipd/1wcrrl9h
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml
new file mode 100644
index 00000000..a9a1f7ca
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/twenty/mfos_nothing_4.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=6
+run_path: ucl-dark/ipd/1vkddd7q
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml
new file mode 100644
index 00000000..332d7f09
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_0.yaml
@@ -0,0 +1,137 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 23
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=23
+run_path: ucl-dark/ipd/3i5m1agd
+model_path:  exp/MFOS-vs-Tabular/run-seed-23-pop-size-1000/2023-05-14_21.53.30.316487/generation_900
+
+# #nothing model, seed=65
+# run_path: ucl-dark/ipd/1s3kty0d
+# model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml
new file mode 100644
index 00000000..028a2570
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_1.yaml
@@ -0,0 +1,134 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 65
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=65
+run_path: ucl-dark/ipd/1s3kty0d
+model_path:  exp/MFOS-vs-Tabular/run-seed-65-pop-size-1000 /2023-05-15_03.29.53.728701/generation_900
+
+# #nothing model, seed=47
+# run_path: ucl-dark/ipd/37v877f5
+# model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml
new file mode 100644
index 00000000..a78712ef
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_2.yaml
@@ -0,0 +1,130 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 47
+
+# MFOS vs. Tabular trained on seed = 0
+
+
+# #nothing model, seed=47
+run_path: ucl-dark/ipd/37v877f5
+model_path: exp/MFOS-vs-Tabular/run-seed-47-pop-size-1000/2023-05-15_00.41.45.864580/generation_900
+
+# #nothing model, seed=8
+# run_path: ucl-dark/ipd/1wcrrl9h
+# model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml
new file mode 100644
index 00000000..ab859fad
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_3.yaml
@@ -0,0 +1,125 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 8
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=8
+run_path: ucl-dark/ipd/1wcrrl9h
+model_path: exp/MFOS-vs-Tabular/run-seed-8-pop-size-1000/2023-05-14_19.05.09.345813/generation_900
+
+# #nothing model, seed=6
+# run_path: ucl-dark/ipd/1vkddd7q
+# model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml
new file mode 100644
index 00000000..2fc8c00d
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/mfos_nothing/two/mfos_nothing_4.yaml
@@ -0,0 +1,121 @@
+# @package _global_
+
+# Agents  
+agent1: 'MFOS'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_outer_steps: 100 
+num_inner_steps: 100 
+num_steps: 10000
+num_iters: 1
+num_devices: 1
+att_type: 'nothing'
+seed: 6
+
+# MFOS vs. Tabular trained on seed = 0
+
+# #nothing model, seed=6
+run_path: ucl-dark/ipd/1vkddd7q
+model_path: exp/MFOS-vs-Tabular/run-seed-6-pop-size-1000/2023-05-14_16.16.28.014913/generation_900
+
+ 
+# MFOS vs. Tabular trained on seed = 1
+# run_path: ucl-dark/ipd/13srlkhp
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-1-pop-size-1000/2022-09-25_20.33.11.352762/generation_4300
+# MFOS vs. Tabular trained on seed = 2
+# run_path: ucl-dark/ipd/3pfmqrpw
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-2-pop-size-1000/2022-09-25_20.34.04.832865/generation_4400
+# MFOS vs. Tabular trained on seed = 3
+# run_path: ucl-dark/ipd/groh4iwx
+# model_path: exp/GS-MFOS-vs-Tabular/run-seed-3-pop-size-1000/2022-09-25_20.36.02.555928/generation_4400
+# MFOS vs. Tabular trained on seed = 4
+# run_path: ucl-dark/ipd/26cqaqyc
+# model_path:  exp/GS-MFOS-vs-Tabular/run-seed-4-pop-size-1000/2022-09-25_20.38.01.382774/generation_4400
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: False
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False     # Fitness centered_rank
+  w_decay: 0        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: True      # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: '${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-pop-size-${popsize}-stevie 
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml
new file mode 100644
index 00000000..2c4d74a4
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_0.yaml
@@ -0,0 +1,135 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 65
+
+# Evaluation 
+# # AVG-model 0
+run_path: ucl-dark/ipd/1n313hkb
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml
new file mode 100644
index 00000000..57455307
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_1.yaml
@@ -0,0 +1,132 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 23
+
+# Evaluation 
+
+## avg-model 1 seed23
+run_path: ucl-dark/ipd/2jtks2rd
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml
new file mode 100644
index 00000000..013741f7
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/ten/shaper_avg_2.yaml
@@ -0,0 +1,128 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 6
+
+# Evaluation 
+
+## avg-model 2 seed 6
+run_path: ucl-dark/ipd/2d4s9hl2
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml
new file mode 100644
index 00000000..24ceeebd
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_0.yaml
@@ -0,0 +1,135 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 65
+
+# Evaluation 
+# # AVG-model 0
+run_path: ucl-dark/ipd/1n313hkb
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml
new file mode 100644
index 00000000..4c412365
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_1.yaml
@@ -0,0 +1,132 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 23
+
+# Evaluation 
+
+## avg-model 1 seed23
+run_path: ucl-dark/ipd/2jtks2rd
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml
new file mode 100644
index 00000000..5555c0e5
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/twenty/shaper_avg_2.yaml
@@ -0,0 +1,128 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 6
+
+# Evaluation 
+
+## avg-model 2 seed 6
+run_path: ucl-dark/ipd/2d4s9hl2
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml
new file mode 100644
index 00000000..93f850c1
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_0.yaml
@@ -0,0 +1,135 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 65
+
+# Evaluation 
+# # AVG-model 0
+run_path: ucl-dark/ipd/1n313hkb
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_15.31.22.592492/generation_900
+
+## avg-model 1 seed23
+# run_path: ucl-dark/ipd/2jtks2rd
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml
new file mode 100644
index 00000000..a276e9c2
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_1.yaml
@@ -0,0 +1,132 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 23
+
+# Evaluation 
+
+## avg-model 1 seed23
+run_path: ucl-dark/ipd/2jtks2rd
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_12.29.16.667285/generation_900
+
+## avg-model 2 seed 6
+# run_path: ucl-dark/ipd/2d4s9hl2
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml
new file mode 100644
index 00000000..3ca42a9c
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_avg/two/shaper_avg_2.yaml
@@ -0,0 +1,128 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'avg'
+seed: 6
+
+# Evaluation 
+
+## avg-model 2 seed 6
+run_path: ucl-dark/ipd/2d4s9hl2
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-avg/2023-05-11_09.27.30.020298/generation_900
+
+
+# # nothing-model 0
+# run_path: ucl-dark/ipd/2jpssoai
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml
new file mode 100644
index 00000000..7c853ef8
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_0.yaml
@@ -0,0 +1,123 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 36
+
+# Evaluation 
+
+# # nothing-model 0
+run_path: ucl-dark/ipd/2jpssoai
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml
new file mode 100644
index 00000000..9a417276
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_1.yaml
@@ -0,0 +1,119 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 65
+
+# Evaluation 
+
+# # nothing-model 1 seed 65
+run_path: ucl-dark/ipd/2m3wh5g7
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml
new file mode 100644
index 00000000..4eb364ef
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_2.yaml
@@ -0,0 +1,115 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 47
+
+# Evaluation 
+
+# # nothing-model 2 47
+run_path: ucl-dark/ipd/1jk5zly5
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml
new file mode 100644
index 00000000..2a0106e4
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_3.yaml
@@ -0,0 +1,111 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 23
+
+# Evaluation 
+
+# # nothing-model 3 23
+run_path: ucl-dark/ipd/1cvpiolk
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml
new file mode 100644
index 00000000..3954f424
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/ten/shaper_nothing_4.yaml
@@ -0,0 +1,107 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 10
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 6
+
+# Evaluation 
+
+# # nothing-model 4 6
+run_path: ucl-dark/ipd/3vml0wjy
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml
new file mode 100644
index 00000000..e5c993b2
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_0.yaml
@@ -0,0 +1,123 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 36
+
+# Evaluation 
+
+# # nothing-model 0
+run_path: ucl-dark/ipd/2jpssoai
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml
new file mode 100644
index 00000000..03cb9e85
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_1.yaml
@@ -0,0 +1,119 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 65
+
+# Evaluation 
+
+# # nothing-model 1 seed 65
+run_path: ucl-dark/ipd/2m3wh5g7
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml
new file mode 100644
index 00000000..b78a948b
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_2.yaml
@@ -0,0 +1,115 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 47
+
+# Evaluation 
+
+# # nothing-model 2 47
+run_path: ucl-dark/ipd/1jk5zly5
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml
new file mode 100644
index 00000000..d1ed08a2
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_3.yaml
@@ -0,0 +1,111 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 23
+
+# Evaluation 
+
+# # nothing-model 3 23
+run_path: ucl-dark/ipd/1cvpiolk
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml
new file mode 100644
index 00000000..136903f9
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/twenty/shaper_nothing_4.yaml
@@ -0,0 +1,107 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 20
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 6
+
+# Evaluation 
+
+# # nothing-model 4 6
+run_path: ucl-dark/ipd/3vml0wjy
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml
new file mode 100644
index 00000000..5e371be3
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_0.yaml
@@ -0,0 +1,123 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 36
+
+# Evaluation 
+
+# # nothing-model 0
+run_path: ucl-dark/ipd/2jpssoai
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-36-OpenES-pop-size-1000-num-opps-10/2023-04-26_03.30.19.506954/generation_900
+
+# # nothing-model 1 seed 65
+# run_path: ucl-dark/ipd/2m3wh5g7
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml
new file mode 100644
index 00000000..bcc10e07
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_1.yaml
@@ -0,0 +1,119 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 65
+
+# Evaluation 
+
+# # nothing-model 1 seed 65
+run_path: ucl-dark/ipd/2m3wh5g7
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-65-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_22.17.07.592872/generation_900
+
+# # nothing-model 2 47
+# run_path: ucl-dark/ipd/1jk5zly5
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml
new file mode 100644
index 00000000..1a108e1f
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_2.yaml
@@ -0,0 +1,115 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 47
+
+# Evaluation 
+
+# # nothing-model 2 47
+run_path: ucl-dark/ipd/1jk5zly5
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-47-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_20.46.58.588813/generation_900
+
+# # nothing-model 3 23
+# run_path: ucl-dark/ipd/1cvpiolk
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml
new file mode 100644
index 00000000..7801c704
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_3.yaml
@@ -0,0 +1,111 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 23
+
+# Evaluation 
+
+# # nothing-model 3 23
+run_path: ucl-dark/ipd/1cvpiolk
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-23-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_19.16.56.990716/generation_900
+
+# # nothing-model 4 6
+# run_path: ucl-dark/ipd/3vml0wjy
+# model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml
new file mode 100644
index 00000000..8c308c50
--- /dev/null
+++ b/pax/conf/experiment/ipd/stevie/shaper_nothing/two/shaper_nothing_4.yaml
@@ -0,0 +1,107 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'Tabular'
+
+# Environment
+env_id: iterated_matrix_game
+env_type: meta
+env_discount: 0.96
+payoff: [[-1, -1], [-3, 0], [0, -3], [-2, -2]]
+
+# Runner 
+runner: stevie 
+
+# Training
+top_k: 5
+popsize: 1000
+num_envs: 2
+num_opps: 1
+num_steps: 10000
+num_outer_steps: 100
+num_inner_steps: 100 
+num_iters: 1
+att_type: 'nothing'
+seed: 6
+
+# Evaluation 
+
+# # nothing-model 4 6
+run_path: ucl-dark/ipd/3vml0wjy
+model_path: exp/EARL-Shaper-vs-Tabular/run-seed-6-OpenES-pop-size-1000-num-opps-10-att-type-nothing/2023-05-14_16.16.19.180942/generation_900
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+# PPO agent parameters
+ppo2:
+  num_minibatches: 4
+  num_epochs: 2
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True 
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.02
+  entropy_coeff_horizon: 2000000
+  entropy_coeff_end: 0.001
+  lr_scheduling: False
+  learning_rate: 1
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: False
+  hidden_size: 16
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES]
+  sigma_init: 0.04    # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.1    # Initial learning rate
+  lrate_decay: 0.9999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  centered_rank: False # Fitness centered_rank
+  w_decay: 0           # Decay old elite fitness 
+  maximise: True       # Maximise fitness 
+  z_score: False       # Normalise fitness 
+  mean_reduce: True    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipd
+  group: 'EARL-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}-${es.algo}-pop-size-${popsize}-num-opps-${num_opps}-att-type-${att_type}-hardstop
+  log: False
+
+
diff --git a/pax/conf/experiment/ipditm/train_shaper_att.yaml b/pax/conf/experiment/ipditm/train_shaper_att.yaml
new file mode 100644
index 00000000..2ba52646
--- /dev/null
+++ b/pax/conf/experiment/ipditm/train_shaper_att.yaml
@@ -0,0 +1,116 @@
+# @package _global_
+
+# Agents  
+agent1: 'Shaper'
+agent2: 'PPO_memory'
+
+# Environment
+env_id: InTheMatrix
+env_type: meta
+env_discount: 0.96
+freeze: 5
+payoff: [[[3, 0], [5, 1]], [[3, 5], [0, 1]]]
+fixed_coins: True
+
+# Save
+save: True
+save_interval: 100
+benchmark: False
+
+# Runner 
+runner: evo
+
+# Training
+top_k: 8
+popsize: 128 #512
+# total popsize = popsize * num_devices
+num_envs: 50
+num_opps: 1
+num_devices: 8
+num_outer_steps: 500
+num_inner_steps: 152 
+num_iters: 5000
+att_type: avg
+
+# Evaluation
+run_path: ucl-dark/cg/3mpgbfm2
+model_path: exp/coin_game-EARL-PPO_memory-vs-Random/run-seed-0/2022-09-08_20.41.03.643377/generation_30
+
+# PPO agent parameters
+ppo1:
+  num_minibatches: 8
+  num_epochs: 2 
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: False
+  learning_rate: 0.005
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: True
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: False # only works with CNN
+  hidden_size: 32
+
+ppo2:
+  num_minibatches: 8
+  num_epochs: 2 
+  gamma: 0.96
+  gae_lambda: 0.95
+  ppo_clipping_epsilon: 0.2
+  value_coeff: 0.5
+  clip_value: True
+  max_gradient_norm: 0.5
+  anneal_entropy: False
+  entropy_coeff_start: 0.1
+  entropy_coeff_horizon: 0.6e8
+  entropy_coeff_end: 0.005
+  lr_scheduling: False
+  learning_rate: 0.005
+  adam_epsilon: 1e-5
+  with_memory: True
+  with_cnn: True
+  output_channels: 16
+  kernel_shape: [3, 3]
+  separate: True # only works with CNN
+  hidden_size: 8
+
+
+# ES parameters 
+es: 
+  algo: OpenES        # [OpenES, CMA_ES, SimpleGA]
+  sigma_init: 0.075   # Initial scale of isotropic Gaussian noise
+  sigma_decay: 0.999  # Multiplicative decay factor
+  sigma_limit: 0.01   # Smallest possible scale
+  init_min: 0.0       # Range of parameter mean initialization - Min
+  init_max: 0.0       # Range of parameter mean initialization - Max
+  clip_min: -1e10     # Range of parameter proposals - Min
+  clip_max: 1e10      # Range of parameter proposals - Max
+  lrate_init: 0.05     # Initial learning rate
+  lrate_decay: 0.999 # Multiplicative decay factor
+  lrate_limit: 0.001  # Smallest possible lrate
+  beta_1: 0.99        # Adam - beta_1
+  beta_2: 0.999       # Adam - beta_2
+  eps: 1e-8           # eps constant,
+  elite_ratio: 0.1
+  centered_rank: True     # Fitness centered_rank
+  w_decay: 0.1        # Decay old elite fitness 
+  maximise: True      # Maximise fitness 
+  z_score: False      # Normalise fitness 
+  mean_reduce: False    # Remove mean
+  
+# Logging setup
+wandb:
+  entity: "ucl-dark"
+  project: ipditm
+  group: 'shaping-${agent1}-vs-${agent2}'
+  name: run-seed-${seed}
+  log: True
\ No newline at end of file
diff --git a/pax/experiment.py b/pax/experiment.py
index 15493d08..6ca8cb90 100644
--- a/pax/experiment.py
+++ b/pax/experiment.py
@@ -19,6 +19,7 @@
 from pax.agents.naive_exact import NaiveExact
 from pax.agents.ppo.ppo import make_agent
 from pax.agents.ppo.ppo_gru import make_gru_agent
+from pax.agents.shaper_att.ppo_gru import make_shaper_agent
 from pax.agents.strategies import (
     Altruistic,
     Defect,
@@ -56,17 +57,30 @@
 from pax.envs.iterated_tensor_game_n_player import (
     EnvParams as IteratedTensorGameNPlayerParams,
 )
+
+from pax.runners.runner_stevie import StevieRunner
+from pax.runners.runner_eval import EvalRunner
+from pax.runners.runner_eval_multishaper import MultishaperEvalRunner
+from pax.runners.runner_eval_hardstop import EvalHardstopRunner
+from pax.runners.runner_evo import EvoRunner
+from pax.runners.runner_evo_multishaper import MultishaperEvoRunner
+from pax.runners.runner_evo_hardstop import EvoHardstopRunner
+from pax.runners.experimental.runner_evo_mixed_lr import EvoMixedLRRunner
+from pax.runners.experimental.runner_evo_mixed_payoffs import EvoMixedPayoffRunner
+from pax.runners.experimental.runner_evo_mixed_IPD_payoffs import EvoMixedIPDPayoffRunner
+from pax.runners.experimental.runner_evo_mixed_payoffs_input import EvoMixedPayoffInputRunner
+from pax.runners.experimental.runner_evo_mixed_payoffs_gen import EvoMixedPayoffGenRunner
+from pax.runners.experimental.runner_evo_mixed_payoffs_only_opp import EvoMixedPayoffOnlyOppRunner
+from pax.runners.runner_evo_scanned import EvoScannedRunner
+
 from pax.envs.iterated_tensor_game_n_player import IteratedTensorGameNPlayer
 from pax.envs.rice.c_rice import ClubRice
 from pax.envs.rice.rice import Rice, EnvParams as RiceParams
 from pax.envs.rice.sarl_rice import SarlRice
 from pax.runners.runner_evo_nroles import EvoRunnerNRoles
 from pax.runners.runner_weight_sharing import WeightSharingRunner
-from pax.runners.runner_eval import EvalRunner
-from pax.runners.runner_eval_multishaper import MultishaperEvalRunner
-from pax.runners.runner_evo import EvoRunner
-from pax.runners.runner_evo_multishaper import MultishaperEvoRunner
 from pax.runners.runner_ipditm_eval import IPDITMEvalRunner
+
 from pax.runners.runner_marl import RLRunner
 from pax.runners.runner_marl_nplayer import NplayerRLRunner
 from pax.runners.runner_sarl import SARLRunner
@@ -275,14 +289,25 @@ def runner_setup(args, env, agents, save_dir, logger):
     if args.runner == "eval":
         logger.info("Evaluating with EvalRunner")
         return EvalRunner(agents, env, args)
+
+    elif args.runner == "stevie":
+        logger.info("Activating Stevie Wonder Mode")
+        return StevieRunner(agents, env, args)
+
+    elif args.runner == "eval_hardstop":
+        logger.info("Activating Eval Hardstop")
+        return EvalHardstopRunner(agents, env, args)
+
     elif args.runner == "multishaper_eval":
         logger.info("Training with multishaper eval Runner")
         return MultishaperEvalRunner(agents, env, save_dir, args)
+
     elif args.runner == "ipditm_eval":
         logger.info("Evaluating with ipditmEvalRunner")
         return IPDITMEvalRunner(agents, env, save_dir, args)
 
-    if args.runner in ["evo", "multishaper_evo", "evo_nroles"]:
+    if args.runner in ["evo", "evo_mixed_lr", "evo_hardstop", "evo_mixed_payoff", "evo_mixed_ipd_payoff",
+    "evo_mixed_payoff_gen", "evo_mixed_payoff_input", "evo_scanned", "evo_mixed_payoff_only_opp", "multishaper_evo", "evo_nroles"]:
         agent1 = agents[0]
         algo = args.es.algo
         strategies = {"CMA_ES", "OpenES", "PGPE", "SimpleGA"}
@@ -367,16 +392,46 @@ def get_pgpe_strategy(agent):
             strategy, es_params, param_reshaper = get_ga_strategy(agent1)
 
         logger.info(f"Evolution Strategy: {algo}")
-        if args.runner == "evo":
-            logger.info("Training with EVO runner")
+
+        if args.runner == "evo_hardstop":
+            return EvoHardstopRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo":
             return EvoRunner(
-                agents,
-                env,
-                strategy,
-                es_params,
-                param_reshaper,
-                save_dir,
-                args,
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_lr":
+            return EvoMixedLRRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_payoff":
+            return EvoMixedPayoffRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_ipd_payoff":
+            return EvoMixedIPDPayoffRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_payoff_gen":
+            return EvoMixedPayoffGenRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_payoff_input":
+            return EvoMixedPayoffInputRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_payoff_pred":
+            return EvoMixedPayoffPredRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_mixed_payoff_only_opp":
+            return EvoMixedPayoffOnlyOppRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
+            )
+        elif args.runner == "evo_scanned":
+            return EvoScannedRunner(
+                agents, env, strategy, es_params, param_reshaper, save_dir, args
             )
 
         elif args.runner == "evo_nroles":
@@ -433,8 +488,31 @@ def agent_setup(args, env, env_params, logger):
     else:
         obs_shape = env.observation_space(env_params).shape
 
+    if args.runner in ["evo_mixed_payoff_input"]:
+        obs_shape_meta = env.observation_space(env_params).n + 8
+    else:
+        obs_shape_meta = obs_shape
+
+    # print(obs_shape, "obs_shape")
+
     num_actions = env.num_actions
 
+    def get_Shaper_agent(seed, player_id):
+        player_args = args.ppo1 if player_id == 1 else args.ppo2
+        num_iterations = args.num_iters
+        if player_id == 1 and args.env_type == "meta":
+            num_iterations = args.num_outer_steps
+        return make_shaper_agent(
+            args,
+            player_args,
+            obs_spec=obs_shape_meta,
+            action_spec=num_actions,
+            seed=seed,
+            num_iterations=num_iterations,
+            player_id=player_id,
+        )
+    
+
     def get_LOLA_agent(seed, player_id):
         return make_lola(
             args,
@@ -447,6 +525,7 @@ def get_LOLA_agent(seed, player_id):
             env_reset=env.reset,
         )
 
+
     def get_PPO_memory_agent(seed, player_id):
         default_player_args = omegaconf.OmegaConf.select(
             args, "ppo_default", default=None
@@ -594,6 +673,7 @@ def get_stay_agent(seed, player_id):
         "LOLA": get_LOLA_agent,
         "PPO": get_PPO_agent,
         "PPO_memory": get_PPO_memory_agent,
+        "Shaper": get_Shaper_agent,
         "Naive": get_naive_pg,
         "Tabular": get_PPO_tabular_agent,
         "MFOS": get_mfos_agent,
@@ -733,6 +813,7 @@ def naive_pg_log(agent):
         "PPO": ppo_log,
         "LOLA": dumb_log,
         "PPO_memory": ppo_memory_log,
+        "Shaper": ppo_memory_log,
         "Naive": naive_pg_log,
         "Hyper": hyper_log,
         "NaiveEx": naive_logger,
@@ -795,7 +876,10 @@ def main(args):
 
     print(f"Number of Training Iterations: {args.num_iters}")
 
-    if args.runner in ["evo", "evo_nroles", "multishaper_evo"]:
+    if args.runner in ["evo", "evo_mixed_lr", "evo_hardstop", "evo_mixed_payoff", "evo_mixed_ipd_payoff",
+    "evo_mixed_payoff_gen", "evo_mixed_payoff_input", "evo_scanned", "evo_mixed_payoff_only_opp", "multishaper_evo", "evo_nroles"]:
+        print(f"Running {args.runner}")
+
         runner.run_loop(env_params, agent_pair, args.num_iters, watchers)
     elif args.runner == "rl" or args.runner == "tensor_rl_nplayer":
         # number of episodes
@@ -804,13 +888,8 @@ def main(args):
 
     elif args.runner == "ipditm_eval" or args.runner == "multishaper_eval":
         runner.run_loop(env_params, agent_pair, watchers)
-    elif args.runner == "sarl":
-        print(f"Number of Episodes: {args.num_iters}")
-        runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers)
-    elif args.runner == "weight_sharing":
-        print(f"Number of Episodes: {args.num_iters}")
-        runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers)
-    elif args.runner == "eval":
+
+    elif args.runner in ["eval", "stevie", "eval_hardstop", "weight_sharing", "sarl"] or args.runner == 'stevie' or args.runner == "eval_hardstop":
         print(f"Number of Episodes: {args.num_iters}")
         runner.run_loop(env, env_params, agent_pair, args.num_iters, watchers)
 
diff --git a/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py b/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py
new file mode 100644
index 00000000..8c88236e
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_IPD_payoffs.py
@@ -0,0 +1,671 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedIPDPayoffRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Each opponent has a different payoff matrix that follows the IPD conditions but each member 
+    of the evo population plays against the same payoff matrices to ensure fair comparison.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, 0), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+            # set payoff matrix to random integers of shape [4,2]
+            payoffs = jnp.array([0, 0, 0, 0], dtype=jnp.int8)
+            def cond_fun(val):
+                _rng_run, payoffs = val
+                return 2*payoffs[1] <= (payoffs[0] + payoffs[2])
+            def body_fun(val):
+                _rng_run, payoffs = val
+                _rng_run, payoff_T, payoff_R, payoff_P, payoff_S = jax.random.split(_rng_run, 5)
+                T = jax.random.randint(payoff_T, minval=0, maxval=2, shape=(1,), dtype=jnp.int8)[0]
+                R = jax.random.randint(payoff_R, minval=T, maxval=4, shape=(1,), dtype=jnp.int8)[0]
+                P = jax.random.randint(payoff_P, minval=R, maxval=6, shape=(1,), dtype=jnp.int8)[0]
+                S = jax.random.randint(payoff_S, minval=P, maxval=8, shape=(1,), dtype=jnp.int8)[0]
+                # payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8)
+                payoffs = jnp.array([T, R, P, S], dtype=jnp.int8)
+                return (_rng_run, payoffs)
+            # _rng_run, payoff_T, payoff_R, payoff_P, payoff_S = jax.random.split(_rng_run, 5)
+            # T = jax.random.randint(payoff_T, minval=0, maxval=2, shape=(1,), dtype=jnp.int8)[0]
+            # R = jax.random.randint(payoff_R, minval=T, maxval=4, shape=(1,), dtype=jnp.int8)[0]
+            # P = jax.random.randint(payoff_P, minval=R, maxval=6, shape=(1,), dtype=jnp.int8)[0]
+            # S = jax.random.randint(payoff_S, minval=P, maxval=8, shape=(1,), dtype=jnp.int8)[0]
+            _rng_run, payoffs = jax.lax.while_loop(cond_fun, body_fun, (_rng_run, payoffs))
+            T = payoffs[0]
+            R = payoffs[1]
+            P = payoffs[2]
+            S = payoffs[3]
+            payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8)
+            # payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8)
+            payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1))
+            # jax.debug.breakpoint()
+            _env_params.payoff_matrix = payoff_matrix
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            # jax.debug.breakpoint()
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/experimental/runner_evo_mixed_lr.py b/pax/runners/experimental/runner_evo_mixed_lr.py
new file mode 100644
index 00000000..bb8942f7
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_lr.py
@@ -0,0 +1,642 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedLRRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Each opponent has a different learning rate, but the members of the population
+    play against the same learning rates to ensure a fair comparison.
+    
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            # jax.debug.print("env_params: {x}", x=env_params)
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [args.num_opps]
+                random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(args.num_opps,))
+                # # repeat the array popsize-times along the first dimension
+                learning_rates = jnp.tile(random_numbers, (args.popsize, 1))
+                a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs.py b/pax/runners/experimental/runner_evo_mixed_payoffs.py
new file mode 100644
index 00000000..00a254a2
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_payoffs.py
@@ -0,0 +1,646 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedPayoffRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Payoff matrix is randomly sampled at each rollout. Each opponent has a different payoff matrix.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, 0), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+            # set payoff matrix to random integers of shape [4,2]
+            _rng_run, payoff_rng = jax.random.split(_rng_run)
+            payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8)
+            payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1))
+            # jax.debug.breakpoint()
+            _env_params.payoff_matrix = payoff_matrix
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            # jax.debug.breakpoint()
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py b/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py
new file mode 100644
index 00000000..f68f9fc6
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_payoffs_gen.py
@@ -0,0 +1,645 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedPayoffGenRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Payoff matrix is randomly sampled at each rollout. Each opponent has the same payoff matrix.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+            # set payoff matrix to random integers of shape [4,2]
+            _rng_run, payoff_rng = jax.random.split(_rng_run)
+            payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8)
+            # payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1))
+
+            _env_params.payoff_matrix = payoff_matrix
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            # jax.debug.breakpoint()
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_input.py b/pax/runners/experimental/runner_evo_mixed_payoffs_input.py
new file mode 100644
index 00000000..9601852e
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_payoffs_input.py
@@ -0,0 +1,663 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedPayoffInputRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Add payoff matrices as input to agents so they don't have to figure out payoff matrices on the go.
+    Either randomly sample and set a payoff matrix
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, 0), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+            # print("OBS1 shape: ", obs1.shape)
+            # print("env params shape: ", env_params.payoff_matrix.shape)
+            # flatten the payoff matrix and append it to the observations
+            # the observations have shape (500, 10, 2, 5) and the payoff matrix has shape (10, 4, 2)
+            # we want to append the payoff matrix to the observations so that the observations have shape (500, 10, 2, 5+8)
+            # we want to flatten the payoff matrix so that it has shape (10, 8)
+            # This is the code
+            payoff_matrix = env_params.payoff_matrix.reshape((self.args.num_opps, 8))
+            payoff_matrix = jnp.tile(jnp.expand_dims(jnp.tile(payoff_matrix, (self.args.popsize, 1, 1)), 2), (1, 1, 2, 1))
+            obs1 = jnp.concatenate((obs1, payoff_matrix), axis=3)
+            # obs2 = jnp.concatenate((obs2, payoff_matrix), axis=3)
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+            # print("OBS2 shape: ", obs2.shape)
+            # payoff_matrix = env_params.payoff_matrix.reshape((10, 8))
+            # payoff_matrix = jnp.tile(jnp.expand_dims(jnp.tile(payoff_matrix, (500, 1, 1)), 2), (1, 1, 2, 1))
+            # obs2_update = jnp.concatenate((obs2, payoff_matrix), axis=3)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+            # set payoff matrix to random integers of shape [4,2]
+            _rng_run, payoff_rng = jax.random.split(_rng_run)
+            # payoff_matrix = -jax.random.randint(payoff_rng, minval=0, maxval=10, shape=(4,2), dtype=jnp.int8)
+            payoff_matrix = jnp.array([[-1, -1], [-3, 0], [0, -3], [-2, -2]])
+            payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1))
+            # jax.debug.breakpoint()
+            _env_params.payoff_matrix = payoff_matrix
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            # jax.debug.breakpoint()
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py b/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py
new file mode 100644
index 00000000..873aeefc
--- /dev/null
+++ b/pax/runners/experimental/runner_evo_mixed_payoffs_only_opp.py
@@ -0,0 +1,657 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoMixedPayoffOnlyOppRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Opponent plays a noisy payoff function of the original IPD payoff matrix.
+    Same noise applied to all opponents.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, 0), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, 0), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+            # set payoff matrix to random integers of shape [4,2]
+            _rng_run, payoff_rng = jax.random.split(_rng_run)
+            # jnp.array([T, R, P, S], dtype=jnp.int8)
+            payoff_matrix_opp = jax.random.uniform(payoff_rng, minval=-0.5, maxval=0.5, shape=(args.num_envs,4,1)) #, dtype=jnp.int8
+            payoff_matrix_ag1 = jnp.expand_dims(jnp.tile(jnp.array([1, 3, 0, 2], dtype=jnp.int8), (args.num_envs,1)), axis=-1)
+            payoff_matrix_ag2 = payoff_matrix_opp + payoff_matrix_ag1
+            payoff_matrix = -jnp.concatenate((payoff_matrix_ag1, payoff_matrix_ag2), axis=-1)
+
+            # payoff_matrix = -jnp.array([[R, R], [S, T], [T, S], [P, P]], dtype=jnp.int8)
+            # payoff_matrix = -jnp.array([[1, payoff_matrix_opp[1]], 
+            #                             [3, payoff_matrix_opp[0]], 
+            #                             [0, payoff_matrix_opp[3]], 
+            #                             [2, payoff_matrix_opp[2]]]) #, dtype=jnp.int8
+            payoff_matrix = jnp.tile(payoff_matrix, (args.num_opps, 1, 1, 1))
+
+            _env_params.payoff_matrix = payoff_matrix
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            # jax.debug.breakpoint()
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/runner_eval.py b/pax/runners/runner_eval.py
index fb63f98d..44648dc9 100644
--- a/pax/runners/runner_eval.py
+++ b/pax/runners/runner_eval.py
@@ -410,6 +410,15 @@ def run_loop(self, env, env_params, agents, num_episodes, watchers):
             ) = vals
             traj_1, traj_2, env_states, a2_metrics = stack
 
+            # reset second agent memory
+            a2_mem = agent2.batch_reset(a2_mem, False)
+            # jax.debug.breakpoint()
+            traj_1_rewards = traj_1.rewards.mean(axis=(1,3))
+            traj_2_rewards = traj_2.rewards.mean(axis=(1,3))
+            for i in range(len(traj_1_rewards)):
+                wandb.log({"r1": traj_1_rewards[i].item()}, step=i)
+                wandb.log({"r2": traj_2_rewards[i].item()}, step=i)
+
             rewards_1 = jnp.concatenate([traj.rewards for traj in traj_1])
             rewards_2 = jnp.concatenate([traj.rewards for traj in traj_2])
 
diff --git a/pax/runners/runner_eval_hardstop.py b/pax/runners/runner_eval_hardstop.py
new file mode 100644
index 00000000..c301d9c5
--- /dev/null
+++ b/pax/runners/runner_eval_hardstop.py
@@ -0,0 +1,474 @@
+import os
+import time
+from typing import NamedTuple
+
+import jax
+import jax.numpy as jnp
+
+import wandb
+from pax.utils import load
+from pax.watchers import cg_visitation, ipd_visitation
+
+MAX_WANDB_CALLS = 10000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvalHardstopRunner:
+    """
+    Evaluation runner provides a convenient example for quickly writing
+    a shaping eval runner for PAX. The EvalRunner class can be used to
+    run any two agents together either in a meta-game or regular game, it composes together agents,
+    watchers, and the environment. Within the init, we declare vmaps and pmaps for training.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is important for
+            logic used in the class.
+        env (gymnax.envs.Environment):
+            The environment that the agents will run in.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(self, agents, env, args):
+        self.train_episodes = 0
+        self.start_time = time.time()
+        self.args = args
+        self.num_opps = args.num_opps
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.run_path = args.run_path
+        self.model_path = args.model_path
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(cg_visitation)
+        # VMAP for num envs: we vmap over the rng but not params
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # VMAP for num opps: we vmap over the rng but not params
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+
+        self.split = jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None))
+
+        agent1, agent2 = agents
+
+        if args.agent1 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent1.batch_init = jax.jit(jax.vmap(agent1.make_initial_state))
+        else:
+            # batch MemoryState not TrainingState
+            agent1.batch_init = jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),
+                (None, 0),
+            )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(agent1.reset_memory, (0, None), 0), static_argnums=1
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0))
+        )
+
+        # batch all for Agent2
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(jax.vmap(agent2.make_initial_state))
+        else:
+            agent2.batch_init = jax.vmap(
+                agent2.make_initial_state, (0, None), 0
+            )
+        agent2.batch_policy = jax.jit(jax.vmap(agent2._policy))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(agent2.reset_memory, (0, None), 0), static_argnums=1
+        )
+        agent2.batch_update = jax.jit(jax.vmap(agent2.update, (1, 0, 0, 0), 0))
+
+        if args.agent1 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent1._mem.hidden, (args.num_opps, 1, 1))
+            agent1._state, agent1._mem = agent1.batch_init(
+                agent1._state.random_key, init_hidden
+            )
+
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+            agent2._state, agent2._mem = agent2.batch_init(
+                jax.random.split(agent2._state.random_key, args.num_opps),
+                init_hidden,
+            )
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, 0, :]
+            # a1_rng = rngs[:, :, 1, :]
+            # a2_rng = rngs[:, :, 2, :]
+            rngs = rngs[:, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=self.args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _outer_rollout_fixed(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=self.args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            _, _, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        self.rollout = jax.jit(_outer_rollout)
+        self.rollout_fixed = jax.jit(_outer_rollout_fixed)
+
+    def run_loop(self, env, env_params, agents, num_episodes, watchers):
+        """Run evaluation of agents in environment"""
+        print("Training")
+        print("-----------------------")
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+        a2_state, a2_mem = agent2._state, agent2._mem
+
+        if watchers:
+            wandb.restore(
+                name=self.model_path, run_path=self.run_path, root=os.getcwd()
+            )
+        pretrained_params = load(self.model_path)
+        a1_state = a1_state._replace(params=pretrained_params)
+
+        num_iters = max(
+            int(num_episodes / (self.args.num_envs * self.args.num_opps)), 1
+        )
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Log Interval {log_interval}")
+
+        # RNG are the same for num_opps but different for num_envs
+        rngs = jnp.concatenate(
+            [jax.random.split(rng, self.args.num_envs)] * self.args.num_opps
+        ).reshape((self.args.num_opps, self.args.num_envs, -1))
+        # run actual loop
+        print('num episodes', num_episodes)
+        for i in range(num_episodes):
+            
+            obs, env_state = env.reset(rngs, env_params)
+            rewards = [
+                jnp.zeros((self.args.num_opps, self.args.num_envs)),
+                jnp.zeros((self.args.num_opps, self.args.num_envs)),
+            ]
+
+            if self.args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+            elif self.args.env_type in ["meta"]:
+                # meta-experiments - init 2nd agent per trial
+                a2_state, a2_mem = agent2.batch_init(
+                    jax.random.split(rng, self.num_opps), a2_mem.hidden
+                )
+            # run trials
+
+            vals, stack = jax.lax.scan(
+                self.rollout,
+                (
+                    rngs,
+                    *obs,
+                    *rewards,
+                    a1_state,
+                    a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    env_params,
+                ),
+                None,
+                length=self.args.stop,
+            )
+            traj_1, traj_2, a2_metrics = stack
+            (
+                rngs,
+                _,
+                _,
+                _,
+                _,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals     
+            vals, stack = jax.lax.scan(
+                self.rollout_fixed,
+                (
+                    rngs,
+                    *obs,
+                    *rewards,
+                    a1_state,
+                    a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    env_params,
+                ),
+                None,
+                length=(self.args.num_steps // self.args.num_inner_steps)-self.args.stop,
+            )
+
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            traj_1_fixed, traj_2_fixed, a2_metrics_fixed = stack
+
+            # reset second agent memory
+            a2_mem = agent2.batch_reset(a2_mem, False)
+
+            # logging
+            traj_1_rewards = jnp.concatenate([traj_1.rewards, traj_1_fixed.rewards], axis=0)
+            traj_2_rewards = jnp.concatenate([traj_2.rewards, traj_2_fixed.rewards], axis=0)
+            traj_1_rewards = traj_1_rewards.mean(axis=(1,3))
+            traj_2_rewards = traj_2_rewards.mean(axis=(1,3))
+            for i in range(len(traj_1_rewards)):
+                wandb.log({"r1": traj_1_rewards[i].item()}, step=i)
+                wandb.log({"r2": traj_2_rewards[i].item()}, step=i)
+
+            self.train_episodes += 1
+            if i % log_interval == 0:
+                print(f"Episode {i}")
+                if self.args.env_id == "coin_game":
+                    env_stats = jax.tree_util.tree_map(
+                        lambda x: x.item(),
+                        self.cg_stats(env_state),
+                    )
+                    rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                    rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+                elif self.args.env_type in [
+                    "meta",
+                    "sequential",
+                ]:
+                    env_stats = jax.tree_util.tree_map(
+                        lambda x: x.item(),
+                        self.ipd_stats(
+                            traj_1.observations,
+                            traj_1.actions,
+                            obs1,
+                        ),
+                    )
+                    rewards_1 = traj_1.rewards.mean()
+                    rewards_2 = traj_2.rewards.mean()
+
+                else:
+                    rewards_1 = traj_1.rewards.mean()
+                    rewards_2 = traj_2.rewards.mean()
+                    env_stats = {}
+
+                print(f"Env Stats: {env_stats}")
+                print(
+                    f"Total Episode Reward: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print()
+
+                if watchers:
+                    # metrics [outer_timesteps, num_opps]
+                    flattened_metrics = jax.tree_util.tree_map(
+                        lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                    )
+                    agent2._logger.metrics = (
+                        agent2._logger.metrics | flattened_metrics
+                    )
+
+                    for watcher, agent in zip(watchers, agents):
+                        watcher(agent)
+                    wandb.log(
+                        {
+                            "episodes": self.train_episodes,
+                            "train/episode_reward/player_1": float(
+                                rewards_1.mean()
+                            ),
+                            "train/episode_reward/player_2": float(
+                                rewards_2.mean()
+                            ),
+                        }
+                        | env_stats,
+                    )
+
+        agents[0]._state = a1_state
+        agents[1]._state = a2_state
+        return agents
diff --git a/pax/runners/runner_evo.py b/pax/runners/runner_evo.py
index 9ce590b0..43c4a0bd 100644
--- a/pax/runners/runner_evo.py
+++ b/pax/runners/runner_evo.py
@@ -337,6 +337,12 @@ def _rollout(
                     a2_rng,
                     agent2._mem.hidden,
                 )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
 
             # run trials
             vals, stack = jax.lax.scan(
diff --git a/pax/runners/runner_evo_hardstop.py b/pax/runners/runner_evo_hardstop.py
new file mode 100644
index 00000000..cb5345fd
--- /dev/null
+++ b/pax/runners/runner_evo_hardstop.py
@@ -0,0 +1,648 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoHardstopRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                counter,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+                counter,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                counter,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+            # jax.debug.print("Step Size: {x}", x=a2_state.opt_state[2].hyperparams['step_size'][0])
+            # jax.debug.print("Counter: {x}", x=counter[0])
+            # update second agent
+            a2_state.opt_state[2].hyperparams['step_size'] = jnp.where(counter <= 0, 0.0, a2_state.opt_state[2].hyperparams['step_size'])
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                counter - 1,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                random_numbers = jax.random.uniform(_rng_run, minval=1, maxval=self.num_outer_steps, shape=(10,))
+                # repeat the array 1000 times along the first dimension
+                counter = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+            
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                    counter,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+                counter,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+
+        for gen in range(num_gens):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(rng, 4)
+
+            # Ask
+            x, evo_state = strategy.ask(rng_evo, evo_state, es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            # Evo Rollout
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+
+            # Aggregate over devices
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+
+            # Tell
+            fitness_re = fit_shaper.apply(x, fitness)
+
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+
+            # Logging
+            log = es_logging.update(log, x, fitness)
+
+            # Saving
+            if gen % self.args.save_interval == 0:
+                log_savepath = os.path.join(self.save_dir, f"generation_{gen}")
+                if self.args.num_devices > 1:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0 : self.args.num_devices]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x[0].reshape(x[0].shape[1:]), top_params
+                    )
+                else:
+                    top_params = param_reshaper.reshape(
+                        log["top_gen_params"][0:1]
+                    )
+                    top_params = jax.tree_util.tree_map(
+                        lambda x: x.reshape(x.shape[1:]), top_params
+                    )
+                save(top_params, log_savepath)
+                if watchers:
+                    print(f"Saving generation {gen} locally and to WandB")
+                    wandb.save(log_savepath)
+                else:
+                    print(f"Saving iteration {gen} locally")
+
+            if gen % log_interval == 0:
+                print(f"Generation: {gen}")
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Fitness: {fitness.mean()} | Other Fitness: {other_fitness.mean()}"
+                )
+                print(
+                    f"Reward Per Timestep: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                )
+                print(
+                    f"Env Stats: {jax.tree_map(lambda x: x.item(), env_stats)}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(
+                    f"Top 5: Generation | Mean: {log['log_top_gen_mean'][gen]}"
+                    f" | Std: {log['log_top_gen_std'][gen]}"
+                )
+                print(
+                    "--------------------------------------------------------------------------"
+                )
+                print(f"Agent {1} | Fitness: {log['top_gen_fitness'][0]}")
+                print(f"Agent {2} | Fitness: {log['top_gen_fitness'][1]}")
+                print(f"Agent {3} | Fitness: {log['top_gen_fitness'][2]}")
+                print(f"Agent {4} | Fitness: {log['top_gen_fitness'][3]}")
+                print(f"Agent {5} | Fitness: {log['top_gen_fitness'][4]}")
+                print()
+
+            if watchers:
+                wandb_log = {
+                    "train_iteration": gen,
+                    "train/fitness/player_1": float(fitness.mean()),
+                    "train/fitness/player_2": float(other_fitness.mean()),
+                    "train/fitness/top_overall_mean": log["log_top_mean"][gen],
+                    "train/fitness/top_overall_std": log["log_top_std"][gen],
+                    "train/fitness/top_gen_mean": log["log_top_gen_mean"][gen],
+                    "train/fitness/top_gen_std": log["log_top_gen_std"][gen],
+                    "train/fitness/gen_std": log["log_gen_std"][gen],
+                    "train/time/minutes": float(
+                        (time.time() - self.start_time) / 60
+                    ),
+                    "train/time/seconds": float(
+                        (time.time() - self.start_time)
+                    ),
+                    "train/reward_per_timestep/player_1": float(
+                        rewards_1.mean()
+                    ),
+                    "train/reward_per_timestep/player_2": float(
+                        rewards_2.mean()
+                    ),
+                }
+                wandb_log.update(env_stats)
+                # loop through population
+                for idx, (overall_fitness, gen_fitness) in enumerate(
+                    zip(log["top_fitness"], log["top_gen_fitness"])
+                ):
+                    wandb_log[
+                        f"train/fitness/top_overall_agent_{idx+1}"
+                    ] = overall_fitness
+                    wandb_log[
+                        f"train/fitness/top_gen_agent_{idx+1}"
+                    ] = gen_fitness
+
+                # player 2 metrics
+                # metrics [outer_timesteps, num_opps]
+                flattened_metrics = jax.tree_util.tree_map(
+                    lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                )
+
+                agent2._logger.metrics.update(flattened_metrics)
+                for watcher, agent in zip(watchers, agents):
+                    watcher(agent)
+                wandb_log = jax.tree_util.tree_map(
+                    lambda x: x.item() if isinstance(x, jax.Array) else x,
+                    wandb_log,
+                )
+                wandb.log(wandb_log)
+
+        return agents
diff --git a/pax/runners/runner_evo_scanned.py b/pax/runners/runner_evo_scanned.py
new file mode 100644
index 00000000..e22b73bb
--- /dev/null
+++ b/pax/runners/runner_evo_scanned.py
@@ -0,0 +1,528 @@
+import os
+import time
+from datetime import datetime
+from typing import Any, Callable, NamedTuple
+
+import jax
+import jax.numpy as jnp
+from evosax import FitnessShaper
+
+import wandb
+from pax.utils import MemoryState, TrainingState, save
+
+# TODO: import when evosax library is updated
+# from evosax.utils import ESLog
+from pax.watchers import ESLog, cg_visitation, ipd_visitation, ipditm_stats
+
+MAX_WANDB_CALLS = 1000
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class EvoScannedRunner:
+    """
+    Evoluationary Strategy runner provides a convenient example for quickly writing
+    a MARL runner for PAX. The EvoRunner class can be used to
+    run an RL agent (optimised by an Evolutionary Strategy) against an Reinforcement Learner.
+    It composes together agents, watchers, and the environment.
+    Within the init, we declare vmaps and pmaps for training.
+    The environment provided must conform to a meta-environment.
+    This runner also scans over the evolutionary step, which leads to longer compilation time,
+    shorter run time and logging is not possible.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is
+             important for logic used in the class.
+        env (gymnax.envs.Environment):
+            The meta-environment that the agents will run in.
+        strategy (evosax.Strategy):
+            The evolutionary strategy that will be used to train the agents.
+        param_reshaper (evosax.param_reshaper.ParameterReshaper):
+            A function that reshapes the parameters of the agents into a format that can be
+             used by the strategy.
+        save_dir (string):
+            The directory to save the model to.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(
+        self, agents, env, strategy, es_params, param_reshaper, save_dir, args
+    ):
+        self.args = args
+        self.algo = args.es.algo
+        self.es_params = es_params
+        self.generations = 0
+        self.num_opps = args.num_opps
+        self.param_reshaper = param_reshaper
+        self.popsize = args.popsize
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.start_datetime = datetime.now()
+        self.save_dir = save_dir
+        self.start_time = time.time()
+        self.strategy = strategy
+        self.top_k = args.top_k
+        self.train_steps = 0
+        self.train_episodes = 0
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(jax.vmap(cg_visitation))
+        self.ipditm_stats = jax.jit(
+            jax.vmap(ipditm_stats, in_axes=(0, 2, 2, None))
+        )
+
+        # Evo Runner has 3 vmap dims (popsize, num_opps, num_envs)
+        # Evo Runner also has an additional pmap dim (num_devices, ...)
+        # For the env we vmap over the rng but not params
+
+        # num envs
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # num opps
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+        # pop size
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+        self.split = jax.vmap(
+            jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None)),
+            (0, None),
+        )
+
+        self.num_outer_steps = args.num_outer_steps
+        agent1, agent2 = agents
+
+        # vmap agents accordingly
+        # agent 1 is batched over popsize and num_opps
+        agent1.batch_init = jax.vmap(
+            jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),  # (params, rng)
+                (None, 0),  # (TrainingState, MemoryState)
+            ),
+            # both for Population
+        )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(
+                jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0)),
+            )
+        )
+
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(
+                jax.vmap(jax.vmap(agent2.make_initial_state))
+            )
+        else:
+            agent2.batch_init = jax.jit(
+                jax.vmap(
+                    jax.vmap(agent2.make_initial_state, (0, None), 0),
+                    (0, None),
+                    0,
+                )
+            )
+
+        agent2.batch_policy = jax.jit(jax.vmap(jax.vmap(agent2._policy, 0, 0)))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.reset_memory, (0, None), 0), (0, None), 0
+            ),
+            static_argnums=1,
+        )
+
+        agent2.batch_update = jax.jit(
+            jax.vmap(
+                jax.vmap(agent2.update, (1, 0, 0, 0)),
+                (1, 0, 0, 0),
+            )
+        )
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+
+            a2_rng = jnp.concatenate(
+                [jax.random.split(agent2._state.random_key, args.num_opps)]
+                * args.popsize
+            ).reshape(args.popsize, args.num_opps, -1)
+
+            agent2._state, agent2._mem = agent2.batch_init(
+                a2_rng,
+                init_hidden,
+            )
+
+        # jit evo
+        strategy.ask = jax.jit(strategy.ask)
+        strategy.tell = jax.jit(strategy.tell)
+        param_reshaper.reshape = jax.jit(param_reshaper.reshape)
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, :, 0, :]
+
+            # a1_rng = rngs[:, :, :, 1, :]
+            # a2_rng = rngs[:, :, :, 2, :]
+            rngs = rngs[:, :, :, 3, :]
+
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+            ), (*trajectories, a2_metrics)
+
+        def _rollout(
+            _params: jnp.ndarray,
+            _rng_run: jnp.ndarray,
+            _a1_state: TrainingState,
+            _a1_mem: MemoryState,
+            _env_params: Any,
+        ):
+            # env reset
+            env_rngs = jnp.concatenate(
+                [jax.random.split(_rng_run, args.num_envs)]
+                * args.num_opps
+                * args.popsize
+            ).reshape((args.popsize, args.num_opps, args.num_envs, -1))
+
+            obs, env_state = env.reset(env_rngs, _env_params)
+            rewards = [
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+                jnp.zeros((args.popsize, args.num_opps, args.num_envs)),
+            ]
+
+            # Player 1
+            _a1_state = _a1_state._replace(params=_params)
+            _a1_mem = agent1.batch_reset(_a1_mem, False)
+            # Player 2
+            if args.agent2 == "NaiveEx":
+                a2_state, a2_mem = agent2.batch_init(obs[1])
+
+            else:
+                # meta-experiments - init 2nd agent per trial
+                a2_rng = jnp.concatenate(
+                    [jax.random.split(_rng_run, args.num_opps)] * args.popsize
+                ).reshape(args.popsize, args.num_opps, -1)
+                a2_state, a2_mem = agent2.batch_init(
+                    a2_rng,
+                    agent2._mem.hidden,
+                )
+                # generate an array of shape [10]
+                # random_numbers = jax.random.uniform(_rng_run, minval=1e-5, maxval=1.0, shape=(10,))
+                # # repeat the array 1000 times along the first dimension
+                # learning_rates = jnp.tile(random_numbers, (1000, 1))
+                # a2_state.opt_state[2].hyperparams['step_size'] = learning_rates
+                # jax.debug.breakpoint()
+
+            # run trials
+            vals, stack = jax.lax.scan(
+                _outer_rollout,
+                (
+                    env_rngs,
+                    *obs,
+                    *rewards,
+                    _a1_state,
+                    _a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    _env_params,
+                ),
+                None,
+                length=self.num_outer_steps,
+            )
+
+            (
+                env_rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                _a1_state,
+                _a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                _env_params,
+            ) = vals
+            traj_1, traj_2, a2_metrics = stack
+
+            # Fitness
+            fitness = traj_1.rewards.mean(axis=(0, 1, 3, 4))
+            other_fitness = traj_2.rewards.mean(axis=(0, 1, 3, 4))
+            # Stats
+            if args.env_id == "coin_game":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x,
+                    self.cg_stats(env_state),
+                )
+
+                rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+            elif args.env_id in [
+                "iterated_matrix_game",
+            ]:
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipd_stats(
+                        traj_1.observations,
+                        traj_1.actions,
+                        obs1,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            elif args.env_id == "InTheMatrix":
+                env_stats = jax.tree_util.tree_map(
+                    lambda x: x.mean(),
+                    self.ipditm_stats(
+                        env_state,
+                        traj_1,
+                        traj_2,
+                        args.num_envs,
+                    ),
+                )
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+            else:
+                env_stats = {}
+                rewards_1 = traj_1.rewards.mean()
+                rewards_2 = traj_2.rewards.mean()
+
+            return (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            )
+
+        self.rollout = jax.pmap(
+            _rollout,
+            in_axes=(0, None, None, None, None),
+        )
+
+        print(
+            f"Time to Compile Jax Methods: {time.time() - self.start_time} Seconds"
+        )
+
+    def run_loop(
+        self,
+        env_params,
+        agents,
+        num_iters: int,
+        watchers: Callable,
+    ):
+        """Run training of agents in environment"""
+        print("Training")
+        print("------------------------------")
+        log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+        print(f"Number of Generations: {num_iters}")
+        print(f"Number of Meta Episodes: {self.num_outer_steps}")
+        print(f"Population Size: {self.popsize}")
+        print(f"Number of Environments: {self.args.num_envs}")
+        print(f"Number of Opponent: {self.args.num_opps}")
+        print(f"Log Interval: {log_interval}")
+        print("------------------------------")
+        # Initialize agents and RNG
+        agent1, agent2 = agents
+        rng, _ = jax.random.split(self.random_key)
+
+        # Initialize evolution
+        num_gens = num_iters
+        strategy = self.strategy
+        es_params = self.es_params
+        param_reshaper = self.param_reshaper
+        popsize = self.popsize
+        num_opps = self.num_opps
+        evo_state = strategy.initialize(rng, es_params)
+        fit_shaper = FitnessShaper(
+            maximize=self.args.es.maximise,
+            centered_rank=self.args.es.centered_rank,
+            w_decay=self.args.es.w_decay,
+            z_score=self.args.es.z_score,
+        )
+        es_logging = ESLog(
+            param_reshaper.total_params,
+            num_gens,
+            top_k=self.top_k,
+            maximize=True,
+        )
+        log = es_logging.initialize()
+
+        # Reshape a single agent's params before vmapping
+        init_hidden = jnp.tile(
+            agent1._mem.hidden,
+            (popsize, num_opps, 1, 1),
+        )
+        a1_rng = jax.random.split(rng, popsize)
+        agent1._state, agent1._mem = agent1.batch_init(
+            a1_rng,
+            init_hidden,
+        )
+
+        a1_state, a1_mem = agent1._state, agent1._mem
+        
+        def es_step(state_input, tmp):
+            rng, rng_run, rng_evo, rng_key = jax.random.split(state_input[0], 4)
+            x, evo_state = strategy.ask(rng_evo, state_input[1], es_params)
+            params = param_reshaper.reshape(x)
+            if self.args.num_devices == 1:
+                params = jax.tree_util.tree_map(
+                    lambda x: jax.lax.expand_dims(x, (0,)), params
+                )
+            (
+                fitness,
+                other_fitness,
+                env_stats,
+                rewards_1,
+                rewards_2,
+                a2_metrics,
+            ) = self.rollout(params, rng_run, a1_state, a1_mem, env_params)
+            fitness = jnp.reshape(fitness, popsize * self.args.num_devices)
+            env_stats = jax.tree_util.tree_map(lambda x: x.mean(), env_stats)
+            fitness_re = fit_shaper.apply(x, fitness)
+            if self.args.es.mean_reduce:
+                fitness_re = fitness_re - fitness_re.mean()
+            evo_state = strategy.tell(x, fitness_re, evo_state, es_params)
+            return (rng, evo_state), (fitness, other_fitness, env_stats, rewards_1, rewards_2, a2_metrics)
+
+        state_input = (rng, evo_state)
+        _, scan_output = jax.lax.scan(es_step, state_input, None, length=num_gens)
+
+        return agents
diff --git a/pax/runners/runner_stevie.py b/pax/runners/runner_stevie.py
new file mode 100644
index 00000000..9b8f333e
--- /dev/null
+++ b/pax/runners/runner_stevie.py
@@ -0,0 +1,413 @@
+import os
+import time
+from typing import NamedTuple
+
+import jax
+import jax.numpy as jnp
+
+import wandb
+from pax.utils import load
+from pax.watchers import cg_visitation, ipd_visitation
+
+MAX_WANDB_CALLS = 10000
+NUM_ENVS = 10
+
+
+
+class Sample(NamedTuple):
+    """Object containing a batch of data"""
+
+    observations: jnp.ndarray
+    actions: jnp.ndarray
+    rewards: jnp.ndarray
+    behavior_log_probs: jnp.ndarray
+    behavior_values: jnp.ndarray
+    dones: jnp.ndarray
+    hiddens: jnp.ndarray
+
+
+class StevieRunner:
+    """
+    Runner in which a small number of the principal agent is unable to see what is happening.
+    Args:
+        agents (Tuple[agents]):
+            The set of agents that will run in the experiment. Note, ordering is important for
+            logic used in the class.
+        env (gymnax.envs.Environment):
+            The environment that the agents will run in.
+        args (NamedTuple):
+            A tuple of experiment arguments used (usually provided by HydraConfig).
+    """
+
+    def __init__(self, agents, env, args):
+        self.train_episodes = 0
+        self.start_time = time.time()
+        self.args = args
+        self.num_opps = args.num_opps
+        self.random_key = jax.random.PRNGKey(args.seed)
+        self.run_path = args.run_path
+        self.model_path = args.model_path
+        self.ipd_stats = jax.jit(ipd_visitation)
+        self.cg_stats = jax.jit(cg_visitation)
+        # VMAP for num envs: we vmap over the rng but not params
+        env.reset = jax.vmap(env.reset, (0, None), 0)
+        env.step = jax.vmap(
+            env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+        )
+
+        # VMAP for num opps: we vmap over the rng but not params
+        env.reset = jax.jit(jax.vmap(env.reset, (0, None), 0))
+        env.step = jax.jit(
+            jax.vmap(
+                env.step, (0, 0, 0, None), 0  # rng, state, actions, params
+            )
+        )
+
+        self.split = jax.vmap(jax.vmap(jax.random.split, (0, None)), (0, None))
+
+        agent1, agent2 = agents
+
+        if args.agent1 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent1.batch_init = jax.jit(jax.vmap(agent1.make_initial_state))
+        else:
+            # batch MemoryState not TrainingState
+            agent1.batch_init = jax.vmap(
+                agent1.make_initial_state,
+                (None, 0),
+                (None, 0),
+            )
+        agent1.batch_reset = jax.jit(
+            jax.vmap(agent1.reset_memory, (0, None), 0), static_argnums=1
+        )
+
+        agent1.batch_policy = jax.jit(
+            jax.vmap(agent1._policy, (None, 0, 0), (0, None, 0))
+        )
+
+        # batch all for Agent2
+        if args.agent2 == "NaiveEx":
+            # special case where NaiveEx has a different call signature
+            agent2.batch_init = jax.jit(jax.vmap(agent2.make_initial_state))
+        else:
+            agent2.batch_init = jax.vmap(
+                agent2.make_initial_state, (0, None), 0
+            )
+        agent2.batch_policy = jax.jit(jax.vmap(agent2._policy))
+        agent2.batch_reset = jax.jit(
+            jax.vmap(agent2.reset_memory, (0, None), 0), static_argnums=1
+        )
+        agent2.batch_update = jax.jit(jax.vmap(agent2.update, (1, 0, 0, 0), 0))
+
+        if args.agent1 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent1._mem.hidden, (args.num_opps, 1, 1))
+            agent1._state, agent1._mem = agent1.batch_init(
+                agent1._state.random_key, init_hidden
+            )
+
+        if args.agent2 != "NaiveEx":
+            # NaiveEx requires env first step to init.
+            init_hidden = jnp.tile(agent2._mem.hidden, (args.num_opps, 1, 1))
+            agent2._state, agent2._mem = agent2.batch_init(
+                jax.random.split(agent2._state.random_key, args.num_opps),
+                init_hidden,
+            )
+
+
+        # BLIND_IDX = [] #For Timon to crank up that conspiracy
+        # BLIND_MASK = jnp.array(
+        #     [[[0,0,0,0,0] if idx in BLIND_IDX else [1,1,1,1,1] for idx in range(args.num_envs)]],
+        #     dtype=jnp.int8)
+
+        # NOT_BLIND_MASK = jnp.logical_not(BLIND_MASK)
+        BLIND_STATE = jnp.tile(jnp.array([0, 0, 0, 0, 1]), (args.num_opps, args.num_envs, 1))
+
+        def _inner_rollout(carry, unused):
+            """Runner for inner episode"""
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                BLIND_MASK,
+                NOT_BLIND_MASK,
+            ) = carry
+
+            # unpack rngs
+            rngs = self.split(rngs, 4)
+            env_rng = rngs[:, :, 0, :]
+            # a1_rng = rngs[:, :, 1, :]
+
+            # a2_rng = rngs[:, :, 2, :]
+            rngs = rngs[:, :, 3, :]
+            obs1 = BLIND_MASK * obs1 + NOT_BLIND_MASK*BLIND_STATE
+            a1, a1_state, new_a1_mem = agent1.batch_policy(
+                a1_state,
+                obs1,
+                a1_mem,
+            )
+            a2, a2_state, new_a2_mem = agent2.batch_policy(
+                a2_state,
+                obs2,
+                a2_mem,
+            )
+            (next_obs1, next_obs2), env_state, rewards, done, info = env.step(
+                env_rng,
+                env_state,
+                (a1, a2),
+                env_params,
+            )
+
+            traj1 = Sample(
+                obs1,
+                a1,
+                rewards[0],
+                new_a1_mem.extras["log_probs"],
+                new_a1_mem.extras["values"],
+                done,
+                a1_mem.hidden,
+            )
+            traj2 = Sample(
+                obs2,
+                a2,
+                rewards[1],
+                new_a2_mem.extras["log_probs"],
+                new_a2_mem.extras["values"],
+                done,
+                a2_mem.hidden,
+            )
+            return (
+                rngs,
+                next_obs1,
+                next_obs2,
+                rewards[0],
+                rewards[1],
+                a1_state,
+                new_a1_mem,
+                a2_state,
+                new_a2_mem,
+                env_state,
+                env_params,
+                BLIND_MASK,
+                NOT_BLIND_MASK,
+            ), (
+                traj1,
+                traj2,
+            )
+
+        def _outer_rollout(carry, unused):
+            """Runner for trial"""
+            # play episode of the game
+            vals, trajectories = jax.lax.scan(
+                _inner_rollout,
+                carry,
+                None,
+                length=self.args.num_inner_steps,
+            )
+            (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                BLIND_MASK,
+                NOT_BLIND_MASK,
+            ) = vals
+            # MFOS has to take a meta-action for each episode
+            if args.agent1 == "MFOS":
+                a1_mem = agent1.meta_policy(a1_mem)
+
+            # update second agent
+            a2_state, a2_mem, a2_metrics = agent2.batch_update(
+                trajectories[1],
+                obs2,
+                a2_state,
+                a2_mem,
+            )
+            return (
+                rngs,
+                obs1,
+                obs2,
+                r1,
+                r2,
+                a1_state,
+                a1_mem,
+                a2_state,
+                a2_mem,
+                env_state,
+                env_params,
+                BLIND_MASK,
+                NOT_BLIND_MASK,
+            ), (*trajectories, a2_metrics)
+
+        self.rollout = jax.jit(_outer_rollout)
+
+    def run_loop(self, env, env_params, agents, num_episodes, watchers):
+        """Run evaluation of agents in environment"""
+        print("Eval")
+        print("-----------------------")
+        for s in range(self.args.num_envs):
+            print(f"Number of blind dims: {s}")
+            BLIND_IDX = jnp.arange(s) #For Timon to crank up that conspiracy
+            BLIND_MASK = jnp.array(
+            [[[0,0,0,0,0] if idx in BLIND_IDX else [1,1,1,1,1] for idx in range(self.args.num_envs)]],
+            dtype=jnp.int8)
+
+            NOT_BLIND_MASK = jnp.logical_not(BLIND_MASK)
+            agent1, agent2 = agents
+            rng, _ = jax.random.split(self.random_key)
+
+            a1_state, a1_mem = agent1._state, agent1._mem
+            a2_state, a2_mem = agent2._state, agent2._mem
+
+            if watchers:
+                wandb.restore(
+                    name=self.model_path, run_path=self.run_path, root=os.getcwd()
+                )
+            pretrained_params = load(self.model_path)
+            a1_state = a1_state._replace(params=pretrained_params)
+
+            num_iters = max(
+                int(num_episodes / (self.args.num_envs * self.args.num_opps)), 1
+            )
+            log_interval = max(num_iters / MAX_WANDB_CALLS, 5)
+            print(f"Log Interval {log_interval}")
+
+            # RNG are the same for num_opps but different for num_envs
+            rngs = jnp.concatenate(
+                [jax.random.split(rng, self.args.num_envs)] * self.args.num_opps
+            ).reshape((self.args.num_opps, self.args.num_envs, -1))
+            # run actual loop
+            for i in range(num_episodes):
+                obs, env_state = env.reset(rngs, env_params)
+                rewards = [
+                    jnp.zeros((self.args.num_opps, self.args.num_envs)),
+                    jnp.zeros((self.args.num_opps, self.args.num_envs)),
+                ]
+
+                if self.args.agent2 == "NaiveEx":
+                    a2_state, a2_mem = agent2.batch_init(obs[1])
+                elif self.args.env_type in ["meta"]:
+                    # meta-experiments - init 2nd agent per trial
+                    a2_state, a2_mem = agent2.batch_init(
+                        jax.random.split(rng, self.num_opps), a2_mem.hidden
+                    )
+                # run trials
+                vals, stack = jax.lax.scan(
+                    self.rollout,
+                    (
+                        rngs,
+                        *obs,
+                        *rewards,
+                        a1_state,
+                        a1_mem,
+                        a2_state,
+                        a2_mem,
+                        env_state,
+                        env_params,
+                        BLIND_MASK,
+                        NOT_BLIND_MASK,
+                    ),
+                    None,
+                    length=self.args.num_steps // self.args.num_inner_steps,
+                )
+
+                (
+                    rngs,
+                    obs1,
+                    obs2,
+                    r1,
+                    r2,
+                    a1_state,
+                    a1_mem,
+                    a2_state,
+                    a2_mem,
+                    env_state,
+                    env_params,
+                    BLIND_MASK,
+                    NOT_BLIND_MASK,
+                ) = vals
+                traj_1, traj_2, a2_metrics = stack
+
+                # reset second agent memory
+                a2_mem = agent2.batch_reset(a2_mem, False)
+                # jax.debug.breakpoint()
+                # logging
+                self.train_episodes += 1
+                if i % log_interval == 0:
+                    print(f"Episode {i}")
+                    if self.args.env_id == "coin_game":
+                        env_stats = jax.tree_util.tree_map(
+                            lambda x: x.item(),
+                            self.cg_stats(env_state),
+                        )
+                        rewards_1 = traj_1.rewards.sum(axis=1).mean()
+                        rewards_2 = traj_2.rewards.sum(axis=1).mean()
+
+                    elif self.args.env_type in [
+                        "meta",
+                        "sequential",
+                    ]:
+                        env_stats = jax.tree_util.tree_map(
+                            lambda x: x.item(),
+                            self.ipd_stats(
+                                traj_1.observations,
+                                traj_1.actions,
+                                obs1,
+                            ),
+                        )
+                        rewards_1 = traj_1.rewards.mean()
+                        rewards_2 = traj_2.rewards.mean()
+
+                    else:
+                        rewards_1 = traj_1.rewards.mean()
+                        rewards_2 = traj_2.rewards.mean()
+                        env_stats = {}
+
+                    print(f"Env Stats: {env_stats}")
+                    print(
+                        f"Total Episode Reward: {float(rewards_1.mean()), float(rewards_2.mean())}"
+                    )
+                    print()
+
+                    if watchers:
+                        # metrics [outer_timesteps, num_opps]
+                        flattened_metrics = jax.tree_util.tree_map(
+                            lambda x: jnp.sum(jnp.mean(x, 1)), a2_metrics
+                        )
+                        agent2._logger.metrics = (
+                            agent2._logger.metrics | flattened_metrics
+                        )
+
+                        for watcher, agent in zip(watchers, agents):
+                            watcher(agent)
+                        wandb.log(
+                            {
+                                "episodes": s,
+                                "train/episode_reward/player_1": float(
+                                    rewards_1.mean()
+                                ),
+                                "train/episode_reward/player_2": float(
+                                    rewards_2.mean()
+                                ),
+                            }
+                            | env_stats,
+                        )
+
+            agents[0]._state = a1_state
+            agents[1]._state = a2_state
+        return agents
diff --git a/stevie_bash.sh b/stevie_bash.sh
new file mode 100755
index 00000000..1f1cfa89
--- /dev/null
+++ b/stevie_bash.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/two=mfos_avg_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/ten=mfos_avg_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_avg/twenty=mfos_avg_4 ++wandb.log=True
+
+
+###### MFOS NOTHING ######
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/two=mfos_nothing_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/ten=mfos_nothing_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/mfos_nothing/twenty=mfos_nothing_4 ++wandb.log=True
+
+###### SHAPER AVG ######
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/two=shaper_avg_2 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/ten=shaper_avg_2 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_avg/twenty=shaper_avg_2 ++wandb.log=True
+
+###### SHAPER NOTHING ######
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/two=shaper_nothing_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/ten=shaper_nothing_4 ++wandb.log=True
+
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_0 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_1 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_2 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_3 ++wandb.log=True
+python -m pax.experiment -m +experiment/ipd/stevie/shaper_nothing/twenty=shaper_nothing_4 ++wandb.log=True
\ No newline at end of file
diff --git a/test/runners/test_runners.py b/test/runners/test_runners.py
index c74dc3b1..11e96bad 100644
--- a/test/runners/test_runners.py
+++ b/test/runners/test_runners.py
@@ -73,3 +73,39 @@ def test_runner_marl_nplayer():
     _test_runner(
         ["+experiment/multiplayer_ipd=lola_vs_ppo_ipd", "++num_inner_steps=10"]
     )
+
+
+def test_runner_evo_hardstop():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_hardstop"])
+
+
+def test_runner_evo_mixed_rl():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_lr"])
+
+
+def test_runner_evo_mixed_payoff():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff"])
+
+
+def test_runner_evo_mixed_ipd_payoff():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_ipd_payoff"])
+
+
+def test_runner_evo_mixed_payoff_gen():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_gen"])
+
+
+def test_runner_evo_mixed_payoff_input():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_input"])
+
+
+def test_runner_evo_mixed_payoff_input():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_input"])
+
+
+def test_runner_evo_scanned():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_scanned"])
+
+
+def test_runner_evo_mixed_payoff_only_opp():
+    _test_runner(["+experiment/ipd=shaper_att_v_tabular", "++runner=evo_mixed_payoff_only_opp"])
\ No newline at end of file