diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index dcdccbc1df..ad6121c4b6 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  # git_branch:  # use your branch
+  # git_branch: # use your branch
   # git_commit: # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false # Should be true if using a private repo
@@ -13,8 +13,8 @@ command: |
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
 run_name: all-eval
 gpu_num: 8
-gpu_type: a100_40gb
-cluster: r7z2 # replace with your cluster here!
+# gpu_type:
+# cluster:  # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.0.1_cu118-latest
 
@@ -115,7 +115,7 @@ parameters:
     model_name: mosaicml/mpt-7b-chat
     # Tokenizer
     tokenizer:
-      name: EleutherAI/gpt-neox-20b
+      name: mosaicml/mpt-7b-chat
       kwargs:
         model_max_length: ${max_seq_len}
 
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 5601fb60d3..246d49de24 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import re
 import sys
 import time
@@ -36,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries):
                 )
 
 
-def evaluate_model(model_cfg):
+def evaluate_model(model_cfg, run_name):
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
     # Build tokenizer and model
     tokenizer = build_tokenizer(model_cfg.tokenizer)
@@ -81,6 +82,7 @@ def evaluate_model(model_cfg):
     load_path = model_cfg.get('load_path', None)
 
     trainer = Trainer(
+        run_name=run_name,
         model=composer_model,
         loggers=loggers,
         precision=cfg.precision,
@@ -106,6 +108,8 @@ def evaluate_model(model_cfg):
 
 def main(cfg):
     cfg.dist_timeout = cfg.get('dist_timeout', 600.0)
+    if cfg.get('run_name') is None:
+        cfg.run_name = os.environ.get('RUN_NAME', 'llm')
 
     reproducibility.seed_all(cfg.seed)
     dist.initialize_dist(get_device(None), timeout=cfg.dist_timeout)
@@ -116,7 +120,7 @@ def main(cfg):
 
         try:
             (in_memory_logger, logger_keys, model_gauntlet_callback,
-             model_gauntlet) = evaluate_model(model_cfg)
+             model_gauntlet) = evaluate_model(model_cfg, cfg.run_name)
 
             composite_scores = model_gauntlet_callback.eval_end(
                 None, in_memory_logger)
diff --git a/scripts/eval/yamls/model_gauntlet.yaml b/scripts/eval/yamls/model_gauntlet.yaml
index 08eb902405..11aa1381e9 100644
--- a/scripts/eval/yamls/model_gauntlet.yaml
+++ b/scripts/eval/yamls/model_gauntlet.yaml
@@ -2,7 +2,7 @@ model_gauntlet:
   weighting: EQUAL
   subtract_random_baseline: true
   rescale_accuracy: true
-  tasks:
+  categories:
   - name: world_knowledge
     benchmarks:
     - name: jeopardy
@@ -112,8 +112,3 @@ model_gauntlet:
     - name: boolq
       num_fewshot: 10
       random_baseline: 0.5
-  - name: programming
-    benchmarks:
-    - name: humaneval
-      num_fewshot: 0
-      random_baseline: 0.0