From 7295930b2412ef67f0477b231185056a6b9efb63 Mon Sep 17 00:00:00 2001
From: bmosaicml <jeremy@mosaicml.com>
Date: Wed, 28 Jun 2023 19:33:51 -0400
Subject: [PATCH 1/2] bug fix

---
 mcli/mcli-hf-eval.yaml                 | 82 +++++++++++++-------------
 scripts/eval/eval.py                   | 12 +++-
 scripts/eval/yamls/model_gauntlet.yaml |  7 +--
 3 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index dcdccbc1df..9718349c42 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -13,8 +13,8 @@ command: |
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
 run_name: all-eval
 gpu_num: 8
-gpu_type: a100_40gb
-cluster: r7z2 # replace with your cluster here!
+# gpu_type: 
+# cluster:  # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.0.1_cu118-latest
 
@@ -97,48 +97,48 @@ parameters:
   #     device: cpu
   #     pretrained: true
   #     use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b
-    # Tokenizer
-    tokenizer:
-      name: EleutherAI/gpt-neox-20b
-      kwargs:
-        model_max_length: ${max_seq_len}
+  # -
+  #   model_name: mosaicml/mpt-7b
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: EleutherAI/gpt-neox-20b
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b
-      device: cpu
-      pretrained: true
-      use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b-chat
-    # Tokenizer
-    tokenizer:
-      name: EleutherAI/gpt-neox-20b
-      kwargs:
-        model_max_length: ${max_seq_len}
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
+  # -
+  #   model_name: mosaicml/mpt-7b-chat
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: mosaicml/mpt-7b-chat
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-chat
-      device: cpu
-      pretrained: true
-      use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b-instruct
-    # Tokenizer
-    tokenizer:
-      name: EleutherAI/gpt-neox-20b
-      kwargs:
-        model_max_length: ${max_seq_len}
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b-chat
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
+  # -
+  #   model_name: mosaicml/mpt-7b-instruct
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: EleutherAI/gpt-neox-20b
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
-      device: cpu
-      pretrained: true
-      use_auth_token: false
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
   # -
   #   model_name: tiiuae/falcon-7b
   #   # Tokenizer
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 5601fb60d3..942f464eb0 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -1,6 +1,7 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import re
 import sys
 import time
@@ -36,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries):
                 )
 
 
-def evaluate_model(model_cfg):
+def evaluate_model(model_cfg, run_name, model_gauntlet_df):
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
     # Build tokenizer and model
     tokenizer = build_tokenizer(model_cfg.tokenizer)
@@ -81,6 +82,7 @@ def evaluate_model(model_cfg):
     load_path = model_cfg.get('load_path', None)
 
     trainer = Trainer(
+        run_name=run_name,
         model=composer_model,
         loggers=loggers,
         precision=cfg.precision,
@@ -101,11 +103,13 @@ def evaluate_model(model_cfg):
     b = time.time()
     print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds')
     return (in_memory_logger, logger_keys, model_gauntlet_callback,
-            model_gauntlet)
+            model_gauntlet, model_gauntlet_df)
 
 
 def main(cfg):
     cfg.dist_timeout = cfg.get('dist_timeout', 600.0)
+    if cfg.get('run_name') is None:
+        cfg.run_name = os.environ.get('RUN_NAME', 'llm')
 
     reproducibility.seed_all(cfg.seed)
     dist.initialize_dist(get_device(None), timeout=cfg.dist_timeout)
@@ -116,7 +120,9 @@ def main(cfg):
 
         try:
             (in_memory_logger, logger_keys, model_gauntlet_callback,
-             model_gauntlet) = evaluate_model(model_cfg)
+             model_gauntlet,
+             model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
+                                                 model_gauntlet_df)
 
             composite_scores = model_gauntlet_callback.eval_end(
                 None, in_memory_logger)
diff --git a/scripts/eval/yamls/model_gauntlet.yaml b/scripts/eval/yamls/model_gauntlet.yaml
index 08eb902405..11aa1381e9 100644
--- a/scripts/eval/yamls/model_gauntlet.yaml
+++ b/scripts/eval/yamls/model_gauntlet.yaml
@@ -2,7 +2,7 @@ model_gauntlet:
   weighting: EQUAL
   subtract_random_baseline: true
   rescale_accuracy: true
-  tasks:
+  categories:
   - name: world_knowledge
     benchmarks:
     - name: jeopardy
@@ -112,8 +112,3 @@ model_gauntlet:
     - name: boolq
       num_fewshot: 10
       random_baseline: 0.5
-  - name: programming
-    benchmarks:
-    - name: humaneval
-      num_fewshot: 0
-      random_baseline: 0.0

From 90ff7dd76709329943a4ebc20d70cc984f63fa7e Mon Sep 17 00:00:00 2001
From: bmosaicml <jeremy@mosaicml.com>
Date: Wed, 28 Jun 2023 21:33:59 -0400
Subject: [PATCH 2/2] change key name from tasks to categories

---
 mcli/mcli-hf-eval.yaml | 6 +++---
 scripts/eval/eval.py   | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index 9718349c42..d554047a6e 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  # git_branch:  # use your branch
+  # git_branch: # use your branch
   # git_commit: # OR use your commit hash
   pip_install: -e ".[gpu]"
   ssh_clone: false # Should be true if using a private repo
@@ -13,8 +13,8 @@ command: |
 # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME
 run_name: all-eval
 gpu_num: 8
-# gpu_type: 
-# cluster:  # replace with your cluster here!
+# gpu_type:
+# cluster: # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.0.1_cu118-latest
 
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 942f464eb0..9edeeea9bf 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -66,7 +66,7 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df):
 
     if model_gauntlet_df is None and model_gauntlet is not None:
         model_gauntlet_df = pd.DataFrame(columns=['model_name', 'average'] +
-                                         [t.name for t in model_gauntlet.tasks])
+                                         [t.name for t in model_gauntlet.categories])
 
     in_memory_logger = InMemoryLogger()  # track metrics in the in_memory_logger
     loggers: List[LoggerDestination] = [
@@ -128,11 +128,11 @@ def main(cfg):
                 None, in_memory_logger)
 
             benchmark_to_taxonomy = {}
-            for t in model_gauntlet.tasks:
+            for t in model_gauntlet.categories:
                 for b in t.benchmarks:
                     benchmark_to_taxonomy[b.name] = t.name
 
-            [t.name for t in model_gauntlet.tasks]
+            
             model_results = calculate_markdown_results(logger_keys,
                                                        in_memory_logger.data,
                                                        benchmark_to_taxonomy,
@@ -148,7 +148,7 @@ def main(cfg):
 
             row.update({
                 t.name: composite_scores[f'metrics/model_gauntlet/{t.name}']
-                for t in model_gauntlet.tasks
+                for t in model_gauntlet.categories
             })
             row.update({
                 'average': composite_scores[f'metrics/model_gauntlet/average']