Merge branch 'model_gauntlet' of github.com:mosaicml/llm-foundry into…

… model_gauntlet
mosaicml · Jun 29, 2023 · 2ccb445 · 2ccb445
2 parents 743518b + 90ff7dd
commit 2ccb445
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 47 deletions.
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -14,7 +14,7 @@ command: |
 run_name: all-eval
 gpu_num: 8
 # gpu_type:
-# cluster:  # replace with your cluster here!
+# cluster: # replace with your cluster here!
 
 image: mosaicml/llm-foundry:2.0.1_cu118-latest
 
@@ -97,48 +97,48 @@ parameters:
   #     device: cpu
   #     pretrained: true
   #     use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b
-    # Tokenizer
-    tokenizer:
-      name: EleutherAI/gpt-neox-20b
-      kwargs:
-        model_max_length: ${max_seq_len}
+  # -
+  #   model_name: mosaicml/mpt-7b
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: EleutherAI/gpt-neox-20b
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b
-      device: cpu
-      pretrained: true
-      use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b-chat
-    # Tokenizer
-    tokenizer:
-      name: mosaicml/mpt-7b-chat
-      kwargs:
-        model_max_length: ${max_seq_len}
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
+  # -
+  #   model_name: mosaicml/mpt-7b-chat
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: mosaicml/mpt-7b-chat
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-chat
-      device: cpu
-      pretrained: true
-      use_auth_token: false
-  -
-    model_name: mosaicml/mpt-7b-instruct
-    # Tokenizer
-    tokenizer:
-      name: EleutherAI/gpt-neox-20b
-      kwargs:
-        model_max_length: ${max_seq_len}
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b-chat
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
+  # -
+  #   model_name: mosaicml/mpt-7b-instruct
+  #   # Tokenizer
+  #   tokenizer:
+  #     name: EleutherAI/gpt-neox-20b
+  #     kwargs:
+  #       model_max_length: ${max_seq_len}
 
-    model:
-      name: hf_causal_lm
-      pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
-      device: cpu
-      pretrained: true
-      use_auth_token: false
+  #   model:
+  #     name: hf_causal_lm
+  #     pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+  #     device: cpu
+  #     pretrained: true
+  #     use_auth_token: false
   # -
   #   model_name: tiiuae/falcon-7b
   #   # Tokenizer

diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
@@ -37,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries):
                 )
 
 
-def evaluate_model(model_cfg, run_name):
+def evaluate_model(model_cfg, run_name, model_gauntlet_df):
     print(f'Evaluating model: {model_cfg.model_name}', flush=True)
     # Build tokenizer and model
     tokenizer = build_tokenizer(model_cfg.tokenizer)
@@ -66,7 +66,7 @@ def evaluate_model(model_cfg, run_name):
 
     if model_gauntlet_df is None and model_gauntlet is not None:
         model_gauntlet_df = pd.DataFrame(columns=['model_name', 'average'] +
-                                         [t.name for t in model_gauntlet.tasks])
+                                         [t.name for t in model_gauntlet.categories])
 
     in_memory_logger = InMemoryLogger()  # track metrics in the in_memory_logger
     loggers: List[LoggerDestination] = [
@@ -103,7 +103,7 @@ def evaluate_model(model_cfg, run_name):
     b = time.time()
     print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds')
     return (in_memory_logger, logger_keys, model_gauntlet_callback,
-            model_gauntlet)
+            model_gauntlet, model_gauntlet_df)
 
 
 def main(cfg):
@@ -120,17 +120,19 @@ def main(cfg):
 
         try:
             (in_memory_logger, logger_keys, model_gauntlet_callback,
-             model_gauntlet) = evaluate_model(model_cfg, cfg.run_name)
+             model_gauntlet,
+             model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name,
+                                                 model_gauntlet_df)
 
             composite_scores = model_gauntlet_callback.eval_end(
                 None, in_memory_logger)
 
             benchmark_to_taxonomy = {}
-            for t in model_gauntlet.tasks:
+            for t in model_gauntlet.categories:
                 for b in t.benchmarks:
                     benchmark_to_taxonomy[b.name] = t.name
 
-            [t.name for t in model_gauntlet.tasks]
+
             model_results = calculate_markdown_results(logger_keys,
                                                        in_memory_logger.data,
                                                        benchmark_to_taxonomy,
@@ -146,7 +148,7 @@ def main(cfg):
 
             row.update({
                 t.name: composite_scores[f'metrics/model_gauntlet/{t.name}']
-                for t in model_gauntlet.tasks
+                for t in model_gauntlet.categories
             })
             row.update({
                 'average': composite_scores[f'metrics/model_gauntlet/average']