From 7295930b2412ef67f0477b231185056a6b9efb63 Mon Sep 17 00:00:00 2001 From: bmosaicml Date: Wed, 28 Jun 2023 19:33:51 -0400 Subject: [PATCH 1/2] bug fix --- mcli/mcli-hf-eval.yaml | 82 +++++++++++++------------- scripts/eval/eval.py | 12 +++- scripts/eval/yamls/model_gauntlet.yaml | 7 +-- 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index dcdccbc1df..9718349c42 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -13,8 +13,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: all-eval gpu_num: 8 -gpu_type: a100_40gb -cluster: r7z2 # replace with your cluster here! +# gpu_type: +# cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.0.1_cu118-latest @@ -97,48 +97,48 @@ parameters: # device: cpu # pretrained: true # use_auth_token: false - - - model_name: mosaicml/mpt-7b - # Tokenizer - tokenizer: - name: EleutherAI/gpt-neox-20b - kwargs: - model_max_length: ${max_seq_len} + # - + # model_name: mosaicml/mpt-7b + # # Tokenizer + # tokenizer: + # name: EleutherAI/gpt-neox-20b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-7b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: mosaicml/mpt-7b-chat - # Tokenizer - tokenizer: - name: EleutherAI/gpt-neox-20b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: mosaicml/mpt-7b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: mosaicml/mpt-7b-chat + # # Tokenizer + # tokenizer: + # name: mosaicml/mpt-7b-chat + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-7b-chat - device: cpu - pretrained: true - use_auth_token: false - - - model_name: mosaicml/mpt-7b-instruct - # Tokenizer - tokenizer: - name: EleutherAI/gpt-neox-20b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: mosaicml/mpt-7b-chat + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: mosaicml/mpt-7b-instruct + # # Tokenizer + # tokenizer: + # name: EleutherAI/gpt-neox-20b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-7b-instruct - device: cpu - pretrained: true - use_auth_token: false + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: mosaicml/mpt-7b-instruct + # device: cpu + # pretrained: true + # use_auth_token: false # - # model_name: tiiuae/falcon-7b # # Tokenizer diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 5601fb60d3..942f464eb0 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os import re import sys import time @@ -36,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries): ) -def evaluate_model(model_cfg): +def evaluate_model(model_cfg, run_name, model_gauntlet_df): print(f'Evaluating model: {model_cfg.model_name}', flush=True) # Build tokenizer and model tokenizer = build_tokenizer(model_cfg.tokenizer) @@ -81,6 +82,7 @@ def evaluate_model(model_cfg): load_path = model_cfg.get('load_path', None) trainer = Trainer( + run_name=run_name, model=composer_model, loggers=loggers, precision=cfg.precision, @@ -101,11 +103,13 @@ def evaluate_model(model_cfg): b = time.time() print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds') return (in_memory_logger, logger_keys, model_gauntlet_callback, - model_gauntlet) + model_gauntlet, model_gauntlet_df) def main(cfg): cfg.dist_timeout = cfg.get('dist_timeout', 600.0) + if cfg.get('run_name') is None: + cfg.run_name = os.environ.get('RUN_NAME', 'llm') reproducibility.seed_all(cfg.seed) dist.initialize_dist(get_device(None), timeout=cfg.dist_timeout) @@ -116,7 +120,9 @@ def main(cfg): try: (in_memory_logger, logger_keys, model_gauntlet_callback, - model_gauntlet) = evaluate_model(model_cfg) + model_gauntlet, + model_gauntlet_df) = evaluate_model(model_cfg, cfg.run_name, + model_gauntlet_df) composite_scores = model_gauntlet_callback.eval_end( None, in_memory_logger) diff --git a/scripts/eval/yamls/model_gauntlet.yaml b/scripts/eval/yamls/model_gauntlet.yaml index 08eb902405..11aa1381e9 100644 --- a/scripts/eval/yamls/model_gauntlet.yaml +++ b/scripts/eval/yamls/model_gauntlet.yaml @@ -2,7 +2,7 @@ model_gauntlet: weighting: EQUAL subtract_random_baseline: true rescale_accuracy: true - tasks: + categories: - name: world_knowledge benchmarks: - name: jeopardy @@ -112,8 +112,3 @@ model_gauntlet: - name: boolq num_fewshot: 10 random_baseline: 0.5 - - name: programming - benchmarks: - - name: humaneval - num_fewshot: 0 - random_baseline: 0.0 From 90ff7dd76709329943a4ebc20d70cc984f63fa7e Mon Sep 17 00:00:00 2001 From: bmosaicml Date: Wed, 28 Jun 2023 21:33:59 -0400 Subject: [PATCH 2/2] change key name from tasks to categories --- mcli/mcli-hf-eval.yaml | 6 +++--- scripts/eval/eval.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 9718349c42..d554047a6e 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - # git_branch: # use your branch + # git_branch: # use your branch # git_commit: # OR use your commit hash pip_install: -e ".[gpu]" ssh_clone: false # Should be true if using a private repo @@ -13,8 +13,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: all-eval gpu_num: 8 -# gpu_type: -# cluster: # replace with your cluster here! +# gpu_type: +# cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.0.1_cu118-latest diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 942f464eb0..9edeeea9bf 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -66,7 +66,7 @@ def evaluate_model(model_cfg, run_name, model_gauntlet_df): if model_gauntlet_df is None and model_gauntlet is not None: model_gauntlet_df = pd.DataFrame(columns=['model_name', 'average'] + - [t.name for t in model_gauntlet.tasks]) + [t.name for t in model_gauntlet.categories]) in_memory_logger = InMemoryLogger() # track metrics in the in_memory_logger loggers: List[LoggerDestination] = [ @@ -128,11 +128,11 @@ def main(cfg): None, in_memory_logger) benchmark_to_taxonomy = {} - for t in model_gauntlet.tasks: + for t in model_gauntlet.categories: for b in t.benchmarks: benchmark_to_taxonomy[b.name] = t.name - [t.name for t in model_gauntlet.tasks] + model_results = calculate_markdown_results(logger_keys, in_memory_logger.data, benchmark_to_taxonomy, @@ -148,7 +148,7 @@ def main(cfg): row.update({ t.name: composite_scores[f'metrics/model_gauntlet/{t.name}'] - for t in model_gauntlet.tasks + for t in model_gauntlet.categories }) row.update({ 'average': composite_scores[f'metrics/model_gauntlet/average']