diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index dcdccbc1df..ad6121c4b6 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - # git_branch: # use your branch + # git_branch: # use your branch # git_commit: # OR use your commit hash pip_install: -e ".[gpu]" ssh_clone: false # Should be true if using a private repo @@ -13,8 +13,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: all-eval gpu_num: 8 -gpu_type: a100_40gb -cluster: r7z2 # replace with your cluster here! +# gpu_type: +# cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.0.1_cu118-latest @@ -115,7 +115,7 @@ parameters: model_name: mosaicml/mpt-7b-chat # Tokenizer tokenizer: - name: EleutherAI/gpt-neox-20b + name: mosaicml/mpt-7b-chat kwargs: model_max_length: ${max_seq_len} diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 5601fb60d3..246d49de24 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os import re import sys import time @@ -36,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries): ) -def evaluate_model(model_cfg): +def evaluate_model(model_cfg, run_name): print(f'Evaluating model: {model_cfg.model_name}', flush=True) # Build tokenizer and model tokenizer = build_tokenizer(model_cfg.tokenizer) @@ -81,6 +82,7 @@ def evaluate_model(model_cfg): load_path = model_cfg.get('load_path', None) trainer = Trainer( + run_name=run_name, model=composer_model, loggers=loggers, precision=cfg.precision, @@ -106,6 +108,8 @@ def evaluate_model(model_cfg): def main(cfg): cfg.dist_timeout = cfg.get('dist_timeout', 600.0) + if cfg.get('run_name') is None: + cfg.run_name = os.environ.get('RUN_NAME', 'llm') reproducibility.seed_all(cfg.seed) dist.initialize_dist(get_device(None), timeout=cfg.dist_timeout) @@ -116,7 +120,7 @@ def main(cfg): try: (in_memory_logger, logger_keys, model_gauntlet_callback, - model_gauntlet) = evaluate_model(model_cfg) + model_gauntlet) = evaluate_model(model_cfg, cfg.run_name) composite_scores = model_gauntlet_callback.eval_end( None, in_memory_logger) diff --git a/scripts/eval/yamls/model_gauntlet.yaml b/scripts/eval/yamls/model_gauntlet.yaml index 08eb902405..11aa1381e9 100644 --- a/scripts/eval/yamls/model_gauntlet.yaml +++ b/scripts/eval/yamls/model_gauntlet.yaml @@ -2,7 +2,7 @@ model_gauntlet: weighting: EQUAL subtract_random_baseline: true rescale_accuracy: true - tasks: + categories: - name: world_knowledge benchmarks: - name: jeopardy @@ -112,8 +112,3 @@ model_gauntlet: - name: boolq num_fewshot: 10 random_baseline: 0.5 - - name: programming - benchmarks: - - name: humaneval - num_fewshot: 0 - random_baseline: 0.0