diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 37b9a1323e..ad6121c4b6 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - # git_branch: # use your branch + # git_branch: # use your branch # git_commit: # OR use your commit hash pip_install: -e ".[gpu]" ssh_clone: false # Should be true if using a private repo @@ -13,8 +13,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: all-eval gpu_num: 8 -gpu_type: a100_80gb -# cluster: TODO #replace with your cluster here! +# gpu_type: +# cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.0.1_cu118-latest @@ -41,62 +41,62 @@ parameters: device: cpu pretrained: true use_auth_token: false - - - model_name: falcon-40b - # Tokenizer - tokenizer: - name: tiiuae/falcon-40b - kwargs: - model_max_length: ${max_seq_len} + # - + # model_name: falcon-40b + # # Tokenizer + # tokenizer: + # name: tiiuae/falcon-40b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: tiiuae/falcon-40b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: mpt-30b - # Tokenizer - tokenizer: - name: EleutherAI/gpt-neox-20b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: tiiuae/falcon-40b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: mpt-30b + # # Tokenizer + # tokenizer: + # name: EleutherAI/gpt-neox-20b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: mosaicml/mpt-30b - device: cpu - pretrained: true - use_auth_token: true - - - model_name: falcon-40b-instruct - # Tokenizer - tokenizer: - name: tiiuae/falcon-40b-instruct - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: mosaicml/mpt-30b + # device: cpu + # pretrained: true + # use_auth_token: true + # - + # model_name: falcon-40b-instruct + # # Tokenizer + # tokenizer: + # name: tiiuae/falcon-40b-instruct + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: tiiuae/falcon-40b-instruct - device: cpu - pretrained: true - use_auth_token: false - - - model_name: llama-30b - # Tokenizer - tokenizer: - name: huggyllama/llama-30b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: tiiuae/falcon-40b-instruct + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: llama-30b + # # Tokenizer + # tokenizer: + # name: huggyllama/llama-30b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: huggyllama/llama-30b - device: cpu - pretrained: true - use_auth_token: false + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: huggyllama/llama-30b + # device: cpu + # pretrained: true + # use_auth_token: false - model_name: mosaicml/mpt-7b # Tokenizer @@ -115,7 +115,7 @@ parameters: model_name: mosaicml/mpt-7b-chat # Tokenizer tokenizer: - name: EleutherAI/gpt-neox-20b + name: mosaicml/mpt-7b-chat kwargs: model_max_length: ${max_seq_len} @@ -139,146 +139,146 @@ parameters: device: cpu pretrained: true use_auth_token: false - - - model_name: tiiuae/falcon-7b - # Tokenizer - tokenizer: - name: tiiuae/falcon-7b - kwargs: - model_max_length: ${max_seq_len} + # - + # model_name: tiiuae/falcon-7b + # # Tokenizer + # tokenizer: + # name: tiiuae/falcon-7b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: tiiuae/falcon-7b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: togethercomputer/RedPajama-INCITE-7B-Instruct - # Tokenizer - tokenizer: - name: togethercomputer/RedPajama-INCITE-7B-Instruct - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: tiiuae/falcon-7b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: togethercomputer/RedPajama-INCITE-7B-Instruct + # # Tokenizer + # tokenizer: + # name: togethercomputer/RedPajama-INCITE-7B-Instruct + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: togethercomputer/RedPajama-INCITE-7B-Instruct - device: cpu - pretrained: true - use_auth_token: false - - - model_name: togethercomputer/RedPajama-INCITE-7B-Base - # Tokenizer - tokenizer: - name: togethercomputer/RedPajama-INCITE-7B-Base - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: togethercomputer/RedPajama-INCITE-7B-Instruct + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: togethercomputer/RedPajama-INCITE-7B-Base + # # Tokenizer + # tokenizer: + # name: togethercomputer/RedPajama-INCITE-7B-Base + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: togethercomputer/RedPajama-INCITE-7B-Base - device: cpu - pretrained: true - use_auth_token: false - - - model_name: huggyllama/llama-13b - # Tokenizer - tokenizer: - name: huggyllama/llama-13b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: togethercomputer/RedPajama-INCITE-7B-Base + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: huggyllama/llama-13b + # # Tokenizer + # tokenizer: + # name: huggyllama/llama-13b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: huggyllama/llama-13b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: huggyllama/llama-7b - # Tokenizer - tokenizer: - name: huggyllama/llama-7b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: huggyllama/llama-13b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: huggyllama/llama-7b + # # Tokenizer + # tokenizer: + # name: huggyllama/llama-7b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: huggyllama/llama-7b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: EleutherAI/pythia-12b - # Tokenizer - tokenizer: - name: EleutherAI/pythia-12b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: huggyllama/llama-7b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: EleutherAI/pythia-12b + # # Tokenizer + # tokenizer: + # name: EleutherAI/pythia-12b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: EleutherAI/pythia-12b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: EleutherAI/pythia-6.9b - # Tokenizer - tokenizer: - name: EleutherAI/pythia-6.9b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: EleutherAI/pythia-12b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: EleutherAI/pythia-6.9b + # # Tokenizer + # tokenizer: + # name: EleutherAI/pythia-6.9b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: EleutherAI/pythia-6.9b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: facebook/opt-6.7b - # Tokenizer - tokenizer: - name: facebook/opt-6.7b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: EleutherAI/pythia-6.9b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: facebook/opt-6.7b + # # Tokenizer + # tokenizer: + # name: facebook/opt-6.7b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: facebook/opt-6.7b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: stabilityai/stablelm-tuned-alpha-7b - # Tokenizer - tokenizer: - name: stabilityai/stablelm-tuned-alpha-7b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: facebook/opt-6.7b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: stabilityai/stablelm-tuned-alpha-7b + # # Tokenizer + # tokenizer: + # name: stabilityai/stablelm-tuned-alpha-7b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: stabilityai/stablelm-tuned-alpha-7b - device: cpu - pretrained: true - use_auth_token: false - - - model_name: EleutherAI/gpt-j-6b - # Tokenizer - tokenizer: - name: EleutherAI/gpt-j-6b - kwargs: - model_max_length: ${max_seq_len} + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: stabilityai/stablelm-tuned-alpha-7b + # device: cpu + # pretrained: true + # use_auth_token: false + # - + # model_name: EleutherAI/gpt-j-6b + # # Tokenizer + # tokenizer: + # name: EleutherAI/gpt-j-6b + # kwargs: + # model_max_length: ${max_seq_len} - model: - name: hf_causal_lm - pretrained_model_name_or_path: EleutherAI/gpt-j-6b - device: cpu - pretrained: true - use_auth_token: false + # model: + # name: hf_causal_lm + # pretrained_model_name_or_path: EleutherAI/gpt-j-6b + # device: cpu + # pretrained: true + # use_auth_token: false # FSDP config for model sharding diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 5956f7cd32..246d49de24 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os import re import sys import time @@ -36,7 +37,7 @@ def load_model(model_cfg, tokenizer, num_retries): ) -def evaluate_model(model_cfg): +def evaluate_model(model_cfg, run_name): print(f'Evaluating model: {model_cfg.model_name}', flush=True) # Build tokenizer and model tokenizer = build_tokenizer(model_cfg.tokenizer) @@ -81,6 +82,7 @@ def evaluate_model(model_cfg): load_path = model_cfg.get('load_path', None) trainer = Trainer( + run_name=run_name, model=composer_model, loggers=loggers, precision=cfg.precision, @@ -106,6 +108,8 @@ def evaluate_model(model_cfg): def main(cfg): cfg.dist_timeout = cfg.get('dist_timeout', 600.0) + if cfg.get('run_name') is None: + cfg.run_name = os.environ.get('RUN_NAME', 'llm') reproducibility.seed_all(cfg.seed) dist.initialize_dist(get_device(None), timeout=cfg.dist_timeout) @@ -116,7 +120,7 @@ def main(cfg): try: (in_memory_logger, logger_keys, model_gauntlet_callback, - model_gauntlet) = evaluate_model() + model_gauntlet) = evaluate_model(model_cfg, cfg.run_name) composite_scores = model_gauntlet_callback.eval_end( None, in_memory_logger) diff --git a/scripts/eval/yamls/model_gauntlet.yaml b/scripts/eval/yamls/model_gauntlet.yaml index 08eb902405..11aa1381e9 100644 --- a/scripts/eval/yamls/model_gauntlet.yaml +++ b/scripts/eval/yamls/model_gauntlet.yaml @@ -2,7 +2,7 @@ model_gauntlet: weighting: EQUAL subtract_random_baseline: true rescale_accuracy: true - tasks: + categories: - name: world_knowledge benchmarks: - name: jeopardy @@ -112,8 +112,3 @@ model_gauntlet: - name: boolq num_fewshot: 10 random_baseline: 0.5 - - name: programming - benchmarks: - - name: humaneval - num_fewshot: 0 - random_baseline: 0.0