From 7a8e99c073506c01b7fa34ba4739c53f61b85bf4 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 26 Sep 2024 11:52:55 +0000 Subject: [PATCH] ci: update llm cache files --- .github/workflows/inference_cache_llm.yml | 4 +- benchmark/text-generation/llama2-7b.py | 43 ------------------- .../{llama2-13b.py => mistral_small.py} | 10 ++--- benchmark/text-generation/mistralv2.py | 43 ------------------- 4 files changed, 6 insertions(+), 94 deletions(-) delete mode 100644 benchmark/text-generation/llama2-7b.py rename benchmark/text-generation/{llama2-13b.py => mistral_small.py} (76%) delete mode 100644 benchmark/text-generation/mistralv2.py diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml index 798b29173..bc8cd7bfb 100644 --- a/.github/workflows/inference_cache_llm.yml +++ b/.github/workflows/inference_cache_llm.yml @@ -21,9 +21,9 @@ jobs: matrix: config: [ gpt2, - llama3-8b, + llama, + llama3.1-70b, llama3-70b, - llama2-7b-13b, llama2-70b, mistral, llama-variants, diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py deleted file mode 100644 index e4a7541b6..000000000 --- a/benchmark/text-generation/llama2-7b.py +++ /dev/null @@ -1,43 +0,0 @@ -from tempfile import TemporaryDirectory - -from transformers import AutoTokenizer - -from benchmark import run -from optimum.neuron import NeuronModelForCausalLM -from optimum.neuron.modeling_decoder import get_available_cores - - -def main(): - NUM_CORES = 12 - num_cores = get_available_cores() - if num_cores < NUM_CORES: - raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") - - model_configurations = { - "Llama-2-7B-BS1": ["meta-llama/Llama-2-7b-chat-hf", 1, 4096], - "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096], - "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096], - "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096], - "Llama-2-7B-BS24": ["meta-llama/Llama-2-7b-chat-hf", 24, 4096], - } - - for model_name, model_configuration in model_configurations.items(): - model_id, batch_size, seq_length = model_configuration - model = NeuronModelForCausalLM.from_pretrained( - model_id, - export=True, - batch_size=batch_size, - sequence_length=seq_length, - auto_cast_type="fp16", - num_cores=NUM_CORES, - ) - with TemporaryDirectory() as tmpdir: - model.save_pretrained(tmpdir) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.save_pretrained(tmpdir) - json_path = f"{model_name}.json" - run(tmpdir, 256, 2048, json_path=json_path) - - -if __name__ == "__main__": - main() diff --git a/benchmark/text-generation/llama2-13b.py b/benchmark/text-generation/mistral_small.py similarity index 76% rename from benchmark/text-generation/llama2-13b.py rename to benchmark/text-generation/mistral_small.py index 03bcf70c2..15743d793 100644 --- a/benchmark/text-generation/llama2-13b.py +++ b/benchmark/text-generation/mistral_small.py @@ -8,16 +8,14 @@ def main(): - NUM_CORES = 8 + NUM_CORES = 12 num_cores = get_available_cores() if num_cores < NUM_CORES: raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") model_configurations = { - "Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096], - "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096], - "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096], - "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096], + "Mistral-Small-2409-BS1": ["mistralai/Mistral-Small-Instruct-2409", 1, 4096], + "Mistral-Small-2409-BS4": ["mistralai/Mistral-Small-Instruct-2409", 4, 4096], } for model_name, model_configuration in model_configurations.items(): @@ -27,7 +25,7 @@ def main(): export=True, batch_size=batch_size, sequence_length=seq_length, - auto_cast_type="fp16", + auto_cast_type="bf16", num_cores=NUM_CORES, ) with TemporaryDirectory() as tmpdir: diff --git a/benchmark/text-generation/mistralv2.py b/benchmark/text-generation/mistralv2.py deleted file mode 100644 index 21182a5de..000000000 --- a/benchmark/text-generation/mistralv2.py +++ /dev/null @@ -1,43 +0,0 @@ -from tempfile import TemporaryDirectory - -from transformers import AutoTokenizer - -from benchmark import run -from optimum.neuron import NeuronModelForCausalLM -from optimum.neuron.modeling_decoder import get_available_cores - - -def main(): - NUM_CORES = 8 - num_cores = get_available_cores() - if num_cores < NUM_CORES: - raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") - - model_configurations = { - "Mistral-7B-v2-BS1": ["mistralai/Mistral-7B-Instruct-v0.2", 1, 4096], - "Mistral-7B-v2-BS4": ["mistralai/Mistral-7B-Instruct-v0.2", 4, 4096], - "Mistral-7B-v2-BS8": ["mistralai/Mistral-7B-Instruct-v0.2", 8, 4096], - "Mistral-7B-v2-BS16": ["mistralai/Mistral-7B-Instruct-v0.2", 16, 4096], - "Mistral-7B-v2-BS32": ["mistralai/Mistral-7B-Instruct-v0.2", 32, 4096], - } - - for model_name, model_configuration in model_configurations.items(): - model_id, batch_size, seq_length = model_configuration - model = NeuronModelForCausalLM.from_pretrained( - model_id, - export=True, - batch_size=batch_size, - sequence_length=seq_length, - auto_cast_type="bf16", - num_cores=NUM_CORES, - ) - with TemporaryDirectory() as tmpdir: - model.save_pretrained(tmpdir) - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokenizer.save_pretrained(tmpdir) - json_path = f"{model_name}.json" - run(tmpdir, 256, 2048, json_path=json_path) - - -if __name__ == "__main__": - main()