From 7a8e99c073506c01b7fa34ba4739c53f61b85bf4 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 26 Sep 2024 11:52:55 +0000
Subject: [PATCH] ci: update llm cache files

---
 .github/workflows/inference_cache_llm.yml     |  4 +-
 benchmark/text-generation/llama2-7b.py        | 43 -------------------
 .../{llama2-13b.py => mistral_small.py}       | 10 ++---
 benchmark/text-generation/mistralv2.py        | 43 -------------------
 4 files changed, 6 insertions(+), 94 deletions(-)
 delete mode 100644 benchmark/text-generation/llama2-7b.py
 rename benchmark/text-generation/{llama2-13b.py => mistral_small.py} (76%)
 delete mode 100644 benchmark/text-generation/mistralv2.py

diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
index 798b29173..bc8cd7bfb 100644
--- a/.github/workflows/inference_cache_llm.yml
+++ b/.github/workflows/inference_cache_llm.yml
@@ -21,9 +21,9 @@ jobs:
       matrix:
         config: [
           gpt2,
-          llama3-8b,
+          llama,
+          llama3.1-70b,
           llama3-70b,
-          llama2-7b-13b,
           llama2-70b,
           mistral,
           llama-variants,
diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py
deleted file mode 100644
index e4a7541b6..000000000
--- a/benchmark/text-generation/llama2-7b.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from tempfile import TemporaryDirectory
-
-from transformers import AutoTokenizer
-
-from benchmark import run
-from optimum.neuron import NeuronModelForCausalLM
-from optimum.neuron.modeling_decoder import get_available_cores
-
-
-def main():
-    NUM_CORES = 12
-    num_cores = get_available_cores()
-    if num_cores < NUM_CORES:
-        raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
-
-    model_configurations = {
-        "Llama-2-7B-BS1": ["meta-llama/Llama-2-7b-chat-hf", 1, 4096],
-        "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096],
-        "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096],
-        "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096],
-        "Llama-2-7B-BS24": ["meta-llama/Llama-2-7b-chat-hf", 24, 4096],
-    }
-
-    for model_name, model_configuration in model_configurations.items():
-        model_id, batch_size, seq_length = model_configuration
-        model = NeuronModelForCausalLM.from_pretrained(
-            model_id,
-            export=True,
-            batch_size=batch_size,
-            sequence_length=seq_length,
-            auto_cast_type="fp16",
-            num_cores=NUM_CORES,
-        )
-        with TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            tokenizer.save_pretrained(tmpdir)
-            json_path = f"{model_name}.json"
-            run(tmpdir, 256, 2048, json_path=json_path)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark/text-generation/llama2-13b.py b/benchmark/text-generation/mistral_small.py
similarity index 76%
rename from benchmark/text-generation/llama2-13b.py
rename to benchmark/text-generation/mistral_small.py
index 03bcf70c2..15743d793 100644
--- a/benchmark/text-generation/llama2-13b.py
+++ b/benchmark/text-generation/mistral_small.py
@@ -8,16 +8,14 @@
 
 
 def main():
-    NUM_CORES = 8
+    NUM_CORES = 12
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
 
     model_configurations = {
-        "Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096],
-        "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
-        "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
-        "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
+        "Mistral-Small-2409-BS1": ["mistralai/Mistral-Small-Instruct-2409", 1, 4096],
+        "Mistral-Small-2409-BS4": ["mistralai/Mistral-Small-Instruct-2409", 4, 4096],
     }
 
     for model_name, model_configuration in model_configurations.items():
@@ -27,7 +25,7 @@ def main():
             export=True,
             batch_size=batch_size,
             sequence_length=seq_length,
-            auto_cast_type="fp16",
+            auto_cast_type="bf16",
             num_cores=NUM_CORES,
         )
         with TemporaryDirectory() as tmpdir:
diff --git a/benchmark/text-generation/mistralv2.py b/benchmark/text-generation/mistralv2.py
deleted file mode 100644
index 21182a5de..000000000
--- a/benchmark/text-generation/mistralv2.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from tempfile import TemporaryDirectory
-
-from transformers import AutoTokenizer
-
-from benchmark import run
-from optimum.neuron import NeuronModelForCausalLM
-from optimum.neuron.modeling_decoder import get_available_cores
-
-
-def main():
-    NUM_CORES = 8
-    num_cores = get_available_cores()
-    if num_cores < NUM_CORES:
-        raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
-
-    model_configurations = {
-        "Mistral-7B-v2-BS1": ["mistralai/Mistral-7B-Instruct-v0.2", 1, 4096],
-        "Mistral-7B-v2-BS4": ["mistralai/Mistral-7B-Instruct-v0.2", 4, 4096],
-        "Mistral-7B-v2-BS8": ["mistralai/Mistral-7B-Instruct-v0.2", 8, 4096],
-        "Mistral-7B-v2-BS16": ["mistralai/Mistral-7B-Instruct-v0.2", 16, 4096],
-        "Mistral-7B-v2-BS32": ["mistralai/Mistral-7B-Instruct-v0.2", 32, 4096],
-    }
-
-    for model_name, model_configuration in model_configurations.items():
-        model_id, batch_size, seq_length = model_configuration
-        model = NeuronModelForCausalLM.from_pretrained(
-            model_id,
-            export=True,
-            batch_size=batch_size,
-            sequence_length=seq_length,
-            auto_cast_type="bf16",
-            num_cores=NUM_CORES,
-        )
-        with TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            tokenizer.save_pretrained(tmpdir)
-            json_path = f"{model_name}.json"
-            run(tmpdir, 256, 2048, json_path=json_path)
-
-
-if __name__ == "__main__":
-    main()