Add a tool to fill neuron inference cache and update benchmarks (#496)

* feat(tools): add tool to fill neuronx inference cache * feat(benchmark): use higher batch sizes * doc(benchmark): split llama2 benchmarks
huggingface · Feb 22, 2024 · 881d399 · 881d399
1 parent aa56b10
commit 881d399
Show file tree

Hide file tree

Showing 15 changed files with 199 additions and 31 deletions.
diff --git a/benchmark/text-generation/llama2-13b.py b/benchmark/text-generation/llama2-13b.py
@@ -0,0 +1,28 @@
+import os
+from tempfile import TemporaryDirectory
+
+from transformers import AutoTokenizer
+
+from benchmark import run
+from optimum.neuron import NeuronModelForCausalLM
+
+
+model_configurations = {
+    "Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096],
+    "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
+    "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
+    "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
+}
+
+
+for model_name, model_configuration in model_configurations.items():
+    model_id, batch_size, seq_length = model_configuration
+    model = NeuronModelForCausalLM.from_pretrained(
+        model_id, export=True, batch_size=batch_size, sequence_length=seq_length, auto_cast_type="fp16"
+    )
+    with TemporaryDirectory() as tmpdir:
+        model.save_pretrained(tmpdir)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.save_pretrained(tmpdir)
+        json_path = f"{model_name}.json"
+        run(tmpdir, 256, 1024, json_path=json_path)
diff --git a/benchmark/text-generation/llama2.py → benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2.py → benchmark/text-generation/llama2-7b.py
@@ -8,15 +8,15 @@
 
 
 model_configurations = {
-    "Llama-2-7BL": ["meta-llama/Llama-2-7b-chat-hf", 1, 2048],
-    "Llama-2-7BT": ["meta-llama/Llama-2-7b-chat-hf", 4, 2048],
+    "Llama-2-7B-BS1": ["meta-llama/Llama-2-7b-chat-hf", 1, 4096],
+    "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096],
 }
 
 num_cores = len(os.listdir("/sys/class/neuron_device/")) * 2
 if num_cores >= 4:
     extra_model_configurations = {
-        "Llama-2-13BL": ["meta-llama/Llama-2-13b-chat-hf", 1, 2048],
-        "Llama-2-13BT": ["meta-llama/Llama-2-13b-chat-hf", 4, 2048],
+        "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096],
+        "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096],
     }
     model_configurations = {**model_configurations, **extra_model_configurations}
 

diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/encoding_times.png b/docs/assets/benchmarks/inferentia-llama2-13b/encoding_times.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/latencies.png b/docs/assets/benchmarks/inferentia-llama2-13b/latencies.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/throughputs.png b/docs/assets/benchmarks/inferentia-llama2-13b/throughputs.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png b/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png b/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png
diff --git a/docs/assets/benchmarks/inferentia-llama2/encoding_times.png b/docs/assets/benchmarks/inferentia-llama2/encoding_times.png
diff --git a/docs/assets/benchmarks/inferentia-llama2/latencies.png b/docs/assets/benchmarks/inferentia-llama2/latencies.png
diff --git a/docs/assets/benchmarks/inferentia-llama2/throughputs.png b/docs/assets/benchmarks/inferentia-llama2/throughputs.png
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -43,6 +43,12 @@
     - local: guides/pipelines
       title: Inference pipelines with AWS Neuron
     title: How-To Guides
+  - sections:
+    - local: benchmarks/inferentia-llama2-7b
+      title: Llama2 7b on AWS Inferentia2
+    - local: benchmarks/inferentia-llama2-13b
+      title: Llama2 13b on AWS Inferentia2
+    title: Benchmarks
   - sections:
     - local: community/contributing
       title: Add support for a new model architecture
@@ -57,9 +63,5 @@
     - local: package_reference/modeling
       title: Neuron Models
     title: Reference
-  - sections:
-    - local: benchmarks/inferentia-llama2
-      title: Llama on AWS Inferentia2
-    title: Benchmarks
   title: Optimum Neuron
   isExpanded: true
diff --git a/docs/source/benchmarks/inferentia-llama2-13b.mdx b/docs/source/benchmarks/inferentia-llama2-13b.mdx
@@ -0,0 +1,66 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Llama-2-13b performance on AWS Inferentia2 (Latency & Througput)
+
+How fast is Llama-2-13b on Inferentia2?  Let's figure out!
+
+For this benchmark we will use the following configurations:
+
+| Model type      | batch_size | sequence_length |
+|-----------------|------------|-----------------|
+| Llama2 13b BS1  | 1          | 4096            |
+| Llama2 13b BS4  | 4          | 4096            |
+| Llama2 13b BS8  | 8          | 4096            |
+
+*Note: all models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.*
+
+*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
+
+To evaluate the models, we generate tokens up to a total sequence length of 1024, starting from
+256 input tokens (i.e. we generate 256, 512 and 768 tokens).
+
+## Encoding time (time to first token)
+
+The encoding time or time to first token is the time required to process the input tokens and generate the first output token.
+It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.
+
+We test the encoding time for increasing context sizes, 256 input tokens corresponding roughly to a typical Q/A usage,
+while 768 is more typical of a Retrieval Augmented Generation (RAG) use-case.
+
+Encoding time is expressed in **seconds**.
+
+![Llama2 13b inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png "Encoding time")
+
+## End-to-end Latency
+
+The end-to-end latency corresponds to the total time to reach a sequence length of 1024 tokens.
+
+It therefore includes the encoding and generation time.
+
+Latency is expressed in **seconds**.
+
+![Llama2 13b inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png "Latency")
+
+### Throughput
+
+We adopt the same convention as other benchmarks to evaluate the throughput, by dividing the end-to-end
+latency by the sum of both input and output tokens.
+In other words, we divide the end-to-end latency by `batch_size * sequence_length` to obtain the number of generated tokens per second.
+
+Throughput is expressed in **tokens/second**.
+
+![Llama2 13b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png "Throughput")
diff --git a/docs/source/benchmarks/inferentia-llama2.mdx → ...ource/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2.mdx → ...ource/benchmarks/inferentia-llama2-7b.mdx
@@ -14,27 +14,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Llama performance on AWS Inferentia2 (Latency & Througput)
+# Llama-2-7b performance on AWS Inferentia2 (Latency & Througput)
 
-How fast is Llama on Inferentia2?  Let's figure out!
+How fast is Llama-2-7b on Inferentia2?  Let's figure out!
 
-For this benchmark we will use the LLama 2 7B and 13B models with different configurations:
+For this benchmark we will use the following configurations:
 
-| Model type                 | num cores | batch_size |
-|----------------------------|-----------|------------|
-| Llama2 7B - L (latency)    | 24        | 1          |
-| Llama2 7B - T (throughput) | 24        | 4          |
-| Llama2 13B - L (latency)   | 24        | 1          |
-| Llama2 13B - T (throughput)| 24        | 4          |
+| Model type     | batch_size | sequence_length |
+|----------------|------------|-----------------|
+| Llama2 7B BS1  | 1          | 4096            |
+| Llama2 7B BS4  | 4          | 4096            |
+| Llama2 7B BS8  | 8          | 4096            |
+| Llama2 7B BS16 | 16         | 4096            |
 
-*Note: all models are compiled with a maximum sequence length of 2048.*
-
-All models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.
+*Note: all models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.*
 
 *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
 
-We created two "latency" oriented configurations for the `llama2 7B` and `llama2 13B` models that can serve only one request at a time, but at full speed and two "throughput" oriented configurations to serve up to four requests in parallel.
-
 To evaluate the models, we generate tokens up to a total sequence length of 1024, starting from
 256 input tokens (i.e. we generate 256, 512 and 768 tokens).
 
@@ -48,9 +44,7 @@ while 768 is more typical of a Retrieval Augmented Generation (RAG) use-case.
 
 Encoding time is expressed in **seconds**.
 
-![Llama2 inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/encoding_times.png "Encoding time")
-
-We can see that all deployed models exhibit excellent response times, even for long contexts.
+![Llama2 7b inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png "Encoding time")
 
 ## End-to-end Latency
 
@@ -60,9 +54,7 @@ It therefore includes the encoding and generation time.
 
 Latency is expressed in **seconds**.
 
-![Llama2 inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/latencies.png "Latency")
-
-All models deployed on the high-end instance exhibit a good latency, even those actually configured to optimize throughput.
+![Llama2 7b inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png "Latency")
 
 ### Throughput
 
@@ -72,6 +64,4 @@ In other words, we divide the end-to-end latency by `batch_size * sequence_lengt
 
 Throughput is expressed in **tokens/second**.
 
-![Llama2 inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/throughputs.png "Throughput")
-
-Again, the models deployed on the high-end instance have a very good throughput, even those optimized for latency.
+![Llama2 7b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png "Throughput")
diff --git a/tools/auto_fill_neuronx_inference_cache.py b/tools/auto_fill_neuronx_inference_cache.py
@@ -0,0 +1,82 @@
+import argparse
+import subprocess
+import time
+from tempfile import TemporaryDirectory
+
+from optimum.neuron.utils import synchronize_hub_cache
+
+
+MODEL_CONFIGURATIONS = {
+    "openai-community/gpt2": [
+        {"batch_size": 1, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"},
+        {"batch_size": 16, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"},
+    ],
+    "meta-llama/Llama-2-7b-chat-hf": [
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+        {"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 16, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+    ],
+    "meta-llama/Llama-2-13b-chat-hf": [
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+    ],
+    "meta-llama/Llama-2-70b-chat-hf": [
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
+    ],
+    "mistralai/Mistral-7B-Instruct-v0.1": [
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+    ],
+    "HuggingFaceH4/zephyr-7b-beta": [
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        {"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+        {"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+        {"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
+    ],
+}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-id", type=str, default=None)
+    parser.add_argument("--cache-repo-id", type=str, default=None)
+    args = parser.parse_args()
+    model_ids = [args.model_id] if args.model_id else MODEL_CONFIGURATIONS.keys()
+    for model_id in model_ids:
+        for export_kwargs in MODEL_CONFIGURATIONS[model_id]:
+            print(f"Exporting {model_id} with parameters {export_kwargs}")
+            start = time.time()
+            # Export in a separate process to reset the number of used cores
+            with TemporaryDirectory() as tmpdir:
+                command = f"optimum-cli export neuron -m {model_id}"
+                for kwarg, value in export_kwargs.items():
+                    command += f" --{kwarg} {value}"
+                command += f" {tmpdir}"
+                print(command)
+                p = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
+                p.communicate()
+                assert p.returncode == 0
+            end = time.time()
+            print(f"Model successfully exported in {end -start:.2f} s.")
+            synchronize_hub_cache(args.cache_repo_id)
+
+
+if __name__ == "__main__":
+    main()