Skip to content

Commit

Permalink
Add a tool to fill neuron inference cache and update benchmarks (#496)
Browse files Browse the repository at this point in the history
* feat(tools): add tool to fill neuronx inference cache

* feat(benchmark): use higher batch sizes

* doc(benchmark): split llama2 benchmarks
  • Loading branch information
dacorvo authored Feb 22, 2024
1 parent aa56b10 commit 881d399
Show file tree
Hide file tree
Showing 15 changed files with 199 additions and 31 deletions.
28 changes: 28 additions & 0 deletions benchmark/text-generation/llama2-13b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
from tempfile import TemporaryDirectory

from transformers import AutoTokenizer

from benchmark import run
from optimum.neuron import NeuronModelForCausalLM


model_configurations = {
"Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096],
"Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
"Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
"Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
}


for model_name, model_configuration in model_configurations.items():
model_id, batch_size, seq_length = model_configuration
model = NeuronModelForCausalLM.from_pretrained(
model_id, export=True, batch_size=batch_size, sequence_length=seq_length, auto_cast_type="fp16"
)
with TemporaryDirectory() as tmpdir:
model.save_pretrained(tmpdir)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(tmpdir)
json_path = f"{model_name}.json"
run(tmpdir, 256, 1024, json_path=json_path)
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@


model_configurations = {
"Llama-2-7BL": ["meta-llama/Llama-2-7b-chat-hf", 1, 2048],
"Llama-2-7BT": ["meta-llama/Llama-2-7b-chat-hf", 4, 2048],
"Llama-2-7B-BS1": ["meta-llama/Llama-2-7b-chat-hf", 1, 4096],
"Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096],
}

num_cores = len(os.listdir("/sys/class/neuron_device/")) * 2
if num_cores >= 4:
extra_model_configurations = {
"Llama-2-13BL": ["meta-llama/Llama-2-13b-chat-hf", 1, 2048],
"Llama-2-13BT": ["meta-llama/Llama-2-13b-chat-hf", 4, 2048],
"Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096],
"Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096],
}
model_configurations = {**model_configurations, **extra_model_configurations}

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
10 changes: 6 additions & 4 deletions docs/source/_toctree.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@
- local: guides/pipelines
title: Inference pipelines with AWS Neuron
title: How-To Guides
- sections:
- local: benchmarks/inferentia-llama2-7b
title: Llama2 7b on AWS Inferentia2
- local: benchmarks/inferentia-llama2-13b
title: Llama2 13b on AWS Inferentia2
title: Benchmarks
- sections:
- local: community/contributing
title: Add support for a new model architecture
Expand All @@ -57,9 +63,5 @@
- local: package_reference/modeling
title: Neuron Models
title: Reference
- sections:
- local: benchmarks/inferentia-llama2
title: Llama on AWS Inferentia2
title: Benchmarks
title: Optimum Neuron
isExpanded: true
66 changes: 66 additions & 0 deletions docs/source/benchmarks/inferentia-llama2-13b.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Llama-2-13b performance on AWS Inferentia2 (Latency & Througput)

How fast is Llama-2-13b on Inferentia2? Let's figure out!

For this benchmark we will use the following configurations:

| Model type | batch_size | sequence_length |
|-----------------|------------|-----------------|
| Llama2 13b BS1 | 1 | 4096 |
| Llama2 13b BS4 | 4 | 4096 |
| Llama2 13b BS8 | 8 | 4096 |

*Note: all models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.*

*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*

To evaluate the models, we generate tokens up to a total sequence length of 1024, starting from
256 input tokens (i.e. we generate 256, 512 and 768 tokens).

## Encoding time (time to first token)

The encoding time or time to first token is the time required to process the input tokens and generate the first output token.
It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.

We test the encoding time for increasing context sizes, 256 input tokens corresponding roughly to a typical Q/A usage,
while 768 is more typical of a Retrieval Augmented Generation (RAG) use-case.

Encoding time is expressed in **seconds**.

![Llama2 13b inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png "Encoding time")

## End-to-end Latency

The end-to-end latency corresponds to the total time to reach a sequence length of 1024 tokens.

It therefore includes the encoding and generation time.

Latency is expressed in **seconds**.

![Llama2 13b inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png "Latency")

### Throughput

We adopt the same convention as other benchmarks to evaluate the throughput, by dividing the end-to-end
latency by the sum of both input and output tokens.
In other words, we divide the end-to-end latency by `batch_size * sequence_length` to obtain the number of generated tokens per second.

Throughput is expressed in **tokens/second**.

![Llama2 13b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png "Throughput")
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,23 @@ See the License for the specific language governing permissions and
limitations under the License.
-->

# Llama performance on AWS Inferentia2 (Latency & Througput)
# Llama-2-7b performance on AWS Inferentia2 (Latency & Througput)

How fast is Llama on Inferentia2? Let's figure out!
How fast is Llama-2-7b on Inferentia2? Let's figure out!

For this benchmark we will use the LLama 2 7B and 13B models with different configurations:
For this benchmark we will use the following configurations:

| Model type | num cores | batch_size |
|----------------------------|-----------|------------|
| Llama2 7B - L (latency) | 24 | 1 |
| Llama2 7B - T (throughput) | 24 | 4 |
| Llama2 13B - L (latency) | 24 | 1 |
| Llama2 13B - T (throughput)| 24 | 4 |
| Model type | batch_size | sequence_length |
|----------------|------------|-----------------|
| Llama2 7B BS1 | 1 | 4096 |
| Llama2 7B BS4 | 4 | 4096 |
| Llama2 7B BS8 | 8 | 4096 |
| Llama2 7B BS16 | 16 | 4096 |

*Note: all models are compiled with a maximum sequence length of 2048.*

All models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.
*Note: all models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.*

*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*

We created two "latency" oriented configurations for the `llama2 7B` and `llama2 13B` models that can serve only one request at a time, but at full speed and two "throughput" oriented configurations to serve up to four requests in parallel.

To evaluate the models, we generate tokens up to a total sequence length of 1024, starting from
256 input tokens (i.e. we generate 256, 512 and 768 tokens).

Expand All @@ -48,9 +44,7 @@ while 768 is more typical of a Retrieval Augmented Generation (RAG) use-case.

Encoding time is expressed in **seconds**.

![Llama2 inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/encoding_times.png "Encoding time")

We can see that all deployed models exhibit excellent response times, even for long contexts.
![Llama2 7b inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png "Encoding time")

## End-to-end Latency

Expand All @@ -60,9 +54,7 @@ It therefore includes the encoding and generation time.

Latency is expressed in **seconds**.

![Llama2 inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/latencies.png "Latency")

All models deployed on the high-end instance exhibit a good latency, even those actually configured to optimize throughput.
![Llama2 7b inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png "Latency")

### Throughput

Expand All @@ -72,6 +64,4 @@ In other words, we divide the end-to-end latency by `batch_size * sequence_lengt

Throughput is expressed in **tokens/second**.

![Llama2 inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2/throughputs.png "Throughput")

Again, the models deployed on the high-end instance have a very good throughput, even those optimized for latency.
![Llama2 7b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png "Throughput")
82 changes: 82 additions & 0 deletions tools/auto_fill_neuronx_inference_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import argparse
import subprocess
import time
from tempfile import TemporaryDirectory

from optimum.neuron.utils import synchronize_hub_cache


MODEL_CONFIGURATIONS = {
"openai-community/gpt2": [
{"batch_size": 1, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"},
{"batch_size": 16, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"},
],
"meta-llama/Llama-2-7b-chat-hf": [
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 16, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
],
"meta-llama/Llama-2-13b-chat-hf": [
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
],
"meta-llama/Llama-2-70b-chat-hf": [
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"},
],
"mistralai/Mistral-7B-Instruct-v0.1": [
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
],
"HuggingFaceH4/zephyr-7b-beta": [
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"},
],
}


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model-id", type=str, default=None)
parser.add_argument("--cache-repo-id", type=str, default=None)
args = parser.parse_args()
model_ids = [args.model_id] if args.model_id else MODEL_CONFIGURATIONS.keys()
for model_id in model_ids:
for export_kwargs in MODEL_CONFIGURATIONS[model_id]:
print(f"Exporting {model_id} with parameters {export_kwargs}")
start = time.time()
# Export in a separate process to reset the number of used cores
with TemporaryDirectory() as tmpdir:
command = f"optimum-cli export neuron -m {model_id}"
for kwarg, value in export_kwargs.items():
command += f" --{kwarg} {value}"
command += f" {tmpdir}"
print(command)
p = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
p.communicate()
assert p.returncode == 0
end = time.time()
print(f"Model successfully exported in {end -start:.2f} s.")
synchronize_hub_cache(args.cache_repo_id)


if __name__ == "__main__":
main()

0 comments on commit 881d399

Please sign in to comment.