-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a tool to fill neuron inference cache and update benchmarks (#496)
* feat(tools): add tool to fill neuronx inference cache * feat(benchmark): use higher batch sizes * doc(benchmark): split llama2 benchmarks
- Loading branch information
Showing
15 changed files
with
199 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import os | ||
from tempfile import TemporaryDirectory | ||
|
||
from transformers import AutoTokenizer | ||
|
||
from benchmark import run | ||
from optimum.neuron import NeuronModelForCausalLM | ||
|
||
|
||
model_configurations = { | ||
"Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096], | ||
"Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096], | ||
"Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096], | ||
"Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096], | ||
} | ||
|
||
|
||
for model_name, model_configuration in model_configurations.items(): | ||
model_id, batch_size, seq_length = model_configuration | ||
model = NeuronModelForCausalLM.from_pretrained( | ||
model_id, export=True, batch_size=batch_size, sequence_length=seq_length, auto_cast_type="fp16" | ||
) | ||
with TemporaryDirectory() as tmpdir: | ||
model.save_pretrained(tmpdir) | ||
tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
tokenizer.save_pretrained(tmpdir) | ||
json_path = f"{model_name}.json" | ||
run(tmpdir, 256, 1024, json_path=json_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
<!--- | ||
Copyright 2024 The HuggingFace Team. All rights reserved. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--> | ||
|
||
# Llama-2-13b performance on AWS Inferentia2 (Latency & Througput) | ||
|
||
How fast is Llama-2-13b on Inferentia2? Let's figure out! | ||
|
||
For this benchmark we will use the following configurations: | ||
|
||
| Model type | batch_size | sequence_length | | ||
|-----------------|------------|-----------------| | ||
| Llama2 13b BS1 | 1 | 4096 | | ||
| Llama2 13b BS4 | 4 | 4096 | | ||
| Llama2 13b BS8 | 8 | 4096 | | ||
|
||
*Note: all models are compiled to use the full extent of cores available on the `inf2.48xlarge` instance.* | ||
|
||
*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.* | ||
|
||
To evaluate the models, we generate tokens up to a total sequence length of 1024, starting from | ||
256 input tokens (i.e. we generate 256, 512 and 768 tokens). | ||
|
||
## Encoding time (time to first token) | ||
|
||
The encoding time or time to first token is the time required to process the input tokens and generate the first output token. | ||
It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens. | ||
|
||
We test the encoding time for increasing context sizes, 256 input tokens corresponding roughly to a typical Q/A usage, | ||
while 768 is more typical of a Retrieval Augmented Generation (RAG) use-case. | ||
|
||
Encoding time is expressed in **seconds**. | ||
|
||
![Llama2 13b inferentia2 encoding-time](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/encoding_times.png "Encoding time") | ||
|
||
## End-to-end Latency | ||
|
||
The end-to-end latency corresponds to the total time to reach a sequence length of 1024 tokens. | ||
|
||
It therefore includes the encoding and generation time. | ||
|
||
Latency is expressed in **seconds**. | ||
|
||
![Llama2 13b inferentia2 end-to-end latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latencies.png "Latency") | ||
|
||
### Throughput | ||
|
||
We adopt the same convention as other benchmarks to evaluate the throughput, by dividing the end-to-end | ||
latency by the sum of both input and output tokens. | ||
In other words, we divide the end-to-end latency by `batch_size * sequence_length` to obtain the number of generated tokens per second. | ||
|
||
Throughput is expressed in **tokens/second**. | ||
|
||
![Llama2 13b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughputs.png "Throughput") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import argparse | ||
import subprocess | ||
import time | ||
from tempfile import TemporaryDirectory | ||
|
||
from optimum.neuron.utils import synchronize_hub_cache | ||
|
||
|
||
MODEL_CONFIGURATIONS = { | ||
"openai-community/gpt2": [ | ||
{"batch_size": 1, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"}, | ||
{"batch_size": 16, "sequence_length": 1024, "num_cores": 1, "auto_cast_type": "fp16"}, | ||
], | ||
"meta-llama/Llama-2-7b-chat-hf": [ | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 16, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
], | ||
"meta-llama/Llama-2-13b-chat-hf": [ | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "fp16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
], | ||
"meta-llama/Llama-2-70b-chat-hf": [ | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 24, "auto_cast_type": "fp16"}, | ||
], | ||
"mistralai/Mistral-7B-Instruct-v0.1": [ | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
], | ||
"HuggingFaceH4/zephyr-7b-beta": [ | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, | ||
{"batch_size": 1, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, | ||
{"batch_size": 4, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 8, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
{"batch_size": 16, "sequence_length": 4096, "num_cores": 8, "auto_cast_type": "bf16"}, | ||
], | ||
} | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--model-id", type=str, default=None) | ||
parser.add_argument("--cache-repo-id", type=str, default=None) | ||
args = parser.parse_args() | ||
model_ids = [args.model_id] if args.model_id else MODEL_CONFIGURATIONS.keys() | ||
for model_id in model_ids: | ||
for export_kwargs in MODEL_CONFIGURATIONS[model_id]: | ||
print(f"Exporting {model_id} with parameters {export_kwargs}") | ||
start = time.time() | ||
# Export in a separate process to reset the number of used cores | ||
with TemporaryDirectory() as tmpdir: | ||
command = f"optimum-cli export neuron -m {model_id}" | ||
for kwarg, value in export_kwargs.items(): | ||
command += f" --{kwarg} {value}" | ||
command += f" {tmpdir}" | ||
print(command) | ||
p = subprocess.Popen(command.split(), stdout=subprocess.PIPE) | ||
p.communicate() | ||
assert p.returncode == 0 | ||
end = time.time() | ||
print(f"Model successfully exported in {end -start:.2f} s.") | ||
synchronize_hub_cache(args.cache_repo_id) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |