Skip to content

Commit

Permalink
TGI benchmark with llmperf (#564)
Browse files Browse the repository at this point in the history
* feat(tgi): add TGI benchmark on multiple replicas

* feat(tgi): add mistral 7b results

* feat(tgi): add llama-7b benchmark results

* review: apply suggestions

Co-authored-by: Michael Benayoun <[email protected]>

* fix(bench): wrap in main + style

---------

Co-authored-by: Michael Benayoun <[email protected]>
  • Loading branch information
dacorvo and michaelbenayoun authored Apr 12, 2024
1 parent c8f15f9 commit eacf343
Show file tree
Hide file tree
Showing 10 changed files with 291 additions and 0 deletions.
36 changes: 36 additions & 0 deletions benchmark/text-generation-inference/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# NeuronX TGI benchmark using multiple replicas

## Select model and configuration

Edit the `.env` file to select the model to use for the benchmark and its configuration.

## Start the servers

```shell
$ docker compose --env-file llama-7b/.env up
```

Note: replace the .env file to change the model configuration

## Run the benchmark

### Install `llmperf`

Follow instalation [instructions](https://github.com/ray-project/llmperf/tree/main?tab=readme-ov-file#installation) for `llmperf`.

### Setup test environment

```shell
$ export LLMPerf=<path-to-llmperf>
```

### Launch benchmark run

The benchmark script takes the `model_id` and number of concurrent users as parameters.
The `model_id` must match the one corresponding to the selected `.env` file.

```
$ ./benchmark.sh NousResearch/Llama-2-7b-chat-hf 128
```


29 changes: 29 additions & 0 deletions benchmark/text-generation-inference/benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

model=${1:-NousResearch/Llama-2-7b-chat-hf}
vu=${2:-1}

export HUGGINGFACE_API_BASE=http://127.0.0.1:8080
export HUGGINGFACE_API_KEY=EMPTY

benchmark_script=${LLMPerf}/token_benchmark_ray.py

if ! test -f ${benchmark_script}; then
echo "LLMPerf script not found, please export LLMPerf=<path-to-llmperf>."
fi

max_requests=$(expr ${vu} \* 8 )
date_str=$(date '+%Y-%m-%d-%H-%M-%S')

python ${benchmark_script} \
--model "huggingface/${model}" \
--mean-input-tokens 1500 \
--stddev-input-tokens 150 \
--mean-output-tokens 245 \
--stddev-output-tokens 20 \
--max-num-completed-requests ${max_requests} \
--timeout 7200 \
--num-concurrent-requests ${vu} \
--results-dir "tgi_bench_results/${date_str}" \
--llm-api "litellm" \
--additional-sampling-params '{}'
79 changes: 79 additions & 0 deletions benchmark/text-generation-inference/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
version: '3.7'

services:
tgi-1:
image: neuronx-tgi:latest
ports:
- "8081:8081"
environment:
- PORT=8081
- MODEL_ID=${MODEL_ID}
- HF_BATCH_SIZE=${HF_BATCH_SIZE}
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
- HF_NUM_CORES=8
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
- MAX_CONCURRENT_REQUESTS=512
devices:
- "/dev/neuron0"
- "/dev/neuron1"
- "/dev/neuron2"
- "/dev/neuron3"

tgi-2:
image: neuronx-tgi:latest
ports:
- "8082:8082"
environment:
- PORT=8082
- MODEL_ID=${MODEL_ID}
- HF_BATCH_SIZE=${HF_BATCH_SIZE}
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
- HF_NUM_CORES=8
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
- MAX_CONCURRENT_REQUESTS=512
devices:
- "/dev/neuron4"
- "/dev/neuron5"
- "/dev/neuron6"
- "/dev/neuron7"

tgi-3:
image: neuronx-tgi:latest
ports:
- "8083:8083"
environment:
- PORT=8083
- MODEL_ID=${MODEL_ID}
- HF_BATCH_SIZE=${HF_BATCH_SIZE}
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
- HF_NUM_CORES=8
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
- MAX_CONCURRENT_REQUESTS=512
devices:
- "/dev/neuron8"
- "/dev/neuron9"
- "/dev/neuron10"
- "/dev/neuron11"

loadbalancer:
image: nginx:alpine
ports:
- "8080:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- tgi-1
- tgi-2
- tgi-3
deploy:
placement:
constraints: [node.role == manager]
29 changes: 29 additions & 0 deletions benchmark/text-generation-inference/generate_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import glob
import json

import pandas as pd


def main():
filenames = glob.glob("tgi_bench_results/*/*summary.json")

results = []

for filename in filenames:
with open(filename) as f:
summary = json.load(f)
d = {
"model_id": summary["model"],
"concurrent requests": summary["num_concurrent_requests"],
"throughput (t/s)": summary["results_mean_output_throughput_token_per_s"],
"Time-to-first-token @ P50 (s)": summary["results_ttft_s_quantiles_p50"],
"average latency (ms)": summary["results_inter_token_latency_s_quantiles_p50"] * 1000,
}
results.append(pd.DataFrame.from_dict(d, orient="index").transpose())

df = pd.concat(results).sort_values(by="concurrent requests")
df.to_csv("tgi-results.csv", index=False)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions benchmark/text-generation-inference/llama-7b/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MODEL_ID='NousResearch/Llama-2-7b-chat-hf'
HF_BATCH_SIZE=32
HF_SEQUENCE_LENGTH=4096
HF_AUTO_CAST_TYPE='fp16'
MAX_BATCH_SIZE=32
MAX_INPUT_LENGTH=3072
MAX_TOTAL_TOKENS=4096
11 changes: 11 additions & 0 deletions benchmark/text-generation-inference/llama-7b/tgi-results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179
huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507
huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298
huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308
huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393
huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669
huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551
huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929
huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685
huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593
7 changes: 7 additions & 0 deletions benchmark/text-generation-inference/mistral-7b/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MODEL_ID='mistralai/Mistral-7B-Instruct-v0.2'
HF_BATCH_SIZE=32
HF_SEQUENCE_LENGTH=4096
HF_AUTO_CAST_TYPE='bf16'
MAX_BATCH_SIZE=32
MAX_INPUT_LENGTH=3072
MAX_TOTAL_TOKENS=4096
11 changes: 11 additions & 0 deletions benchmark/text-generation-inference/mistral-7b/tgi-results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
huggingface/mistralai/Mistral-7B-Instruct-v0.2,1,34.662810045679024,0.46342812800048705,27.74296394585929
huggingface/mistralai/Mistral-7B-Instruct-v0.2,2,67.55520390730916,0.46188541100036673,27.32067234909958
huggingface/mistralai/Mistral-7B-Instruct-v0.2,4,115.9644253080536,0.4719622849997904,29.599952973112146
huggingface/mistralai/Mistral-7B-Instruct-v0.2,8,177.15609277817416,0.51119948700034,33.335737027419185
huggingface/mistralai/Mistral-7B-Instruct-v0.2,16,156.52392957214906,0.9595348704997377,86.39206521348669
huggingface/mistralai/Mistral-7B-Instruct-v0.2,32,247.29299604071295,2.5056241824995595,100.72862078096863
huggingface/mistralai/Mistral-7B-Instruct-v0.2,64,384.5781500641263,4.886728052500075,108.16498200178273
huggingface/mistralai/Mistral-7B-Instruct-v0.2,128,560.878982504929,10.410015015499994,130.6066071497773
huggingface/mistralai/Mistral-7B-Instruct-v0.2,256,623.9707062587075,23.141914837000513,190.67140038075857
huggingface/mistralai/Mistral-7B-Instruct-v0.2,512,572.8680705363325,41.84460775000116,283.4274198954966
15 changes: 15 additions & 0 deletions benchmark/text-generation-inference/nginx.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
### Nginx TGI Load Balancer
events {}
http {
upstream tgicluster {
server tgi-1:8081;
server tgi-2:8082;
server tgi-3:8083;
}
server {
listen 80;
location / {
proxy_pass http://tgicluster;
}
}
}
67 changes: 67 additions & 0 deletions benchmark/text-generation-inference/tgi_live_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

import requests
from prometheus_client.parser import text_string_to_metric_families


def get_node_results(node_url):

metrics = requests.get(node_url + "/metrics").text

counters = {
"tgi_queue_size": {},
"tgi_batch_current_size": {},
"tgi_request_input_length": {},
"tgi_request_generated_tokens": {},
"tgi_request_mean_time_per_token_duration": {},
"tgi_batch_inference_duration": {},
"tgi_request_queue_duration": {},
}

for family in text_string_to_metric_families(metrics):
if family.name in counters:
for sample in family.samples:
if sample.name == family.name + "_sum":
if len(sample.labels) == 0:
counters[family.name]["sum"] = sample.value
elif "method" in sample.labels:
counters[family.name][sample.labels["method"] + "_sum"] = sample.value
elif sample.name == family.name + "_count":
if len(sample.labels) == 0:
counters[family.name]["count"] = sample.value
elif "method" in sample.labels:
counters[family.name][sample.labels["method"] + "_count"] = sample.value
elif sample.name == family.name:
counters[family.name] = sample.value
queue_size = counters["tgi_queue_size"]
batch_size = counters["tgi_batch_current_size"]
num_requests = counters["tgi_request_mean_time_per_token_duration"]["count"]
input_tokens = counters["tgi_request_input_length"]["sum"]
avg_time_per_token = counters["tgi_request_mean_time_per_token_duration"]["sum"] * 1000 / num_requests
prefill_time = counters["tgi_batch_inference_duration"]["prefill_sum"]
decode_time = counters["tgi_batch_inference_duration"]["decode_sum"]
total_time = prefill_time + decode_time
decode_tokens = counters["tgi_request_generated_tokens"]["sum"]
avg_queue_duration = counters["tgi_request_queue_duration"]["sum"] / num_requests

return {
"queue_size": queue_size,
"batch_size": batch_size,
"requests": num_requests,
"avg_input_tokens": input_tokens / num_requests,
"avg_time_per_token": avg_time_per_token,
"throughput": (input_tokens + decode_tokens) / total_time,
"prefill_throughput": input_tokens / prefill_time,
"decode_throughput": decode_tokens / decode_time,
"avg_time_to_first_token": avg_queue_duration + (prefill_time / num_requests),
}


results = []
for port in [8081, 8082, 8083]:
results.append(get_node_results(f"http://0.0.0.0:{port}"))

for metric in results[0]:
value = sum([result[metric] for result in results])
if metric.startswith("avg"):
value /= len(results)
print(f"{metric} : {value:.3f}")

0 comments on commit eacf343

Please sign in to comment.