-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into bump-to-torch2
- Loading branch information
Showing
42 changed files
with
13,036 additions
and
463 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# NeuronX TGI benchmark using multiple replicas | ||
|
||
## Select model and configuration | ||
|
||
Edit the `.env` file to select the model to use for the benchmark and its configuration. | ||
|
||
## Start the servers | ||
|
||
```shell | ||
$ docker compose --env-file llama-7b/.env up | ||
``` | ||
|
||
Note: replace the .env file to change the model configuration | ||
|
||
## Run the benchmark | ||
|
||
### Install `llmperf` | ||
|
||
Follow instalation [instructions](https://github.com/ray-project/llmperf/tree/main?tab=readme-ov-file#installation) for `llmperf`. | ||
|
||
### Setup test environment | ||
|
||
```shell | ||
$ export LLMPerf=<path-to-llmperf> | ||
``` | ||
|
||
### Launch benchmark run | ||
|
||
The benchmark script takes the `model_id` and number of concurrent users as parameters. | ||
The `model_id` must match the one corresponding to the selected `.env` file. | ||
|
||
``` | ||
$ ./benchmark.sh NousResearch/Llama-2-7b-chat-hf 128 | ||
``` | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/bin/bash | ||
|
||
model=${1:-NousResearch/Llama-2-7b-chat-hf} | ||
vu=${2:-1} | ||
|
||
export HUGGINGFACE_API_BASE=http://127.0.0.1:8080 | ||
export HUGGINGFACE_API_KEY=EMPTY | ||
|
||
benchmark_script=${LLMPerf}/token_benchmark_ray.py | ||
|
||
if ! test -f ${benchmark_script}; then | ||
echo "LLMPerf script not found, please export LLMPerf=<path-to-llmperf>." | ||
fi | ||
|
||
max_requests=$(expr ${vu} \* 8 ) | ||
date_str=$(date '+%Y-%m-%d-%H-%M-%S') | ||
|
||
python ${benchmark_script} \ | ||
--model "huggingface/${model}" \ | ||
--mean-input-tokens 1500 \ | ||
--stddev-input-tokens 150 \ | ||
--mean-output-tokens 245 \ | ||
--stddev-output-tokens 20 \ | ||
--max-num-completed-requests ${max_requests} \ | ||
--timeout 7200 \ | ||
--num-concurrent-requests ${vu} \ | ||
--results-dir "tgi_bench_results/${date_str}" \ | ||
--llm-api "litellm" \ | ||
--additional-sampling-params '{}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
version: '3.7' | ||
|
||
services: | ||
tgi-1: | ||
image: neuronx-tgi:latest | ||
ports: | ||
- "8081:8081" | ||
environment: | ||
- PORT=8081 | ||
- MODEL_ID=${MODEL_ID} | ||
- HF_BATCH_SIZE=${HF_BATCH_SIZE} | ||
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH} | ||
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} | ||
- HF_NUM_CORES=8 | ||
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE} | ||
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} | ||
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} | ||
- MAX_CONCURRENT_REQUESTS=512 | ||
devices: | ||
- "/dev/neuron0" | ||
- "/dev/neuron1" | ||
- "/dev/neuron2" | ||
- "/dev/neuron3" | ||
|
||
tgi-2: | ||
image: neuronx-tgi:latest | ||
ports: | ||
- "8082:8082" | ||
environment: | ||
- PORT=8082 | ||
- MODEL_ID=${MODEL_ID} | ||
- HF_BATCH_SIZE=${HF_BATCH_SIZE} | ||
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH} | ||
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} | ||
- HF_NUM_CORES=8 | ||
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE} | ||
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} | ||
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} | ||
- MAX_CONCURRENT_REQUESTS=512 | ||
devices: | ||
- "/dev/neuron4" | ||
- "/dev/neuron5" | ||
- "/dev/neuron6" | ||
- "/dev/neuron7" | ||
|
||
tgi-3: | ||
image: neuronx-tgi:latest | ||
ports: | ||
- "8083:8083" | ||
environment: | ||
- PORT=8083 | ||
- MODEL_ID=${MODEL_ID} | ||
- HF_BATCH_SIZE=${HF_BATCH_SIZE} | ||
- HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH} | ||
- HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} | ||
- HF_NUM_CORES=8 | ||
- MAX_BATCH_SIZE=${MAX_BATCH_SIZE} | ||
- MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} | ||
- MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} | ||
- MAX_CONCURRENT_REQUESTS=512 | ||
devices: | ||
- "/dev/neuron8" | ||
- "/dev/neuron9" | ||
- "/dev/neuron10" | ||
- "/dev/neuron11" | ||
|
||
loadbalancer: | ||
image: nginx:alpine | ||
ports: | ||
- "8080:80" | ||
volumes: | ||
- ./nginx.conf:/etc/nginx/nginx.conf:ro | ||
depends_on: | ||
- tgi-1 | ||
- tgi-2 | ||
- tgi-3 | ||
deploy: | ||
placement: | ||
constraints: [node.role == manager] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import glob | ||
import json | ||
|
||
import pandas as pd | ||
|
||
|
||
def main(): | ||
filenames = glob.glob("tgi_bench_results/*/*summary.json") | ||
|
||
results = [] | ||
|
||
for filename in filenames: | ||
with open(filename) as f: | ||
summary = json.load(f) | ||
d = { | ||
"model_id": summary["model"], | ||
"concurrent requests": summary["num_concurrent_requests"], | ||
"throughput (t/s)": summary["results_mean_output_throughput_token_per_s"], | ||
"Time-to-first-token @ P50 (s)": summary["results_ttft_s_quantiles_p50"], | ||
"average latency (ms)": summary["results_inter_token_latency_s_quantiles_p50"] * 1000, | ||
} | ||
results.append(pd.DataFrame.from_dict(d, orient="index").transpose()) | ||
|
||
df = pd.concat(results).sort_values(by="concurrent requests") | ||
df.to_csv("tgi-results.csv", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
MODEL_ID='NousResearch/Llama-2-7b-chat-hf' | ||
HF_BATCH_SIZE=32 | ||
HF_SEQUENCE_LENGTH=4096 | ||
HF_AUTO_CAST_TYPE='fp16' | ||
MAX_BATCH_SIZE=32 | ||
MAX_INPUT_LENGTH=3072 | ||
MAX_TOTAL_TOKENS=4096 |
11 changes: 11 additions & 0 deletions
11
benchmark/text-generation-inference/llama-7b/tgi-results.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685 | ||
huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
MODEL_ID='mistralai/Mistral-7B-Instruct-v0.2' | ||
HF_BATCH_SIZE=32 | ||
HF_SEQUENCE_LENGTH=4096 | ||
HF_AUTO_CAST_TYPE='bf16' | ||
MAX_BATCH_SIZE=32 | ||
MAX_INPUT_LENGTH=3072 | ||
MAX_TOTAL_TOKENS=4096 |
11 changes: 11 additions & 0 deletions
11
benchmark/text-generation-inference/mistral-7b/tgi-results.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,1,34.662810045679024,0.46342812800048705,27.74296394585929 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,2,67.55520390730916,0.46188541100036673,27.32067234909958 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,4,115.9644253080536,0.4719622849997904,29.599952973112146 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,8,177.15609277817416,0.51119948700034,33.335737027419185 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,16,156.52392957214906,0.9595348704997377,86.39206521348669 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,32,247.29299604071295,2.5056241824995595,100.72862078096863 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,64,384.5781500641263,4.886728052500075,108.16498200178273 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,128,560.878982504929,10.410015015499994,130.6066071497773 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,256,623.9707062587075,23.141914837000513,190.67140038075857 | ||
huggingface/mistralai/Mistral-7B-Instruct-v0.2,512,572.8680705363325,41.84460775000116,283.4274198954966 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
### Nginx TGI Load Balancer | ||
events {} | ||
http { | ||
upstream tgicluster { | ||
server tgi-1:8081; | ||
server tgi-2:8082; | ||
server tgi-3:8083; | ||
} | ||
server { | ||
listen 80; | ||
location / { | ||
proxy_pass http://tgicluster; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
|
||
import requests | ||
from prometheus_client.parser import text_string_to_metric_families | ||
|
||
|
||
def get_node_results(node_url): | ||
|
||
metrics = requests.get(node_url + "/metrics").text | ||
|
||
counters = { | ||
"tgi_queue_size": {}, | ||
"tgi_batch_current_size": {}, | ||
"tgi_request_input_length": {}, | ||
"tgi_request_generated_tokens": {}, | ||
"tgi_request_mean_time_per_token_duration": {}, | ||
"tgi_batch_inference_duration": {}, | ||
"tgi_request_queue_duration": {}, | ||
} | ||
|
||
for family in text_string_to_metric_families(metrics): | ||
if family.name in counters: | ||
for sample in family.samples: | ||
if sample.name == family.name + "_sum": | ||
if len(sample.labels) == 0: | ||
counters[family.name]["sum"] = sample.value | ||
elif "method" in sample.labels: | ||
counters[family.name][sample.labels["method"] + "_sum"] = sample.value | ||
elif sample.name == family.name + "_count": | ||
if len(sample.labels) == 0: | ||
counters[family.name]["count"] = sample.value | ||
elif "method" in sample.labels: | ||
counters[family.name][sample.labels["method"] + "_count"] = sample.value | ||
elif sample.name == family.name: | ||
counters[family.name] = sample.value | ||
queue_size = counters["tgi_queue_size"] | ||
batch_size = counters["tgi_batch_current_size"] | ||
num_requests = counters["tgi_request_mean_time_per_token_duration"]["count"] | ||
input_tokens = counters["tgi_request_input_length"]["sum"] | ||
avg_time_per_token = counters["tgi_request_mean_time_per_token_duration"]["sum"] * 1000 / num_requests | ||
prefill_time = counters["tgi_batch_inference_duration"]["prefill_sum"] | ||
decode_time = counters["tgi_batch_inference_duration"]["decode_sum"] | ||
total_time = prefill_time + decode_time | ||
decode_tokens = counters["tgi_request_generated_tokens"]["sum"] | ||
avg_queue_duration = counters["tgi_request_queue_duration"]["sum"] / num_requests | ||
|
||
return { | ||
"queue_size": queue_size, | ||
"batch_size": batch_size, | ||
"requests": num_requests, | ||
"avg_input_tokens": input_tokens / num_requests, | ||
"avg_time_per_token": avg_time_per_token, | ||
"throughput": (input_tokens + decode_tokens) / total_time, | ||
"prefill_throughput": input_tokens / prefill_time, | ||
"decode_throughput": decode_tokens / decode_time, | ||
"avg_time_to_first_token": avg_queue_duration + (prefill_time / num_requests), | ||
} | ||
|
||
|
||
results = [] | ||
for port in [8081, 8082, 8083]: | ||
results.append(get_node_results(f"http://0.0.0.0:{port}")) | ||
|
||
for metric in results[0]: | ||
value = sum([result[metric] for result in results]) | ||
if metric.startswith("avg"): | ||
value /= len(results) | ||
print(f"{metric} : {value:.3f}") |
Oops, something went wrong.