Merge branch 'main' into bump-to-torch2

huggingface · Apr 12, 2024 · 1706415 · 1706415
2 parents ee4678c + eacf343
commit 1706415
Show file tree

Hide file tree

Showing 42 changed files with 13,036 additions and 463 deletions.
diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -50,8 +50,17 @@ jobs:
           sudo apt install gawk -y
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_test
-      - name: Run integration tests
+      - name: Build docker image
         shell: bash
         run: |
           source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_docker_test
+          make neuronx-tgi
+      - name: Install integration tests prerequisites
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -r text-generation-inference/integration-tests/requirements.txt
+      - name: Run TGI docker tests
+        shell: bash
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} python -m pytest -sv text-generation-inference/integration-tests
diff --git a/benchmark/text-generation-inference/README.md b/benchmark/text-generation-inference/README.md
@@ -0,0 +1,36 @@
+# NeuronX TGI benchmark using multiple replicas
+
+## Select model and configuration
+
+Edit the `.env` file to select the model to use for the benchmark and its configuration.
+
+## Start the servers
+
+```shell
+$ docker compose --env-file llama-7b/.env up
+```
+
+Note: replace the .env file to change the model configuration
+
+## Run the benchmark
+
+### Install `llmperf`
+
+Follow instalation [instructions](https://github.com/ray-project/llmperf/tree/main?tab=readme-ov-file#installation) for `llmperf`.
+
+### Setup test environment
+
+```shell
+$ export LLMPerf=<path-to-llmperf>
+```
+
+### Launch benchmark run
+
+The benchmark script takes the `model_id` and number of concurrent users as parameters.
+The `model_id` must match the one corresponding to the selected `.env` file.
+
+```
+$ ./benchmark.sh NousResearch/Llama-2-7b-chat-hf 128
+```
+
+
diff --git a/benchmark/text-generation-inference/benchmark.sh b/benchmark/text-generation-inference/benchmark.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+model=${1:-NousResearch/Llama-2-7b-chat-hf}
+vu=${2:-1}
+
+export HUGGINGFACE_API_BASE=http://127.0.0.1:8080
+export HUGGINGFACE_API_KEY=EMPTY
+
+benchmark_script=${LLMPerf}/token_benchmark_ray.py
+
+if ! test -f ${benchmark_script}; then
+  echo "LLMPerf script not found, please export LLMPerf=<path-to-llmperf>."
+fi
+
+max_requests=$(expr ${vu} \* 8 )
+date_str=$(date '+%Y-%m-%d-%H-%M-%S')
+
+python ${benchmark_script} \
+       --model "huggingface/${model}" \
+       --mean-input-tokens 1500 \
+       --stddev-input-tokens 150 \
+       --mean-output-tokens 245 \
+       --stddev-output-tokens 20 \
+       --max-num-completed-requests ${max_requests} \
+       --timeout 7200 \
+       --num-concurrent-requests ${vu} \
+       --results-dir "tgi_bench_results/${date_str}" \
+       --llm-api "litellm" \
+       --additional-sampling-params '{}'
diff --git a/benchmark/text-generation-inference/docker-compose.yaml b/benchmark/text-generation-inference/docker-compose.yaml
@@ -0,0 +1,79 @@
+version: '3.7'
+
+services:
+  tgi-1:
+    image: neuronx-tgi:latest
+    ports:
+      - "8081:8081"
+    environment:
+      - PORT=8081
+      - MODEL_ID=${MODEL_ID}
+      - HF_BATCH_SIZE=${HF_BATCH_SIZE}
+      - HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron0"
+      - "/dev/neuron1"
+      - "/dev/neuron2"
+      - "/dev/neuron3"
+
+  tgi-2:
+    image: neuronx-tgi:latest
+    ports:
+      - "8082:8082"
+    environment:
+      - PORT=8082
+      - MODEL_ID=${MODEL_ID}
+      - HF_BATCH_SIZE=${HF_BATCH_SIZE}
+      - HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron4"
+      - "/dev/neuron5"
+      - "/dev/neuron6"
+      - "/dev/neuron7"
+
+  tgi-3:
+    image: neuronx-tgi:latest
+    ports:
+      - "8083:8083"
+    environment:
+      - PORT=8083
+      - MODEL_ID=${MODEL_ID}
+      - HF_BATCH_SIZE=${HF_BATCH_SIZE}
+      - HF_SEQUENCE_LENGTH=${HF_SEQUENCE_LENGTH}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron8"
+      - "/dev/neuron9"
+      - "/dev/neuron10"
+      - "/dev/neuron11"
+
+  loadbalancer:
+    image: nginx:alpine
+    ports:
+      - "8080:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - tgi-1
+      - tgi-2
+      - tgi-3
+    deploy:
+      placement:
+        constraints: [node.role == manager]
diff --git a/benchmark/text-generation-inference/generate_csv.py b/benchmark/text-generation-inference/generate_csv.py
@@ -0,0 +1,29 @@
+import glob
+import json
+
+import pandas as pd
+
+
+def main():
+    filenames = glob.glob("tgi_bench_results/*/*summary.json")
+
+    results = []
+
+    for filename in filenames:
+        with open(filename) as f:
+            summary = json.load(f)
+            d = {
+                "model_id": summary["model"],
+                "concurrent requests": summary["num_concurrent_requests"],
+                "throughput (t/s)": summary["results_mean_output_throughput_token_per_s"],
+                "Time-to-first-token @ P50 (s)": summary["results_ttft_s_quantiles_p50"],
+                "average latency (ms)": summary["results_inter_token_latency_s_quantiles_p50"] * 1000,
+            }
+            results.append(pd.DataFrame.from_dict(d, orient="index").transpose())
+
+    df = pd.concat(results).sort_values(by="concurrent requests")
+    df.to_csv("tgi-results.csv", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/text-generation-inference/llama-7b/.env b/benchmark/text-generation-inference/llama-7b/.env
@@ -0,0 +1,7 @@
+MODEL_ID='NousResearch/Llama-2-7b-chat-hf'
+HF_BATCH_SIZE=32
+HF_SEQUENCE_LENGTH=4096
+HF_AUTO_CAST_TYPE='fp16'
+MAX_BATCH_SIZE=32
+MAX_INPUT_LENGTH=3072
+MAX_TOTAL_TOKENS=4096
diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv
@@ -0,0 +1,11 @@
+model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
+huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.84493535907894,0.435653425001874,70.64353697527179
+huggingface/NousResearch/Llama-2-7b-chat-hf,2,25.213946432976638,0.4359589194991713,70.55276283551507
+huggingface/NousResearch/Llama-2-7b-chat-hf,4,43.26619632041904,0.43764654000005976,71.40762554352298
+huggingface/NousResearch/Llama-2-7b-chat-hf,8,81.7002047989417,0.46597404000203824,74.66130438229308
+huggingface/NousResearch/Llama-2-7b-chat-hf,16,148.73365777295837,0.8807341205010744,79.46121462672393
+huggingface/NousResearch/Llama-2-7b-chat-hf,32,241.07605116636378,2.58607812900118,91.31557495460669
+huggingface/NousResearch/Llama-2-7b-chat-hf,64,338.6319898631105,6.262418706501194,118.3833058616551
+huggingface/NousResearch/Llama-2-7b-chat-hf,128,410.3968188304912,12.920248634000018,167.830813359929
+huggingface/NousResearch/Llama-2-7b-chat-hf,256,478.76738958996015,29.621474924000722,257.6998219293685
+huggingface/NousResearch/Llama-2-7b-chat-hf,512,496.5535875105274,44.485632503998204,329.7294857727593
diff --git a/benchmark/text-generation-inference/mistral-7b/.env b/benchmark/text-generation-inference/mistral-7b/.env
@@ -0,0 +1,7 @@
+MODEL_ID='mistralai/Mistral-7B-Instruct-v0.2'
+HF_BATCH_SIZE=32
+HF_SEQUENCE_LENGTH=4096
+HF_AUTO_CAST_TYPE='bf16'
+MAX_BATCH_SIZE=32
+MAX_INPUT_LENGTH=3072
+MAX_TOTAL_TOKENS=4096
diff --git a/benchmark/text-generation-inference/mistral-7b/tgi-results.csv b/benchmark/text-generation-inference/mistral-7b/tgi-results.csv
@@ -0,0 +1,11 @@
+model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,1,34.662810045679024,0.46342812800048705,27.74296394585929
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,2,67.55520390730916,0.46188541100036673,27.32067234909958
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,4,115.9644253080536,0.4719622849997904,29.599952973112146
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,8,177.15609277817416,0.51119948700034,33.335737027419185
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,16,156.52392957214906,0.9595348704997377,86.39206521348669
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,32,247.29299604071295,2.5056241824995595,100.72862078096863
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,64,384.5781500641263,4.886728052500075,108.16498200178273
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,128,560.878982504929,10.410015015499994,130.6066071497773
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,256,623.9707062587075,23.141914837000513,190.67140038075857
+huggingface/mistralai/Mistral-7B-Instruct-v0.2,512,572.8680705363325,41.84460775000116,283.4274198954966
diff --git a/benchmark/text-generation-inference/nginx.conf b/benchmark/text-generation-inference/nginx.conf
@@ -0,0 +1,15 @@
+### Nginx TGI Load Balancer
+events {}
+http {
+    upstream tgicluster {
+        server tgi-1:8081;
+        server tgi-2:8082;
+        server tgi-3:8083;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://tgicluster;
+        }
+    }
+}
diff --git a/benchmark/text-generation-inference/tgi_live_metrics.py b/benchmark/text-generation-inference/tgi_live_metrics.py
@@ -0,0 +1,67 @@
+
+import requests
+from prometheus_client.parser import text_string_to_metric_families
+
+
+def get_node_results(node_url):
+
+    metrics = requests.get(node_url + "/metrics").text
+
+    counters = {
+        "tgi_queue_size": {},
+        "tgi_batch_current_size": {},
+        "tgi_request_input_length": {},
+        "tgi_request_generated_tokens": {},
+        "tgi_request_mean_time_per_token_duration": {},
+        "tgi_batch_inference_duration": {},
+        "tgi_request_queue_duration": {},
+    }
+
+    for family in text_string_to_metric_families(metrics):
+        if family.name in counters:
+            for sample in family.samples:
+                if sample.name == family.name + "_sum":
+                    if len(sample.labels) == 0:
+                        counters[family.name]["sum"] = sample.value
+                    elif "method" in sample.labels:
+                        counters[family.name][sample.labels["method"] + "_sum"] = sample.value
+                elif sample.name == family.name + "_count":
+                    if len(sample.labels) == 0:
+                        counters[family.name]["count"] = sample.value
+                    elif "method" in sample.labels:
+                        counters[family.name][sample.labels["method"] + "_count"] = sample.value
+                elif sample.name == family.name:
+                    counters[family.name] = sample.value
+    queue_size = counters["tgi_queue_size"]
+    batch_size = counters["tgi_batch_current_size"]
+    num_requests = counters["tgi_request_mean_time_per_token_duration"]["count"]
+    input_tokens = counters["tgi_request_input_length"]["sum"]
+    avg_time_per_token = counters["tgi_request_mean_time_per_token_duration"]["sum"] * 1000 / num_requests
+    prefill_time = counters["tgi_batch_inference_duration"]["prefill_sum"]
+    decode_time = counters["tgi_batch_inference_duration"]["decode_sum"]
+    total_time = prefill_time + decode_time
+    decode_tokens = counters["tgi_request_generated_tokens"]["sum"]
+    avg_queue_duration = counters["tgi_request_queue_duration"]["sum"] / num_requests
+
+    return {
+        "queue_size": queue_size,
+        "batch_size": batch_size,
+        "requests": num_requests,
+        "avg_input_tokens": input_tokens / num_requests,
+        "avg_time_per_token": avg_time_per_token,
+        "throughput": (input_tokens + decode_tokens) / total_time,
+        "prefill_throughput": input_tokens / prefill_time,
+        "decode_throughput": decode_tokens / decode_time,
+        "avg_time_to_first_token": avg_queue_duration + (prefill_time / num_requests),
+    }
+
+
+results = []
+for port in [8081, 8082, 8083]:
+    results.append(get_node_results(f"http://0.0.0.0:{port}"))
+
+for metric in results[0]:
+    value = sum([result[metric] for result in results])
+    if metric.startswith("avg"):
+        value /= len(results)
+    print(f"{metric} : {value:.3f}")