ray-project · sihanwang41 · Jan 18, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 # syntax=docker/dockerfile:1.4
 
 ARG RAY_IMAGE="anyscale/ray"
-ARG RAY_TAG="2.7.0oss-py39-cu118"
+ARG RAY_TAG="2.9.0-py39-cu121"
 
 # Use Anyscale base image
 FROM ${RAY_IMAGE}:${RAY_TAG} AS aviary
@@ -16,18 +16,24 @@ ARG RAY_GID=100
 ENV RAY_SERVE_ENABLE_NEW_HANDLE_API=1
 ENV RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1
 ENV RAY_SERVE_ENABLE_JSON_LOGGING=1
+ENV RAY_SERVE_PROXY_PREFER_LOCAL_NODE_ROUTING=1
+ENV RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S=310
+ENV RAY_metrics_report_batch_size=400
 
 ENV FORCE_CUDA=1
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV SAFETENSORS_FAST_GPU=1
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
+ENV OMPI_ALLOW_RUN_AS_ROOT=1
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
 
 # Remove this line if we need the CUDA packages
 # and NVIDIA fixes their repository #ir-gleaming-sky
 RUN sudo rm -v /etc/apt/sources.list.d/cuda.list
 
 # Install torch first
 RUN pip install --no-cache-dir -U pip \
-    && pip install --no-cache-dir -i https://download.pytorch.org/whl/cu118 torch torchvision torchaudio \
+    && pip install --no-cache-dir -i https://download.pytorch.org/whl/cu121 torch~=2.1.0 torchvision torchaudio \
     && pip install --no-cache-dir tensorboard ninja
 
 # The build context should be the root of the repo
@@ -40,7 +46,7 @@ COPY --chown=${RAY_UID}:${RAY_GID} "./models/README.md" "${RAY_MODELS_DIR}/READM
 RUN cd "${RAY_DIST_DIR}" \
     # Update accelerate so transformers doesn't complain.
     && pip install --no-cache-dir -U accelerate \
-    && pip install --no-cache-dir -U "$(ls aviary-*.whl | head -n1)[frontend,backend]" \
+    && pip install --no-cache-dir -U "$(ls rayllm-*.whl | head -n1)[frontend,backend]" \
     # Purge caches
     && pip cache purge || true \
     && conda clean -a \

diff --git a/README.md b/README.md
@@ -15,10 +15,11 @@ a variety of open source LLMs, built on [Ray Serve](https://docs.ray.io/en/lates
 - Fully supporting multi-GPU & multi-node model deployments.
 - Offering high performance features like continuous batching, quantization and streaming.
 - Providing a REST API that is similar to OpenAI's to make it easy to migrate and cross test them.
+- Supporting multiple LLM backends out of the box, including [vLLM](https://github.com/vllm-project/vllm) and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
 
 In addition to LLM serving, it also includes a CLI and a web frontend (Aviary Explorer) that you can use to compare the outputs of different models directly, rank them by quality, get a cost and latency estimate, and more.
 
-RayLLM supports continuous batching and quantization by integrating with [vLLM](https://github.com/vllm-project/vllm). Continuous batching allows you to get much better throughput and latency than static batching. Quantization allows you to deploy compressed models with cheaper hardware requirements and lower inference costs. See [quantization guide](models/continuous_batching/quantization/README.md) for more details on running quantized models on RayLLM. 
+RayLLM supports continuous batching and quantization by integrating with [vLLM](https://github.com/vllm-project/vllm). Continuous batching allows you to get much better throughput and latency than static batching. Quantization allows you to deploy compressed models with cheaper hardware requirements and lower inference costs. See [quantization guide](models/continuous_batching/quantization/README.md) for more details on running quantized models on RayLLM.
 
 RayLLM leverages [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), which has native support for autoscaling
 and multi-node deployments. RayLLM can scale to zero and create
@@ -368,4 +369,4 @@ Feel free to post an issue first to get our feedback on a proposal first, or jus
 
 We use `pre-commit` hooks to ensure that all code is formatted correctly.
 Make sure to `pip install pre-commit` and then run `pre-commit install`.
-You can also run `./format` to run the hooks manually.
+You can also run `./format` to run the hooks manually.
diff --git a/models/README.md b/models/README.md
@@ -30,19 +30,43 @@ Ray Actors during deployments (using `ray_actor_options`). We recommend using th
 
 Engine is the abstraction for interacting with a model. It is responsible for scheduling and running the model inside a Ray Actor worker group.
 
-The `engine_config` section specifies the Hugging Face model ID (`model_id`), how to initialize it and what parameters to use when generating tokens with an LLM.
+The `engine_config` section specifies the model ID (`model_id`), how to initialize it and what parameters to use when generating tokens with an LLM.
 
-RayLLM supports continuous batching, meaning incoming requests are processed as soon as they arrive, and can be added to batches that are already being processed. This means that the model is not slowed down by certain sentences taking longer to generate than others. RayLLM also supports quantization, meaning compressed models can be deployed with cheaper hardware requirements. For more details on using quantized models in RayLLM, see the [quantization guide](continuous_batching/quantization/README.md).  
+RayLLM supports continuous batching, meaning incoming requests are processed as soon as they arrive, and can be added to batches that are already being processed. This means that the model is not slowed down by certain sentences taking longer to generate than others. RayLLM also supports quantization, meaning compressed models can be deployed with cheaper hardware requirements. For more details on using quantized models in RayLLM, see the [quantization guide](continuous_batching/quantization/README.md).
 
+#### vLLM Engine Config
 * `model_id` is the ID that refers to the model in the RayLLM or OpenAI API.
-* `type` is the type of  inference engine. Only `VLLMEngine` is currently supported.
-* `engine_kwargs` and `max_total_tokens` are configuration options for the inference engine (e.g. gpu memory utilization, quantization, max number of concurrent sequences). These options may vary depending on the hardware accelerator type and model size. We have tuned the parameters in the configuration files included in RayLLM for you to use as reference. 
+* `type` is the type of  inference engine. `VLLMEngine`, `TRTLLMEngine` and `EmbeddingEngine` are currently supported.
+* `engine_kwargs` and `max_total_tokens` are configuration options for the inference engine (e.g. gpu_memory_utilization, quantization, max_num_seqs and so on, see [more options](https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py#L11)). These options may vary depending on the hardware accelerator type and model size. We have tuned the parameters in the configuration files included in RayLLM for you to use as reference.
 * `generation` contains configurations related to default generation parameters such as `prompt_format` and `stopping_sequences`.
 * `hf_model_id` is the Hugging Face model ID. This can also be a path to a local directory. If not specified, defaults to `model_id`.
 * `runtime_env` is a dictionary that contains Ray runtime environment configuration. It allows you to set per-model pip packages and environment variables. See [Ray documentation on Runtime Environments](https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments) for more information.
 * `s3_mirror_config` is a dictionary that contains configuration for loading the model from S3 instead of Hugging Face Hub. You can use this to speed up downloads.
 * `gcs_mirror_config` is a dictionary that contains configuration for loading the model from Google Cloud Storage instead of Hugging Face Hub. You can use this to speed up downloads.
 
+#### TRTLLM Engine Config
+* `model_local_path` is the path to the TensorRT-LLM model directory.
+* `s3_mirror_config` is a dictionary that contains configuration for loading the model from S3 instead of Hugging Face Hub. You can use this to speed up downloads.
+* `generation` contains configurations related to default generation parameters such as `prompt_format` and `stopping_sequences`.
+* `scheduler_policy` is to choose scheduler policy between max_utilization/guaranteed_no_evict.
+(`MAX_UTILIZATION` packs as many requests as the underlying TRT engine can support in any iteration of the InflightBatching generation loop. While this is expected to maximize GPU throughput, it might require that some requests be paused and restarted depending on peak KV cache memory availability.
+`GUARANTEED_NO_EVICT` uses KV cache more conservatively guaranteeing that a request, once started, will run to completion without eviction.)
+* `logger_level` is to configure log level for TensorRT-LLM engine. ("INFO", "ERROR", "VERBOSE", "WARNING")
+* `max_num_sequences` is the maximum number of requests/sequences the backend can maintain state
+* `max_tokens_in_paged_kv_cache` is to configure the maximum number of tokens in the paged kv cache.
+* `kv_cache_free_gpu_mem_fraction` is to configure K-V Cache free gpu memory fraction.
+
+#### Embedding Engine Config
+* `model_id` is the ID that refers to the model in the RayLLM or OpenAI API.
+* `type` is the type of inference engine. `VLLMEngine`, `TRTLLMEngine` and `EmbeddingEngine` are currently supported.
+* `hf_model_id` is the Hugging Face model ID. This can also be a path to a local directory. If not specified, defaults to `model_id`.
+* `max_total_tokens` is to configure number of the maximum length of each query.
+* `max_batch_size` is to set the maximum batch size when queries are batched in the backend.
+
+#### Prepare TensorRT-LLM models
+You can follow the TensorRT-LLM example to generate the model.(https://github.com/NVIDIA/TensorRT-LLM/tree/v0.6.1/examples/llama). After generating the model, you can upload the model artifact to S3 and use the `s3_mirror_config` to load the model from S3. You can also place the model artifacts in a local directory and use the `model_local_path` to load the model from the local directory. See the [llama example](continuous_batching/trtllm-meta-llama--Llama-2-7b-chat-hf.yaml) for more details.
+
+
 ### Scaling config
 
 Finally, the `scaling_config` section specifies what resources should be used to serve the model - this corresponds to Ray AIR [ScalingConfig](https://docs.ray.io/en/latest/train/api/doc/ray.train.ScalingConfig.html). Note that the `scaling_config` applies to each model replica, and not the entire model deployment (in other words, each replica will have `num_workers` workers).
@@ -88,7 +112,7 @@ engine_config:
   model_id: mosaicml/mpt-7b-instruct
   # Id of the model on Hugging Face Hub. Can also be a disk path. Defaults to model_id if not specified.
   hf_model_id: mosaicml/mpt-7b-instruct
-  # vLLM keyword arguments passed when constructing the model.
+  # LLM engine keyword arguments passed when constructing the model.
   engine_kwargs:
     trust_remote_code: true
   # Optional Ray Runtime Environment configuration. See Ray documentation for more details.
@@ -151,4 +175,4 @@ engine_config:
   s3_mirror_config:
     bucket_uri: s3://YOUR_BUCKET_NAME/YOUR_MODEL_FOLDER
 
-```
+```
diff --git a/models/continuous_batching/OpenAssistant--falcon-40b-sft-top1-560.yaml b/models/continuous_batching/OpenAssistant--falcon-40b-sft-top1-560.yaml
@@ -40,4 +40,4 @@ scaling_config:
   num_cpus_per_worker: 8
   placement_strategy: "STRICT_PACK"
   resources_per_worker:
-    accelerator_type_a100_40g: 0.01
+    accelerator_type_a100_40g: 0.01
diff --git a/models/continuous_batching/codellama--CodeLlama-34b-Instruct-hf.yaml b/models/continuous_batching/codellama--CodeLlama-34b-Instruct-hf.yaml
@@ -31,7 +31,7 @@ engine_config:
       user: "[INST] {system}{instruction} [/INST]"
       system_in_user: true
       default_system_message: ""
-    stopping_sequences: ["<unk>"]
+    stopping_sequences: []
 scaling_config:
   num_workers: 2
   num_gpus_per_worker: 1

diff --git a/models/continuous_batching/meta-llama--Llama-2-13b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-13b-chat-hf.yaml
@@ -29,7 +29,7 @@ engine_config:
       user: "[INST] {system}{instruction} [/INST]"
       system_in_user: true
       default_system_message: ""
-    stopping_sequences: ["<unk>"]
+    stopping_sequences: []
 scaling_config:
   num_workers: 1
   num_gpus_per_worker: 1

diff --git a/models/continuous_batching/meta-llama--Llama-2-70b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-70b-chat-hf.yaml
@@ -29,7 +29,7 @@ engine_config:
       user: "[INST] {system}{instruction} [/INST]"
       system_in_user: true
       default_system_message: ""
-    stopping_sequences: ["<unk>"]
+    stopping_sequences: []
 scaling_config:
   num_workers: 4
   num_gpus_per_worker: 1

diff --git a/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/meta-llama--Llama-2-7b-chat-hf.yaml
@@ -21,7 +21,7 @@ engine_config:
     trust_remote_code: true
     max_num_batched_tokens: 4096
     max_num_seqs: 64
-    gpu_memory_utilization: 0.9
+    gpu_memory_utilization: 0.95
   max_total_tokens: 4096
   generation:
     prompt_format:
@@ -31,7 +31,7 @@ engine_config:
       user: "[INST] {system}{instruction} [/INST]"
       system_in_user: true
       default_system_message: ""
-    stopping_sequences: ["<unk>"]
+    stopping_sequences: []
 scaling_config:
   num_workers: 1
   num_gpus_per_worker: 1

diff --git a/models/continuous_batching/trtllm-meta-llama--Llama-2-70b-chat-hf.yaml b/models/continuous_batching/trtllm-meta-llama--Llama-2-70b-chat-hf.yaml
@@ -0,0 +1,39 @@
+deployment_config:
+  autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 8
+    target_num_ongoing_requests_per_replica: 24
+    metrics_interval_s: 10.0
+    look_back_period_s: 30.0
+    smoothing_factor: 0.5
+    downscale_delay_s: 300.0
+    upscale_delay_s: 15.0
+  max_concurrent_queries: 64
+  ray_actor_options:
+    resources:
+      accelerator_type_a10: 0.01
+engine_config:
+  model_id: meta-llama/Llama-2-70b-chat-hf
+  type: TRTLLMEngine
+  s3_mirror_config:
+    bucket_uri: s3://trtllm-models/llama2-70b-tp2/ # Change to your own model s3 path
+  # If you want to test with local file path, you can comment out s3_mirror_config section
+  # and add following 
+  # model_local_path: <your local path>
+  generation:
+    prompt_format:
+      system: "<<SYS>>\n{instruction}\n<</SYS>>\n\n"
+      assistant: " {instruction} </s><s>"
+      trailing_assistant: ""
+      user: "[INST] {system}{instruction} [/INST]"
+      system_in_user: true
+      default_system_message: ""
+    stopping_sequences: ["<unk>"]
+scaling_config:
+  num_workers: 8 #mpi size
+  num_gpus_per_worker: 1
+  num_cpus_per_worker: 8
+  placement_strategy: "STRICT_PACK"
+  resources_per_worker:
+    accelerator_type_a10: 0.01
diff --git a/models/continuous_batching/trtllm-meta-llama--Llama-2-7b-chat-hf.yaml b/models/continuous_batching/trtllm-meta-llama--Llama-2-7b-chat-hf.yaml
@@ -0,0 +1,39 @@
+deployment_config:
+  autoscaling_config:
+    min_replicas: 1
+    initial_replicas: 1
+    max_replicas: 8
+    target_num_ongoing_requests_per_replica: 24
+    metrics_interval_s: 10.0
+    look_back_period_s: 30.0
+    smoothing_factor: 0.5
+    downscale_delay_s: 300.0
+    upscale_delay_s: 15.0
+  max_concurrent_queries: 64
+  ray_actor_options:
+    resources:
+      accelerator_type_a10: 0.01
+engine_config:
+  model_id: meta-llama/Llama-2-7b-chat-hf
+  type: TRTLLMEngine
+  s3_mirror_config:
+    bucket_uri: s3://trtllm-models/llama2-7b-tp2/ # Change to your own model s3 path
+  # If you want to test with local file path, you can comment out s3_mirror_config section
+  # and add following
+  # model_local_path: <your local path>
+  generation:
+    prompt_format:
+      system: "<<SYS>>\n{instruction}\n<</SYS>>\n\n"
+      assistant: " {instruction} </s><s>"
+      trailing_assistant: ""
+      user: "[INST] {system}{instruction} [/INST]"
+      system_in_user: true
+      default_system_message: ""
+    stopping_sequences: ["<unk>"]
+scaling_config:
+  num_workers: 2 #mpi size
+  num_gpus_per_worker: 1
+  num_cpus_per_worker: 8
+  placement_strategy: "STRICT_PACK"
+  resources_per_worker:
+    accelerator_type_a10: 0.01
diff --git a/rayllm/backend/llm/dict_utils.py b/rayllm/backend/llm/dict_utils.py
@@ -1,14 +1,15 @@
-def merge_dicts(overwrite: dict, base: dict) -> dict:
+def merge_dicts(base: dict, overwrite: dict) -> dict:
     """
-    Merge two dictionaries recursively, with keys from overwrite taking precedence.
+    Merge overwrite into base. Modify base inplace.
     """
-    base = base.copy()
-    for key, value in overwrite.items():
-        if isinstance(value, dict):
-            # get node or create one
-            node = base.setdefault(key, {})
-            merge_dicts(value, node)
-        else:
-            base[key] = value
 
+    for key in overwrite:
+        if (
+            key in base
+            and isinstance(base[key], dict)
+            and isinstance(overwrite[key], dict)
+        ):
+            merge_dicts(base[key], overwrite[key])
+        else:
+            base[key] = overwrite[key]
     return base
diff --git a/rayllm/backend/llm/embedding/__init__.py b/rayllm/backend/llm/embedding/__init__.py