diff --git a/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml b/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml new file mode 100644 index 00000000..b49c201f --- /dev/null +++ b/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml @@ -0,0 +1,40 @@ +deployment_config: + autoscaling_config: + min_replicas: 1 + initial_replicas: 1 + max_replicas: 2 + target_num_ongoing_requests_per_replica: 100 + metrics_interval_s: 10.0 + look_back_period_s: 30.0 + smoothing_factor: 0.5 + downscale_delay_s: 300.0 + upscale_delay_s: 10.0 + max_concurrent_queries: 256 + ray_actor_options: + resources: + accelerator_type_a100_80g: 0.01 +engine_config: + model_id: TheBloke/Llama-2-70B-chat-AWQ + hf_model_id: TheBloke/Llama-2-70B-chat-AWQ + type: VLLMEngine + engine_kwargs: + quantization: awq + max_num_batched_tokens: 65536 + max_num_seqs: 256 + max_total_tokens: 4096 + generation: + prompt_format: + system: "<>\n{instruction}\n<>\n\n" + assistant: " {instruction} " + trailing_assistant: "" + user: "[INST] {system}{instruction} [/INST]" + system_in_user: true + default_system_message: "" + stopping_sequences: [""] +scaling_config: + num_workers: 1 + num_gpus_per_worker: 1 + num_cpus_per_worker: 8 + placement_strategy: "STRICT_PACK" + resources_per_worker: + accelerator_type_a100_80g: 0.01 diff --git a/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml b/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml new file mode 100644 index 00000000..523ed0d0 --- /dev/null +++ b/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml @@ -0,0 +1,7 @@ +applications: +- name: ray-llm + route_prefix: / + import_path: rayllm.backend:router_application + args: + models: + - "./models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml" \ No newline at end of file