Skip to content
This repository has been archived by the owner on May 28, 2024. It is now read-only.

Commit

Permalink
Merge pull request #82 from YQ-Wang/awq-model
Browse files Browse the repository at this point in the history
Add AWQ Quantized Llama 2 70B Model Config & Update README
  • Loading branch information
shrekris-anyscale authored Nov 13, 2023
2 parents 8fd2dc9 + 98bad43 commit ae910a2
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
40 changes: 40 additions & 0 deletions models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
deployment_config:
autoscaling_config:
min_replicas: 1
initial_replicas: 1
max_replicas: 2
target_num_ongoing_requests_per_replica: 100
metrics_interval_s: 10.0
look_back_period_s: 30.0
smoothing_factor: 0.5
downscale_delay_s: 300.0
upscale_delay_s: 10.0
max_concurrent_queries: 256
ray_actor_options:
resources:
accelerator_type_a100_80g: 0.01
engine_config:
model_id: TheBloke/Llama-2-70B-chat-AWQ
hf_model_id: TheBloke/Llama-2-70B-chat-AWQ
type: VLLMEngine
engine_kwargs:
quantization: awq
max_num_batched_tokens: 65536
max_num_seqs: 256
max_total_tokens: 4096
generation:
prompt_format:
system: "<<SYS>>\n{instruction}\n<</SYS>>\n\n"
assistant: " {instruction} </s><s>"
trailing_assistant: ""
user: "[INST] {system}{instruction} [/INST]"
system_in_user: true
default_system_message: ""
stopping_sequences: ["<unk>"]
scaling_config:
num_workers: 1
num_gpus_per_worker: 1
num_cpus_per_worker: 8
placement_strategy: "STRICT_PACK"
resources_per_worker:
accelerator_type_a100_80g: 0.01
7 changes: 7 additions & 0 deletions serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
applications:
- name: ray-llm
route_prefix: /
import_path: rayllm.backend:router_application
args:
models:
- "./models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml"

0 comments on commit ae910a2

Please sign in to comment.