diff --git a/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml b/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml
new file mode 100644
index 00000000..b49c201f
--- /dev/null
+++ b/models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml
@@ -0,0 +1,40 @@
+deployment_config:
+ autoscaling_config:
+ min_replicas: 1
+ initial_replicas: 1
+ max_replicas: 2
+ target_num_ongoing_requests_per_replica: 100
+ metrics_interval_s: 10.0
+ look_back_period_s: 30.0
+ smoothing_factor: 0.5
+ downscale_delay_s: 300.0
+ upscale_delay_s: 10.0
+ max_concurrent_queries: 256
+ ray_actor_options:
+ resources:
+ accelerator_type_a100_80g: 0.01
+engine_config:
+ model_id: TheBloke/Llama-2-70B-chat-AWQ
+ hf_model_id: TheBloke/Llama-2-70B-chat-AWQ
+ type: VLLMEngine
+ engine_kwargs:
+ quantization: awq
+ max_num_batched_tokens: 65536
+ max_num_seqs: 256
+ max_total_tokens: 4096
+ generation:
+ prompt_format:
+ system: "<>\n{instruction}\n<>\n\n"
+ assistant: " {instruction} "
+ trailing_assistant: ""
+ user: "[INST] {system}{instruction} [/INST]"
+ system_in_user: true
+ default_system_message: ""
+ stopping_sequences: [""]
+scaling_config:
+ num_workers: 1
+ num_gpus_per_worker: 1
+ num_cpus_per_worker: 8
+ placement_strategy: "STRICT_PACK"
+ resources_per_worker:
+ accelerator_type_a100_80g: 0.01
diff --git a/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml b/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml
new file mode 100644
index 00000000..523ed0d0
--- /dev/null
+++ b/serve_configs/TheBloke--Llama-2-70B-chat-AWQ.yaml
@@ -0,0 +1,7 @@
+applications:
+- name: ray-llm
+ route_prefix: /
+ import_path: rayllm.backend:router_application
+ args:
+ models:
+ - "./models/continuous_batching/TheBloke--Llama-2-70B-chat-AWQ.yaml"
\ No newline at end of file