-
Notifications
You must be signed in to change notification settings - Fork 6
/
run.sh
95 lines (88 loc) · 3.8 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/bin/bash
# Check if docker compose is installed
if ! command -v docker &> /dev/null
then
echo "Docker could not be found. Please install Docker and try again."
exit
fi
# Parse command line arguments for model value
while [[ "$#" -gt 0 ]]; do
case $1 in
--model) model="$2"; shift ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
done
# Check if model value is provided
if [ -z "$model" ]
then
echo "No model value provided. Defaulting to 7b. If you want to change the model, exit the script and use --model to provide the model value."
echo "Supported models are 7b, 13b, 70b, code-7b, code-13b, code-34b."
model="7b"
fi
model_type="gguf"
# Export the model value as an environment variable
case $model in
7b)
export MODEL_NAME="llama-2-7b-chat.bin"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin"
export WAIT_TIMEOUT=3600
export N_GQA=1
model_type="ggml"
;;
13b)
export MODEL_NAME="llama-2-13b-chat.bin"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Nous-Hermes-Llama2-GGML/resolve/main/nous-hermes-llama2-13b.ggmlv3.q4_0.bin"
export WAIT_TIMEOUT=10800
export N_GQA=1
model_type="ggml"
;;
70b)
export MODEL_NAME="llama-2-70b-chat.bin"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Nous-Hermes-Llama2-70B-GGML/resolve/main/nous-hermes-llama2-70b.ggmlv3.Q4_0.bin"
export WAIT_TIMEOUT=21600
# Llama 2 70B's grouping factor is 8 compared to 7B and 13B's 1. Currently,
# it's not possible to change this using --n_gqa with llama-cpp-python in
# run.sh, so we expose it as an environment variable.
# See: https://github.com/abetlen/llama-cpp-python/issues/528
# and: https://github.com/facebookresearch/llama/issues/407
export N_GQA=8
model_type="ggml"
;;
code-7b)
export MODEL_NAME="code-llama-7b-chat.gguf"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q4_K_M.gguf"
export WAIT_TIMEOUT=3600
export DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
export N_GQA=1
;;
code-13b)
export MODEL_NAME="code-llama-13b-chat.gguf"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/CodeLlama-13B-Instruct-GGUF/resolve/main/codellama-13b-instruct.Q4_K_M.gguf"
export DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
export WAIT_TIMEOUT=10800
export N_GQA=1
;;
code-34b)
export MODEL_NAME="code-llama-34b-chat.gguf"
export MODEL_DOWNLOAD_URL="https://huggingface.co/TheBloke/Phind-CodeLlama-34B-v1-GGUF/resolve/main/phind-codellama-34b-v1.Q4_K_M.gguf"
export DEFAULT_SYSTEM_PROMPT="You are a helpful coding assistant. Use markdown when responding with code."
export WAIT_TIMEOUT=21600
# Code Llama 34B's grouping factor is 8 compared to 7B and 13B's 1. Currently,
# it's not possible to change this using --n_gqa with llama-cpp-python in
# run.sh, so we expose it as an environment variable.
# See: https://github.com/abetlen/llama-cpp-python/issues/528
export N_GQA=8
;;
*)
echo "Invalid model value provided. Supported models are 7b, 13b, 70b, code-7b, code-13b, code-34b."
exit 1
;;
esac
# Run docker compose with docker-compose-ggml.yml or docker-compose-gguf.yml
if [ "$model_type" = "ggml" ]
then
docker-compose -f docker-compose.yml up --build
else
docker-compose -f docker-compose-gguf.yml up --build
fi