Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
airaria authored Apr 13, 2023
2 parents 53abdbc + 4232d29 commit 0b6718a
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 0 deletions.
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,45 @@ python server.py --model llama-7b-hf --lora chinese-alpaca-lora-7b

```

### 使用Transformers推理

如果想快速体验模型效果,不安装其他库或Python包,可以使用[scripts/inference_hf.py](scripts/inference_hf.py)在不量化的情况下启动模型。该脚本支持CPU和GPU的单卡推理。以启动Chinese-Alpaca 7B模型为例,脚本运行方式如下:

(**因不同框架的解码的实现细节有差异,该脚本并不能保证复现llama.cpp的解码效果**)

```
CUDA_VISIBLE_DEVICES={device_id} python scripts/inference_hf.py \
--base_model path_to_original_llama_hf_dir \
--lora_model path_to_chinese_llama_or_alpaca_lora \
--with_prompt \
--interactive
```

如果已经执行了`merge_llama_with_chinese_lora_to_hf.py`脚本将lora权重合并,那么无需再指定lora_model,启动方式更简单:

```
CUDA_VISIBLE_DEVICES={device_id} python scripts/inference_hf.py \
--base_model path_to_merged_llama_or_alpaca_hf_dir \
--with_prompt \
--interactive
```

参数说明以及其他可选参数如下

* `{device_id}`: CUDA设备编号。如果为空,那么在CPU上进行推理
* `--base_model {base_model} `: 存放HF格式的LLaMA模型权重和配置文件的目录
* `--lora_model {lora_model}` : 中文LLaMA/Alpaca LoRA解压后文件所在目录,也可使用[🤗Model Hub模型调用名称](#Model-Hub)。若不提供此参数,则只加载base_model
* `--tokenizer_path {tokenizer_path}` : 存放对应tokenizer的目录。若不提供此参数,则其值与lora_model相同;若也未提供lora_model参数,则其值与base_model相同
* `--with_prompt`: 是否将输入放入prompt模版中。**如果加载Alpaca模型,请务必启用此选项!**
* `--interactive`: 以交互式方式启动。**与llama.cpp不同,该脚本不支持多轮对话中的上下文语意理解**
* `--data_file {file_name}`: 非交互式方式启动下,按行读取file_name中的的内容进行预测
* `--predictions_file {file_name}`: 非交互式方式下,将预测的结果以json格式写入file_name

⚠️**注意:该脚本仅为方便快速体验用,并未对多卡、低内存、低显存等情况等条件做任何优化。⚠️**

⚠️**如在CPU上运行7B模型推理,请确保有32GB内存;如在GPU上运行7B模型推理,请确保有20GB显存**⚠️



## 系统效果

Expand Down
151 changes: 151 additions & 0 deletions scripts/inference_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import PeftModel
import argparse
import json, os
parser = argparse.ArgumentParser()
parser.add_argument('--base_model', default=None, type=str, required=True)
parser.add_argument('--lora_model', default=None, type=str,help="If None, perform inference on the base model")
parser.add_argument('--tokenizer_path',default=None,type=str)
parser.add_argument('--data_file',default=None, type=str,help="file that contains instructions (one instruction per line).")
parser.add_argument('--with_prompt',action='store_true')
parser.add_argument('--interactive',action='store_true')
parser.add_argument('--predictions_file', default='./predictions.json', type=str)
args = parser.parse_args()

generation_config = dict(
temperature=0.2,
top_k=40,
top_p=0.9,
do_sample=True,
num_beams=1,
repeat_penalty=1.3,
max_new_tokens=400
)


# The prompt template below is taken from llama.cpp
# and is slightly different from the one used in training.
# But we find it gives better results
prompt_input = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n\n{instruction}\n\n### Response:\n\n"
)

sample_data = ["为什么要减少污染,保护环境?"]

def generate_prompt(instruction, input=None):
if input:
instruction = instruction + '\n' + input
return prompt_input.format_map({'instruction': instruction})


if __name__ == '__main__':
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
else:
device = torch.device('cpu')
if args.tokenizer_path is None:
args.tokenizer_path = args.lora_model
if args.lora_model is None:
args.tokenizer_path = args.base_model
tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)

base_model = LlamaForCausalLM.from_pretrained(
args.base_model,
load_in_8bit=False,
torch_dtype=load_type,
low_cpu_mem_usage=True,
)

model_vocab_size = base_model.get_input_embeddings().weight.size(0)
tokenzier_vocab_size = len(tokenizer)
print(f"Vocab of the base model: {model_vocab_size}")
print(f"Vocab of the tokenizer: {tokenzier_vocab_size}")
if model_vocab_size!=tokenzier_vocab_size:
assert tokenzier_vocab_size > model_vocab_size
print("Resize model embeddings to fit tokenizer")
base_model.resize_token_embeddings(tokenzier_vocab_size)
if args.lora_model is not None:
print("loading peft model")
model = PeftModel.from_pretrained(base_model, args.lora_model,torch_dtype=load_type)
else:
model = base_model

if device==torch.device('cpu'):
model.float()
# test data
if args.data_file is None:
examples = sample_data
else:
with open(args.data_file,'r') as f:
examples = [l.strip() for l in f.readlines()]
print("first 10 examples:")
for example in examples[:10]:
print(example)

model.to(device)
model.eval()


with torch.no_grad():
if args.interactive:
while True:
raw_input_text = input("Input:")
if len(raw_input_text.strip())==0:
break
if args.with_prompt:
input_text = generate_prompt(instruction=raw_input_text)
else:
input_text = raw_input_text
inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
**generation_config
)
s = generation_output[0]
output = tokenizer.decode(s,skip_special_tokens=True)
if args.with_prompt:
response = output.split("### Response:")[1].strip()
else:
response = output
print("Response: ",response)
print("\n")
else:
results = []
for index, example in enumerate(examples):
if args.with_prompt is True:
input_text = generate_prompt(instruction=example)
else:
input_text = example
inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ?
generation_output = model.generate(
input_ids = inputs["input_ids"].to(device),
attention_mask = inputs['attention_mask'].to(device),
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
**generation_config
)
s = generation_output[0]
output = tokenizer.decode(s,skip_special_tokens=True)
if args.with_prompt:
response = output.split("### Response:")[1].strip()
else:
response = output
print(f"======={index}=======")
print(f"Input: {example}\n")
print(f"Output: {response}\n")

results.append({"Input":input_text,"Output":response})

dirname = os.path.dirname(args.predictions_file)
os.makedirs(dirname,exist_ok=True)
with open(args.predictions_file,'w') as f:
json.dump(results,f,ensure_ascii=False,indent=2)
with open(dirname+'/generation_config.json','w') as f:
json.dump(generation_config,f,ensure_ascii=False,indent=2)

0 comments on commit 0b6718a

Please sign in to comment.