Merge branch 'main' into main

ymcui · Apr 13, 2023 · 0b6718a · 0b6718a
2 parents 53abdbc + 4232d29
commit 0b6718a
Show file tree

Hide file tree

Showing 2 changed files with 190 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -287,6 +287,45 @@ python server.py --model llama-7b-hf --lora chinese-alpaca-lora-7b
 
 ```
 
+### 使用Transformers推理
+
+如果想快速体验模型效果，不安装其他库或Python包，可以使用[scripts/inference_hf.py](scripts/inference_hf.py)在不量化的情况下启动模型。该脚本支持CPU和GPU的单卡推理。以启动Chinese-Alpaca 7B模型为例，脚本运行方式如下：
+
+(**因不同框架的解码的实现细节有差异，该脚本并不能保证复现llama.cpp的解码效果**)
+
+```
+CUDA_VISIBLE_DEVICES={device_id} python scripts/inference_hf.py \
+    --base_model path_to_original_llama_hf_dir \
+    --lora_model path_to_chinese_llama_or_alpaca_lora \
+    --with_prompt \
+    --interactive
+```
+
+如果已经执行了`merge_llama_with_chinese_lora_to_hf.py`脚本将lora权重合并，那么无需再指定lora_model，启动方式更简单：
+
+```
+CUDA_VISIBLE_DEVICES={device_id} python scripts/inference_hf.py \
+    --base_model path_to_merged_llama_or_alpaca_hf_dir \
+    --with_prompt \
+    --interactive
+```
+
+参数说明以及其他可选参数如下
+
+* `{device_id}`: CUDA设备编号。如果为空，那么在CPU上进行推理
+* `--base_model {base_model} `: 存放HF格式的LLaMA模型权重和配置文件的目录
+* `--lora_model {lora_model}` : 中文LLaMA/Alpaca LoRA解压后文件所在目录，也可使用[🤗Model Hub模型调用名称](#Model-Hub)。若不提供此参数，则只加载base_model
+* `--tokenizer_path {tokenizer_path}`  : 存放对应tokenizer的目录。若不提供此参数，则其值与lora_model相同；若也未提供lora_model参数，则其值与base_model相同
+* `--with_prompt`: 是否将输入放入prompt模版中。**如果加载Alpaca模型，请务必启用此选项！**
+* `--interactive`: 以交互式方式启动。**与llama.cpp不同，该脚本不支持多轮对话中的上下文语意理解**
+* `--data_file {file_name}`:  非交互式方式启动下，按行读取file_name中的的内容进行预测
+* `--predictions_file {file_name}`: 非交互式方式下，将预测的结果以json格式写入file_name
+
+⚠️**注意：该脚本仅为方便快速体验用，并未对多卡、低内存、低显存等情况等条件做任何优化。⚠️**
+
+⚠️**如在CPU上运行7B模型推理，请确保有32GB内存；如在GPU上运行7B模型推理，请确保有20GB显存**⚠️
+
+
 
 ## 系统效果
 

diff --git a/scripts/inference_hf.py b/scripts/inference_hf.py
@@ -0,0 +1,151 @@
+import torch
+from transformers import LlamaForCausalLM, LlamaTokenizer
+from peft import  PeftModel
+import argparse
+import json, os
+parser = argparse.ArgumentParser()
+parser.add_argument('--base_model', default=None, type=str, required=True)
+parser.add_argument('--lora_model', default=None, type=str,help="If None, perform inference on the base model")
+parser.add_argument('--tokenizer_path',default=None,type=str)
+parser.add_argument('--data_file',default=None, type=str,help="file that contains instructions (one instruction per line).")
+parser.add_argument('--with_prompt',action='store_true')
+parser.add_argument('--interactive',action='store_true')
+parser.add_argument('--predictions_file', default='./predictions.json', type=str)
+args = parser.parse_args()
+
+generation_config = dict(
+    temperature=0.2,
+    top_k=40,
+    top_p=0.9,
+    do_sample=True,
+    num_beams=1,
+    repeat_penalty=1.3,
+    max_new_tokens=400
+    )
+
+
+ # The prompt template below is taken from llama.cpp
+ # and is slightly different from the one used in training.
+ # But we find it gives better results
+prompt_input = (
+    "Below is an instruction that describes a task. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n\n{instruction}\n\n### Response:\n\n"
+)
+
+sample_data = ["为什么要减少污染，保护环境？"]
+
+def generate_prompt(instruction, input=None):
+    if input:
+        instruction = instruction + '\n' + input
+    return prompt_input.format_map({'instruction': instruction})
+
+
+if __name__ == '__main__':
+    load_type = torch.float16
+    if torch.cuda.is_available():
+        device = torch.device(0)
+    else:
+        device = torch.device('cpu')
+    if args.tokenizer_path is None:
+        args.tokenizer_path = args.lora_model
+        if args.lora_model is None:
+            args.tokenizer_path = args.base_model
+    tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
+
+    base_model = LlamaForCausalLM.from_pretrained(
+        args.base_model, 
+        load_in_8bit=False,
+        torch_dtype=load_type,
+        low_cpu_mem_usage=True,
+        )
+
+    model_vocab_size = base_model.get_input_embeddings().weight.size(0)
+    tokenzier_vocab_size = len(tokenizer)
+    print(f"Vocab of the base model: {model_vocab_size}")
+    print(f"Vocab of the tokenizer: {tokenzier_vocab_size}")
+    if model_vocab_size!=tokenzier_vocab_size:
+        assert tokenzier_vocab_size > model_vocab_size
+        print("Resize model embeddings to fit tokenizer")
+        base_model.resize_token_embeddings(tokenzier_vocab_size)
+    if args.lora_model is not None:
+        print("loading peft model")
+        model = PeftModel.from_pretrained(base_model, args.lora_model,torch_dtype=load_type)
+    else:
+        model = base_model
+
+    if device==torch.device('cpu'):
+        model.float()
+    # test data
+    if args.data_file is None:
+        examples = sample_data
+    else:
+        with open(args.data_file,'r') as f:
+            examples = [l.strip() for l in f.readlines()]
+        print("first 10 examples:")
+        for example in examples[:10]:
+            print(example)
+
+    model.to(device)
+    model.eval()
+
+
+    with torch.no_grad():
+        if args.interactive:
+            while True:
+                raw_input_text = input("Input:")
+                if len(raw_input_text.strip())==0:
+                    break
+                if args.with_prompt:
+                    input_text = generate_prompt(instruction=raw_input_text)
+                else:
+                    input_text = raw_input_text
+                inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
+                generation_output = model.generate(
+                    input_ids = inputs["input_ids"].to(device), 
+                    attention_mask = inputs['attention_mask'].to(device),
+                    eos_token_id=tokenizer.eos_token_id,
+                    pad_token_id=tokenizer.pad_token_id,
+                    **generation_config
+                )
+                s = generation_output[0]
+                output = tokenizer.decode(s,skip_special_tokens=True)
+                if args.with_prompt:
+                    response = output.split("### Response:")[1].strip()
+                else:
+                    response = output
+                print("Response: ",response)
+                print("\n")
+        else:
+            results = []
+            for index, example in enumerate(examples):
+                if args.with_prompt is True:
+                    input_text = generate_prompt(instruction=example)
+                else:
+                    input_text = example
+                inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
+                generation_output = model.generate(
+                    input_ids = inputs["input_ids"].to(device), 
+                    attention_mask = inputs['attention_mask'].to(device),
+                    eos_token_id=tokenizer.eos_token_id,
+                    pad_token_id=tokenizer.pad_token_id,
+                    **generation_config
+                )
+                s = generation_output[0]
+                output = tokenizer.decode(s,skip_special_tokens=True)
+                if args.with_prompt:
+                    response = output.split("### Response:")[1].strip()
+                else:
+                    response = output
+                print(f"======={index}=======")
+                print(f"Input: {example}\n")
+                print(f"Output: {response}\n")
+
+                results.append({"Input":input_text,"Output":response})
+
+            dirname = os.path.dirname(args.predictions_file)
+            os.makedirs(dirname,exist_ok=True)
+            with open(args.predictions_file,'w') as f:
+                json.dump(results,f,ensure_ascii=False,indent=2)
+            with open(dirname+'/generation_config.json','w') as f:
+                json.dump(generation_config,f,ensure_ascii=False,indent=2)