From 0836aec67b5ebdcc1c6c318674f95b5920002d47 Mon Sep 17 00:00:00 2001
From: RunningLeon <mnsheng@yeah.net>
Date: Wed, 17 Jan 2024 11:09:50 +0800
Subject: [PATCH] [Feature] Update evaluate turbomind  (#804)

* update

* fix

* fix

* fix
---
 configs/eval_internlm_chat_turbomind.py       | 24 +++++++-------
 configs/eval_internlm_turbomind.py            | 12 +++----
 .../advanced_guides/evaluation_turbomind.md   | 33 ++++++++++---------
 .../advanced_guides/evaluation_turbomind.md   | 29 ++++++++--------
 opencompass/models/turbomind.py               | 14 ++++----
 5 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/configs/eval_internlm_chat_turbomind.py b/configs/eval_internlm_chat_turbomind.py
index a09a67da3..34667249a 100644
--- a/configs/eval_internlm_chat_turbomind.py
+++ b/configs/eval_internlm_chat_turbomind.py
@@ -48,7 +48,7 @@
 internlm_chat_7b = dict(
     type=TurboMindModel,
     abbr='internlm-chat-7b-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-7b',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=32,
@@ -60,7 +60,7 @@
 internlm_chat_7b_w4 = dict(
     type=TurboMindModel,
     abbr='internlm-chat-7b-w4-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-7b-w4',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=32,
@@ -73,7 +73,7 @@
 internlm_chat_7b_w4kv8 = dict(
     type=TurboMindModel,
     abbr='internlm-chat-7b-w4kv8-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-7b-w4kv8',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=32,
@@ -86,7 +86,7 @@
 internlm_chat_20b = dict(
     type=TurboMindModel,
     abbr='internlm-chat-20b-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-20b',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=8,
@@ -99,7 +99,7 @@
 internlm_chat_20b_w4 = dict(
     type=TurboMindModel,
     abbr='internlm-chat-20b-w4-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-20b-w4',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -112,7 +112,7 @@
 internlm_chat_20b_w4kv8 = dict(
     type=TurboMindModel,
     abbr='internlm-chat-20b-w4kv8-turbomind',
-    path='./turbomind',
+    path='internlm/internlm-chat-20b-w4kv8',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -125,7 +125,7 @@
 llama2_chat_7b = dict(
     type=TurboMindModel,
     abbr='llama2-chat-7b-turbomind',
-    path='./turbomind',
+    path='meta-llama/Llama-2-7b-chat-hf',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -138,7 +138,7 @@
 llama2_chat_13b = dict(
     type=TurboMindModel,
     abbr='llama2-chat-13b-turbomind',
-    path='./turbomind',
+    path='meta-llama/Llama-2-13b-chat-hf',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -151,7 +151,7 @@
 llama2_chat_70b = dict(
     type=TurboMindModel,
     abbr='llama2-chat-70b-turbomind',
-    path='./turbomind',
+    path='meta-llama/Llama-2-70b-chat-hf',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=8,
@@ -164,7 +164,7 @@
 qwen_chat_7b = dict(
     type=TurboMindModel,
     abbr='qwen-chat-7b-turbomind',
-    path='./turbomind',
+    path='Qwen/Qwen-7B-Chat',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -177,7 +177,7 @@
 qwen_chat_14b = dict(
     type=TurboMindModel,
     abbr='qwen-chat-14b-turbomind',
-    path='./turbomind',
+    path='Qwen/Qwen-14B-Chat',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
@@ -190,7 +190,7 @@
 baichuan2_chat_7b = dict(
     type=TurboMindModel,
     abbr='baichuan2-chat-7b-turbomind',
-    path='./turbomind',
+    path='baichuan-inc/Baichuan2-7B-Chat',
     max_out_len=100,
     max_seq_len=2048,
     batch_size=16,
diff --git a/configs/eval_internlm_turbomind.py b/configs/eval_internlm_turbomind.py
index a2396a5be..8e62fa344 100644
--- a/configs/eval_internlm_turbomind.py
+++ b/configs/eval_internlm_turbomind.py
@@ -19,7 +19,7 @@
 internlm_7b = dict(
         type=TurboMindModel,
         abbr='internlm-7b-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-7b",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=32,
@@ -31,7 +31,7 @@
 internlm_7b_w4 = dict(
         type=TurboMindModel,
         abbr='internlm-7b-w4-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-7b-w4",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=32,
@@ -43,7 +43,7 @@
 internlm_7b_w4kv8 = dict(
         type=TurboMindModel,
         abbr='internlm-7b-w4kv8-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-7b-w4kv8",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=32,
@@ -55,7 +55,7 @@
 internlm_20b = dict(
         type=TurboMindModel,
         abbr='internlm-20b-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-20b",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
@@ -67,7 +67,7 @@
 internlm_20b_w4 = dict(
         type=TurboMindModel,
         abbr='internlm-20b-w4-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-20b-w4",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=16,
@@ -80,7 +80,7 @@
 internlm_20b_w4kv8 = dict(
         type=TurboMindModel,
         abbr='internlm-20b-w4kv8-turbomind',
-        path="./turbomind",
+        path="internlm/internlm-20b-w4kv8",
         max_out_len=100,
         max_seq_len=2048,
         batch_size=16,
diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md
index 00b57226e..0fa75fc31 100644
--- a/docs/en/advanced_guides/evaluation_turbomind.md
+++ b/docs/en/advanced_guides/evaluation_turbomind.md
@@ -20,27 +20,14 @@ pip install lmdeploy
 
 OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended.
 
-We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format:
+We take the InternLM-20B as example. Please download it from huggingface:
 
 ```shell
-# 1. Download InternLM model(or use the cached model's checkpoint)
+# Download InternLM model(or use the cached model's checkpoint)
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
 git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
-
-# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
-lmdeploy convert internlm /path/to/internlm-20b \
-    --dst-path {/home/folder/of/opencompass}/turbomind
-```
-
-**Note**:
-
-If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
-
-```shell
-lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
-    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
 
 ### Evaluation with Turbomind Python API (recommended)
@@ -61,6 +48,22 @@ You are expected to get the evaluation results after the inference and evaluatio
 
 ### Evaluation with Turbomind gPRC API (optional)
 
+Convert model to TurboMind format using lmdeploy
+
+```shell
+lmdeploy convert internlm /path/to/internlm-20b \
+    --dst-path {/home/folder/of/opencompass}/turbomind
+```
+
+**Note**:
+
+If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is:
+
+```shell
+lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
+    --dst-path {/home/folder/of/opencompass}/turbomind
+```
+
 In the home folder of OpenCompass, launch the Triton Inference Server:
 
 ```shell
diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
index 4c8714e1e..240b2f382 100644
--- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md
+++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md
@@ -20,25 +20,14 @@ pip install lmdeploy
 
 OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。
 
-下文以 InternLM-20B 模型为例，介绍如何评测。首先，从 huggingface 上下载 InternLM 模型，并转换为 turbomind 模型格式：
+下文以 InternLM-20B 模型为例，介绍如何评测。首先，从 huggingface 上下载 InternLM 模型：
 
 ```shell
-# 1. Download InternLM model(or use the cached model's checkpoint)
+Download InternLM model(or use the cached model's checkpoint)
 
 # Make sure you have git-lfs installed (https://git-lfs.com)
 git lfs install
 git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b
-
-# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass
-lmdeploy convert internlm /path/to/internlm-20b \
-    --dst-path {/home/folder/of/opencompass}/turbomind
-```
-
-注意：如果评测 InternLM Chat 模型，那么在转换模型格式的时候，模型名字要填写 `internlm-chat`。具体命令是：
-
-```shell
-lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
-    --dst-path {/home/folder/of/opencompass}/turbomind
 ```
 
 ### 通过 TurboMind Python API 评测（推荐）
@@ -57,6 +46,20 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2
 
 ### 通过 TurboMind gPRC API 评测（可选）
 
+首先需要将模型转换为 turbomind 格式
+
+```shell script
+lmdeploy convert internlm /path/to/internlm-20b \
+    --dst-path {/home/folder/of/opencompass}/turbomind
+```
+
+注意：如果评测 InternLM Chat 模型，那么在转换模型格式的时候，模型名字要填写 `internlm-chat`。具体命令是：
+
+```shell
+lmdeploy convert internlm-chat /path/to/internlm-20b-chat \
+    --dst-path {/home/folder/of/opencompass}/turbomind
+```
+
 在 OpenCompass 的项目目录下，启动 triton inference server：
 
 ```shell
diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py
index b75c3e021..f435fe86c 100644
--- a/opencompass/models/turbomind.py
+++ b/opencompass/models/turbomind.py
@@ -1,4 +1,3 @@
-import os.path as osp
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
@@ -40,16 +39,14 @@ def __init__(
         max_seq_len: int = 2048,
         meta_template: Optional[Dict] = None,
     ):
-        from lmdeploy import turbomind as tm
-        from lmdeploy.tokenizer import Tokenizer
+        from lmdeploy.turbomind import TurboMind
 
         super().__init__(path=path,
                          max_seq_len=max_seq_len,
                          meta_template=meta_template)
         self.logger = get_logger()
-        tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer')
-        self.tokenizer = Tokenizer(tokenizer_model_path)
-        tm_model = tm.TurboMind(path)
+        tm_model = TurboMind.from_pretrained(path)
+        self.tokenizer = tm_model.tokenizer
         self.generators = [
             tm_model.create_instance() for i in range(concurrency)
         ]
@@ -134,9 +131,10 @@ def _generate(self, generator, session_id, prompt: str or PromptList,
                                               sequence_start=True,
                                               sequence_end=True,
                                               top_k=1,
+                                              top_p=0.8,
                                               step=0,
                                               stream_output=False):
-            output_ids, _ = outputs[0]
-            response = self.tokenizer.decode(output_ids.tolist())
+            _, output_ids, _ = outputs
+            response = self.tokenizer.decode(output_ids)
             response = valid_str(response)
         return response