From 0836aec67b5ebdcc1c6c318674f95b5920002d47 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 17 Jan 2024 11:09:50 +0800 Subject: [PATCH] [Feature] Update evaluate turbomind (#804) * update * fix * fix * fix --- configs/eval_internlm_chat_turbomind.py | 24 +++++++------- configs/eval_internlm_turbomind.py | 12 +++---- .../advanced_guides/evaluation_turbomind.md | 33 ++++++++++--------- .../advanced_guides/evaluation_turbomind.md | 29 ++++++++-------- opencompass/models/turbomind.py | 14 ++++---- 5 files changed, 58 insertions(+), 54 deletions(-) diff --git a/configs/eval_internlm_chat_turbomind.py b/configs/eval_internlm_chat_turbomind.py index a09a67da3..34667249a 100644 --- a/configs/eval_internlm_chat_turbomind.py +++ b/configs/eval_internlm_chat_turbomind.py @@ -48,7 +48,7 @@ internlm_chat_7b = dict( type=TurboMindModel, abbr='internlm-chat-7b-turbomind', - path='./turbomind', + path='internlm/internlm-chat-7b', max_out_len=100, max_seq_len=2048, batch_size=32, @@ -60,7 +60,7 @@ internlm_chat_7b_w4 = dict( type=TurboMindModel, abbr='internlm-chat-7b-w4-turbomind', - path='./turbomind', + path='internlm/internlm-chat-7b-w4', max_out_len=100, max_seq_len=2048, batch_size=32, @@ -73,7 +73,7 @@ internlm_chat_7b_w4kv8 = dict( type=TurboMindModel, abbr='internlm-chat-7b-w4kv8-turbomind', - path='./turbomind', + path='internlm/internlm-chat-7b-w4kv8', max_out_len=100, max_seq_len=2048, batch_size=32, @@ -86,7 +86,7 @@ internlm_chat_20b = dict( type=TurboMindModel, abbr='internlm-chat-20b-turbomind', - path='./turbomind', + path='internlm/internlm-chat-20b', max_out_len=100, max_seq_len=2048, batch_size=8, @@ -99,7 +99,7 @@ internlm_chat_20b_w4 = dict( type=TurboMindModel, abbr='internlm-chat-20b-w4-turbomind', - path='./turbomind', + path='internlm/internlm-chat-20b-w4', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -112,7 +112,7 @@ internlm_chat_20b_w4kv8 = dict( type=TurboMindModel, abbr='internlm-chat-20b-w4kv8-turbomind', - path='./turbomind', + path='internlm/internlm-chat-20b-w4kv8', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -125,7 +125,7 @@ llama2_chat_7b = dict( type=TurboMindModel, abbr='llama2-chat-7b-turbomind', - path='./turbomind', + path='meta-llama/Llama-2-7b-chat-hf', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -138,7 +138,7 @@ llama2_chat_13b = dict( type=TurboMindModel, abbr='llama2-chat-13b-turbomind', - path='./turbomind', + path='meta-llama/Llama-2-13b-chat-hf', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -151,7 +151,7 @@ llama2_chat_70b = dict( type=TurboMindModel, abbr='llama2-chat-70b-turbomind', - path='./turbomind', + path='meta-llama/Llama-2-70b-chat-hf', max_out_len=100, max_seq_len=2048, batch_size=8, @@ -164,7 +164,7 @@ qwen_chat_7b = dict( type=TurboMindModel, abbr='qwen-chat-7b-turbomind', - path='./turbomind', + path='Qwen/Qwen-7B-Chat', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -177,7 +177,7 @@ qwen_chat_14b = dict( type=TurboMindModel, abbr='qwen-chat-14b-turbomind', - path='./turbomind', + path='Qwen/Qwen-14B-Chat', max_out_len=100, max_seq_len=2048, batch_size=16, @@ -190,7 +190,7 @@ baichuan2_chat_7b = dict( type=TurboMindModel, abbr='baichuan2-chat-7b-turbomind', - path='./turbomind', + path='baichuan-inc/Baichuan2-7B-Chat', max_out_len=100, max_seq_len=2048, batch_size=16, diff --git a/configs/eval_internlm_turbomind.py b/configs/eval_internlm_turbomind.py index a2396a5be..8e62fa344 100644 --- a/configs/eval_internlm_turbomind.py +++ b/configs/eval_internlm_turbomind.py @@ -19,7 +19,7 @@ internlm_7b = dict( type=TurboMindModel, abbr='internlm-7b-turbomind', - path="./turbomind", + path="internlm/internlm-7b", max_out_len=100, max_seq_len=2048, batch_size=32, @@ -31,7 +31,7 @@ internlm_7b_w4 = dict( type=TurboMindModel, abbr='internlm-7b-w4-turbomind', - path="./turbomind", + path="internlm/internlm-7b-w4", max_out_len=100, max_seq_len=2048, batch_size=32, @@ -43,7 +43,7 @@ internlm_7b_w4kv8 = dict( type=TurboMindModel, abbr='internlm-7b-w4kv8-turbomind', - path="./turbomind", + path="internlm/internlm-7b-w4kv8", max_out_len=100, max_seq_len=2048, batch_size=32, @@ -55,7 +55,7 @@ internlm_20b = dict( type=TurboMindModel, abbr='internlm-20b-turbomind', - path="./turbomind", + path="internlm/internlm-20b", max_out_len=100, max_seq_len=2048, batch_size=8, @@ -67,7 +67,7 @@ internlm_20b_w4 = dict( type=TurboMindModel, abbr='internlm-20b-w4-turbomind', - path="./turbomind", + path="internlm/internlm-20b-w4", max_out_len=100, max_seq_len=2048, batch_size=16, @@ -80,7 +80,7 @@ internlm_20b_w4kv8 = dict( type=TurboMindModel, abbr='internlm-20b-w4kv8-turbomind', - path="./turbomind", + path="internlm/internlm-20b-w4kv8", max_out_len=100, max_seq_len=2048, batch_size=16, diff --git a/docs/en/advanced_guides/evaluation_turbomind.md b/docs/en/advanced_guides/evaluation_turbomind.md index 00b57226e..0fa75fc31 100644 --- a/docs/en/advanced_guides/evaluation_turbomind.md +++ b/docs/en/advanced_guides/evaluation_turbomind.md @@ -20,27 +20,14 @@ pip install lmdeploy OpenCompass integrates both turbomind's python API and gRPC API for evaluation. And the former is highly recommended. -We take the InternLM-20B as example. Please download it from huggingface and convert it to turbomind's model format: +We take the InternLM-20B as example. Please download it from huggingface: ```shell -# 1. Download InternLM model(or use the cached model's checkpoint) +# Download InternLM model(or use the cached model's checkpoint) # Make sure you have git-lfs installed (https://git-lfs.com) git lfs install git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b - -# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass -lmdeploy convert internlm /path/to/internlm-20b \ - --dst-path {/home/folder/of/opencompass}/turbomind -``` - -**Note**: - -If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is: - -```shell -lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ - --dst-path {/home/folder/of/opencompass}/turbomind ``` ### Evaluation with Turbomind Python API (recommended) @@ -61,6 +48,22 @@ You are expected to get the evaluation results after the inference and evaluatio ### Evaluation with Turbomind gPRC API (optional) +Convert model to TurboMind format using lmdeploy + +```shell +lmdeploy convert internlm /path/to/internlm-20b \ + --dst-path {/home/folder/of/opencompass}/turbomind +``` + +**Note**: + +If evaluating the InternLM Chat model, make sure to pass `internlm-chat` as the model name instead of `internlm` when converting the model format. The specific command is: + +```shell +lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ + --dst-path {/home/folder/of/opencompass}/turbomind +``` + In the home folder of OpenCompass, launch the Triton Inference Server: ```shell diff --git a/docs/zh_cn/advanced_guides/evaluation_turbomind.md b/docs/zh_cn/advanced_guides/evaluation_turbomind.md index 4c8714e1e..240b2f382 100644 --- a/docs/zh_cn/advanced_guides/evaluation_turbomind.md +++ b/docs/zh_cn/advanced_guides/evaluation_turbomind.md @@ -20,25 +20,14 @@ pip install lmdeploy OpenCompass 支持分别通过 turbomind python API 和 gRPC API 评测数据集。我们强烈推荐使用前者进行评测。 -下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型,并转换为 turbomind 模型格式: +下文以 InternLM-20B 模型为例,介绍如何评测。首先,从 huggingface 上下载 InternLM 模型: ```shell -# 1. Download InternLM model(or use the cached model's checkpoint) +Download InternLM model(or use the cached model's checkpoint) # Make sure you have git-lfs installed (https://git-lfs.com) git lfs install git clone https://huggingface.co/internlm/internlm-20b /path/to/internlm-20b - -# 2. Convert InternLM model to turbomind's format, and save it in the home folder of opencompass -lmdeploy convert internlm /path/to/internlm-20b \ - --dst-path {/home/folder/of/opencompass}/turbomind -``` - -注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是: - -```shell -lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ - --dst-path {/home/folder/of/opencompass}/turbomind ``` ### 通过 TurboMind Python API 评测(推荐) @@ -57,6 +46,20 @@ python run.py configs/eval_internlm_turbomind.py -w outputs/turbomind/internlm-2 ### 通过 TurboMind gPRC API 评测(可选) +首先需要将模型转换为 turbomind 格式 + +```shell script +lmdeploy convert internlm /path/to/internlm-20b \ + --dst-path {/home/folder/of/opencompass}/turbomind +``` + +注意:如果评测 InternLM Chat 模型,那么在转换模型格式的时候,模型名字要填写 `internlm-chat`。具体命令是: + +```shell +lmdeploy convert internlm-chat /path/to/internlm-20b-chat \ + --dst-path {/home/folder/of/opencompass}/turbomind +``` + 在 OpenCompass 的项目目录下,启动 triton inference server: ```shell diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index b75c3e021..f435fe86c 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,4 +1,3 @@ -import os.path as osp from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -40,16 +39,14 @@ def __init__( max_seq_len: int = 2048, meta_template: Optional[Dict] = None, ): - from lmdeploy import turbomind as tm - from lmdeploy.tokenizer import Tokenizer + from lmdeploy.turbomind import TurboMind super().__init__(path=path, max_seq_len=max_seq_len, meta_template=meta_template) self.logger = get_logger() - tokenizer_model_path = osp.join(path, 'triton_models', 'tokenizer') - self.tokenizer = Tokenizer(tokenizer_model_path) - tm_model = tm.TurboMind(path) + tm_model = TurboMind.from_pretrained(path) + self.tokenizer = tm_model.tokenizer self.generators = [ tm_model.create_instance() for i in range(concurrency) ] @@ -134,9 +131,10 @@ def _generate(self, generator, session_id, prompt: str or PromptList, sequence_start=True, sequence_end=True, top_k=1, + top_p=0.8, step=0, stream_output=False): - output_ids, _ = outputs[0] - response = self.tokenizer.decode(output_ids.tolist()) + _, output_ids, _ = outputs + response = self.tokenizer.decode(output_ids) response = valid_str(response) return response