diff --git a/configs/eval_internlm_chat_turbomind_api.py b/configs/eval_internlm_chat_lmdeploy_apiserver.py similarity index 59% rename from configs/eval_internlm_chat_turbomind_api.py rename to configs/eval_internlm_chat_lmdeploy_apiserver.py index 40483f012..3dd4326bf 100644 --- a/configs/eval_internlm_chat_turbomind_api.py +++ b/configs/eval_internlm_chat_lmdeploy_apiserver.py @@ -6,9 +6,9 @@ from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.race.race_gen_69ee4f import race_datasets from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets # and output the results in a choosen format @@ -24,16 +24,29 @@ ], eos_token_id=103028) -models = [ - dict( - type=TurboMindAPIModel, - abbr='internlm-chat-20b-turbomind', - path="internlm-chat-20b", - api_addr='http://0.0.0.0:23333', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - meta_template=meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] +internlm_chat_20b = dict( + type=TurboMindAPIModel, + abbr='internlm-chat-20b-turbomind', + api_addr='http://0.0.0.0:23333', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + meta_template=meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + end_str='', +) + +internlm_chat_7b = dict( + type=TurboMindAPIModel, + abbr='internlm-chat-7b-turbomind', + api_addr='http://0.0.0.0:23333', + max_out_len=100, + max_seq_len=2048, + batch_size=16, + meta_template=meta_template, + run_cfg=dict(num_gpus=1, num_procs=1), + end_str='', +) + +models = [internlm_chat_20b] + diff --git a/configs/eval_internlm_turbomind_api.py b/configs/eval_internlm_lmdeploy_apiserver.py similarity index 59% rename from configs/eval_internlm_turbomind_api.py rename to configs/eval_internlm_lmdeploy_apiserver.py index 193fd4d92..36d3b8606 100644 --- a/configs/eval_internlm_turbomind_api.py +++ b/configs/eval_internlm_lmdeploy_apiserver.py @@ -14,15 +14,25 @@ datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) -models = [ - dict( - type=TurboMindAPIModel, - abbr='internlm-chat-20b-turbomind', - path="internlm-chat-20b", - api_addr='http://0.0.0.0:23333', - max_out_len=100, - max_seq_len=2048, - batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), - ) -] +internlm_chat_20b = dict( + type=TurboMindAPIModel, + abbr='internlm-chat-20b-turbomind', + api_addr='http://0.0.0.0:23333', + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=1, num_procs=1), +) + +internlm_chat_7b = dict( + type=TurboMindAPIModel, + abbr='internlm-chat-7b-turbomind', + api_addr='http://0.0.0.0:23333', + max_out_len=100, + max_seq_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1, num_procs=1), +) + +models = [internlm_chat_20b] + diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py index b47ab4194..506e43bd3 100644 --- a/opencompass/models/lmdeploy_pytorch.py +++ b/opencompass/models/lmdeploy_pytorch.py @@ -54,6 +54,10 @@ def __init__(self, if engine_config is not None: from lmdeploy.messages import PytorchEngineConfig engine_config = PytorchEngineConfig(**engine_config) + # set thread_safe + if hasattr(engine_config, 'thread_safe'): + engine_config.thread_safe = True + if gen_config is not None: from lmdeploy.messages import EngineGenerationConfig gen_config = EngineGenerationConfig(**gen_config) diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index c5606d93f..9a4023e00 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -1,6 +1,8 @@ from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union +import numpy as np + from opencompass.models.base import BaseModel from opencompass.utils.logging import get_logger from opencompass.utils.prompt import PromptList @@ -161,3 +163,29 @@ def _generate(self, if end_str: response = response.split(end_str)[0] return response + + def get_ppl(self, + inputs: List[str], + mask_length: Optional[List[int]] = None) -> List[float]: + """Get perplexity scores given a list of inputs. + + Args: + inputs (List[str]): A list of strings. + mask_length (Optional[List[int]]): A list of mask lengths. If + provided, the perplexity scores will be calculated with the + first mask_length[i] tokens masked out. It's okay to skip + its implementation if advanced features in PPLInfernecer is + not needed. + + Returns: + np.ndarray: The perplexity scores in shape of (N,) + """ + assert isinstance( + inputs, List), f'List(str) is expected, but got {type(inputs)}' + results = [] + for text in inputs: + input_ids = self.tokenizer.encode(text) + res = self.generators[0].get_ppl(input_ids) + results.append(res) + results = np.concatenate(results) + return results diff --git a/opencompass/models/turbomind_api.py b/opencompass/models/turbomind_api.py index 6399a1911..75db216e0 100644 --- a/opencompass/models/turbomind_api.py +++ b/opencompass/models/turbomind_api.py @@ -20,30 +20,31 @@ def valid_str(string, coding='utf-8'): class TurboMindAPIModel(BaseModel): - """Model wrapper for TurboMind Triton Inference Server gRPC API. + """Model wrapper for lmdeploy api server. Args: - path (str): The name of OpenAI's model. - tis_addr (str): The address (ip:port format) of turbomind's - triton inference server + api_addr (str): The address (ip:port format) of lmdeploy's + api server. max_seq_len (int): The maximum allowed sequence length of a model. Note that the length of prompt + generated tokens shall not exceed this value. Defaults to 2048. meta_template (Dict, optional): The model's meta prompt template if needed, in case the requirement of injecting or wrapping of any meta instructions. + end_str (str, optional): Whether to trim generated strings with end_str + if the model has special ending strings that are not handled well. + Defaults to None. """ is_api: bool = True - def __init__( - self, - path: str, - api_addr: str = 'http://0.0.0.0:23333', - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - ): - super().__init__(path=path, + def __init__(self, + api_addr: str = 'http://0.0.0.0:23333', + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + end_str: Optional[str] = None, + **kwargs): + super().__init__(path='', max_seq_len=max_seq_len, meta_template=meta_template) from lmdeploy.serve.openai.api_client import APIClient @@ -55,6 +56,7 @@ def __init__( if meta_template and 'eos_token_id' in meta_template: self.eos_token_id = meta_template['eos_token_id'] self.api_addr = api_addr + self.end_str = end_str def generate( self, @@ -73,7 +75,10 @@ def generate( between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. Defaults to 0.7. - + end_str (str, optional): Whether to trim generated strings + with end_str if the model has special ending strings + that are not handled well. + Defaults to None. Returns: List[str]: A list of generated strings. """ @@ -82,7 +87,8 @@ def generate( results = list( executor.map(self._generate, inputs, [max_out_len] * len(inputs), - [temperature] * len(inputs))) + [temperature] * len(inputs), + [self.end_str] * len(inputs))) return results def get_token_len(self, prompt: str) -> int: @@ -97,7 +103,7 @@ def wait(self): return self.token_bucket.get_token() def _generate(self, prompt: str or PromptList, max_out_len: int, - temperature: float) -> str: + temperature: float, end_str: str) -> str: """Generate results given a list of inputs. Args: @@ -127,4 +133,6 @@ def _generate(self, prompt: str or PromptList, max_out_len: int, top_k=1): response += output['choices'][0]['text'] response = valid_str(response) + if end_str: + response = response.split(end_str)[0] return response