Skip to content

Commit

Permalink
Support get_ppl for TurbomindModel (#878)
Browse files Browse the repository at this point in the history
* update ppl for turbomindmodel

* update api_server

* rename config and set thread_safe for pytorch engine if possible
  • Loading branch information
RunningLeon committed Mar 6, 2024
1 parent caf1cf8 commit c54a5d3
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.race.race_gen_69ee4f import race_datasets
from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
# and output the results in a choosen format
Expand All @@ -24,16 +24,29 @@
],
eos_token_id=103028)

models = [
dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
path="internlm-chat-20b",
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)

internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
meta_template=meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)

models = [internlm_chat_20b]

Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,25 @@

datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

models = [
dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
path="internlm-chat-20b",
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
internlm_chat_20b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-20b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)

internlm_chat_7b = dict(
type=TurboMindAPIModel,
abbr='internlm-chat-7b-turbomind',
api_addr='http://0.0.0.0:23333',
max_out_len=100,
max_seq_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1, num_procs=1),
)

models = [internlm_chat_20b]

4 changes: 4 additions & 0 deletions opencompass/models/lmdeploy_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ def __init__(self,
if engine_config is not None:
from lmdeploy.messages import PytorchEngineConfig
engine_config = PytorchEngineConfig(**engine_config)
# set thread_safe
if hasattr(engine_config, 'thread_safe'):
engine_config.thread_safe = True

if gen_config is not None:
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
Expand Down
28 changes: 28 additions & 0 deletions opencompass/models/turbomind.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union

import numpy as np

from opencompass.models.base import BaseModel
from opencompass.utils.logging import get_logger
from opencompass.utils.prompt import PromptList
Expand Down Expand Up @@ -161,3 +163,29 @@ def _generate(self,
if end_str:
response = response.split(end_str)[0]
return response

def get_ppl(self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
np.ndarray: The perplexity scores in shape of (N,)
"""
assert isinstance(
inputs, List), f'List(str) is expected, but got {type(inputs)}'
results = []
for text in inputs:
input_ids = self.tokenizer.encode(text)
res = self.generators[0].get_ppl(input_ids)
results.append(res)
results = np.concatenate(results)
return results
38 changes: 23 additions & 15 deletions opencompass/models/turbomind_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,30 +20,31 @@ def valid_str(string, coding='utf-8'):


class TurboMindAPIModel(BaseModel):
"""Model wrapper for TurboMind Triton Inference Server gRPC API.
"""Model wrapper for lmdeploy api server.
Args:
path (str): The name of OpenAI's model.
tis_addr (str): The address (ip:port format) of turbomind's
triton inference server
api_addr (str): The address (ip:port format) of lmdeploy's
api server.
max_seq_len (int): The maximum allowed sequence length of a model.
Note that the length of prompt + generated tokens shall not exceed
this value. Defaults to 2048.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
end_str (str, optional): Whether to trim generated strings with end_str
if the model has special ending strings that are not handled well.
Defaults to None.
"""

is_api: bool = True

def __init__(
self,
path: str,
api_addr: str = 'http://0.0.0.0:23333',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
):
super().__init__(path=path,
def __init__(self,
api_addr: str = 'http://0.0.0.0:23333',
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
end_str: Optional[str] = None,
**kwargs):
super().__init__(path='',
max_seq_len=max_seq_len,
meta_template=meta_template)
from lmdeploy.serve.openai.api_client import APIClient
Expand All @@ -55,6 +56,7 @@ def __init__(
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.api_addr = api_addr
self.end_str = end_str

def generate(
self,
Expand All @@ -73,7 +75,10 @@ def generate(
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic. Defaults to 0.7.
end_str (str, optional): Whether to trim generated strings
with end_str if the model has special ending strings
that are not handled well.
Defaults to None.
Returns:
List[str]: A list of generated strings.
"""
Expand All @@ -82,7 +87,8 @@ def generate(
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs),
[temperature] * len(inputs)))
[temperature] * len(inputs),
[self.end_str] * len(inputs)))
return results

def get_token_len(self, prompt: str) -> int:
Expand All @@ -97,7 +103,7 @@ def wait(self):
return self.token_bucket.get_token()

def _generate(self, prompt: str or PromptList, max_out_len: int,
temperature: float) -> str:
temperature: float, end_str: str) -> str:
"""Generate results given a list of inputs.
Args:
Expand Down Expand Up @@ -127,4 +133,6 @@ def _generate(self, prompt: str or PromptList, max_out_len: int,
top_k=1):
response += output['choices'][0]['text']
response = valid_str(response)
if end_str:
response = response.split(end_str)[0]
return response

0 comments on commit c54a5d3

Please sign in to comment.