Support get_ppl for TurbomindModel (#878)

* update ppl for turbomindmodel * update api_server * rename config and set thread_safe for pytorch engine if possible
open-compass · Mar 6, 2024 · c54a5d3 · c54a5d3
1 parent caf1cf8
commit c54a5d3
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 41 deletions.
diff --git a/configs/eval_internlm_chat_turbomind_api.py → .../eval_internlm_chat_lmdeploy_apiserver.py b/configs/eval_internlm_chat_turbomind_api.py → .../eval_internlm_chat_lmdeploy_apiserver.py
@@ -6,9 +6,9 @@
     from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
     from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
     from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
     from .datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
     from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
-    from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
     from .datasets.race.race_gen_69ee4f import race_datasets
     from .datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
     # and output the results in a choosen format
@@ -24,16 +24,29 @@
     ],
     eos_token_id=103028)
 
-models = [
-    dict(
-        type=TurboMindAPIModel,
-        abbr='internlm-chat-20b-turbomind',
-        path="internlm-chat-20b",
-        api_addr='http://0.0.0.0:23333',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+internlm_chat_20b = dict(
+    type=TurboMindAPIModel,
+    abbr='internlm-chat-20b-turbomind',
+    api_addr='http://0.0.0.0:23333',
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    meta_template=meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+)
+
+internlm_chat_7b = dict(
+    type=TurboMindAPIModel,
+    abbr='internlm-chat-7b-turbomind',
+    api_addr='http://0.0.0.0:23333',
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=16,
+    meta_template=meta_template,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+    end_str='<eoa>',
+)
+
+models = [internlm_chat_20b]
+
diff --git a/configs/eval_internlm_turbomind_api.py → configs/eval_internlm_lmdeploy_apiserver.py b/configs/eval_internlm_turbomind_api.py → configs/eval_internlm_lmdeploy_apiserver.py
@@ -14,15 +14,25 @@
 
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 
-models = [
-    dict(
-        type=TurboMindAPIModel,
-        abbr='internlm-chat-20b-turbomind',
-        path="internlm-chat-20b",
-        api_addr='http://0.0.0.0:23333',
-        max_out_len=100,
-        max_seq_len=2048,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+internlm_chat_20b = dict(
+    type=TurboMindAPIModel,
+    abbr='internlm-chat-20b-turbomind',
+    api_addr='http://0.0.0.0:23333',
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=8,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+
+internlm_chat_7b = dict(
+    type=TurboMindAPIModel,
+    abbr='internlm-chat-7b-turbomind',
+    api_addr='http://0.0.0.0:23333',
+    max_out_len=100,
+    max_seq_len=2048,
+    batch_size=16,
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
+
+models = [internlm_chat_20b]
+
diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py
@@ -54,6 +54,10 @@ def __init__(self,
         if engine_config is not None:
             from lmdeploy.messages import PytorchEngineConfig
             engine_config = PytorchEngineConfig(**engine_config)
+            # set thread_safe
+            if hasattr(engine_config, 'thread_safe'):
+                engine_config.thread_safe = True
+
         if gen_config is not None:
             from lmdeploy.messages import EngineGenerationConfig
             gen_config = EngineGenerationConfig(**gen_config)

diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py
@@ -1,6 +1,8 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, List, Optional, Union
 
+import numpy as np
+
 from opencompass.models.base import BaseModel
 from opencompass.utils.logging import get_logger
 from opencompass.utils.prompt import PromptList
@@ -161,3 +163,29 @@ def _generate(self,
         if end_str:
             response = response.split(end_str)[0]
         return response
+
+    def get_ppl(self,
+                inputs: List[str],
+                mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get perplexity scores given a list of inputs.
+
+        Args:
+            inputs (List[str]): A list of strings.
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+
+        Returns:
+            np.ndarray:  The perplexity scores in shape of (N,)
+        """
+        assert isinstance(
+            inputs, List), f'List(str) is expected, but got {type(inputs)}'
+        results = []
+        for text in inputs:
+            input_ids = self.tokenizer.encode(text)
+            res = self.generators[0].get_ppl(input_ids)
+            results.append(res)
+        results = np.concatenate(results)
+        return results
diff --git a/opencompass/models/turbomind_api.py b/opencompass/models/turbomind_api.py
@@ -20,30 +20,31 @@ def valid_str(string, coding='utf-8'):
 
 
 class TurboMindAPIModel(BaseModel):
-    """Model wrapper for TurboMind Triton Inference Server gRPC API.
+    """Model wrapper for lmdeploy api server.
 
     Args:
-        path (str): The name of OpenAI's model.
-        tis_addr (str): The address (ip:port format) of turbomind's
-            triton inference server
+        api_addr (str): The address (ip:port format) of lmdeploy's
+            api server.
         max_seq_len (int): The maximum allowed sequence length of a model.
             Note that the length of prompt + generated tokens shall not exceed
             this value. Defaults to 2048.
         meta_template (Dict, optional): The model's meta prompt
             template if needed, in case the requirement of injecting or
             wrapping of any meta instructions.
+        end_str (str, optional): Whether to trim generated strings with end_str
+            if the model has special ending strings that are not handled well.
+            Defaults to None.
     """
 
     is_api: bool = True
 
-    def __init__(
-        self,
-        path: str,
-        api_addr: str = 'http://0.0.0.0:23333',
-        max_seq_len: int = 2048,
-        meta_template: Optional[Dict] = None,
-    ):
-        super().__init__(path=path,
+    def __init__(self,
+                 api_addr: str = 'http://0.0.0.0:23333',
+                 max_seq_len: int = 2048,
+                 meta_template: Optional[Dict] = None,
+                 end_str: Optional[str] = None,
+                 **kwargs):
+        super().__init__(path='',
                          max_seq_len=max_seq_len,
                          meta_template=meta_template)
         from lmdeploy.serve.openai.api_client import APIClient
@@ -55,6 +56,7 @@ def __init__(
         if meta_template and 'eos_token_id' in meta_template:
             self.eos_token_id = meta_template['eos_token_id']
         self.api_addr = api_addr
+        self.end_str = end_str
 
     def generate(
         self,
@@ -73,7 +75,10 @@ def generate(
                 between 0 and 2. Higher values like 0.8 will make the output
                 more random, while lower values like 0.2 will make it more
                 focused and deterministic. Defaults to 0.7.
-
+            end_str (str, optional): Whether to trim generated strings
+                with end_str if the model has special ending strings
+                that are not handled well.
+                Defaults to None.
         Returns:
             List[str]: A list of generated strings.
         """
@@ -82,7 +87,8 @@ def generate(
             results = list(
                 executor.map(self._generate, inputs,
                              [max_out_len] * len(inputs),
-                             [temperature] * len(inputs)))
+                             [temperature] * len(inputs),
+                             [self.end_str] * len(inputs)))
         return results
 
     def get_token_len(self, prompt: str) -> int:
@@ -97,7 +103,7 @@ def wait(self):
         return self.token_bucket.get_token()
 
     def _generate(self, prompt: str or PromptList, max_out_len: int,
-                  temperature: float) -> str:
+                  temperature: float, end_str: str) -> str:
         """Generate results given a list of inputs.
 
         Args:
@@ -127,4 +133,6 @@ def _generate(self, prompt: str or PromptList, max_out_len: int,
                 top_k=1):
             response += output['choices'][0]['text']
         response = valid_str(response)
+        if end_str:
+            response = response.split(end_str)[0]
         return response