diff --git a/opencompass/models/lmdeploy_pytorch.py b/opencompass/models/lmdeploy_pytorch.py
deleted file mode 100644
index 80924c276..000000000
--- a/opencompass/models/lmdeploy_pytorch.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional, Union
-
-from opencompass.models.base import BaseModel
-from opencompass.utils.logging import get_logger
-from opencompass.utils.prompt import PromptList
-
-PromptType = Union[PromptList, str]
-
-
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
-class LmdeployPytorchModel(BaseModel):
-    """Model wrapper for lmdeploy pytorch engine through python API.
-
-    Args:
-        path (str): path of the supported pytorch model.
-        max_seq_len (int): The maximum allowed sequence length of a model.
-            Note that the length of prompt + generated tokens shall not exceed
-            this value. Defaults to 2048.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-        engine_config (Dict, optional): The engine config to set
-            arguments like session_len, max_batch_size for TurboMind.
-        gen_config (Dict, optional): Generation config to set
-                arguments like top_k, top_p, temperature.
-        end_str (str, optional): Whether to trim generated strings with end_str
-            if the model has special ending strings that are not handled well.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 path: str,
-                 concurrency: int = 8,
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 engine_config: Optional[Dict] = None,
-                 gen_config: Optional[Dict] = None,
-                 end_str: Optional[str] = None):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         meta_template=meta_template)
-        from lmdeploy.pytorch import engine as tm
-        from lmdeploy.version import version_info
-
-        if engine_config is not None:
-            from lmdeploy.messages import PytorchEngineConfig
-            engine_config = PytorchEngineConfig(**engine_config)
-            # set thread_safe
-            if hasattr(engine_config, 'thread_safe'):
-                engine_config.thread_safe = True
-
-        if gen_config is not None:
-            from lmdeploy.messages import GenerationConfig
-            gen_config = GenerationConfig(**gen_config)
-
-        self.logger = get_logger()
-        tm_model = tm.Engine(path, engine_config)
-        self.tokenizer = tm_model.tokenizer
-        self.generators = [
-            tm_model.create_instance() for i in range(concurrency)
-        ]
-        self.generator_ids = [i + 1 for i in range(concurrency)]
-
-        from transformers import GenerationConfig
-        try:
-            generation_config = GenerationConfig.from_pretrained(path)
-        except Exception:
-            generation_config = None
-        if generation_config and hasattr(generation_config, 'eos_token_id'):
-            if gen_config.stop_words is None:
-                stop_words = []
-            if isinstance(generation_config.eos_token_id, int):
-                stop_words.append(generation_config.eos_token_id)
-            else:
-                assert isinstance(generation_config.eos_token_id, list)
-                for token_id in generation_config.eos_token_id:
-                    stop_words.append(token_id)
-            gen_config.stop_words = stop_words
-            if version_info >= (0, 6, 0):
-                gen_config.stop_token_ids = stop_words
-        self.gen_config = gen_config
-        self.end_str = end_str
-        self.major_version, self.minor_version = version_info[:2]
-
-    def generate(
-        self,
-        inputs: List[str],
-        max_out_len: int = 512,
-    ) -> List[str]:
-        """Generate results given a list of inputs.
-
-        Args:
-            inputs (List[str]): A list of prompts
-            max_out_len (int): The maximum length of the output.
-
-        Returns:
-            List[str]: A list of generated strings.
-        """
-        assert isinstance(
-            inputs, List), f'List(str) is expected, but got {type(inputs)}'
-
-        # split inputs into batches
-        batch_size = len(self.generators)
-        batch_inputs = [
-            inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)
-        ]
-
-        results = []
-        for batch_input in batch_inputs:
-            with ThreadPoolExecutor() as executor:
-                _results = list(
-                    executor.map(
-                        self._generate,
-                        self.generators[:len(batch_input)],
-                        self.generator_ids[:len(batch_input)],
-                        batch_input,
-                        [self.gen_config] * len(batch_input),
-                        [self.end_str] * len(batch_input),
-                    ))
-                results += _results
-        return results
-
-    def get_token_len(self, prompt: str) -> int:
-        input_ids = self.tokenizer.encode(prompt)
-        return len(input_ids)
-
-    def wait(self):
-        """Wait till the next query can be sent.
-
-        Applicable in both single-thread and multi-thread environments.
-        """
-        return self.token_bucket.get_token()
-
-    def _generate(self,
-                  generator,
-                  session_id,
-                  prompt: PromptType,
-                  gen_config=None,
-                  end_str: Optional[str] = None) -> str:
-        """Generate results given a list of inputs.
-
-        Args:
-            prompt (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            gen_config (GenerationConfig, optional): Generation
-                config to set arguments like top_k, top_p, temperature.
-            end_str (str, optional): Whether to trim generated strings
-                with end_str if the model has special ending strings
-                that are not handled well.
-                Defaults to None.
-        Returns:
-            str: The generated string.
-        """
-        assert type(
-            prompt) is str, 'We only support string for TurboMind Python API'
-        input_ids = self.tokenizer.encode(prompt)
-        if self.major_version >= 0 and self.minor_version >= 4:
-            outputs = generator.infer(session_id,
-                                      input_ids,
-                                      gen_config=gen_config)
-            output_ids = outputs.token_ids
-        else:
-            _, output_ids, _ = generator.infer(session_id,
-                                               input_ids,
-                                               gen_config=gen_config)
-
-        # stop engine
-        if hasattr(generator, 'end'):
-            generator.end(session_id)
-        # decode output
-        response_all = self.tokenizer.decode(output_ids)
-        # trim output
-        if end_str:
-            response_all = response_all.split(end_str)[0]
-        # remove invalid characters
-        response_all = valid_str(response_all)
-        return response_all