diff --git a/configs/datasets/subjective_cmp/alignment_bench.py b/configs/datasets/subjective_cmp/alignment_bench.py index e27d8f7af..9ad4b21b9 100644 --- a/configs/datasets/subjective_cmp/alignment_bench.py +++ b/configs/datasets/subjective_cmp/alignment_bench.py @@ -1,5 +1,3 @@ -from os import getenv as gv - from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer @@ -15,10 +13,10 @@ subjective_all_sets = [ "alignment_bench", ] -data_path =gv('WORKDIR')+"data/subjective/alignment_bench" +data_path ="data/subjective/alignment_bench" -alignment_bench_config_path = gv('WORKDIR')+"data/subjective/alignment_bench/config" -alignment_bench_config_name = 'multi-dimension' +alignment_bench_config_path = "data/subjective/alignment_bench/" +alignment_bench_config_name = 'config/multi-dimension' subjective_datasets = [] diff --git a/configs/alignment_bench.py b/configs/eval_subjective_alignbench.py similarity index 67% rename from configs/alignment_bench.py rename to configs/eval_subjective_alignbench.py index 8950f5e02..78a3ae5c7 100644 --- a/configs/alignment_bench.py +++ b/configs/eval_subjective_alignbench.py @@ -11,7 +11,7 @@ datasets = [*subjective_datasets] -from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI, HuggingFaceChatGLM3 +from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 from opencompass.partitioners import NaivePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.runners import LocalRunner @@ -19,17 +19,11 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import AlignmentBenchSummarizer -models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True) - ], - reserved_roles=[ - dict(role='SYSTEM', api_role='SYSTEM'), - ], -) + +# -------------Inferen Stage ---------------------------------------- + +models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] infer = dict( partitioner=dict(type=NaivePartitioner), @@ -42,6 +36,10 @@ ) +# -------------Evalation Stage ---------------------------------------- + + +## ------------- JudgeLLM Configuration api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), @@ -50,26 +48,18 @@ ) judge_model = dict( - type=HuggingFaceChatGLM3, - abbr='chatglm3-6b-hf', - path='THUDM/chatglm3-6b', - tokenizer_path='THUDM/chatglm3-6b', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True, - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, path='gpt-4-1106-preview', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='xxxx', meta_template=api_meta_template, - max_out_len=100, - max_seq_len=4096, - batch_size=1, - run_cfg=dict(num_gpus=1, num_procs=1) - ) + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8 +) +## ------------- Evaluation Configuration eval = dict( partitioner=dict( type=SubjectiveNaivePartitioner, @@ -77,17 +67,16 @@ models = [*hf_baichuan2_7b] ), runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, + type=LocalRunner, + max_num_workers=2, task=dict( type=SubjectiveEvalTask, judge_cfg=judge_model )), ) -work_dir = gv('WORKDIR')+'alignment_bench/' summarizer = dict( type=AlignmentBenchSummarizer, -) \ No newline at end of file +) + +work_dir = 'outputs/alignment_bench/' diff --git a/configs/subjective_compare.py b/configs/eval_subjective_compare.py similarity index 100% rename from configs/subjective_compare.py rename to configs/eval_subjective_compare.py diff --git a/configs/subjective_score.py b/configs/eval_subjective_score.py similarity index 100% rename from configs/subjective_score.py rename to configs/eval_subjective_score.py diff --git a/configs/models/hf_internlm/hf_internlm_chat_20b.py b/configs/models/hf_internlm/hf_internlm_chat_20b.py index e8631aded..ba9277eb2 100644 --- a/configs/models/hf_internlm/hf_internlm_chat_20b.py +++ b/configs/models/hf_internlm/hf_internlm_chat_20b.py @@ -3,7 +3,7 @@ _meta_template = dict( round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), ], ) diff --git a/configs/models/hf_internlm/hf_internlm_chat_7b.py b/configs/models/hf_internlm/hf_internlm_chat_7b.py index 53102e865..a5015b8e1 100644 --- a/configs/models/hf_internlm/hf_internlm_chat_7b.py +++ b/configs/models/hf_internlm/hf_internlm_chat_7b.py @@ -3,7 +3,7 @@ _meta_template = dict( round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), ], ) diff --git a/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py b/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py index f1ae5c12f..5e0152d56 100644 --- a/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py +++ b/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py @@ -3,7 +3,7 @@ _meta_template = dict( round=[ - dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), ], ) diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index 92f0ce1af..fedaf904c 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -17,7 +17,7 @@ from .mixtral import Mixtral # noqa: F401 from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403 from .moonshot_api import MoonShot # noqa: F401 -from .openai_api import OpenAI # noqa: F401 +from .openai_api import OpenAI, OpenAIAllesAPIN # noqa: F401 from .pangu_api import PanGu # noqa: F401 from .sensetime_api import SenseTime # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 8c2eb7586..50b1a7209 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -310,3 +310,121 @@ def bin_trim(self, prompt: str, num_token: int) -> str: elif self.mode == 'rear': prompt = sep.join(words[:l]) return prompt + + +@MODELS.register_module(name=['OpenAIAllesAPIN']) +class OpenAIAllesAPIN(OpenAI): + """Model wrapper around OpenAI-AllesAPIN. + + Args: + path (str): The name of OpenAI's model. + url (str): URL to AllesAPIN. + key (str): AllesAPIN key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + is_api: bool = True + + def __init__(self, + path: str, + url: str, + key: str, + query_per_second: int = 1, + rpm_verbose: bool = False, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + rpm_verbose=rpm_verbose, + meta_template=meta_template, + retry=retry) + self.url = url + self.headers = { + 'alles-apin-token': key, + 'content-type': 'application/json', + } + + def _generate(self, input: str or PromptList, max_out_len: int, + temperature: float) -> str: + """Generate results given an input. + + Args: + inputs (str or PromptList): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + temperature (float): What sampling temperature to use, + between 0 and 2. Higher values like 0.8 will make the output + more random, while lower values like 0.2 will make it more + focused and deterministic. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + for item in input: + msg = {'content': item['prompt']} + if item['role'] == 'HUMAN': + msg['role'] = 'user' + elif item['role'] == 'BOT': + msg['role'] = 'assistant' + elif item['role'] == 'SYSTEM': + msg['role'] = 'system' + messages.append(msg) + # model can be response with user and system + # when it comes with agent involved. + assert msg['role'] in ['user', 'system'] + data = { + 'model': self.path, + 'messages': messages, + } + + for _ in range(self.retry): + self.wait() + raw_response = requests.post(self.url, + headers=self.headers, + data=json.dumps(data)) + try: + response = raw_response.json() + except requests.JSONDecodeError: + self.logger.error('JsonDecode error, got', + str(raw_response.content)) + continue + if raw_response.status_code == 200 and response[ + 'msgCode'] == '10000': + data = response['data'] + choices = data['choices'] + if choices is None: + self.logger.error(data) + else: + return choices[0]['message']['content'].strip() + self.logger.error(response['msg']) + + raise RuntimeError('API call failed.') + + def get_token_len(self, prompt: str) -> int: + """Get lengths of the tokenized string. Only English and Chinese + characters are counted for now. Users are encouraged to override this + method if more accurate length is needed. + + Args: + prompt (str): Input string. + + Returns: + int: Length of the input tokens + """ + enc = self.tiktoken.encoding_for_model(self.path) + return len(enc.encode(prompt)) diff --git a/opencompass/runners/local_api.py b/opencompass/runners/local_api.py index 04f78cf97..8ec3df55f 100644 --- a/opencompass/runners/local_api.py +++ b/opencompass/runners/local_api.py @@ -172,7 +172,8 @@ def __init__(self, self.max_num_workers = max_num_workers self.concurrent_users = concurrent_users assert task['type'] in [ - 'OpenICLInferTask', 'opencompass.tasks.OpenICLInferTask' + 'OpenICLInferTask', + 'opencompass.tasks.OpenICLInferTask', ], 'Only supported for api infer task.' def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: diff --git a/opencompass/summarizers/alignmentbench.py b/opencompass/summarizers/alignmentbench.py index 4265e671e..69a11d144 100644 --- a/opencompass/summarizers/alignmentbench.py +++ b/opencompass/summarizers/alignmentbench.py @@ -15,7 +15,7 @@ except ImportError: from_csv = None -from opencompass.utils import dataset_abbr_from_cfg +from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg CATEGORIES = { '中文推理': ['数学计算', '逻辑推理'], @@ -91,6 +91,10 @@ class AlignmentBenchSummarizer: def __init__(self, config: ConfigDict) -> None: self.tasks = [] self.cfg = config + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): @@ -116,6 +120,8 @@ def summarize(self, fout2 = osp.join(output_dir, 'capability.csv') fout_flag, fout_flag2 = 0, 0 for subdir in os.listdir(results_folder): + if subdir not in self.eval_model_abbrs: + continue subdir_path = os.path.join(results_folder, subdir) if os.path.isdir(subdir_path): model = subdir