Skip to content

Commit

Permalink
[Fix] Update alignmentbench (open-compass#704)
Browse files Browse the repository at this point in the history
* update alignmentbench

* update alignmentbench

* update alignmentbench
  • Loading branch information
tonysy authored Dec 14, 2023
1 parent bbf33e2 commit bdc0d10
Show file tree
Hide file tree
Showing 11 changed files with 158 additions and 46 deletions.
8 changes: 3 additions & 5 deletions configs/datasets/subjective_cmp/alignment_bench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from os import getenv as gv

from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
Expand All @@ -15,10 +13,10 @@
subjective_all_sets = [
"alignment_bench",
]
data_path =gv('WORKDIR')+"data/subjective/alignment_bench"
data_path ="data/subjective/alignment_bench"

alignment_bench_config_path = gv('WORKDIR')+"data/subjective/alignment_bench/config"
alignment_bench_config_name = 'multi-dimension'
alignment_bench_config_path = "data/subjective/alignment_bench/"
alignment_bench_config_name = 'config/multi-dimension'

subjective_datasets = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,19 @@

datasets = [*subjective_datasets]

from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI, HuggingFaceChatGLM3
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlignmentBenchSummarizer
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True)
],
reserved_roles=[
dict(role='SYSTEM', api_role='SYSTEM'),
],
)

# -------------Inferen Stage ----------------------------------------

models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]

infer = dict(
partitioner=dict(type=NaivePartitioner),
Expand All @@ -42,6 +36,10 @@
)


# -------------Evalation Stage ----------------------------------------


## ------------- JudgeLLM Configuration
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
Expand All @@ -50,44 +48,35 @@
)

judge_model = dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
abbr='GPT4-Turbo',
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='xxxx',
meta_template=api_meta_template,
max_out_len=100,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8
)

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='singlescore',
models = [*hf_baichuan2_7b]
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
type=LocalRunner,
max_num_workers=2,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=judge_model
)),
)
work_dir = gv('WORKDIR')+'alignment_bench/'

summarizer = dict(
type=AlignmentBenchSummarizer,
)
)

work_dir = 'outputs/alignment_bench/'
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_20b.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion opencompass/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .mixtral import Mixtral # noqa: F401
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403
from .moonshot_api import MoonShot # noqa: F401
from .openai_api import OpenAI # noqa: F401
from .openai_api import OpenAI, OpenAIAllesAPIN # noqa: F401
from .pangu_api import PanGu # noqa: F401
from .sensetime_api import SenseTime # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
Expand Down
118 changes: 118 additions & 0 deletions opencompass/models/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,121 @@ def bin_trim(self, prompt: str, num_token: int) -> str:
elif self.mode == 'rear':
prompt = sep.join(words[:l])
return prompt


@MODELS.register_module(name=['OpenAIAllesAPIN'])
class OpenAIAllesAPIN(OpenAI):
"""Model wrapper around OpenAI-AllesAPIN.
Args:
path (str): The name of OpenAI's model.
url (str): URL to AllesAPIN.
key (str): AllesAPIN key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""

is_api: bool = True

def __init__(self,
path: str,
url: str,
key: str,
query_per_second: int = 1,
rpm_verbose: bool = False,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
rpm_verbose=rpm_verbose,
meta_template=meta_template,
retry=retry)
self.url = url
self.headers = {
'alles-apin-token': key,
'content-type': 'application/json',
}

def _generate(self, input: str or PromptList, max_out_len: int,
temperature: float) -> str:
"""Generate results given an input.
Args:
inputs (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))

if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
# model can be response with user and system
# when it comes with agent involved.
assert msg['role'] in ['user', 'system']
data = {
'model': self.path,
'messages': messages,
}

for _ in range(self.retry):
self.wait()
raw_response = requests.post(self.url,
headers=self.headers,
data=json.dumps(data))
try:
response = raw_response.json()
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
continue
if raw_response.status_code == 200 and response[
'msgCode'] == '10000':
data = response['data']
choices = data['choices']
if choices is None:
self.logger.error(data)
else:
return choices[0]['message']['content'].strip()
self.logger.error(response['msg'])

raise RuntimeError('API call failed.')

def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized string. Only English and Chinese
characters are counted for now. Users are encouraged to override this
method if more accurate length is needed.
Args:
prompt (str): Input string.
Returns:
int: Length of the input tokens
"""
enc = self.tiktoken.encoding_for_model(self.path)
return len(enc.encode(prompt))
3 changes: 2 additions & 1 deletion opencompass/runners/local_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def __init__(self,
self.max_num_workers = max_num_workers
self.concurrent_users = concurrent_users
assert task['type'] in [
'OpenICLInferTask', 'opencompass.tasks.OpenICLInferTask'
'OpenICLInferTask',
'opencompass.tasks.OpenICLInferTask',
], 'Only supported for api infer task.'

def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
Expand Down
8 changes: 7 additions & 1 deletion opencompass/summarizers/alignmentbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
except ImportError:
from_csv = None

from opencompass.utils import dataset_abbr_from_cfg
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg

CATEGORIES = {
'中文推理': ['数学计算', '逻辑推理'],
Expand Down Expand Up @@ -91,6 +91,10 @@ class AlignmentBenchSummarizer:
def __init__(self, config: ConfigDict) -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]

def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
Expand All @@ -116,6 +120,8 @@ def summarize(self,
fout2 = osp.join(output_dir, 'capability.csv')
fout_flag, fout_flag2 = 0, 0
for subdir in os.listdir(results_folder):
if subdir not in self.eval_model_abbrs:
continue
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model = subdir
Expand Down

0 comments on commit bdc0d10

Please sign in to comment.