Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix] Update alignmentbench #704

Merged
merged 3 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions configs/datasets/subjective_cmp/alignment_bench.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from os import getenv as gv

from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
Expand All @@ -15,10 +13,10 @@
subjective_all_sets = [
"alignment_bench",
]
data_path =gv('WORKDIR')+"data/subjective/alignment_bench"
data_path ="data/subjective/alignment_bench"

alignment_bench_config_path = gv('WORKDIR')+"data/subjective/alignment_bench/config"
alignment_bench_config_name = 'multi-dimension'
alignment_bench_config_path = "data/subjective/alignment_bench/"
alignment_bench_config_name = 'config/multi-dimension'

subjective_datasets = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,19 @@

datasets = [*subjective_datasets]

from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI, HuggingFaceChatGLM3
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlignmentBenchSummarizer
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True)
],
reserved_roles=[
dict(role='SYSTEM', api_role='SYSTEM'),
],
)

# -------------Inferen Stage ----------------------------------------

models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]

infer = dict(
partitioner=dict(type=NaivePartitioner),
Expand All @@ -42,6 +36,10 @@
)


# -------------Evalation Stage ----------------------------------------


## ------------- JudgeLLM Configuration
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
Expand All @@ -50,44 +48,35 @@
)

judge_model = dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
abbr='GPT4-Turbo',
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='xxxx',
meta_template=api_meta_template,
max_out_len=100,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8
)

## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='singlescore',
models = [*hf_baichuan2_7b]
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
type=LocalRunner,
max_num_workers=2,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=judge_model
)),
)
work_dir = gv('WORKDIR')+'alignment_bench/'

summarizer = dict(
type=AlignmentBenchSummarizer,
)
)

work_dir = 'outputs/alignment_bench/'
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_20b.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
dict(role='HUMAN', begin='<|User|>:', end='\n'),
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
],
)
Expand Down
2 changes: 1 addition & 1 deletion opencompass/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .mixtral import Mixtral # noqa: F401
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403
from .moonshot_api import MoonShot # noqa: F401
from .openai_api import OpenAI # noqa: F401
from .openai_api import OpenAI, OpenAIAllesAPIN # noqa: F401
from .pangu_api import PanGu # noqa: F401
from .sensetime_api import SenseTime # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
Expand Down
118 changes: 118 additions & 0 deletions opencompass/models/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,121 @@ def bin_trim(self, prompt: str, num_token: int) -> str:
elif self.mode == 'rear':
prompt = sep.join(words[:l])
return prompt


@MODELS.register_module(name=['OpenAIAllesAPIN'])
class OpenAIAllesAPIN(OpenAI):
"""Model wrapper around OpenAI-AllesAPIN.

Args:
path (str): The name of OpenAI's model.
url (str): URL to AllesAPIN.
key (str): AllesAPIN key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""

is_api: bool = True

def __init__(self,
path: str,
url: str,
key: str,
query_per_second: int = 1,
rpm_verbose: bool = False,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
rpm_verbose=rpm_verbose,
meta_template=meta_template,
retry=retry)
self.url = url
self.headers = {
'alles-apin-token': key,
'content-type': 'application/json',
}

def _generate(self, input: str or PromptList, max_out_len: int,
temperature: float) -> str:
"""Generate results given an input.

Args:
inputs (str or PromptList): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use,
between 0 and 2. Higher values like 0.8 will make the output
more random, while lower values like 0.2 will make it more
focused and deterministic.

Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))

if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
# model can be response with user and system
# when it comes with agent involved.
assert msg['role'] in ['user', 'system']
data = {
'model': self.path,
'messages': messages,
}

for _ in range(self.retry):
self.wait()
raw_response = requests.post(self.url,
headers=self.headers,
data=json.dumps(data))
try:
response = raw_response.json()
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',
str(raw_response.content))
continue
if raw_response.status_code == 200 and response[
'msgCode'] == '10000':
data = response['data']
choices = data['choices']
if choices is None:
self.logger.error(data)
else:
return choices[0]['message']['content'].strip()
self.logger.error(response['msg'])

raise RuntimeError('API call failed.')

def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized string. Only English and Chinese
characters are counted for now. Users are encouraged to override this
method if more accurate length is needed.

Args:
prompt (str): Input string.

Returns:
int: Length of the input tokens
"""
enc = self.tiktoken.encoding_for_model(self.path)
return len(enc.encode(prompt))
3 changes: 2 additions & 1 deletion opencompass/runners/local_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def __init__(self,
self.max_num_workers = max_num_workers
self.concurrent_users = concurrent_users
assert task['type'] in [
'OpenICLInferTask', 'opencompass.tasks.OpenICLInferTask'
'OpenICLInferTask',
'opencompass.tasks.OpenICLInferTask',
], 'Only supported for api infer task.'

def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
Expand Down
8 changes: 7 additions & 1 deletion opencompass/summarizers/alignmentbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
except ImportError:
from_csv = None

from opencompass.utils import dataset_abbr_from_cfg
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg

CATEGORIES = {
'中文推理': ['数学计算', '逻辑推理'],
Expand Down Expand Up @@ -91,6 +91,10 @@ class AlignmentBenchSummarizer:
def __init__(self, config: ConfigDict) -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]

def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
Expand All @@ -116,6 +120,8 @@ def summarize(self,
fout2 = osp.join(output_dir, 'capability.csv')
fout_flag, fout_flag2 = 0, 0
for subdir in os.listdir(results_folder):
if subdir not in self.eval_model_abbrs:
continue
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model = subdir
Expand Down