diff --git a/configs/datasets/IFEval/IFEval.md b/configs/datasets/IFEval/IFEval.md index dc1864527..17ac85775 100644 --- a/configs/datasets/IFEval/IFEval.md +++ b/configs/datasets/IFEval/IFEval.md @@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \ ## Evaluation results ``` -dataset version metric mode qwen-72b-chat-hf mistral-7b-instruct-v0.2-hf mixtral-8x7b-instruct-v0.1 chatglm3-6b-hf ---------- --------- ---------- ------ ------------------ ----------------------------- ---------------------------- ---------------- -IFEval 27a9cc strict_acc gen 43.62 49.17 48.98 29.76 -IFEval 27a9cc loose_acc gen 45.47 53.97 54.71 32.16 +dataset version metric mode baichuan2-7b-chat-hf baichuan2-13b-chat-hf internlm2-chat-7b-hf internlm2-chat-20b-hf llama-2-7b-chat-hf llama-2-13b-chat-hf +--------- --------- ---------------------------- ------ ---------------------- ----------------------- ---------------------- ----------------------- -------------------- --------------------- +IFEval 3321a3 Prompt-level-strict-accuracy gen 36.04 35.49 38.26 33.09 33.46 33.64 +IFEval 3321a3 Inst-level-strict-accuracy gen 46.76 46.76 49.16 45.32 45.68 45.44 +IFEval 3321a3 Prompt-level-loose-accuracy gen 37.52 37.71 42.51 39.37 43.81 47.32 +IFEval 3321a3 Inst-level-loose-accuracy gen 48.44 49.16 53.72 51.08 55.64 58.03 ``` ## Reference diff --git a/opencompass/datasets/IFEval/ifeval.py b/opencompass/datasets/IFEval/ifeval.py index b2eb23479..472f6a7b9 100644 --- a/opencompass/datasets/IFEval/ifeval.py +++ b/opencompass/datasets/IFEval/ifeval.py @@ -27,7 +27,9 @@ def load(path): class IFEvaluator(BaseEvaluator): def score(self, predictions, references): - results = [] + results = dict() + for metric in ('strict', 'loose'): + results[metric] = [] for pred, refer in zip(predictions, references): input = InputExample( key=refer['key'], @@ -38,15 +40,29 @@ def score(self, predictions, references): for k in list(kwarg.keys()): if kwarg[k] is None: kwarg.pop(k, None) - result = dict( - strict=test_instruction_following_strict(input, pred), - loose=test_instruction_following_loose(input, pred), - ) - results.append(result) - strict = sum( - [result['strict'].follow_all_instructions - for result in results]) / len(results) - loose = sum( - [result['loose'].follow_all_instructions - for result in results]) / len(results) - return dict(strict_acc=strict * 100, loose_acc=loose * 100) + results['strict'].append( + test_instruction_following_strict(input, pred)) + results['loose'].append( + test_instruction_following_loose(input, pred)) + final_scores = dict() + for metric in ('strict', 'loose'): + prompt_total = 0 + prompt_correct = 0 + inst_total = 0 + inst_correct = 0 + + for example in results[metric]: + follow_instruction_list = example.follow_instruction_list + instruction_id_list = example.instruction_id_list + + prompt_total += 1 + if all(follow_instruction_list): + prompt_correct += 1 + + inst_total += len(instruction_id_list) + inst_correct += sum(follow_instruction_list) + prompt_score = f'Prompt-level-{metric}-accuracy' + inst_score = f'Inst-level-{metric}-accuracy' + final_scores[prompt_score] = prompt_correct / prompt_total * 100 + final_scores[inst_score] = inst_correct / inst_total * 100 + return final_scores