Skip to content

Commit

Permalink
[Fix] fix ifeval (#909)
Browse files Browse the repository at this point in the history
  • Loading branch information
jingmingzhuo authored Feb 23, 2024
1 parent 45c606b commit 53fe788
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 17 deletions.
10 changes: 6 additions & 4 deletions configs/datasets/IFEval/IFEval.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \
## Evaluation results

```
dataset version metric mode qwen-72b-chat-hf mistral-7b-instruct-v0.2-hf mixtral-8x7b-instruct-v0.1 chatglm3-6b-hf
--------- --------- ---------- ------ ------------------ ----------------------------- ---------------------------- ----------------
IFEval 27a9cc strict_acc gen 43.62 49.17 48.98 29.76
IFEval 27a9cc loose_acc gen 45.47 53.97 54.71 32.16
dataset version metric mode baichuan2-7b-chat-hf baichuan2-13b-chat-hf internlm2-chat-7b-hf internlm2-chat-20b-hf llama-2-7b-chat-hf llama-2-13b-chat-hf
--------- --------- ---------------------------- ------ ---------------------- ----------------------- ---------------------- ----------------------- -------------------- ---------------------
IFEval 3321a3 Prompt-level-strict-accuracy gen 36.04 35.49 38.26 33.09 33.46 33.64
IFEval 3321a3 Inst-level-strict-accuracy gen 46.76 46.76 49.16 45.32 45.68 45.44
IFEval 3321a3 Prompt-level-loose-accuracy gen 37.52 37.71 42.51 39.37 43.81 47.32
IFEval 3321a3 Inst-level-loose-accuracy gen 48.44 49.16 53.72 51.08 55.64 58.03
```

## Reference
Expand Down
42 changes: 29 additions & 13 deletions opencompass/datasets/IFEval/ifeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ def load(path):
class IFEvaluator(BaseEvaluator):

def score(self, predictions, references):
results = []
results = dict()
for metric in ('strict', 'loose'):
results[metric] = []
for pred, refer in zip(predictions, references):
input = InputExample(
key=refer['key'],
Expand All @@ -38,15 +40,29 @@ def score(self, predictions, references):
for k in list(kwarg.keys()):
if kwarg[k] is None:
kwarg.pop(k, None)
result = dict(
strict=test_instruction_following_strict(input, pred),
loose=test_instruction_following_loose(input, pred),
)
results.append(result)
strict = sum(
[result['strict'].follow_all_instructions
for result in results]) / len(results)
loose = sum(
[result['loose'].follow_all_instructions
for result in results]) / len(results)
return dict(strict_acc=strict * 100, loose_acc=loose * 100)
results['strict'].append(
test_instruction_following_strict(input, pred))
results['loose'].append(
test_instruction_following_loose(input, pred))
final_scores = dict()
for metric in ('strict', 'loose'):
prompt_total = 0
prompt_correct = 0
inst_total = 0
inst_correct = 0

for example in results[metric]:
follow_instruction_list = example.follow_instruction_list
instruction_id_list = example.instruction_id_list

prompt_total += 1
if all(follow_instruction_list):
prompt_correct += 1

inst_total += len(instruction_id_list)
inst_correct += sum(follow_instruction_list)
prompt_score = f'Prompt-level-{metric}-accuracy'
inst_score = f'Inst-level-{metric}-accuracy'
final_scores[prompt_score] = prompt_correct / prompt_total * 100
final_scores[inst_score] = inst_correct / inst_total * 100
return final_scores

0 comments on commit 53fe788

Please sign in to comment.