[Fix] fix ifeval (#909)

open-compass · Feb 23, 2024 · 53fe788 · 53fe788
1 parent 45c606b
commit 53fe788
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 17 deletions.
diff --git a/configs/datasets/IFEval/IFEval.md b/configs/datasets/IFEval/IFEval.md
@@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \
 ## Evaluation results
 
 ```
-dataset    version    metric      mode      qwen-72b-chat-hf    mistral-7b-instruct-v0.2-hf    mixtral-8x7b-instruct-v0.1    chatglm3-6b-hf
----------  ---------  ----------  ------  ------------------  -----------------------------  ----------------------------  ----------------
-IFEval     27a9cc     strict_acc  gen                  43.62                          49.17                         48.98             29.76
-IFEval     27a9cc     loose_acc   gen                  45.47                          53.97                         54.71             32.16
+dataset    version    metric                        mode      baichuan2-7b-chat-hf    baichuan2-13b-chat-hf    internlm2-chat-7b-hf    internlm2-chat-20b-hf    llama-2-7b-chat-hf    llama-2-13b-chat-hf
+---------  ---------  ----------------------------  ------  ----------------------  -----------------------  ----------------------  -----------------------  --------------------  ---------------------
+IFEval     3321a3     Prompt-level-strict-accuracy  gen                      36.04                    35.49                   38.26                    33.09                 33.46                  33.64
+IFEval     3321a3     Inst-level-strict-accuracy    gen                      46.76                    46.76                   49.16                    45.32                 45.68                  45.44
+IFEval     3321a3     Prompt-level-loose-accuracy   gen                      37.52                    37.71                   42.51                    39.37                 43.81                  47.32
+IFEval     3321a3     Inst-level-loose-accuracy     gen                      48.44                    49.16                   53.72                    51.08                 55.64                  58.03
 ```
 
 ## Reference

diff --git a/opencompass/datasets/IFEval/ifeval.py b/opencompass/datasets/IFEval/ifeval.py
@@ -27,7 +27,9 @@ def load(path):
 class IFEvaluator(BaseEvaluator):
 
     def score(self, predictions, references):
-        results = []
+        results = dict()
+        for metric in ('strict', 'loose'):
+            results[metric] = []
         for pred, refer in zip(predictions, references):
             input = InputExample(
                 key=refer['key'],
@@ -38,15 +40,29 @@ def score(self, predictions, references):
                 for k in list(kwarg.keys()):
                     if kwarg[k] is None:
                         kwarg.pop(k, None)
-            result = dict(
-                strict=test_instruction_following_strict(input, pred),
-                loose=test_instruction_following_loose(input, pred),
-            )
-            results.append(result)
-        strict = sum(
-            [result['strict'].follow_all_instructions
-             for result in results]) / len(results)
-        loose = sum(
-            [result['loose'].follow_all_instructions
-             for result in results]) / len(results)
-        return dict(strict_acc=strict * 100, loose_acc=loose * 100)
+            results['strict'].append(
+                test_instruction_following_strict(input, pred))
+            results['loose'].append(
+                test_instruction_following_loose(input, pred))
+        final_scores = dict()
+        for metric in ('strict', 'loose'):
+            prompt_total = 0
+            prompt_correct = 0
+            inst_total = 0
+            inst_correct = 0
+
+            for example in results[metric]:
+                follow_instruction_list = example.follow_instruction_list
+                instruction_id_list = example.instruction_id_list
+
+                prompt_total += 1
+                if all(follow_instruction_list):
+                    prompt_correct += 1
+
+                inst_total += len(instruction_id_list)
+                inst_correct += sum(follow_instruction_list)
+            prompt_score = f'Prompt-level-{metric}-accuracy'
+            inst_score = f'Inst-level-{metric}-accuracy'
+            final_scores[prompt_score] = prompt_correct / prompt_total * 100
+            final_scores[inst_score] = inst_correct / inst_total * 100
+        return final_scores