[Sync] deprecate old mbpps (#1064)

open-compass · Apr 19, 2024 · 8c85edd · 8c85edd
1 parent c172401
commit 8c85edd
Show file tree

Hide file tree

Showing 95 changed files with 1,506 additions and 408 deletions.
diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py
@@ -15,6 +15,6 @@
     from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
     from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
     from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
-    from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
+    from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
 
 datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
diff --git a/configs/datasets/collections/base_medium.py b/configs/datasets/collections/base_medium.py
@@ -7,7 +7,7 @@
     from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
     from ..bbh.bbh_gen_5b92b0 import bbh_datasets
     from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
     from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
     from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

diff --git a/configs/datasets/collections/base_medium_llama.py b/configs/datasets/collections/base_medium_llama.py
@@ -7,7 +7,7 @@
     from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
     from ..bbh.bbh_gen_5b92b0 import bbh_datasets
     from ..humaneval.humaneval_gen_a82cae import humaneval_datasets
-    from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
     from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
     from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

diff --git a/configs/datasets/collections/base_small.py b/configs/datasets/collections/base_small.py
@@ -11,7 +11,7 @@
     from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
     from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
     from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..lambada.lambada_gen_217e11 import lambada_datasets
     from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
     from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets

diff --git a/configs/datasets/collections/chat_core.py b/configs/datasets/collections/chat_core.py
@@ -15,6 +15,6 @@
     from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
     from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
     from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
+    from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
 
 datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py
@@ -7,7 +7,7 @@
     from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
     from ..bbh.bbh_gen_5b92b0 import bbh_datasets
     from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
     from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
     from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

diff --git a/configs/datasets/collections/chat_small.py b/configs/datasets/collections/chat_small.py
@@ -12,7 +12,7 @@
     from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
     from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
     from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
-    from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ..lambada.lambada_gen_217e11 import lambada_datasets
     from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
     from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets

diff --git a/configs/datasets/collections/leaderboard/qwen.py b/configs/datasets/collections/leaderboard/qwen.py
@@ -44,7 +44,7 @@
     from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
     from ...drop.drop_gen_8a9ed9 import drop_datasets
     from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
-    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ...bbh.bbh_gen_5bf00b import bbh_datasets
 
 

diff --git a/configs/datasets/collections/leaderboard/qwen_chat.py b/configs/datasets/collections/leaderboard/qwen_chat.py
@@ -44,7 +44,7 @@
     from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
     from ...drop.drop_gen_8a9ed9 import drop_datasets
     from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
-    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
     from ...bbh.bbh_gen_5b92b0 import bbh_datasets
 
 

diff --git a/configs/datasets/gpqa/gpqa_gen_015262.py b/configs/datasets/gpqa/gpqa_gen_015262.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
+                                          '(A){A}\n'
+                                          '(B){B}\n'
+                                          '(C){C}\n'
+                                          '(D){D}\n'
+                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
+                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+
+gpqa_datasets = []
+gpqa_subsets = {
+    'extended': 'gpqa_extended.csv',
+    'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )
diff --git a/configs/datasets/mbpp/mbpp_gen_1e1056.py → ...tasets/mbpp/deprecated_mbpp_gen_1e1056.py b/configs/datasets/mbpp/mbpp_gen_1e1056.py → ...tasets/mbpp/deprecated_mbpp_gen_1e1056.py
diff --git a/configs/datasets/mbpp/mbpp_gen_6590b0.py → ...tasets/mbpp/deprecated_mbpp_gen_6590b0.py b/configs/datasets/mbpp/mbpp_gen_6590b0.py → ...tasets/mbpp/deprecated_mbpp_gen_6590b0.py
diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py → ...tasets/mbpp/deprecated_mbpp_gen_caa7ab.py b/configs/datasets/mbpp/mbpp_gen_caa7ab.py → ...tasets/mbpp/deprecated_mbpp_gen_caa7ab.py
diff --git a/...gs/datasets/mbpp/mbpp_passk_gen_1e1056.py → .../mbpp/deprecated_mbpp_passk_gen_1e1056.py b/...gs/datasets/mbpp/mbpp_passk_gen_1e1056.py → .../mbpp/deprecated_mbpp_passk_gen_1e1056.py
diff --git a/...datasets/mbpp/mbpp_repeat10_gen_1e1056.py → ...pp/deprecated_mbpp_repeat10_gen_1e1056.py b/...datasets/mbpp/mbpp_repeat10_gen_1e1056.py → ...pp/deprecated_mbpp_repeat10_gen_1e1056.py
diff --git a/...atasets/mbpp/sanitized_mbpp_gen_1e1056.py → ...p/deprecated_sanitized_mbpp_gen_1e1056.py b/...atasets/mbpp/sanitized_mbpp_gen_1e1056.py → ...p/deprecated_sanitized_mbpp_gen_1e1056.py
diff --git a/...atasets/mbpp/sanitized_mbpp_gen_cb43ef.py → ...p/deprecated_sanitized_mbpp_gen_cb43ef.py b/...atasets/mbpp/sanitized_mbpp_gen_cb43ef.py → ...p/deprecated_sanitized_mbpp_gen_cb43ef.py
diff --git a/...s/mbpp/sanitized_mbpp_passk_gen_1e1056.py → ...ecated_sanitized_mbpp_passk_gen_1e1056.py b/...s/mbpp/sanitized_mbpp_passk_gen_1e1056.py → ...ecated_sanitized_mbpp_passk_gen_1e1056.py
diff --git a/...bpp/sanitized_mbpp_repeat10_gen_1e1056.py → ...ted_sanitized_mbpp_repeat10_gen_1e1056.py b/...bpp/sanitized_mbpp_repeat10_gen_1e1056.py → ...ted_sanitized_mbpp_repeat10_gen_1e1056.py
diff --git a/configs/datasets/mbpp/mbpp_gen.py b/configs/datasets/mbpp/mbpp_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .mbpp_gen_1e1056 import mbpp_datasets  # noqa: F401, F403
+    from .mbpp_gen_830460 import mbpp_datasets  # noqa: F401, F403
diff --git a/configs/datasets/mbpp/mbpp_gen_5d6316.py b/configs/datasets/mbpp/mbpp_gen_5d6316.py
diff --git a/configs/datasets/mbpp/mbpp_gen_78c1bc.py → configs/datasets/mbpp/mbpp_gen_830460.py b/configs/datasets/mbpp/mbpp_gen_78c1bc.py → configs/datasets/mbpp/mbpp_gen_830460.py
@@ -10,13 +10,13 @@
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
                 dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
 
-                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
                 dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
 
-                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
                 dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
 
                 dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
@@ -25,7 +25,7 @@
         ),
     ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
 )
 
 mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")

diff --git a/configs/datasets/mbpp/mbpp_passk_gen_830460.py b/configs/datasets/mbpp/mbpp_passk_gen_830460.py
@@ -0,0 +1,42 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
+
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
+                dict(role="BOT", prompt="[BEGIN]\n"),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset_V2,
+        abbr="mbpp_passk",
+        path="./data/mbpp/mbpp.jsonl",
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+    )
+]
diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py b/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py
@@ -0,0 +1,45 @@
+# This config is used for pass@k evaluation with dataset repetition
+# That model cannot generate multiple response for single input
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
+
+mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
+                dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n"),
+                dict(role="BOT", prompt="[BEGIN]\n"),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset_V2,
+        abbr="mbpp_repeat10",
+        path="./data/mbpp/mbpp.jsonl",
+        num_repeats=10,
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+    )
+]