[Sync] Sync Internal (#941)

open-compass · Mar 4, 2024 · b03d5dc · b03d5dc
1 parent bbec7d8
commit b03d5dc
Show file tree

Hide file tree

Showing 73 changed files with 2,264 additions and 903 deletions.
diff --git a/.gitignore b/.gitignore
@@ -91,8 +91,12 @@ docs/zh_cn/_build/
 
 # sft config ignore list
 configs/sft_cfg/*B_*
+configs/sft_cfg/1B/*
 configs/sft_cfg/7B/*
 configs/sft_cfg/20B/*
+configs/sft_cfg/60B/*
+configs/sft_cfg/100B/*
+
 configs/cky/
 # in case llama clone in the opencompass
 llama/
@@ -120,3 +124,6 @@ turbomind/
 *.csv
 *.npy
 *.c
+
+# aliyun
+core.*
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
@@ -0,0 +1,42 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "ice_template": {
+                "type": PromptTemplate,
+                "template": {"round": [{"role": "HUMAN", "prompt": p["prefix_prompt"] + "{question}"}]},
+                "ice_token": "</E>",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py b/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
@@ -0,0 +1,41 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ("Multiple-choice_Questions", MCQ_prompts),
+    ("Fill-in-the-blank_Questions", FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            "input_columns": ["question"],
+            "output_column": "answer",
+        }
+        infer_cfg = {
+            "prompt_template": {
+                "type": PromptTemplate,
+                "template": p["prefix_prompt"] + "{question}",
+            },
+            "retriever": {"type": ZeroRetriever},
+            "inferencer": {"type": GenInferencer, "max_out_len": 1024},
+        }
+        eval_cfg = {
+            "evaluator": {"type": "GaokaoBenchEvaluator" + "_" + p["type"]},
+            "pred_role": "BOT",
+        }
+        dataset = {
+            "type": GaokaoBenchDataset,
+            "abbr": "GaokaoBench_" + p["keyword"],
+            "path": os.path.join("data", "GAOKAO-BENCH", "data", folder, p["keyword"] + ".json"),
+            "reader_cfg": reader_cfg,
+            "infer_cfg": infer_cfg,
+            "eval_cfg": eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
diff --git a/configs/datasets/GaokaoBench/GaokaoBench_prompts.py b/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
diff --git a/...s/MathBench/mathbench_agent_gen_fbe13b.py → .../deprecated_mathbench_agent_gen_fbe13b.py b/...s/MathBench/mathbench_agent_gen_fbe13b.py → .../deprecated_mathbench_agent_gen_fbe13b.py
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py
@@ -4,37 +4,36 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
 
-TheoremQA_prompt1 = "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. " \
-         "If the Answer type in [bool], the answer needs to be True or False. " \
-         "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. " \
-         "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. " \
-         "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)." \
-         "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
-TheoremQA_prompt2 = f"Below is an instruction that describes a task, paired with an input that provides further context. " \
-         f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
+TheoremQA_prompt1 = (
+    "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. "
+    "If the Answer type in [bool], the answer needs to be True or False. "
+    "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. "
+    "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. "
+    "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)."
+    "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
+)
+TheoremQA_prompt2 = (
+    f"Below is an instruction that describes a task, paired with an input that provides further context. "
+    f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n"
+)
 
 TheoremQA_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=TheoremQA_prompt2),
+    prompt_template=dict(type=PromptTemplate, template=TheoremQA_prompt2),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py
@@ -4,45 +4,41 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
 
 TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
 1. a numerical value like 0.1, no symbol and no unit at all.
 2. a list of number like [2, 3, 4].
 3. True/False.
 4. an option like (a), (b), (c), (d)
 """
-TheoremQA_prompt2 = 'Question: {Question}\nLet\'s think step by step.'
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 
 TheoremQA_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             begin=[
-                dict(
-                    role='SYSTEM',
-                    fallback_role='HUMAN',
-                    prompt=TheoremQA_prompt1),
+                dict(role="SYSTEM", fallback_role="HUMAN", prompt=TheoremQA_prompt1),
             ],
             round=[
-                dict(role='HUMAN', prompt=TheoremQA_prompt2),
-            ])),
+                dict(role="HUMAN", prompt=TheoremQA_prompt2),
+            ],
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py
@@ -4,34 +4,41 @@
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
 
-TheoremQA_reader_cfg = dict(
-    input_columns=['Question', 'Answer_type'],
-    output_column='Answer',
-    train_split='test')
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
 
 TheoremQA_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
-        template=dict(round=[
-            dict(
-                role='HUMAN',
-                prompt=
-                """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:\n1. a numerical value like 0.1, no symbol and no unit at all.\n2. a list of number like [2, 3, 4].\n3. True/False.\n4. an option like (a), (b), (c), (d)\nQuestion: {Question}\nLet\'s think step by step."""
-            ),
-        ])),
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512))
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
 
-TheoremQA_eval_cfg = dict(
-    evaluator=dict(type=AccEvaluator),
-    pred_postprocessor=dict(type=TheoremQA_postprocess))
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
 
 TheoremQA_datasets = [
     dict(
-        abbr='TheoremQA',
+        abbr="TheoremQA",
         type=TheoremQADataset,
         path="./data/TheoremQA/test.csv",
         reader_cfg=TheoremQA_reader_cfg,
         infer_cfg=TheoremQA_infer_cfg,
-        eval_cfg=TheoremQA_eval_cfg)
+        eval_cfg=TheoremQA_eval_cfg,
+    )
 ]
diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_2c2583.py
@@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=TheoremQA_prompt1 + TheoremQA_prompt2,
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
diff --git a/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py b/configs/datasets/TheoremQA/TheoremQA_post_v2_gen_ef26ca.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=["Question", "Answer_type"], output_column="Answer", train_split="test")
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr="TheoremQA",
+        type=TheoremQADataset,
+        path="./data/TheoremQA/test.csv",
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]