Update PRM series dataset

liushz · Nov 11, 2024 · e375303 · e375303
1 parent 75ab00b
commit e375303
Show file tree

Hide file tree

Showing 71 changed files with 752 additions and 163 deletions.
diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
diff --git a/configs/datasets/bbh/lib_prompt/boolean_expressions.txt b/configs/datasets/bbh/lib_prompt/boolean_expressions.txt
diff --git a/configs/datasets/bbh/lib_prompt/causal_judgement.txt b/configs/datasets/bbh/lib_prompt/causal_judgement.txt
diff --git a/configs/datasets/bbh/lib_prompt/date_understanding.txt b/configs/datasets/bbh/lib_prompt/date_understanding.txt
diff --git a/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt b/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt
diff --git a/configs/datasets/bbh/lib_prompt/dyck_languages.txt b/configs/datasets/bbh/lib_prompt/dyck_languages.txt
diff --git a/configs/datasets/bbh/lib_prompt/formal_fallacies.txt b/configs/datasets/bbh/lib_prompt/formal_fallacies.txt
diff --git a/configs/datasets/bbh/lib_prompt/geometric_shapes.txt b/configs/datasets/bbh/lib_prompt/geometric_shapes.txt
diff --git a/configs/datasets/bbh/lib_prompt/hyperbaton.txt b/configs/datasets/bbh/lib_prompt/hyperbaton.txt
diff --git a/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt b/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt b/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt b/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/movie_recommendation.txt b/configs/datasets/bbh/lib_prompt/movie_recommendation.txt
diff --git a/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt b/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt
diff --git a/configs/datasets/bbh/lib_prompt/navigate.txt b/configs/datasets/bbh/lib_prompt/navigate.txt
diff --git a/configs/datasets/bbh/lib_prompt/object_counting.txt b/configs/datasets/bbh/lib_prompt/object_counting.txt
diff --git a/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt b/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt
diff --git a/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt b/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/ruin_names.txt b/configs/datasets/bbh/lib_prompt/ruin_names.txt
diff --git a/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt b/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt
diff --git a/configs/datasets/bbh/lib_prompt/snarks.txt b/configs/datasets/bbh/lib_prompt/snarks.txt
diff --git a/configs/datasets/bbh/lib_prompt/sports_understanding.txt b/configs/datasets/bbh/lib_prompt/sports_understanding.txt
diff --git a/configs/datasets/bbh/lib_prompt/temporal_sequences.txt b/configs/datasets/bbh/lib_prompt/temporal_sequences.txt
diff --git a/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt b/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt b/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt b/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt
diff --git a/configs/datasets/bbh/lib_prompt/web_of_lies.txt b/configs/datasets/bbh/lib_prompt/web_of_lies.txt
diff --git a/configs/datasets/bbh/lib_prompt/word_sorting.txt b/configs/datasets/bbh/lib_prompt/word_sorting.txt
diff --git a/docs/zh_cn/cp_origin_docs.sh b/docs/zh_cn/cp_origin_docs.sh
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt b/opencompass/configs/datasets/bbh/lib_prompt/boolean_expressions.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt b/opencompass/configs/datasets/bbh/lib_prompt/causal_judgement.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt b/opencompass/configs/datasets/bbh/lib_prompt/date_understanding.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt b/opencompass/configs/datasets/bbh/lib_prompt/disambiguation_qa.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt b/opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt b/opencompass/configs/datasets/bbh/lib_prompt/formal_fallacies.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt b/opencompass/configs/datasets/bbh/lib_prompt/geometric_shapes.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt b/opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_five_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_seven_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/logical_deduction_three_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt b/opencompass/configs/datasets/bbh/lib_prompt/movie_recommendation.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt b/opencompass/configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt b/opencompass/configs/datasets/bbh/lib_prompt/navigate.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt b/opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt b/opencompass/configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/reasoning_about_colored_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt b/opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt b/opencompass/configs/datasets/bbh/lib_prompt/salient_translation_error_detection.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt b/opencompass/configs/datasets/bbh/lib_prompt/snarks.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt b/opencompass/configs/datasets/bbh/lib_prompt/sports_understanding.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt b/opencompass/configs/datasets/bbh/lib_prompt/temporal_sequences.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_five_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_seven_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt b/opencompass/configs/datasets/bbh/lib_prompt/tracking_shuffled_objects_three_objects.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt b/opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt
diff --git a/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt b/opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt
diff --git a/opencompass/configs/datasets/prm800k/prm800k_gen_v01.py b/opencompass/configs/datasets/prm800k/prm800k_gen_v01.py
@@ -0,0 +1,76 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import PRM800kDataset, PRM800kEvaluator
+
+
+EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:
+1. The final answer of the response is correct, but there may be some errors in the steps. Please evaluate the response step by step and identify the first step that contains an error.
+2. Read the original question and response carefully, and evaluate the response based on the original question step by step
+3. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
+4. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
+
+Please reply strictly in the following format:
+First Error Step Number: (None if no error, e.g. None, otherwise the step number, e.g. Step 2)
+Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
+
+[Original Question]: The original question that was asked.
+{question}
+
+[Response]: The original response.
+{steps}
+
+[Your Evaluation]: Your step-by-step evaluation of the response.
+"""
+
+# EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:
+
+# 1. Read the original question and response carefully, and evaluate the response based on the original question step by step
+# 2. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
+#         Detailed Error Explanation: ...
+#         First Error Step Number: ... e.g. Step 2
+# 3. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
+#         Detailed Correct Explanation: ...
+#         First Error Step Number: None
+
+# Please reply strictly in the following format:
+
+# Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
+# First Error Step Number: ('None' if no error, otherwise the step number, e.g. 'Step 2')
+
+# [Original Question]: The original question that was asked.
+# {question}
+
+# [Response]: The original response.
+# {steps}
+
+# [Your Evaluation]: Your step-by-step evaluation of the response.
+# """
+
+
+
+prm800k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt=EVAl_INIT_PROMPT)
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048))
+
+prm800k_eval_cfg = dict(
+    evaluator=dict(type=PRM800kEvaluator))
+
+prm800k_datasets = [
+    dict(
+        type=PRM800kDataset,
+        abbr='PRM800k',
+        # path='./data/prm_new/math50.json',
+        path='./data/prm800k/test.jsonl',
+        reader_cfg=dict(
+            input_columns=['question', 'steps'],
+            output_column='answer',
+        ),
+        infer_cfg=prm800k_infer_cfg,
+        eval_cfg=prm800k_eval_cfg)
+]
diff --git a/opencompass/configs/datasets/prm800k/prm800k_gen_v02.py b/opencompass/configs/datasets/prm800k/prm800k_gen_v02.py
@@ -0,0 +1,82 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import PRM800kDataset, PRM800kEvaluator
+
+
+# EVAl_INIT_PROMPT = """You are a Math evaluator that evaluates the response step by step. Please follow the steps below:
+# 1. There may be some errors in the solution steps. Please evaluate the response step by step and identify the first step that contains an error.
+# 2. Read the original question and response carefully, and analysis the response based on the question step by step.
+# 3. Identify the first step in the response if there is an error or give None if there is no error in the end of your evaluation.
+# 4. The response may not finish the solution, like the response is only a part of the solution, please evaluate it as well.
+
+# Please reply strictly in the following format:
+# Detailed Step-by-step Analysis: (...Step-by-step detailed analysis of the response, split by ';\n')
+# First Error Step Number: (None if no error, e.g. None, otherwise the step number, e.g. Step 2)
+
+# Example evaluation response:
+# Example 1:
+# Detailed Step-by-step Analysis: xxx;\nxxx;\nxxx;
+# First Error Step Number: None
+
+# Example 2:
+# Detailed Step-by-step Analysis: xxx;\nxxx;\nxxx;
+# First Error Step Number: Step 2
+
+# [Original Question]: The original question that was asked.
+# {question}
+
+# [Response]: The original response.
+# {steps}
+
+# [Your Evaluation]: Your step-by-step evaluation of the response.
+# Detailed Step-by-step Analysis:"""
+
+EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:
+
+1. Read the original question and response carefully, and evaluate the response based on the original question step by step
+2. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
+        First Error Step Number: ... e.g. Step 2
+        Detailed Error Explanation: ...
+3. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
+        First Error Step Number: None
+        Detailed Correct Explanation: ...
+
+Please reply strictly in the following format:
+
+First Error Step Number: ('None' if no error, otherwise the step number, e.g. 'Step 2')
+Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
+
+[Original Question]: The original question that was asked.
+{question}
+
+[Response]: The original response.
+{steps}
+
+[Your Evaluation]: Your step-by-step evaluation of the response.
+"""
+
+prm800k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt=EVAl_INIT_PROMPT)
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048))
+
+prm800k_eval_cfg = dict(
+    evaluator=dict(type=PRM800kEvaluator))
+
+prm800k_datasets = [
+    dict(
+        type=PRM800kDataset,
+        abbr='PRM800k',
+        path='./data/prm800k/test.jsonl',
+        reader_cfg=dict(
+            input_columns=['question', 'steps'],
+            output_column='answer',
+        ),
+        infer_cfg=prm800k_infer_cfg,
+        eval_cfg=prm800k_eval_cfg)
+]