Skip to content

Commit

Permalink
Update PRM series dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
liushz committed Nov 11, 2024
1 parent 75ab00b commit e375303
Show file tree
Hide file tree
Showing 71 changed files with 752 additions and 163 deletions.
163 changes: 0 additions & 163 deletions .pre-commit-config-zh-cn.yaml

This file was deleted.

Empty file modified configs/datasets/bbh/lib_prompt/boolean_expressions.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/causal_judgement.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/date_understanding.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/disambiguation_qa.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/dyck_languages.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/formal_fallacies.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/geometric_shapes.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/hyperbaton.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/movie_recommendation.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/multistep_arithmetic_two.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/navigate.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/object_counting.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/penguins_in_a_table.txt
100755 → 100644
Empty file.
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/ruin_names.txt
100755 → 100644
Empty file.
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/snarks.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/sports_understanding.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/temporal_sequences.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/web_of_lies.txt
100755 → 100644
Empty file.
Empty file modified configs/datasets/bbh/lib_prompt/word_sorting.txt
100755 → 100644
Empty file.
Empty file modified docs/zh_cn/cp_origin_docs.sh
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/dyck_languages.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/hyperbaton.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/navigate.txt
100755 → 100644
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/object_counting.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/ruin_names.txt
100755 → 100644
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/snarks.txt
100755 → 100644
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/web_of_lies.txt
100755 → 100644
Empty file.
Empty file modified opencompass/configs/datasets/bbh/lib_prompt/word_sorting.txt
100755 → 100644
Empty file.
76 changes: 76 additions & 0 deletions opencompass/configs/datasets/prm800k/prm800k_gen_v01.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import PRM800kDataset, PRM800kEvaluator


EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:
1. The final answer of the response is correct, but there may be some errors in the steps. Please evaluate the response step by step and identify the first step that contains an error.
2. Read the original question and response carefully, and evaluate the response based on the original question step by step
3. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
4. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
Please reply strictly in the following format:
First Error Step Number: (None if no error, e.g. None, otherwise the step number, e.g. Step 2)
Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
[Original Question]: The original question that was asked.
{question}
[Response]: The original response.
{steps}
[Your Evaluation]: Your step-by-step evaluation of the response.
"""

# EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:

# 1. Read the original question and response carefully, and evaluate the response based on the original question step by step
# 2. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
# Detailed Error Explanation: ...
# First Error Step Number: ... e.g. Step 2
# 3. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
# Detailed Correct Explanation: ...
# First Error Step Number: None

# Please reply strictly in the following format:

# Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
# First Error Step Number: ('None' if no error, otherwise the step number, e.g. 'Step 2')

# [Original Question]: The original question that was asked.
# {question}

# [Response]: The original response.
# {steps}

# [Your Evaluation]: Your step-by-step evaluation of the response.
# """



prm800k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=EVAl_INIT_PROMPT)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048))

prm800k_eval_cfg = dict(
evaluator=dict(type=PRM800kEvaluator))

prm800k_datasets = [
dict(
type=PRM800kDataset,
abbr='PRM800k',
# path='./data/prm_new/math50.json',
path='./data/prm800k/test.jsonl',
reader_cfg=dict(
input_columns=['question', 'steps'],
output_column='answer',
),
infer_cfg=prm800k_infer_cfg,
eval_cfg=prm800k_eval_cfg)
]
82 changes: 82 additions & 0 deletions opencompass/configs/datasets/prm800k/prm800k_gen_v02.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import PRM800kDataset, PRM800kEvaluator


# EVAl_INIT_PROMPT = """You are a Math evaluator that evaluates the response step by step. Please follow the steps below:
# 1. There may be some errors in the solution steps. Please evaluate the response step by step and identify the first step that contains an error.
# 2. Read the original question and response carefully, and analysis the response based on the question step by step.
# 3. Identify the first step in the response if there is an error or give None if there is no error in the end of your evaluation.
# 4. The response may not finish the solution, like the response is only a part of the solution, please evaluate it as well.

# Please reply strictly in the following format:
# Detailed Step-by-step Analysis: (...Step-by-step detailed analysis of the response, split by ';\n')
# First Error Step Number: (None if no error, e.g. None, otherwise the step number, e.g. Step 2)

# Example evaluation response:
# Example 1:
# Detailed Step-by-step Analysis: xxx;\nxxx;\nxxx;
# First Error Step Number: None

# Example 2:
# Detailed Step-by-step Analysis: xxx;\nxxx;\nxxx;
# First Error Step Number: Step 2

# [Original Question]: The original question that was asked.
# {question}

# [Response]: The original response.
# {steps}

# [Your Evaluation]: Your step-by-step evaluation of the response.
# Detailed Step-by-step Analysis:"""

EVAl_INIT_PROMPT = """You are an Math evaluator that evaluates the response step by step. Please follow the steps below:
1. Read the original question and response carefully, and evaluate the response based on the original question step by step
2. Identify the first step in the response that contains an error and explain the error in detail. Write in which solution step number of the response the error first occured and then explain the error in the response detaily, how it impacts the response like a judgement to the response. like:
First Error Step Number: ... e.g. Step 2
Detailed Error Explanation: ...
3. If the response is correct every step, or there no serious error that affects the response, please provide a detailed explanation of why the response is correct or why the error is not serious, like:
First Error Step Number: None
Detailed Correct Explanation: ...
Please reply strictly in the following format:
First Error Step Number: ('None' if no error, otherwise the step number, e.g. 'Step 2')
Detailed Correct/Error Explanation: (...Combine your previous content with step-by-step detailed analysis...)
[Original Question]: The original question that was asked.
{question}
[Response]: The original response.
{steps}
[Your Evaluation]: Your step-by-step evaluation of the response.
"""

prm800k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=EVAl_INIT_PROMPT)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048))

prm800k_eval_cfg = dict(
evaluator=dict(type=PRM800kEvaluator))

prm800k_datasets = [
dict(
type=PRM800kDataset,
abbr='PRM800k',
path='./data/prm800k/test.jsonl',
reader_cfg=dict(
input_columns=['question', 'steps'],
output_column='answer',
),
infer_cfg=prm800k_infer_cfg,
eval_cfg=prm800k_eval_cfg)
]
Loading

0 comments on commit e375303

Please sign in to comment.