Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created new task for testing Llama on Asdiv #2236

Merged
merged 12 commits into from
Aug 23, 2024
142 changes: 142 additions & 0 deletions lm_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import string
from collections.abc import Iterable
from typing import List
import signal

import numpy as np
import sacrebleu
Expand Down Expand Up @@ -373,6 +374,147 @@ def acc_all(items):
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc

@register_metric(
metric="compiles@1",
higher_is_better=True,
output_type="generate_until",
aggregation="mean",
)
def compiles_at_1_metric(predictions, references=None, **kwargs):
"""
Checks whether the Python code in the `predictions` compiles without any syntax errors,
including the associated assert statements from the `references`.
Returns the ratio of successfully compiled code to the total number of items.
"""

success_count = 0

for pred, ref in zip(predictions, references):
try:
# Extract the code snippet from the prediction
code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()

# Append the assert statements from the references to the code snippet
code_with_asserts = code_snippet + "\n" + ref.strip()

# Try to compile the combined code snippet with assert statements
compile(code_with_asserts, "<string>", "exec")

# If compilation is successful, increment the success count
success_count += 1

except Exception as e:
# Log the error and continue
eval_logger.error(f"Compilation failed, error: {e}")
continue

return {"compiles@1": success_count / len(predictions)}

@register_metric(
metric="pass@1",
higher_is_better=True,
output_type="generate_until",
aggregation="mean",
)
def pass_at_1_metric(predictions, references=None, **kwargs):
"""
Executes the Python code provided in the `predictions` and checks whether it passes all test cases.
Returns the ratio of successful executions to the total number of items.
"""

timeout = kwargs.get("timeout", 5.0)

def handler(signum, frame):
raise TimeoutError("Execution timed out")

success_count = 0

for pred, ref in zip(predictions, references):
try:
# Extract the code snippet from the prediction
code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()

# Extract the assert statements from the references
assert_statements = ref.strip().split("\n")

# Set the timeout handler
signal.signal(signal.SIGALRM, handler)
signal.alarm(int(timeout))

try:
# Execute the generated code
exec(code_snippet)

# Run through each assertion in the references
for assertion in assert_statements:
exec(assertion)

# If no exception is raised, consider it a success
success_count += 1

except TimeoutError as e:
eval_logger.error(f"Execution timed out: {e}")
finally:
signal.alarm(0) # Disable the alarm

except Exception as e:
# Log the error and continue
eval_logger.error(f"Execution failed, error: {e}")
continue

return {"pass@1": success_count / len(predictions)}
# @register_metric(
# metric="exec_metric",
# higher_is_better=True,
# output_type="generate_until",
# aggregation="mean",
# )
# def exec_metric(predictions, references=None, **kwargs):
# """
# Executes the Python code provided in the `predictions` and checks whether it passes all test cases.
# Returns the ratio of successful executions to the total number of items.
# """

# timeout = kwargs.get("timeout", 5.0)
# def handler(signum, frame):
# raise TimeoutError("Execution timed out")

# success_count = 0

# for pred, ref in zip(predictions, references):
# try:
# # Extract the code snippet from the prediction
# code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()

# # Extract the assert statements from the references
# assert_statements = ref.strip().split("\n")

# # Set the timeout handler
# signal.signal(signal.SIGALRM, handler)
# signal.alarm(int(timeout))

# try:
# # Execute the generated code
# exec(code_snippet)

# # Run through each assertion in the references
# for assertion in assert_statements:
# exec(assertion)

# # If no exception is raised, consider it a success
# success_count += 1

# except TimeoutError as e:
# eval_logger.error(f"Execution timed out: {e}")
# finally:
# signal.alarm(0) # Disable the alarm

# except Exception as e:
# # Log the error and continue
# #print(e)
# eval_logger.error(f"Execution failed, error: {e}")
# continue
# return {"exec_metric": success_count / len(predictions)}

def acc_all_stderr(items):
# Only count as correct if all answers are labeled correctly for each question
Expand Down
88 changes: 88 additions & 0 deletions lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
dataset_path: EleutherAI/asdiv
doc_to_target: "{{answer.split(' (')[0] if answer is defined else target}}"
doc_to_text: "Given the following problem, reason and give a final answer to the problem.\nProblem: {{body if body is defined}} {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.\n"
fewshot_config:
sampler: first_n
samples:
- question: There are 15 trees in the grove. Grove workers will plant trees in the
grove today. After they are done, there will be 21 trees. How many trees did
the grove workers plant today?
target: There are 15 trees originally. Then there were 21 trees after some more
were planted. So there must have been 21 - 15 = 6. The final answer is 6
- question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
cars are in the parking lot?
target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
is 5
- question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
pieces do they have left in total?
target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
- question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
lollipops. How many lollipops did Jason give to Denny?
target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
So he gave Denny 20 - 12 = 8. The final answer is 8
- question: Shawn has five toys. For Christmas, he got two toys each from his mom and
dad. How many toys does he have now?
target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
then that is 4 more toys. 5 + 4 = 9. The final answer is 9
- question: There were nine computers in the server room. Five more computers were
installed each day, from monday to thursday. How many computers are now in the
server room?
target: There were originally 9 computers. For each of 4 days, 5 more computers
were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
29
- question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
he lost 2 more. How many golf balls did he have at the end of wednesday?
target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
is 33
- question: Olivia has $23. She bought five bagels for $3 each. How much money does
she have left?
target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
filter_list:
- filter:
- function: regex
group_select: -1
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
- function: take_first
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '<|eot_id|>'
- '<|start_header_id|>user<|end_header_id|>'
- 'Q:'
- </s>
- <|im_end|>
tag:
- chain_of_thought
metadata:
version: 1.0
metric_list:
- aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: false
metric: exact_match
regexes_to_ignore:
- ','
- \$
- '(?s).*#### '
- \.$
num_fewshot: 8
output_type: generate_until
repeats: 1
task: asdiv_cot_llama
validation_split: validation
test_split: validation
should_decontaminate: true
doc_to_decontamination_query: "{{body}} {{question}}"
dataset_kwargs:
trust_remote_code: true
74 changes: 74 additions & 0 deletions lm_eval/tasks/mbpp/mbpp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
dataset_name: full
dataset_path: google-research-datasets/mbpp
doc_to_target: |
{{ target if target is defined else test_list | join('\n') }}
doc_to_text: |
You are an expert Python programmer, and here is your task:
{{ text }}
Your code should pass the following tests:
{{ test_list | join('\n') }}
fewshot_config:
sampler: first_n
samples:
- text: Write a function to find the similar elements from the given two tuple lists.
target: |
```python
def similar_elements(test_tup1, test_tup2):
res = tuple(set(test_tup1) & set(test_tup2))
return (res)
```
test_list:
- assert similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) == (4, 5)
- assert similar_elements((1, 2, 3, 4), (5, 4, 3, 7)) == (3, 4)
- assert similar_elements((11, 12, 14, 13), (17, 15, 14, 13)) == (13, 14)
- text: Write a python function to identify non-prime numbers.
target: |
```python
import math
def is_not_prime(n):
result = False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
result = True
return result
```
test_list:
- assert is_not_prime(2) == False
- assert is_not_prime(10) == True
- assert is_not_prime(35) == True
- text: Write a function to find the largest integers from a given list of numbers using heap queue algorithm.
target: |
```python
import heapq as hq
def heap_queue_largest(nums, n):
largest_nums = hq.nlargest(n, nums)
return largest_nums
```
test_list:
- assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 3) == [85, 75, 65]
- assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 2) == [85, 75]
- assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 5) == [85, 75, 65, 58, 35]
generation_kwargs:
do_sample: false
until:
- 'Q:'
- '</s>'
- '<|eot_id|>'
- '<|start_header_id|>user<|end_header_id|>'
tag:
- mbpp
metadata:
version: 1.0
metric_list:
- aggregation: mean
higher_is_better: true
metric: compiles@1
- aggregation: mean
higher_is_better: true
timeout: 2.0
metric: pass@1
num_fewshot: 3
output_type: generate_until
repeats: 1
task: mbpp
test_split: test
Loading