EleutherAI · haileyschoelkopf · Aug 23, 2024 · Aug 19, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
@@ -5,6 +5,7 @@
 import string
 from collections.abc import Iterable
 from typing import List
+import signal
 
 import numpy as np
 import sacrebleu
@@ -373,6 +374,147 @@ def acc_all(items):
     acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
     return acc
 
+@register_metric(
+    metric="compiles@1",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def compiles_at_1_metric(predictions, references=None, **kwargs):
+    """
+    Checks whether the Python code in the `predictions` compiles without any syntax errors,
+    including the associated assert statements from the `references`.
+    Returns the ratio of successfully compiled code to the total number of items.
+    """
+
+    success_count = 0
+
+    for pred, ref in zip(predictions, references):
+        try:
+            # Extract the code snippet from the prediction
+            code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()
+
+            # Append the assert statements from the references to the code snippet
+            code_with_asserts = code_snippet + "\n" + ref.strip()
+
+            # Try to compile the combined code snippet with assert statements
+            compile(code_with_asserts, "<string>", "exec")
+
+            # If compilation is successful, increment the success count
+            success_count += 1
+
+        except Exception as e:
+            # Log the error and continue
+            eval_logger.error(f"Compilation failed, error: {e}")
+            continue
+
+    return {"compiles@1": success_count / len(predictions)}
+
+@register_metric(
+    metric="pass@1",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def pass_at_1_metric(predictions, references=None, **kwargs):
+    """
+    Executes the Python code provided in the `predictions` and checks whether it passes all test cases.
+    Returns the ratio of successful executions to the total number of items.
+    """
+
+    timeout = kwargs.get("timeout", 5.0)
+
+    def handler(signum, frame):
+        raise TimeoutError("Execution timed out")
+
+    success_count = 0
+
+    for pred, ref in zip(predictions, references):
+        try:
+            # Extract the code snippet from the prediction
+            code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()
+
+            # Extract the assert statements from the references
+            assert_statements = ref.strip().split("\n")
+
+            # Set the timeout handler
+            signal.signal(signal.SIGALRM, handler)
+            signal.alarm(int(timeout))
+
+            try:
+                # Execute the generated code
+                exec(code_snippet)
+
+                # Run through each assertion in the references
+                for assertion in assert_statements:
+                    exec(assertion)
+
+                # If no exception is raised, consider it a success
+                success_count += 1
+
+            except TimeoutError as e:
+                eval_logger.error(f"Execution timed out: {e}")
+            finally:
+                signal.alarm(0)  # Disable the alarm
+
+        except Exception as e:
+            # Log the error and continue
+            eval_logger.error(f"Execution failed, error: {e}")
+            continue
+
+    return {"pass@1": success_count / len(predictions)}
+# @register_metric(
+#     metric="exec_metric",
+#     higher_is_better=True,
+#     output_type="generate_until",
+#     aggregation="mean",
+# )
+# def exec_metric(predictions, references=None, **kwargs):
+#     """
+#     Executes the Python code provided in the `predictions` and checks whether it passes all test cases.
+#     Returns the ratio of successful executions to the total number of items.
+#     """
+
+#     timeout = kwargs.get("timeout", 5.0)
+#     def handler(signum, frame):
+#         raise TimeoutError("Execution timed out")
+
+#     success_count = 0
+
+#     for pred, ref in zip(predictions, references):
+#         try:
+#             # Extract the code snippet from the prediction
+#             code_snippet = pred.strip().split("```python")[1].split("```")[0].strip()
+
+#             # Extract the assert statements from the references
+#             assert_statements = ref.strip().split("\n")
+
+#             # Set the timeout handler
+#             signal.signal(signal.SIGALRM, handler)
+#             signal.alarm(int(timeout))
+
+#             try:
+#                 # Execute the generated code
+#                 exec(code_snippet)
+
+#                 # Run through each assertion in the references
+#                 for assertion in assert_statements:
+#                     exec(assertion)
+
+#                 # If no exception is raised, consider it a success
+#                 success_count += 1
+
+#             except TimeoutError as e:
+#                 eval_logger.error(f"Execution timed out: {e}")
+#             finally:
+#                 signal.alarm(0)  # Disable the alarm
+
+#         except Exception as e:
+#             # Log the error and continue
+#             #print(e)
+#             eval_logger.error(f"Execution failed, error: {e}")
+#             continue
+#     return {"exec_metric": success_count / len(predictions)}
 
 def acc_all_stderr(items):
     # Only count as correct if all answers are labeled correctly for each question

@@ -0,0 +1,88 @@
+dataset_path: EleutherAI/asdiv
+doc_to_target: "{{answer.split(' (')[0] if answer is defined else target}}"
+doc_to_text: "Given the following problem, reason and give a final answer to the problem.\nProblem: {{body if body is defined}} {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.\n"
+fewshot_config:
+  sampler: first_n
+  samples:
+  - question: There are 15 trees in the grove. Grove workers will plant trees in the
+      grove today. After they are done, there will be 21 trees. How many trees did
+      the grove workers plant today?
+    target: There are 15 trees originally. Then there were 21 trees after some more
+      were planted. So there must have been 21 - 15 = 6. The final answer is 6
+  - question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
+      cars are in the parking lot?
+    target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
+      is 5
+  - question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
+      pieces do they have left in total?
+    target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
+      had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
+  - question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
+      lollipops. How many lollipops did Jason give to Denny?
+    target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
+      So he gave Denny 20 - 12 = 8. The final answer is 8
+  - question: Shawn has five toys. For Christmas, he got two toys each from his mom and
+      dad. How many toys does he have now?
+    target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
+      then that is 4 more toys. 5 + 4 = 9. The final answer is 9
+  - question: There were nine computers in the server room. Five more computers were
+      installed each day, from monday to thursday. How many computers are now in the
+      server room?
+    target: There were originally 9 computers. For each of 4 days, 5 more computers
+      were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
+      29
+  - question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
+      he lost 2 more. How many golf balls did he have at the end of wednesday?
+    target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
+      58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
+      is 33
+  - question: Olivia has $23. She bought five bagels for $3 each. How much money does
+      she have left?
+    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
+      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
+filter_list:
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<|eot_id|>'
+  - '<|start_header_id|>user<|end_header_id|>'
+  - 'Q:'
+  - </s>
+  - <|im_end|>
+tag:
+- chain_of_thought
+metadata:
+  version: 1.0
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  ignore_case: true
+  ignore_punctuation: false
+  metric: exact_match
+  regexes_to_ignore:
+  - ','
+  - \$
+  - '(?s).*#### '
+  - \.$
+num_fewshot: 8
+output_type: generate_until
+repeats: 1
+task: asdiv_cot_llama
+validation_split: validation
+test_split: validation
+should_decontaminate: true
+doc_to_decontamination_query: "{{body}} {{question}}"
+dataset_kwargs:
+  trust_remote_code: true
diff --git a/lm_eval/tasks/mbpp/mbpp.yaml b/lm_eval/tasks/mbpp/mbpp.yaml
@@ -0,0 +1,74 @@
+dataset_name: full
+dataset_path: google-research-datasets/mbpp
+doc_to_target: |
+  {{ target if target is defined else test_list | join('\n') }}
+doc_to_text: |
+  You are an expert Python programmer, and here is your task:
+  {{ text }}
+  Your code should pass the following tests:
+  {{ test_list | join('\n') }}
+fewshot_config:
+  sampler: first_n
+  samples:
+  - text: Write a function to find the similar elements from the given two tuple lists.
+    target: |
+      ```python
+      def similar_elements(test_tup1, test_tup2):
+          res = tuple(set(test_tup1) & set(test_tup2))
+          return (res)
+      ```
+    test_list:
+    - assert similar_elements((3, 4, 5, 6), (5, 7, 4, 10)) == (4, 5)
+    - assert similar_elements((1, 2, 3, 4), (5, 4, 3, 7)) == (3, 4)
+    - assert similar_elements((11, 12, 14, 13), (17, 15, 14, 13)) == (13, 14)
+  - text: Write a python function to identify non-prime numbers.
+    target: |
+      ```python
+      import math
+      def is_not_prime(n):
+          result = False
+          for i in range(2, int(math.sqrt(n)) + 1):
+              if n % i == 0:
+                  result = True
+          return result
+      ```
+    test_list:
+    - assert is_not_prime(2) == False
+    - assert is_not_prime(10) == True
+    - assert is_not_prime(35) == True
+  - text: Write a function to find the largest integers from a given list of numbers using heap queue algorithm.
+    target: |
+      ```python
+      import heapq as hq
+      def heap_queue_largest(nums, n):
+          largest_nums = hq.nlargest(n, nums)
+          return largest_nums
+      ```
+    test_list:
+    - assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 3) == [85, 75, 65] 
+    - assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 2) == [85, 75] 
+    - assert heap_queue_largest([25, 35, 22, 85, 14, 65, 75, 22, 58], 5) == [85, 75, 65, 58, 35]
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Q:'
+  - '</s>'
+  - '<|eot_id|>'
+  - '<|start_header_id|>user<|end_header_id|>'
+tag:
+- mbpp
+metadata:
+  version: 1.0
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  metric: compiles@1
+- aggregation: mean
+  higher_is_better: true
+  timeout: 2.0
+  metric: pass@1
+num_fewshot: 3
+output_type: generate_until
+repeats: 1
+task: mbpp
+test_split: test