Add Programming to Foundry (#441)

* add programming to the gauntlet * fix pre-commit * add to builders * remove programming from gauntlet yaml before v0.1 * add language to human eval jsonl * test multilingual * add data * fix typo * revert yamls * add beams to yaml * final fixes * final fixes * upgrade composer * change beam #s * upgrade tensorbaord * remove c dataset: --------- Co-authored-by: bcui19 <[email protected]>
mosaicml · Aug 24, 2023 · 52a3500 · 52a3500
1 parent 2f30418
commit 52a3500
Show file tree

Hide file tree

Showing 10 changed files with 546 additions and 4 deletions.
diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
@@ -8,7 +8,8 @@
 
 # required for loading a python model into composer
 import transformers
-from composer.metrics.nlp import (InContextLearningLMAccuracy,
+from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy,
+                                  InContextLearningLMAccuracy,
                                   InContextLearningLMExpectedCalibrationError,
                                   InContextLearningMCExpectedCalibrationError,
                                   InContextLearningMultipleChoiceAccuracy,
@@ -74,6 +75,7 @@ def __init__(
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
+            InContextLearningCodeEvalAccuracy(),
             InContextLearningLMExpectedCalibrationError(),
             InContextLearningMCExpectedCalibrationError()
         ]

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
@@ -13,7 +13,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from composer.metrics import (InContextLearningLMAccuracy,
+from composer.metrics import (InContextLearningCodeEvalAccuracy,
+                              InContextLearningLMAccuracy,
                               InContextLearningLMExpectedCalibrationError,
                               InContextLearningMCExpectedCalibrationError,
                               InContextLearningMultipleChoiceAccuracy,
@@ -698,6 +699,7 @@ def __init__(
             InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
+            InContextLearningCodeEvalAccuracy(),
             InContextLearningLMExpectedCalibrationError(),
             InContextLearningMCExpectedCalibrationError(),
         ]

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
@@ -174,6 +174,8 @@ def _validate_cfg(icl_cfg: DictConfig):
                 ]
             elif icl_cfg.icl_task_type == 'question_answering':
                 icl_cfg.metric_names = ['InContextLearningQAAccuracy']
+            elif icl_cfg.icl_task_type == 'code_evaluation':
+                icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
             else:
                 raise ValueError(
                     f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.'
@@ -189,6 +191,8 @@ def _validate_cfg(icl_cfg: DictConfig):
             icl_cfg.max_seq_len = default_max_seq_len
         if 'batch_size' not in icl_cfg:
             icl_cfg.batch_size = default_batch_size
+        if 'num_beams' not in icl_cfg:
+            icl_cfg.num_beams = 1
 
     for icl_cfg in icl_tasks_list:
         _validate_cfg(icl_cfg)
@@ -218,6 +222,7 @@ def _validate_cfg(icl_cfg: DictConfig):
                 example_delimiter=icl_cfg.example_delimiter,
                 continuation_delimiter=icl_cfg.continuation_delimiter,
                 destination_path=destination_path,
+                generations_per_sample=icl_cfg.num_beams,
                 has_categories=icl_cfg.get('has_categories', False),
             )
             if hasattr(

diff --git a/scripts/eval/local_data/programming/human_eval.jsonl b/scripts/eval/local_data/programming/human_eval.jsonl
diff --git a/scripts/eval/local_data/programming/processed_humaneval_c++.jsonl b/scripts/eval/local_data/programming/processed_humaneval_c++.jsonl
diff --git a/scripts/eval/local_data/programming/processed_humaneval_c.jsonl b/scripts/eval/local_data/programming/processed_humaneval_c.jsonl
@@ -0,0 +1,9 @@
+{"task_id": "C/1", "prompt": "/*\nGiven a positive floating point number, it can be decomposed into\nand integer part (largest integer smaller than given number) and decimals\n(leftover part always smaller than 1).\n\nReturn the decimal part of the number.\n>>> truncate_number(3.5)\n0.5\n*/\n#include<stdio.h>\n#include<math.h>\nfloat truncate_number(float number){\n", "canonical_solution": "    return number-(int)(number);\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (truncate_number(3.5) == 0.5); \n assert (abs(truncate_number(1.33) - 0.33) < 1e-4);\n  assert (abs(truncate_number(123.456) - 0.456) < 1e-4);\n}", "entry_point": "truncate_number", "test_inputs": ["3.5", "1.33", "123.456"], "test_outputs": ["0.5", "0.33", "0.456"], "language": "c"}
+{"task_id": "C/2", "prompt": "/*\nReturn a greatest common divisor of two integers a and b\n>>> greatest_common_divisor(3, 5)\n1\n>>> greatest_common_divisor(25, 15)\n5\n*/\n#include<stdio.h>\n#include<stdbool.h>\nint greatest_common_divisor(int a, int b){\n", "canonical_solution": "    int out,m;\n    while (true){\n        if (a<b) \n        {\n            m=a;a=b;b=m;\n        }\n        a=a%b;\n        if (a==0) return b;\n    }\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (greatest_common_divisor(3, 7) == 1);\n     assert (greatest_common_divisor(10, 15) == 5);\n      assert (greatest_common_divisor(49, 14) == 7);\n     assert (greatest_common_divisor(144, 60) == 12);\n}\n", "entry_point": "greatest_common_divisor", "test_inputs": ["3, 7", "10, 15", "49, 14", "144, 60"], "test_outputs": ["1", "5", "7", "12"], "language": "c"}
+{"task_id": "C/3", "prompt": "/*\nFor a given number n, find the largest number that divides n evenly, smaller than n\n>>> largest_divisor(15)\n5\n*/\n#include<stdio.h>\nint largest_divisor(int n){\n", "canonical_solution": "    for (int i=2;i*i<=n;i++)\n        if (n%i==0) return  n/i;\n    return 1;\n\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (largest_divisor(3) == 1);\n    assert (largest_divisor(7) == 1);\n    assert (largest_divisor(10) == 5);\n    assert (largest_divisor(100) == 50);\n    assert (largest_divisor(49) == 7);\n}\n", "entry_point": "largest_divisor", "test_inputs": ["3", "7", "10", "100", "49"], "test_outputs": ["1", "1", "5", "50", "7"], "language": "c"}
+{"task_id": "C/4", "prompt": "/*\nReturn true if a given number is prime, and false otherwise.\n>>> is_prime(6)\nfalse\n>>> is_prime(101)\ntrue\n>>> is_prime(11)\ntrue\n>>> is_prime(13441)\ntrue\n>>> is_prime(61)\ntrue\n>>> is_prime(4)\nfalse\n>>> is_prime(1)\nfalse\n*/\n#include<stdio.h>\n#include<stdbool.h>\nbool is_prime(long long n){\n", "canonical_solution": "    if (n<2) return false;\n    for (long long i=2;i*i<=n;i++)\n        if (n%i==0) return false;\n    return true;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (is_prime(6) == false);\n    assert (is_prime(101) == true);\n    assert (is_prime(11) == true);\n    assert (is_prime(13441) == true);\n    assert (is_prime(61) == true);\n    assert (is_prime(4) == false);\n    assert (is_prime(1) == false);\n    assert (is_prime(5) == true);\n    assert (is_prime(11) == true);\n    assert (is_prime(17) == true);\n    assert (is_prime(5 * 17) == false);\n    assert (is_prime(11 * 7) == false);\n    assert (is_prime(13441 * 19) == false);\n}\n", "entry_point": "is_prime", "test_inputs": ["6", "101", "11", "13441", "61", "4", "1", "5", "11", "17", "5 * 17", "11 * 7", "13441 * 19"], "test_outputs": ["false", "true", "true", "true", "true", "false", "false", "true", "true", "true", "false", "false", "false"], "language": "c"}
+{"task_id": "C/5", "prompt": "/*\nReturn the number of times the digit 7 appears in integers less than n which are divisible by 11 or 13.\n>>> fizz_buzz(50)\n0\n>>> fizz_buzz(78)\n2\n>>> fizz_buzz(79)\n3\n*/\n#include<stdio.h>\nint fizz_buzz(int n){\n", "canonical_solution": "    int count=0;\n    for (int i=0;i<n;i++)\n    if (i%11==0 || i%13==0)\n    {\n        int q=i;\n        while (q>0)\n        {\n            if (q%10==7) count+=1;\n            q=q/10;\n        }\n    } \n    return count;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (fizz_buzz(50) == 0);\n    assert (fizz_buzz(78) == 2);\n    assert (fizz_buzz(79) == 3);\n    assert (fizz_buzz(100) == 3);\n    assert (fizz_buzz(200) == 6);\n    assert (fizz_buzz(4000) == 192);\n    assert (fizz_buzz(10000) == 639);\n    assert (fizz_buzz(100000) == 8026);\n}\n", "entry_point": "fizz_buzz", "test_inputs": ["50", "78", "79", "100", "200", "4000", "10000", "100000"], "test_outputs": ["0", "2", "3", "3", "6", "192", "639", "8026"], "language": "c"}
+{"task_id": "C/6", "prompt": "/*\nprime_fib returns n-th number that is a Fibonacci number and it's also prime.\n>>> prime_fib(1)\n2\n>>> prime_fib(2)\n3\n>>> prime_fib(3)\n5\n>>> prime_fib(4)\n13\n>>> prime_fib(5)\n89\n*/\n#include<stdio.h>\n#include<stdbool.h>\nint prime_fib(int n){\n", "canonical_solution": "    int f1,f2,m;\n    f1=1;f2=2;\n    int count=0;\n    while (count<n)\n    {\n        f1=f1+f2;\n        m=f1;f1=f2;f2=m;\n        bool isprime=true;\n        for (int w=2;w*w<=f1;w++)\n            if (f1%w==0)\n            {\n             isprime=false; break;\n            }\n        if (isprime) count+=1;\n        if (count==n) return f1;\n    }\n\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (prime_fib(1) == 2);\n    assert (prime_fib(2) == 3);\n    assert (prime_fib(3) == 5);\n    assert (prime_fib(4) == 13);\n    assert (prime_fib(5) == 89);\n    assert (prime_fib(6) == 233);\n    assert (prime_fib(7) == 1597);\n    assert (prime_fib(8) == 28657);\n    assert (prime_fib(9) == 514229);\n    assert (prime_fib(10) == 433494437);\n}\n", "entry_point": "prime_fib", "test_inputs": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], "test_outputs": ["2", "3", "5", "13", "89", "233", "1597", "28657", "514229", "433494437"], "language": "c"}
+{"task_id": "C/7", "prompt": "/*\nImagine a road that's a perfectly straight infinitely long line.\nn cars are driving left to right;  simultaneously, a different set of n cars\nare driving right to left.   The two sets of cars start out being very far from\neach other.  All cars move in the same speed.  Two cars are said to collide\nwhen a car that's moving left to right hits a car that's moving right to left.\nHowever, the cars are infinitely sturdy and strong; as a result, they continue moving\nin their trajectory as if they did not collide.\n\nThis function outputs the number of such collisions.\n*/\n#include<stdio.h>\nint car_race_collision(int n){\n", "canonical_solution": "    return n*n;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (car_race_collision(2) == 4);\n    assert (car_race_collision(3) == 9);\n    assert (car_race_collision(4) == 16);\n    assert (car_race_collision(8) == 64);\n    assert (car_race_collision(10) == 100);\n}\n", "entry_point": "car_race_collision", "test_inputs": ["2", "3", "4", "8", "10"], "test_outputs": ["4", "9", "16", "64", "100"], "language": "c"}
+{"task_id": "C/8", "prompt": "/*\nThe Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\nfib4(0) -> 0\nfib4(1) -> 0\nfib4(2) -> 2\nfib4(3) -> 0\nfib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\nPlease write a function to efficiently compute the n-th element of the fib4 number sequence.  Do not use recursion.\n>>> fib4(5)\n4\n>>> fib4(6)\n8\n>>> fib4(7)\n14\n*/\n#include<stdio.h>\nint fib4(int n){\n", "canonical_solution": "    int f[100];\n    f[0]=0;\n    f[1]=0;\n    f[2]=2;\n    f[3]=0;\n    for (int i=4;i<=n;i++)\n    {\n        f[i]=f[i-1]+f[i-2]+f[i-3]+f[i-4];\n    }\n    return f[n];\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (fib4(5) == 4);\n    assert (fib4(8) == 28);\n    assert (fib4(10) == 104);\n    assert (fib4(12) == 386);\n}\n", "entry_point": "fib4", "test_inputs": ["5", "8", "10", "12"], "test_outputs": ["4", "28", "104", "386"], "language": "c"}
+{"task_id": "C/9", "prompt": "/*\nReturn 2^n modulo p (be aware of numerics).\n>>> modp(3, 5)\n3\n>>> modp(1101, 101)\n2\n>>> modp(0, 101)\n1\n>>> modp(3, 11)\n8\n>>> modp(100, 101)\n1\n*/\n#include<stdio.h>\nint modp(int n,int p){\n", "canonical_solution": "    int out=1;\n    for (int i=0;i<n;i++)\n        out=(out*2)%p;\n    return out;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n    assert (modp(3, 5) == 3);\n    assert (modp(1101, 101) == 2);\n    assert (modp(0, 101) == 1);\n    assert (modp(3, 11) == 8);\n    assert (modp(100, 101) == 1);\n    assert (modp(30, 5) == 4);\n    assert (modp(31, 5) == 3);\n}\n", "entry_point": "modp", "test_inputs": ["3, 5", "1101, 101", "0, 101", "3, 11", "100, 101", "30, 5", "31, 5"], "test_outputs": ["3", "2", "1", "8", "1", "4", "3"], "language": "c"}